-
-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathdata_cleaning.py
110 lines (92 loc) · 3.33 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
This is to clean the data pulled by the photos_detail.py script so as to
further delete useless columns and reorganize the dataset as this form:
| locations | amount | time | license | content_categories | highest_comment | total_view | # noqa: E501
| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: | # noqa: E501
| Minneapolis, United States | 20 | 2022-10-22 | 4 | football, life | 105 | 100000 | # noqa: E501
| São José do Rio Preto SP, Brasil | 30 | 2022-10-22 | 4 | football, life | 50 | 300000 | # noqa: E501
...
Note:
content_categories will be got from basic NLP on the tags column
"""
# Standard library
import os
import sys
# Third-party
import pandas as pd
# First-party/Local
import quantify
# Setup only LOGGER using quantify.setup()
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
_, _, _, _, LOGGER = quantify.setup(__file__)
def drop_empty_column(csv_path, new_csv_path):
"""
Drops columns with 'Unnamed' in the name from the CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
df = pd.read_csv(csv_path)
for col in df.columns:
if "Unnamed" in col:
data = df.drop(col, axis=1)
print("Dropping column", col)
data.to_csv(new_csv_path)
print("Dropping empty columns")
def drop_duplicate_id(csv_path, new_csv_path):
"""
Drops duplicate rows based on the 'id' column from the CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
df = pd.read_csv(csv_path)
data = df.drop_duplicates(subset=["id"])
data.to_csv(new_csv_path)
print("Dropping duplicates")
def save_new_data(csv_path, column_name_list, new_csv_path):
"""
Saves specified columns from the original CSV file to a new CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- column_name_list (list of str): List of column names to be saved
(belongs to the existing column names from original csv)
- new_csv_path (str): Path to save the new CSV file.
"""
df = pd.read_csv(csv_path)
new_df = pd.DataFrame()
for col in column_name_list:
new_df[col] = list(df[col])
print("Saving column", col)
new_df.to_csv(new_csv_path)
print("Saving new data to new csv")
def main():
drop_empty_column("hs.csv", "dataset/cleaned_license10.csv")
drop_duplicate_id(
"dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv"
)
save_new_data(
"dataset/cleaned_license10.csv",
[
"location",
"dates",
"license",
"description",
"tags",
"views",
"comments",
],
"dataset/cleaned_license10.csv",
)
if __name__ == "__main__":
try:
main()
except SystemExit as e:
LOGGER.error("System exit with code: %d", e.code)
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception("Unhandled exception:")
sys.exit(1)