-
-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathdata_cleaning.py
118 lines (97 loc) · 3.62 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
This is to clean the data pulled by the photos_detail.py script so as to
further delete useless columns and reorganize the dataset as this form:
| locations | amount | time | license | content_categories | highest_comment | total_view | # noqa: E501
| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: | # noqa: E501
| Minneapolis, United States | 20 | 2022-10-22 | 4 | football, life | 105 | 100000 | # noqa: E501
| São José do Rio Preto SP, Brasil | 30 | 2022-10-22 | 4 | football, life | 50 | 300000 | # noqa: E501
...
Note:
content_categories will be got from basic NLP on the tags column
"""
# Standard library
import sys
import traceback
# Third-party
import pandas as pd
sys.path.append(".")
# First-party/Local
import quantify # noqa: E402
# Setup only LOGGER using quantify.setup()
_, _, _, _, LOGGER = quantify.setup(__file__)
def drop_empty_column(csv_path, new_csv_path):
"""
Drops columns with 'Unnamed' in the name from the CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
LOGGER.info("Dropping 'Unnamed' columns from the CSV file.")
df = pd.read_csv(csv_path)
for col in df.columns:
if "Unnamed" in col:
data = df.drop(col, axis=1)
LOGGER.info(f"Dropping column {col}")
data.to_csv(new_csv_path)
LOGGER.info("Dropping empty columns completed.")
def drop_duplicate_id(csv_path, new_csv_path):
"""
Drops duplicate rows based on the 'id' column from the CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
LOGGER.info(
"Dropping duplicate rows based on the 'id' column from the CSV file."
)
df = pd.read_csv(csv_path)
data = df.drop_duplicates(subset=["id"])
data.to_csv(new_csv_path)
LOGGER.info("Dropping duplicates completed.")
def save_new_data(csv_path, column_name_list, new_csv_path):
"""
Saves specified columns from the original CSV file to a new CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- column_name_list (list of str): List of column names to be saved
(belongs to the existing column names from original csv)
- new_csv_path (str): Path to save the new CSV file.
"""
LOGGER.info("Saving columns from the original CSV to a new CSV.")
df = pd.read_csv(csv_path)
new_df = pd.DataFrame()
for col in column_name_list:
new_df[col] = list(df[col])
LOGGER.info(f"Saving column {col}")
new_df.to_csv(new_csv_path)
LOGGER.info("Saving new data to new csv")
def main():
drop_empty_column("hs.csv", "dataset/cleaned_license10.csv")
drop_duplicate_id(
"dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv"
)
save_new_data(
"dataset/cleaned_license10.csv",
[
"location",
"dates",
"license",
"description",
"tags",
"views",
"comments",
],
"dataset/cleaned_license10.csv",
)
if __name__ == "__main__":
try:
main()
except SystemExit as e:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)