Skip to content

Commit 48d5104

Browse files
committed
CONFLITS REMOVED
1 parent 7a55164 commit 48d5104

File tree

2 files changed

+26
-41
lines changed

2 files changed

+26
-41
lines changed

analyze/data_analysis.py

+18-33
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,6 @@
2929
# Set the current working directory
3030
CWD = os.path.dirname(os.path.abspath(__file__))
3131

32-
# Set up the logger
33-
LOG = logging.getLogger(__name__)
34-
LOG.setLevel(logging.INFO)
35-
36-
# Define both the handler and the formatter
37-
handler = logging.StreamHandler()
38-
formatter = logging.Formatter(
39-
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
40-
)
41-
42-
# Add formatter to the handler
43-
handler.setFormatter(formatter)
44-
45-
# Add handler to the logger
46-
LOG.addHandler(handler)
47-
48-
# Log the start of the script execution
49-
LOG.info("Script execution started.")
50-
5132

5233
def tags_frequency(csv_path, column_names):
5334
"""
@@ -60,7 +41,7 @@ def tags_frequency(csv_path, column_names):
6041
Example: ["tags", "description"]
6142
6243
"""
63-
LOG.info("Generating word cloud based on tags.")
44+
LOGGER.info("Generating word cloud based on tags.")
6445

6546
df = pd.read_csv(csv_path)
6647
# Process each column containing tags
@@ -80,7 +61,7 @@ def tags_frequency(csv_path, column_names):
8061
and str(row) != ""
8162
and str(row) != "nan"
8263
):
83-
LOG.debug(f"Processing row: {row}")
64+
LOGGER.debug(f"Processing row: {row}")
8465
if "ChineseinUS.org" in str(row):
8566
row = "ChineseinUS"
8667
list2 += re.split(r"\s|(?<!\d)[,.](?!\d)", str(row))
@@ -169,7 +150,7 @@ def time_trend_helper(df):
169150
Returns:
170151
- DataFrame: DataFrame with counts of entries per year.
171152
"""
172-
LOG.info("Extracting year-wise count of entries.")
153+
LOGGER.info("Extracting year-wise count of entries.")
173154

174155
year_list = []
175156
for date_row in df["dates"][0:]:
@@ -197,7 +178,7 @@ def time_trend(csv_path):
197178
Args:
198179
- csv_path (str): Path to the CSV file.
199180
"""
200-
LOG.info("Generating time trend line graph.")
181+
LOGGER.info("Generating time trend line graph.")
201182

202183
df = pd.read_csv(csv_path)
203184
count_df = time_trend_helper(df)
@@ -240,7 +221,7 @@ def time_trend_compile_helper(yearly_count):
240221
Returns:
241222
- DataFrame: Filtered yearly count data.
242223
"""
243-
LOG.info("Filtering yearly trend data.")
224+
LOGGER.info("Filtering yearly trend data.")
244225

245226
Years = np.arange(2018, 2023)
246227
yearly_count["year"] = list(yearly_count.index)
@@ -250,7 +231,7 @@ def time_trend_compile_helper(yearly_count):
250231
int(yearly_count["year"][num]) >= 2018
251232
):
252233
counts.append(yearly_count["Counts"][num])
253-
LOG.info(f"{counts}")
234+
LOGGER.info(f"{counts}")
254235
final_yearly_count = pd.DataFrame(
255236
list(zip(Years, counts)), columns=["Years", "Yearly_counts"]
256237
)
@@ -261,7 +242,7 @@ def time_trend_compile():
261242
"""
262243
Compile yearly trends for different licenses and plot them.
263244
"""
264-
LOG.info("Compiling yearly trends for different licenses.")
245+
LOGGER.info("Compiling yearly trends for different licenses.")
265246

266247
license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
267248
license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
@@ -320,7 +301,7 @@ def time_trend_compile():
320301
yearly_count6 = time_trend_compile_helper(yearly_count6)
321302
yearly_count9 = time_trend_compile_helper(yearly_count9)
322303
yearly_count10 = time_trend_compile_helper(yearly_count10)
323-
LOG.info(f"{yearly_count1}")
304+
LOGGER.info(f"{yearly_count1}")
324305

325306
# Plot yearly trend for all licenses
326307
plt.plot(
@@ -409,20 +390,22 @@ def view_compare_helper(df):
409390
Returns:
410391
- int: Maximum views.
411392
"""
412-
LOG.info("Calculating maximum views of pictures under a license.")
393+
LOGGER.info("Calculating maximum views of pictures under a license.")
413394

414395
highest_view = int(max(df["views"]))
415396
df = df.sort_values("views", ascending=False)
416-
LOG.info(f"DataFrame sorted by views in descending order: {df}")
417-
LOG.info(f"Maximum views found: {highest_view}")
397+
LOGGER.info(f"DataFrame sorted by views in descending order: {df}")
398+
LOGGER.info(f"Maximum views found: {highest_view}")
418399
return highest_view
419400

420401

421402
def view_compare():
422403
"""
423404
Compare maximum views of pictures under different licenses.
424405
"""
425-
LOG.info("Comparing maximum views of pictures under different licenses.")
406+
LOGGER.info(
407+
"Comparing maximum views of pictures under different licenses."
408+
)
426409

427410
license1 = pd.read_csv(
428411
os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv")
@@ -462,7 +445,7 @@ def view_compare():
462445
maxs = []
463446
for lic in licenses:
464447
maxs.append(view_compare_helper(lic))
465-
LOG.info(f"{maxs}")
448+
LOGGER.info(f"{maxs}")
466449
# Create DataFrame to store license and their maximum views
467450
temp_data = pd.DataFrame()
468451
temp_data["Licenses"] = [
@@ -518,7 +501,9 @@ def total_usage():
518501
"""
519502
Generate a bar plot showing the total usage of different licenses.
520503
"""
521-
LOG.info("Generating bar plot showing total usage of different licenses.")
504+
LOGGER.info(
505+
"Generating bar plot showing total usage of different licenses."
506+
)
522507

523508
# Reads the license total file as the input dataset
524509
df = pd.read_csv(

flickr/data_cleaning.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,15 @@ def drop_empty_column(csv_path, new_csv_path):
3434
- csv_path (str): Path to the original CSV file.
3535
- new_csv_path (str): Path to save the cleaned CSV file.
3636
"""
37-
LOG.info("Dropping 'Unnamed' columns from the CSV file.")
37+
LOGGER.info("Dropping 'Unnamed' columns from the CSV file.")
3838

3939
df = pd.read_csv(csv_path)
4040
for col in df.columns:
4141
if "Unnamed" in col:
4242
data = df.drop(col, axis=1)
43-
LOG.info(f"Dropping column {col}")
43+
LOGGER.info(f"Dropping column {col}")
4444
data.to_csv(new_csv_path)
45-
LOG.info("Dropping empty columns completed.")
45+
LOGGER.info("Dropping empty columns completed.")
4646

4747

4848
def drop_duplicate_id(csv_path, new_csv_path):
@@ -53,14 +53,14 @@ def drop_duplicate_id(csv_path, new_csv_path):
5353
- csv_path (str): Path to the original CSV file.
5454
- new_csv_path (str): Path to save the cleaned CSV file.
5555
"""
56-
LOG.info(
56+
LOGGER.info(
5757
"Dropping duplicate rows based on the 'id' column from the CSV file."
5858
)
5959

6060
df = pd.read_csv(csv_path)
6161
data = df.drop_duplicates(subset=["id"])
6262
data.to_csv(new_csv_path)
63-
LOG.info("Dropping duplicates completed.")
63+
LOGGER.info("Dropping duplicates completed.")
6464

6565

6666
def save_new_data(csv_path, column_name_list, new_csv_path):
@@ -73,15 +73,15 @@ def save_new_data(csv_path, column_name_list, new_csv_path):
7373
(belongs to the existing column names from original csv)
7474
- new_csv_path (str): Path to save the new CSV file.
7575
"""
76-
LOG.info("Saving columns from the original CSV to a new CSV.")
76+
LOGGER.info("Saving columns from the original CSV to a new CSV.")
7777

7878
df = pd.read_csv(csv_path)
7979
new_df = pd.DataFrame()
8080
for col in column_name_list:
8181
new_df[col] = list(df[col])
82-
LOG.info(f"Saving column {col}")
82+
LOGGER.info(f"Saving column {col}")
8383
new_df.to_csv(new_csv_path)
84-
LOG.info("Saving new data to new csv")
84+
LOGGER.info("Saving new data to new csv")
8585

8686

8787
def main():

0 commit comments

Comments
 (0)