CONFLITS REMOVED

IamMQaisar · IamMQaisar · commit 48d5104e3a2a · 2024-04-01T22:11:23.000+05:00
diff --git a/analyze/data_analysis.py b/analyze/data_analysis.py
@@ -29,25 +29,6 @@
 # Set the current working directory
 CWD = os.path.dirname(os.path.abspath(__file__))
 
-# Set up the logger
-LOG = logging.getLogger(__name__)
-LOG.setLevel(logging.INFO)
-
-# Define both the handler and the formatter
-handler = logging.StreamHandler()
-formatter = logging.Formatter(
-    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
-)
-
-# Add formatter to the handler
-handler.setFormatter(formatter)
-
-# Add handler to the logger
-LOG.addHandler(handler)
-
-# Log the start of the script execution
-LOG.info("Script execution started.")
-
 
 def tags_frequency(csv_path, column_names):
     """
@@ -60,7 +41,7 @@ def tags_frequency(csv_path, column_names):
                            Example: ["tags", "description"]
 
     """
-    LOG.info("Generating word cloud based on tags.")
+    LOGGER.info("Generating word cloud based on tags.")
 
     df = pd.read_csv(csv_path)
     # Process each column containing tags
@@ -80,7 +61,7 @@ def tags_frequency(csv_path, column_names):
                     and str(row) != ""
                     and str(row) != "nan"
                 ):
-                    LOG.debug(f"Processing row: {row}")
+                    LOGGER.debug(f"Processing row: {row}")
                     if "ChineseinUS.org" in str(row):
                         row = "ChineseinUS"
                     list2 += re.split(r"\s|(?<!\d)[,.](?!\d)", str(row))
@@ -169,7 +150,7 @@ def time_trend_helper(df):
     Returns:
     - DataFrame: DataFrame with counts of entries per year.
     """
-    LOG.info("Extracting year-wise count of entries.")
+    LOGGER.info("Extracting year-wise count of entries.")
 
     year_list = []
     for date_row in df["dates"][0:]:
@@ -197,7 +178,7 @@ def time_trend(csv_path):
     Args:
     - csv_path (str): Path to the CSV file.
     """
-    LOG.info("Generating time trend line graph.")
+    LOGGER.info("Generating time trend line graph.")
 
     df = pd.read_csv(csv_path)
     count_df = time_trend_helper(df)
@@ -240,7 +221,7 @@ def time_trend_compile_helper(yearly_count):
     Returns:
     - DataFrame: Filtered yearly count data.
     """
-    LOG.info("Filtering yearly trend data.")
+    LOGGER.info("Filtering yearly trend data.")
 
     Years = np.arange(2018, 2023)
     yearly_count["year"] = list(yearly_count.index)
@@ -250,7 +231,7 @@ def time_trend_compile_helper(yearly_count):
             int(yearly_count["year"][num]) >= 2018
         ):
             counts.append(yearly_count["Counts"][num])
-    LOG.info(f"{counts}")
+    LOGGER.info(f"{counts}")
     final_yearly_count = pd.DataFrame(
         list(zip(Years, counts)), columns=["Years", "Yearly_counts"]
     )
@@ -261,7 +242,7 @@ def time_trend_compile():
     """
     Compile yearly trends for different licenses and plot them.
     """
-    LOG.info("Compiling yearly trends for different licenses.")
+    LOGGER.info("Compiling yearly trends for different licenses.")
 
     license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
     license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
@@ -320,7 +301,7 @@ def time_trend_compile():
     yearly_count6 = time_trend_compile_helper(yearly_count6)
     yearly_count9 = time_trend_compile_helper(yearly_count9)
     yearly_count10 = time_trend_compile_helper(yearly_count10)
-    LOG.info(f"{yearly_count1}")
+    LOGGER.info(f"{yearly_count1}")
 
     # Plot yearly trend for all licenses
     plt.plot(
@@ -409,20 +390,22 @@ def view_compare_helper(df):
     Returns:
     - int: Maximum views.
     """
-    LOG.info("Calculating maximum views of pictures under a license.")
+    LOGGER.info("Calculating maximum views of pictures under a license.")
 
     highest_view = int(max(df["views"]))
     df = df.sort_values("views", ascending=False)
-    LOG.info(f"DataFrame sorted by views in descending order: {df}")
-    LOG.info(f"Maximum views found: {highest_view}")
+    LOGGER.info(f"DataFrame sorted by views in descending order: {df}")
+    LOGGER.info(f"Maximum views found: {highest_view}")
     return highest_view
 
 
 def view_compare():
     """
     Compare maximum views of pictures under different licenses.
     """
-    LOG.info("Comparing maximum views of pictures under different licenses.")
+    LOGGER.info(
+        "Comparing maximum views of pictures under different licenses."
+    )
 
     license1 = pd.read_csv(
         os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv")
@@ -462,7 +445,7 @@ def view_compare():
     maxs = []
     for lic in licenses:
         maxs.append(view_compare_helper(lic))
-    LOG.info(f"{maxs}")
+    LOGGER.info(f"{maxs}")
     # Create DataFrame to store license and their maximum views
     temp_data = pd.DataFrame()
     temp_data["Licenses"] = [
@@ -518,7 +501,9 @@ def total_usage():
     """
     Generate a bar plot showing the total usage of different licenses.
     """
-    LOG.info("Generating bar plot showing total usage of different licenses.")
+    LOGGER.info(
+        "Generating bar plot showing total usage of different licenses."
+    )
 
     # Reads the license total file as the input dataset
     df = pd.read_csv(
diff --git a/flickr/data_cleaning.py b/flickr/data_cleaning.py
@@ -34,15 +34,15 @@ def drop_empty_column(csv_path, new_csv_path):
     - csv_path (str): Path to the original CSV file.
     - new_csv_path (str): Path to save the cleaned CSV file.
     """
-    LOG.info("Dropping 'Unnamed' columns from the CSV file.")
+    LOGGER.info("Dropping 'Unnamed' columns from the CSV file.")
 
     df = pd.read_csv(csv_path)
     for col in df.columns:
         if "Unnamed" in col:
             data = df.drop(col, axis=1)
-            LOG.info(f"Dropping column {col}")
+            LOGGER.info(f"Dropping column {col}")
     data.to_csv(new_csv_path)
-    LOG.info("Dropping empty columns completed.")
+    LOGGER.info("Dropping empty columns completed.")
 
 
 def drop_duplicate_id(csv_path, new_csv_path):
@@ -53,14 +53,14 @@ def drop_duplicate_id(csv_path, new_csv_path):
     - csv_path (str): Path to the original CSV file.
     - new_csv_path (str): Path to save the cleaned CSV file.
     """
-    LOG.info(
+    LOGGER.info(
         "Dropping duplicate rows based on the 'id' column from the CSV file."
     )
 
     df = pd.read_csv(csv_path)
     data = df.drop_duplicates(subset=["id"])
     data.to_csv(new_csv_path)
-    LOG.info("Dropping duplicates completed.")
+    LOGGER.info("Dropping duplicates completed.")
 
 
 def save_new_data(csv_path, column_name_list, new_csv_path):
@@ -73,15 +73,15 @@ def save_new_data(csv_path, column_name_list, new_csv_path):
     (belongs to the existing column names from original csv)
     - new_csv_path (str): Path to save the new CSV file.
     """
-    LOG.info("Saving columns from the original CSV to a new CSV.")
+    LOGGER.info("Saving columns from the original CSV to a new CSV.")
 
     df = pd.read_csv(csv_path)
     new_df = pd.DataFrame()
     for col in column_name_list:
         new_df[col] = list(df[col])
-        LOG.info(f"Saving column {col}")
+        LOGGER.info(f"Saving column {col}")
     new_df.to_csv(new_csv_path)
-    LOG.info("Saving new data to new csv")
+    LOGGER.info("Saving new data to new csv")
 
 
 def main():