""" This file is the script of data analysis and visualization """ # Standard library import os import re import sys import warnings # Third-party import matplotlib.pyplot as plt import numpy as np import pandas as pd import plotly.express as px import seaborn as sns from wordcloud import STOPWORDS, WordCloud # noqa: E402 # First-party/Local import quantify # Warning suppression /!\ Caution /!\ warnings.filterwarnings("ignore") # Setup PATH_WORK_DIR, and LOGGER using quantify.setup() sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) _, PATH_WORK_DIR, _, _, LOGGER = quantify.setup(__file__) def tags_frequency(csv_path, column_names): """ Generate a word cloud based on all the tags of each license. Each license has its own cloud. Args: - csv_path (str): Path to the CSV file containing data. - column_names (list): List of column names to process. Example: ["tags", "description"] """ df = pd.read_csv(csv_path) # Process each column containing tags for column_name in column_names: list2 = [] if column_name == "tags": list_tags = (df[column_name][0]).strip("]'[").split("', '") # Converting string to list for row in df[column_name][1:]: if str(row).strip("]'[").split("', '"): list_tags += str(row).strip("]'[").split("', '") else: for row in df[column_name][1:]: if ( str(row) is not None and str(row) != "" and str(row) != "nan" ): print(str(row)) if "ChineseinUS.org" in str(row): row = "ChineseinUS" list2 += re.split(r"\s|(?= 2018 ): counts.append(yearly_count["Counts"][num]) print(counts) final_yearly_count = pd.DataFrame( list(zip(Years, counts)), columns=["Years", "Yearly_counts"] ) return final_yearly_count def time_trend_compile(): """ Compile yearly trends for different licenses and plot them. """ license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv") license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv") license3 = pd.read_csv("../flickr/dataset/cleaned_license3.csv") license4 = pd.read_csv("../flickr/dataset/cleaned_license4.csv") license5 = pd.read_csv("../flickr/dataset/cleaned_license5.csv") license6 = pd.read_csv("../flickr/dataset/cleaned_license6.csv") license9 = pd.read_csv("../flickr/dataset/cleaned_license9.csv") license10 = pd.read_csv("../flickr/dataset/cleaned_license10.csv") # Calculate yearly counts for each license count_df1 = time_trend_helper(license1) count_df2 = time_trend_helper(license2) count_df3 = time_trend_helper(license3) count_df4 = time_trend_helper(license4) count_df5 = time_trend_helper(license5) count_df6 = time_trend_helper(license6) count_df9 = time_trend_helper(license9) count_df10 = time_trend_helper(license10) list_raw_data = [ count_df1, count_df2, count_df3, count_df4, count_df5, count_df6, count_df9, count_df10, ] # Split date to year and save in a list list_data = [] for each_raw_data in list_raw_data: years = [] for row in each_raw_data["Dates"]: years.append(row.split("-")[0]) each_raw_data["Years"] = years each_raw_data = each_raw_data.drop("Dates", axis=1) each_raw_data = each_raw_data.groupby("Years")["Counts"].sum() each_raw_data.dropna(how="all") list_data.append(each_raw_data) yearly_count1 = list_data[0].to_frame() yearly_count2 = list_data[1].to_frame() yearly_count3 = list_data[2].to_frame() yearly_count4 = list_data[3].to_frame() yearly_count5 = list_data[4].to_frame() yearly_count6 = list_data[5].to_frame() yearly_count9 = list_data[6].to_frame() yearly_count10 = list_data[7].to_frame() # Filter yearly count data for the years between 2018 and 2022 yearly_count1 = time_trend_compile_helper(yearly_count1) yearly_count2 = time_trend_compile_helper(yearly_count2) yearly_count3 = time_trend_compile_helper(yearly_count3) yearly_count4 = time_trend_compile_helper(yearly_count4) yearly_count5 = time_trend_compile_helper(yearly_count5) yearly_count6 = time_trend_compile_helper(yearly_count6) yearly_count9 = time_trend_compile_helper(yearly_count9) yearly_count10 = time_trend_compile_helper(yearly_count10) print(yearly_count1) # Plot yearly trend for all licenses plt.plot( yearly_count1["Years"], yearly_count1["Yearly_counts"], label="CC BY-NC-SA 2.0", alpha=0.7, linestyle="-", ) plt.plot( yearly_count2["Years"], yearly_count2["Yearly_counts"], label="CC BY-NC 2.0", alpha=0.7, linestyle="--", ) plt.plot( yearly_count3["Years"], yearly_count3["Yearly_counts"], label="CC BY-NC-ND 2.0", alpha=0.7, linestyle="-.", ) plt.plot( yearly_count4["Years"], yearly_count4["Yearly_counts"], label="CC BY 2.0", alpha=0.7, linestyle=":", ) plt.plot( yearly_count5["Years"], yearly_count5["Yearly_counts"], label="CC BY-SA 2.0", alpha=0.7, linestyle="-", ) plt.plot( yearly_count6["Years"], yearly_count6["Yearly_counts"], label="CC BY-ND 2.0", alpha=0.7, linestyle="--", ) plt.plot( yearly_count9["Years"], yearly_count9["Yearly_counts"], label="CC0 1.0", alpha=0.7, linestyle=":", ) plt.plot( yearly_count10["Years"], yearly_count10["Yearly_counts"], label="Public Domain Mark 1.0", alpha=0.7, ) plt.legend() plt.xlabel("Date of photos taken", fontsize=10) plt.ylabel("Amount of photos", fontsize=10) plt.title( "Data range: first 4000 pictures for each license", fontsize=13, alpha=0.75, ) plt.suptitle( "Yearly Trend of All Licenses 2018-2022", fontsize=15, fontweight="bold", ) plt.savefig( "../analyze/line_graphs/licenses_yearly_trend.png", dpi=300, bbox_inches="tight", ) plt.show() def view_compare_helper(df): """ Calculate maximum views of pictures under a license. Args: - df (DataFrame): Input DataFrame. Returns: - int: Maximum views. """ highest_view = int(max(df["views"])) df = df.sort_values("views", ascending=False) return highest_view print(df) print(highest_view) def view_compare(): """ Compare maximum views of pictures under different licenses. """ license1 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv") ) license2 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license2.csv") ) license3 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license3.csv") ) license4 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license4.csv") ) license5 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license5.csv") ) license6 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license6.csv") ) license9 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license9.csv") ) license10 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license10.csv") ) licenses = [ license1, license2, license3, license4, license5, license6, license9, license10, ] # Calculate maximum views for each license maxs = [] for lic in licenses: maxs.append(view_compare_helper(lic)) print(maxs) # Create DataFrame to store license and their maximum views temp_data = pd.DataFrame() temp_data["Licenses"] = [ "CC BY-NC-SA 2.0", "CC BY-NC 2.0", "CC BY-NC-ND 2.0", "CC BY 2.0", "CC BY-SA 2.0", "CC BY-ND 2.0", "CC0 1.0", "Public Domain Mark 1.0", ] temp_data["views"] = maxs # Plot bar graph fig, ax = plt.subplots(figsize=(13, 10)) ax.grid(b=True, color="grey", linestyle="-.", linewidth=0.5, alpha=0.6) sns.set_style("dark") sns.barplot( data=temp_data, x="Licenses", y="views", palette="flare", errorbar="sd" ) ax.bar_label(ax.containers[0]) ax.text( x=0.5, y=1.1, s="Maximum Views of Pictures under all Licenses", fontsize=15, weight="bold", ha="center", va="bottom", transform=ax.transAxes, ) ax.text( x=0.5, y=1.05, s="Data range: first 4000 pictures for each license", fontsize=13, alpha=0.75, ha="center", va="bottom", transform=ax.transAxes, ) current_values = plt.gca().get_yticks() plt.gca().set_yticklabels(["{:,.0f}".format(x) for x in current_values]) plt.savefig( os.path.join(PATH_WORK_DIR, "../analyze/compare_graphs/max_views.png"), dpi=300, bbox_inches="tight", ) plt.show() def total_usage(): """ Generate a bar plot showing the total usage of different licenses. """ # Reads the license total file as the input dataset df = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/license_total.csv") ) df["License"] = [str(x) for x in list(df["License"])] fig = px.bar(df, x="License", y="Total amount", color="License") fig.write_html(os.path.join(PATH_WORK_DIR, "../analyze/total_usage.html")) # fig.show() def main(): tags_frequency( os.path.join(PATH_WORK_DIR, "merged_all_cleaned.csv"), ["tags"] ) # df = pd.read_csv("../flickr/dataset/cleaned_license10.csv") # print(df.shape) if __name__ == "__main__": try: main() except SystemExit as e: LOGGER.error("System exit with code: %d", e.code) sys.exit(e.code) except KeyboardInterrupt: LOGGER.info("Halted via KeyboardInterrupt.") sys.exit(130) except Exception: LOGGER.exception("Unhandled exception:") sys.exit(1)