diff --git a/README.md b/README.md index b61eec4a..1b995ae4 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ directories to check: - [ppypa/pipenv][pipenv]: _Python Development Workflow for Humans._ - [pre-commit][pre-commit]: _A framework for managing and maintaining multi-language pre-commit hooks._ +- [Logging][logging]: _Built-in Python logging module to implement a flexible logging system across shared modules._ [ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/ [black]: https://github.com/psf/black @@ -143,6 +144,7 @@ directories to check: [isort]: https://pycqa.github.io/isort/ [pipenv]: https://github.com/pypa/pipenv [pre-commit]: https://pre-commit.com/ +[logging]: https://docs.python.org/3/howto/logging.html ### GitHub Actions diff --git a/analyze/data_analysis.py b/analyze/data_analysis.py index 5feb9991..fccdc779 100644 --- a/analyze/data_analysis.py +++ b/analyze/data_analysis.py @@ -3,6 +3,7 @@ """ # Standard library +import logging import os.path import re import sys @@ -24,6 +25,28 @@ # Set the current working directory PATH_WORK_DIR = os.path.dirname(os.path.abspath(__file__)) +# Set the current working directory +CWD = os.path.dirname(os.path.abspath(__file__)) + +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def tags_frequency(csv_path, column_names): """ @@ -36,6 +59,8 @@ def tags_frequency(csv_path, column_names): Example: ["tags", "description"] """ + LOG.info("Generating word cloud based on tags.") + df = pd.read_csv(csv_path) # Process each column containing tags for column_name in column_names: @@ -54,7 +79,7 @@ def tags_frequency(csv_path, column_names): and str(row) != "" and str(row) != "nan" ): - print(str(row)) + LOG.debug(f"Processing row: {row}") if "ChineseinUS.org" in str(row): row = "ChineseinUS" list2 += re.split(r"\s|(?= 2018 ): counts.append(yearly_count["Counts"][num]) - print(counts) + LOG.info(f"{counts}") final_yearly_count = pd.DataFrame( list(zip(Years, counts)), columns=["Years", "Yearly_counts"] ) @@ -229,6 +260,8 @@ def time_trend_compile(): """ Compile yearly trends for different licenses and plot them. """ + LOG.info("Compiling yearly trends for different licenses.") + license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv") license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv") license3 = pd.read_csv("../flickr/dataset/cleaned_license3.csv") @@ -286,7 +319,7 @@ def time_trend_compile(): yearly_count6 = time_trend_compile_helper(yearly_count6) yearly_count9 = time_trend_compile_helper(yearly_count9) yearly_count10 = time_trend_compile_helper(yearly_count10) - print(yearly_count1) + LOG.info(f"{yearly_count1}") # Plot yearly trend for all licenses plt.plot( @@ -375,17 +408,21 @@ def view_compare_helper(df): Returns: - int: Maximum views. """ + LOG.info("Calculating maximum views of pictures under a license.") + highest_view = int(max(df["views"])) df = df.sort_values("views", ascending=False) + LOG.info(f"DataFrame sorted by views in descending order: {df}") + LOG.info(f"Maximum views found: {highest_view}") return highest_view - print(df) - print(highest_view) def view_compare(): """ Compare maximum views of pictures under different licenses. """ + LOG.info("Comparing maximum views of pictures under different licenses.") + license1 = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv") ) @@ -424,7 +461,7 @@ def view_compare(): maxs = [] for lic in licenses: maxs.append(view_compare_helper(lic)) - print(maxs) + LOG.info(f"{maxs}") # Create DataFrame to store license and their maximum views temp_data = pd.DataFrame() temp_data["Licenses"] = [ @@ -480,6 +517,8 @@ def total_usage(): """ Generate a bar plot showing the total usage of different licenses. """ + LOG.info("Generating bar plot showing total usage of different licenses.") + # Reads the license total file as the input dataset df = pd.read_csv( os.path.join(PATH_WORK_DIR, "../flickr/dataset/license_total.csv") @@ -499,15 +538,15 @@ def main(): if __name__ == "__main__": - # Exception handling + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) - sys.exit(1) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) diff --git a/deviantart/deviantart_scratcher.py b/deviantart/deviantart_scratcher.py index 29058df8..2e4e8dff 100755 --- a/deviantart/deviantart_scratcher.py +++ b/deviantart/deviantart_scratcher.py @@ -4,6 +4,7 @@ data. """ # Standard library +import logging import os import sys import traceback @@ -36,6 +37,25 @@ # Retrieve Programmable Search Engine key from environment variables PSE_KEY = os.getenv("PSE_KEY") +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_license_list(): """ @@ -45,6 +65,8 @@ def get_license_list(): - np.array: An array containing all license types that should be searched via Programmable Search Engine. """ + LOG.info("Retrieving list of license from Creative Commons' record.") + # Read license data from file cc_license_data = pd.read_csv( os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None @@ -70,6 +92,8 @@ def get_request_url(license): Returns: - str: The API Endpoint URL for the query specified by parameters. """ + LOG.info(f"Generating API Endpoint URL for specified license: {license}") + try: api_key = API_KEYS[API_KEYS_IND] return ( @@ -80,7 +104,7 @@ def get_request_url(license): ) except Exception as e: if isinstance(e, IndexError): - print("Depleted all API Keys provided", file=sys.stderr) + LOG.exception("Depleted all API Keys provided") else: raise e @@ -97,6 +121,8 @@ def get_response_elems(license): - dict: A dictionary mapping metadata to its value provided from the API query. """ + LOG.info("Making a request to the API and handling potential retries.") + try: # Make a request to the API and handle potential retries request_url = get_request_url(license) @@ -120,9 +146,7 @@ def get_response_elems(license): # If quota limit exceeded, switch to the next API key global API_KEYS_IND API_KEYS_IND += 1 - print( - "Changing API KEYS due to depletion of quota", file=sys.stderr - ) + LOG.exception("Changing API KEYS due to depletion of quota") return get_response_elems(license) else: raise e @@ -130,6 +154,8 @@ def get_response_elems(license): def set_up_data_file(): """Writes the header row to the file to contain DeviantArt data.""" + LOG.info("Setting up data file by writing the header row.") + header_title = "LICENSE TYPE,Document Count" with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") @@ -142,6 +168,11 @@ def record_license_data(license_type): It's a segment of the URL towards the license description. If not provided, it defaults to None, indicating no assumption about the license type. """ + LOG.info( + "Writing the row for license type %s to contain DeviantArt data", + license_type, + ) + data_log = ( f"{license_type}," f"{get_response_elems(license_type)['totalResults']}" @@ -156,6 +187,8 @@ def record_all_licenses(): list and writes this data into the DATA_WRITE_FILE, as specified by the constant. """ + LOG.info("Recording data for all available license types.") + # Get the list of license types license_list = get_license_list() # Record data for each license types @@ -169,14 +202,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/flickr/data_cleaning.py b/flickr/data_cleaning.py index 993cba29..3dec9109 100644 --- a/flickr/data_cleaning.py +++ b/flickr/data_cleaning.py @@ -13,12 +13,32 @@ """ # Standard library +import logging import sys import traceback # Third-party import pandas as pd +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def drop_empty_column(csv_path, new_csv_path): """ @@ -27,13 +47,15 @@ def drop_empty_column(csv_path, new_csv_path): - csv_path (str): Path to the original CSV file. - new_csv_path (str): Path to save the cleaned CSV file. """ + LOG.info("Dropping 'Unnamed' columns from the CSV file.") + df = pd.read_csv(csv_path) for col in df.columns: if "Unnamed" in col: data = df.drop(col, axis=1) - print("Dropping column", col) + LOG.info(f"Dropping column {col}") data.to_csv(new_csv_path) - print("Dropping empty columns") + LOG.info("Dropping empty columns completed.") def drop_duplicate_id(csv_path, new_csv_path): @@ -44,10 +66,14 @@ def drop_duplicate_id(csv_path, new_csv_path): - csv_path (str): Path to the original CSV file. - new_csv_path (str): Path to save the cleaned CSV file. """ + LOG.info( + "Dropping duplicate rows based on the 'id' column from the CSV file." + ) + df = pd.read_csv(csv_path) data = df.drop_duplicates(subset=["id"]) data.to_csv(new_csv_path) - print("Dropping duplicates") + LOG.info("Dropping duplicates completed.") def save_new_data(csv_path, column_name_list, new_csv_path): @@ -60,13 +86,15 @@ def save_new_data(csv_path, column_name_list, new_csv_path): (belongs to the existing column names from original csv) - new_csv_path (str): Path to save the new CSV file. """ + LOG.info("Saving columns from the original CSV to a new CSV.") + df = pd.read_csv(csv_path) new_df = pd.DataFrame() for col in column_name_list: new_df[col] = list(df[col]) - print("Saving column", col) + LOG.info(f"Saving column {col}") new_df.to_csv(new_csv_path) - print("Saving new data to new csv") + LOG.info("Saving new data to new csv") def main(): @@ -90,14 +118,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) - sys.exit(1) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) diff --git a/flickr/photos.py b/flickr/photos.py index 9a40dcc5..48ae2a00 100644 --- a/flickr/photos.py +++ b/flickr/photos.py @@ -5,6 +5,7 @@ # Standard library import json +import logging import os import os.path import sys @@ -23,6 +24,25 @@ ) load_dotenv(PATH_DOTENV) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def main(): # Initialize Flickr API instance @@ -45,14 +65,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) - sys.exit(1) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) diff --git a/flickr/photos_detail.py b/flickr/photos_detail.py index ae18d62d..7084db7b 100644 --- a/flickr/photos_detail.py +++ b/flickr/photos_detail.py @@ -9,6 +9,7 @@ # Standard library import json +import logging import os import os.path import sys @@ -32,6 +33,25 @@ # Global variable: Number of retries for error handling RETRIES = 0 +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def to_df(datalist, namelist): """ @@ -44,6 +64,8 @@ def to_df(datalist, namelist): Returns: - df (DataFrame): DataFrame constructed from the data. """ + LOG.info("Transforming data into a DataFrame.") + df = [pd.DataFrame() for ind in range(len(datalist))] df = pd.DataFrame(datalist).transpose() df.columns = namelist @@ -61,6 +83,8 @@ def df_to_csv(temp_list, name_list, temp_csv, final_csv): - temp_csv (str): Temporary CSV file path. - final_csv (str): Final CSV file path. """ + LOG.info("Saving data to temporary CSV and merging with final CSV.") + df = to_df(temp_list, name_list) df.to_csv(temp_csv) # Merge temporary CSV with final CSV, ignoring index to avoid duplication @@ -79,6 +103,8 @@ def creat_lisoflis(size): Returns: - temp_list (list): List of empty lists. """ + LOG.info("Saving all the columns with each column as a list") + temp_list = [[] for i in range(size)] return temp_list @@ -91,6 +117,8 @@ def clean_saveas_csv(old_csv_str, new_csv_str): - old_csv_str (str): Path to the old CSV file. - new_csv_str (str): Path to the new CSV file. """ + LOG.info("Cleaning empty columns and save CSV to a new file.") + data = pd.read_csv(old_csv_str, low_memory=False) for col in list(data.columns): if "Unnamed" in col: @@ -141,6 +169,11 @@ def query_data(raw_data, name_list, data_list): - name_list (list): List of column names. - data_list (list): List of lists to store data. """ + + LOG.info( + "Querying useful data from raw pulled data and storing it in lists." + ) + for a in range(0, len(name_list)): if (0 <= a < 4) or a == 9: temp = query_helper2(raw_data, name_list[a], data_list, a) @@ -183,6 +216,8 @@ def page1_reset(final_csv, raw_data): Returns: - int: Total number of pages. """ + LOG.info("Reset page count and update total picture count.") + data = pd.read_csv(final_csv, low_memory=False) for col in list(data.columns): data.drop(col, inplace=True, axis=1) @@ -253,31 +288,18 @@ def main(): ) time.sleep(1) photos_detail = json.loads(detailJson.decode("utf-8")) - print( - index, - "id out of", - len(id), - "in license", - i, - "page", - j, - "out of", - total, + LOG.info( + f"{index} id out of {len(id)} in license {i}, " + f"page {j} out of {total}" ) # query process of useful data query_data(photos_detail, name_list, temp_list) j += 1 - print( - "page", - j, - "out of", - total, - "in license", - i, - "with retry number", - RETRIES, + LOG.info( + f"Page {j} out of {total} in license {i}" + f"with retry number {RETRIES}" ) # save data to csv @@ -312,14 +334,14 @@ def main(): try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: RETRIES += 1 - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") if RETRIES <= 20: continue else: diff --git a/google_custom_search/google_scratcher.py b/google_custom_search/google_scratcher.py index d1e12f11..4657e134 100755 --- a/google_custom_search/google_scratcher.py +++ b/google_custom_search/google_scratcher.py @@ -5,6 +5,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -47,6 +48,25 @@ SEARCH_HALFYEAR_SPAN = 20 PSE_KEY = os.getenv("PSE_KEY") +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_license_list(): """Provides the list of license from 2018's record of Creative Commons. @@ -55,6 +75,8 @@ def get_license_list(): np.array: An np array containing all license types that should be searched via Programmable Search Engine. """ + LOG.info("Providing the list of licenses from Creative Commons' records.") + cc_license_data = pd.read_csv( os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None ) @@ -75,6 +97,11 @@ def get_lang_list(): pd.DataFrame: A Dataframe whose index is language name and has a column for the corresponding language code. """ + LOG.info( + "Providing the list of languages " + "to find Creative Commons usage data on." + ) + languages = pd.read_csv( os.path.join(PATH_WORK_DIR, "google_lang.txt"), sep=": ", @@ -112,6 +139,11 @@ def get_country_list(select_all=False): pd.DataFrame: A Dataframe whose index is country name and has a column for the corresponding country code. """ + LOG.info( + "Providing the list of countries " + "to find Creative Commons usage data on." + ) + countries = pd.read_csv( os.path.join(PATH_WORK_DIR, "google_countries.tsv"), sep="\t" ) @@ -163,6 +195,10 @@ def get_request_url(license=None, country=None, language=None, time=False): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameter combinations." + ) + try: api_key = API_KEYS[API_KEYS_IND] base_url = ( @@ -184,7 +220,7 @@ def get_request_url(license=None, country=None, language=None, time=False): return base_url except Exception as e: if isinstance(e, IndexError): - print("Depleted all API Keys provided", file=sys.stderr) + LOG.exception("Depleted all API Keys provided") else: raise e @@ -215,6 +251,8 @@ def get_response_elems(license=None, country=None, language=None, time=False): dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for a query of specified parameters.") + try: request_url = get_request_url(license, country, language, time) max_retries = Retry( @@ -236,17 +274,17 @@ def get_response_elems(license=None, country=None, language=None, time=False): if isinstance(e, requests.exceptions.HTTPError): global API_KEYS_IND API_KEYS_IND += 1 - print( - "Changing API KEYS due to depletion of quota", file=sys.stderr - ) + LOG.exception("Changing API KEYS due to depletion of quota") return get_response_elems(license, country, language, time) else: - print(f"Request URL was {request_url}", file=sys.stderr) + LOG.exception(f"Request URL was {request_url}") raise e def set_up_data_file(): """Writes the header row to file to contain Google Query data.""" + LOG.info("Writing the header row to file to contain Google Query data.") + header_title = "LICENSE TYPE,No Priori," selected_countries = get_country_list() all_countries = get_country_list(select_all=True) @@ -285,6 +323,11 @@ def record_license_data(license_type=None, time=False, country=False): A boolean indicating whether this query is related to country occurrence. """ + LOG.info( + "Writing the row for LICENSE_TYPE " + "to file to contain Google Query data." + ) + if license_type is None: data_log = "all" else: @@ -334,6 +377,12 @@ def record_all_licenses(): records these data into the DATA_WRITE_FILE and DATA_WRITE_FILE_TIME as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "findable in the license list into " + "DATA_WRITE_FILE and DATA_WRITE_FILE_TIME" + ) + license_list = get_license_list() record_license_data(time=False) record_license_data(time=True) @@ -349,14 +398,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/internetarchive/internetarchive_scratcher.py b/internetarchive/internetarchive_scratcher.py index 7cd86727..0cea61ba 100644 --- a/internetarchive/internetarchive_scratcher.py +++ b/internetarchive/internetarchive_scratcher.py @@ -4,6 +4,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -30,6 +31,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_license_list(): """Provides the list of license from a Creative Commons provided tool list. @@ -37,9 +57,15 @@ def get_license_list(): np.array: An np array containing all license types that should be searched via Programmable Search Engine. """ + LOG.info( + "Providing the list of licenses from a " + "Creative Commons provided tool list." + ) + cc_license_data = pd.read_csv( os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None ) + license_list = cc_license_data[0].unique() return license_list @@ -57,6 +83,8 @@ def get_response_elems(license): dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for query of specified parameters.") + try: max_retries = Retry( total=5, @@ -80,6 +108,8 @@ def get_response_elems(license): def set_up_data_file(): """Writes the header row to file to contain IA data.""" + LOG.info("Writing the header row to file to contain IA data.") + header_title = "LICENSE TYPE,Document Count" with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") @@ -94,6 +124,10 @@ def record_license_data(license_type): default None value stands for having no assumption about license type. """ + LOG.info( + "Writing the row for LICENSE_TYPE to file to contain IA Query data." + ) + data_log = ( f"{license_type}," f"{get_response_elems(license_type)['totalResults']}" @@ -106,6 +140,12 @@ def record_all_licenses(): """Records the data of all license types findable in the license list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "findable in the license list into " + "DATA_WRITE_FILE and DATA_WRITE_FILE_TIME" + ) + license_list = get_license_list() for license_type in license_list: record_license_data(license_type) @@ -117,14 +157,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/metmuseum/metmuseum_scratcher.py b/metmuseum/metmuseum_scratcher.py index 30670ecd..2c1aeb1a 100755 --- a/metmuseum/metmuseum_scratcher.py +++ b/metmuseum/metmuseum_scratcher.py @@ -5,6 +5,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -28,6 +29,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_request_url(): """Provides the API Endpoint URL for specified parameter combinations. @@ -35,6 +55,10 @@ def get_request_url(): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameter combinations." + ) + return "https://collectionapi.metmuseum.org/public/collection/v1/objects" @@ -45,6 +69,8 @@ def get_response_elems(): dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for query of specified parameters.") + try: request_url = get_request_url() max_retries = Retry( @@ -60,7 +86,7 @@ def get_response_elems(): return search_data except Exception as e: if "pageInfo" not in search_data: - print(f"search data is: \n{search_data}", file=sys.stderr) + LOG.exception(f"search data is: \n{search_data}") sys.exit(1) else: raise e @@ -68,6 +94,8 @@ def get_response_elems(): def set_up_data_file(): """Writes the header row to file to contain metmuseum data.""" + LOG.info("Writing the header row to file to contain metmuseum data.") + header_title = "LICENSE TYPE,Document Count" with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") @@ -77,6 +105,12 @@ def record_all_licenses(): """Records the data of all license types findable in the license list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "in the license list and " + "recording them into DATA_WRITE_FILE" + ) + with open(DATA_WRITE_FILE, "a") as f: f.write(f"publicdomain/zero/1.0,{get_response_elems()['total']}\n") @@ -87,14 +121,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/vimeo/vimeo_scratcher.py b/vimeo/vimeo_scratcher.py index b616a669..b791fb34 100755 --- a/vimeo/vimeo_scratcher.py +++ b/vimeo/vimeo_scratcher.py @@ -8,6 +8,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -35,6 +36,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_license_list(): """Provides the list of license from a Creative Commons searched licenses. @@ -42,6 +62,12 @@ def get_license_list(): List: A list containing all license types that should be searched in all possible license filters of Vimeo API. """ + + LOG.info( + "Providing the list of licenses " + "from a Creative Commons searched license." + ) + return [ "CC", "CC-BY", @@ -68,6 +94,10 @@ def get_request_url(license="CC"): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameter combinations." + ) + return ( f"https://api.vimeo.com/videos?filter={license}" f"&client_id={CLIENT_ID}&access_token={ACCESS_TOKEN}" @@ -86,6 +116,8 @@ def get_response_elems(license): dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for query of specified parameters.") + try: request_url = get_request_url(license=license) max_retries = Retry( @@ -105,6 +137,8 @@ def get_response_elems(license): def set_up_data_file(): """Writes the header row to file to contain Vimeo data.""" + LOG.info("Writing the header row to file to contain Vimeo data.") + header_title = "LICENSE TYPE,Document Count" with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") @@ -119,6 +153,8 @@ def record_license_data(license_type): default None value stands for having no assumption about license type. """ + LOG.info("Writing the header row to file to contain Vimeo Query data.") + data_log = ( f"{license_type}," f"{get_response_elems(license_type)['totalResults']}" @@ -131,6 +167,12 @@ def record_all_licenses(): """Records the data of all license types findable in the license list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "in the license list and recording them " + "into DATA_WRITE_FILE" + ) + license_list = get_license_list() for license_type in license_list: record_license_data(license_type) @@ -142,14 +184,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/wikicommons/wikicommons_scratcher.py b/wikicommons/wikicommons_scratcher.py index 275f7150..9149204d 100755 --- a/wikicommons/wikicommons_scratcher.py +++ b/wikicommons/wikicommons_scratcher.py @@ -5,6 +5,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -28,6 +29,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_content_request_url(license): """Provides the API Endpoint URL for specified parameters' WikiCommons @@ -44,6 +64,10 @@ def get_content_request_url(license): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameters' WikiCommons." + ) + return ( r"https://commons.wikimedia.org/w/api.php?" r"action=query&prop=categoryinfo&titles=" @@ -66,6 +90,12 @@ def get_subcat_request_url(license): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL " + "for specified parameters' WikiCommons " + "subcategories for recursive searching." + ) + base_url = ( r"https://commons.wikimedia.org/w/api.php?" r"action=query&cmtitle=" @@ -94,6 +124,12 @@ def get_subcategories(license, session): in WikiCommons dataset from a provided API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Obtaining the subcategories of " + "LICENSE in WikiCommons Database " + "for recursive searching." + ) + try: request_url = get_subcat_request_url(license) with session.get(request_url) as response: @@ -107,12 +143,9 @@ def get_subcategories(license, session): return category_list except Exception as e: if "queries" not in search_data: - print( - ( - f"search data is: \n{search_data} for license {license}" - f"This query will not be processed due to empty result." - ), - file=sys.stderr, + LOG.exception( + f"search data is: \n{search_data} " + f"for license {license} - empty result" ) sys.exit(1) else: @@ -136,6 +169,8 @@ def get_license_contents(license, session): dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for query of specified parameters.") + try: request_url = get_content_request_url(license) with session.get(request_url) as response: @@ -154,12 +189,9 @@ def get_license_contents(license, session): return search_data_dict except Exception as e: if "queries" not in search_data: - print( - ( - f"search data is: \n{search_data} for license {license}" - f"This query will not be processed due to empty result." - ), - file=sys.stderr, + LOG.exception( + f"search data is: \n{search_data} for license {license}" + f"This query will not be processed due to empty result." ) sys.exit(1) else: @@ -168,6 +200,10 @@ def get_license_contents(license, session): def set_up_data_file(): """Writes the header row to file to contain WikiCommons Query data.""" + LOG.info( + "Writing the header row to file to contain WikiCommons Query data." + ) + header_title = "LICENSE TYPE,File Count,Page Count\n" with open(DATA_WRITE_FILE, "w") as f: f.write(header_title) @@ -190,6 +226,10 @@ def record_license_data(license_type, license_alias, session): A requests.Session object for accessing API endpoints and retrieving API endpoint responses. """ + LOG.info( + "Writing the row for LICENSE_TYPE to file to contain WikiCommon Query." + ) + search_result = get_license_contents(license_type, session) cleaned_alias = license_alias.replace(",", "|") data_log = ( @@ -216,6 +256,12 @@ def recur_record_all_licenses(license_alias="Free_Creative_Commons_licenses"): eventual efforts of aggregating data. Defaults to "Free_Creative_Commons_licenses". """ + LOG.info( + "Recursively recording the data of all " + "license types findable in the license " + "lists and recording into DATA_WRITE_FILE" + ) + license_cache = {} session = requests.Session() max_retries = Retry( @@ -244,14 +290,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/wikipedia/wikipedia_scratcher.py b/wikipedia/wikipedia_scratcher.py index dcdc0d3a..13271b0f 100755 --- a/wikipedia/wikipedia_scratcher.py +++ b/wikipedia/wikipedia_scratcher.py @@ -4,6 +4,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -28,6 +29,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_wiki_langs(): """Provides the list of language to find Creative Commons usage data on. @@ -41,6 +61,10 @@ def get_wiki_langs(): pd.DataFrame: A Dataframe containing information of each Wikipedia language and its respective encoding on web address. """ + LOG.info( + "Providing the list of languages " + "to find Creative Commons usage data on." + ) return pd.read_csv(os.path.join(PATH_WORK_DIR, "language-codes_csv.csv")) @@ -57,6 +81,10 @@ def get_request_url(lang="en"): string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameter combinations." + ) + base_url = ( r"wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=statistics" r"&format=json" @@ -77,6 +105,8 @@ def get_response_elems(language="en"): - dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Providing the metadata for query of specified parameters") + search_data = None try: request_url = get_request_url(language) @@ -93,11 +123,10 @@ def get_response_elems(language="en"): search_data = response.json() if search_data is None: - print( + LOG.exception( f"Received Result is None due to Language {language} absent as" "an available Wikipedia client. Will therefore return an empty" - "dictionary for result, but will continue querying.", - file=sys.stderr, + "dictionary for result, but will continue querying." ) return {} @@ -106,18 +135,22 @@ def get_response_elems(language="en"): return search_data_dict except requests.HTTPError as e: - print(f"HTTP Error: {e}", file=sys.stderr) + LOG.exception(f"HTTP Error: {e}") raise except requests.RequestException as e: - print(f"Request Exception: {e}", file=sys.stderr) + LOG.exception(f"Request Exception: {e}") raise except KeyError as e: - print(f"KeyError: {e}. Search data is: {search_data}", file=sys.stderr) + LOG.exception( + f"KeyError: {e}. Search data is: {search_data}", + ) raise def set_up_data_file(): """Writes the header row to file to contain Wikipedia Query data.""" + LOG.info("Writing the header row to file to contain Wikipedia Query data.") + header_title = ",".join(get_response_elems()) with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") @@ -132,6 +165,11 @@ def record_lang_data(lang="en"): presented in. Alternatively, the default value is by Wikipedia customs "en". """ + LOG.info( + "Writing the row for LICENSE_TYPE " + "to file to contain Google Query data." + ) + response = get_response_elems(lang) if response != {}: response_values = response.values() @@ -144,6 +182,12 @@ def record_all_licenses(): """Records the data of all language types findable in the language list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all language " + "types findable in the language list " + "and recording into DATA_WRITE_FILE" + ) + wiki_langs = get_wiki_langs() for iso_language_code in wiki_langs["alpha2"]: record_lang_data(iso_language_code) @@ -157,6 +201,9 @@ def get_current_data(): pd.DataFrame: A DataFrame recording the number of CC-licensed documents per search query of assumption. """ + LOG.info( + "Returning a DataFrame for the Creative Commons usage data collected" + ) return pd.read_csv(DATA_WRITE_FILE).set_index("language") @@ -166,14 +213,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1) diff --git a/youtube/youtube_scratcher.py b/youtube/youtube_scratcher.py index dfdc2b65..d62f8fa6 100755 --- a/youtube/youtube_scratcher.py +++ b/youtube/youtube_scratcher.py @@ -5,6 +5,7 @@ """ # Standard library +import logging import os import sys import traceback @@ -38,6 +39,25 @@ f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv", ) +# Set up the logger +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + +# Define both the handler and the formatter +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +) + +# Add formatter to the handler +handler.setFormatter(formatter) + +# Add handler to the logger +LOG.addHandler(handler) + +# Log the start of the script execution +LOG.info("Script execution started.") + def get_next_time_search_interval(): """ @@ -49,6 +69,11 @@ def get_next_time_search_interval(): via 2 RFC 3339 formatted date-time values (by YouTube API Standards), and the current starting year and month of the interval. """ + LOG.info( + "Providing the next searching interval " + "of time for Creative Commons licensed video." + ) + cur_year, cur_month = 2009, 1 while ( cur_year * 100 + cur_month @@ -92,6 +117,10 @@ def get_request_url(time=None): - string: A string representing the API Endpoint URL for the query specified by this function's parameters. """ + LOG.info( + "Providing the API Endpoint URL for specified parameter combinations." + ) + base_url = ( r"https://youtube.googleapis.com/youtube/v3/search?part=snippet" r"&type=video&videoLicense=creativeCommon&" @@ -119,6 +148,8 @@ def get_response_elems(time=None): - dict: A dictionary mapping metadata to its value provided from the API query of specified parameters. """ + LOG.info("Provides the metadata for query of specified parameters.") + search_data = None try: request_url = get_request_url(time=time) @@ -136,7 +167,7 @@ def get_response_elems(time=None): return search_data except Exception as e: if "pageInfo" not in search_data: - print(f"search data is: \n{search_data}", file=sys.stderr) + LOG.exception(f"search data is: \n{search_data}") sys.exit(1) else: raise e @@ -144,6 +175,8 @@ def get_response_elems(time=None): def set_up_data_file(): """Writes the header row to file to contain YouTube data.""" + LOG.info("Writing the header row to file to contain YouTube data.") + with open(DATA_WRITE_FILE, "w") as f: f.write("LICENSE TYPE,Document Count\n") with open(DATA_WRITE_FILE_TIME, "w") as f: @@ -154,6 +187,12 @@ def record_all_licenses(): """Records the data of all license types findable in the license list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "findable in the license list " + "and records into DATA_WRITE_FILE" + ) + with open(DATA_WRITE_FILE, "a") as f: f.write( "licenses/by/3.0," @@ -165,6 +204,12 @@ def record_all_licenses_time(): """Records the data of all license types findable in the license list and records these data into the DATA_WRITE_FILE as specified in that constant. """ + LOG.info( + "Recording the data of all license types " + "findable in the license list and records " + "into DATA_WRITE_FILE, incorporating time" + ) + with open(DATA_WRITE_FILE_TIME, "a") as f: for time in get_next_time_search_interval(): f.write( @@ -182,14 +227,15 @@ def main(): if __name__ == "__main__": + # Exception Handling try: main() except SystemExit as e: + LOG.error(f"System exit with code: {e.code}") sys.exit(e.code) except KeyboardInterrupt: - print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) + LOG.info("(130) Halted via KeyboardInterrupt.") sys.exit(130) except Exception: - print("ERROR (1) Unhandled exception:", file=sys.stderr) - print(traceback.print_exc(), file=sys.stderr) + LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}") sys.exit(1)