pre-automation/flickr/data_cleaning.py

"""
This is to clean the data pulled by the photos_detail.py script so as to
further delete useless columns and reorganize the dataset as this form:

|       locations                  | amount |   time     | license | content_categories | highest_comment | total_view |  # noqa: E501
| -------------------------------- | -----: | ---------- | ------: | ------------------ | --------------: | ---------: |  # noqa: E501
| Minneapolis, United States       |     20 | 2022-10-22 |       4 | football, life     |             105 |     100000 |  # noqa: E501
| São José do Rio Preto SP, Brasil |     30 | 2022-10-22 |       4 | football, life     |              50 |     300000 |  # noqa: E501
...

Note:
content_categories will be got from basic NLP on the tags column
"""

# Standard library
import sys
import traceback

# Third-party
import pandas as pd

sys.path.append(".")
# First-party/Local
import quantify  # noqa: E402

# Setup only LOGGER using quantify.setup()
_, _, _, _, LOGGER = quantify.setup(__file__)


def drop_empty_column(csv_path, new_csv_path):
    """
    Drops columns with 'Unnamed' in the name from the CSV file.
    Args:
    - csv_path (str): Path to the original CSV file.
    - new_csv_path (str): Path to save the cleaned CSV file.
    """
    LOGGER.info("Dropping 'Unnamed' columns from the CSV file.")

    df = pd.read_csv(csv_path)
    for col in df.columns:
        if "Unnamed" in col:
            data = df.drop(col, axis=1)
            LOGGER.info(f"Dropping column {col}")
    data.to_csv(new_csv_path)
    LOGGER.info("Dropping empty columns completed.")


def drop_duplicate_id(csv_path, new_csv_path):
    """
    Drops duplicate rows based on the 'id' column from the CSV file.

    Args:
    - csv_path (str): Path to the original CSV file.
    - new_csv_path (str): Path to save the cleaned CSV file.
    """
    LOGGER.info(
        "Dropping duplicate rows based on the 'id' column from the CSV file."
    )

    df = pd.read_csv(csv_path)
    data = df.drop_duplicates(subset=["id"])
    data.to_csv(new_csv_path)
    LOGGER.info("Dropping duplicates completed.")


def save_new_data(csv_path, column_name_list, new_csv_path):
    """
    Saves specified columns from the original CSV file to a new CSV file.

    Args:
    - csv_path (str): Path to the original CSV file.
    - column_name_list (list of str): List of column names to be saved
    (belongs to the existing column names from original csv)
    - new_csv_path (str): Path to save the new CSV file.
    """
    LOGGER.info("Saving columns from the original CSV to a new CSV.")

    df = pd.read_csv(csv_path)
    new_df = pd.DataFrame()
    for col in column_name_list:
        new_df[col] = list(df[col])
        LOGGER.info(f"Saving column {col}")
    new_df.to_csv(new_csv_path)
    LOGGER.info("Saving new data to new csv")


def main():
    drop_empty_column("hs.csv", "dataset/cleaned_license10.csv")
    drop_duplicate_id(
        "dataset/cleaned_license10.csv", "dataset/cleaned_license10.csv"
    )
    save_new_data(
        "dataset/cleaned_license10.csv",
        [
            "location",
            "dates",
            "license",
            "description",
            "tags",
            "views",
            "comments",
        ],
        "dataset/cleaned_license10.csv",
    )


if __name__ == "__main__":
    try:
        main()
    except SystemExit as e:
        LOGGER.error(f"System exit with code: {e.code}")
        sys.exit(e.code)
    except KeyboardInterrupt:
        LOGGER.info("(130) Halted via KeyboardInterrupt.")
        sys.exit(130)
    except Exception:
        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
        sys.exit(1)