Skip to content

Commit 8412423

Browse files
authored
Merge pull request #96 from IamMQaisar/shared_module
Fixed: Refactor scripts #91, Adds logging #85 and Cleans code and comments #86 by Putting it in a Shared Library/Module
2 parents f2c4b4c + b3d5e21 commit 8412423

18 files changed

+44437
-1835
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,15 @@ directories to check:
133133
- [ppypa/pipenv][pipenv]: _Python Development Workflow for Humans._
134134
- [pre-commit][pre-commit]: _A framework for managing and maintaining
135135
multi-language pre-commit hooks._
136-
- [Logging][logging]: _Built-in Python logging module to implement a flexible logging system across shared modules._
136+
- [Logging][logging]: _Utilize the built-in Python logging module to implement a flexible logging system from a shared module._
137137

138138
[ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/
139139
[black]: https://github.com/psf/black
140140
[flake8]: https://github.com/PyCQA/flake8
141141
[isort]: https://pycqa.github.io/isort/
142142
[pipenv]: https://github.com/pypa/pipenv
143143
[pre-commit]: https://pre-commit.com/
144-
[logging]: https://docs.python.org/3/howto/logging.html
144+
[logging]: https://docs.python.org/3/library/logging.html
145145

146146

147147
### GitHub Actions

analyze/data_analysis.py

+29-46
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
"""
44

55
# Standard library
6-
import logging
7-
import os.path
6+
import os
87
import re
98
import sys
109
import traceback
@@ -16,36 +15,17 @@
1615
import pandas as pd
1716
import plotly.express as px
1817
import seaborn as sns
19-
20-
warnings.filterwarnings("ignore")
21-
22-
# Third-party
2318
from wordcloud import STOPWORDS, WordCloud # noqa: E402
2419

25-
# Set the current working directory
26-
PATH_WORK_DIR = os.path.dirname(os.path.abspath(__file__))
27-
28-
# Set the current working directory
29-
CWD = os.path.dirname(os.path.abspath(__file__))
30-
31-
# Set up the logger
32-
LOG = logging.getLogger(__name__)
33-
LOG.setLevel(logging.INFO)
20+
sys.path.append(".")
21+
# First-party/Local
22+
import quantify # noqa: E402
3423

35-
# Define both the handler and the formatter
36-
handler = logging.StreamHandler()
37-
formatter = logging.Formatter(
38-
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
39-
)
40-
41-
# Add formatter to the handler
42-
handler.setFormatter(formatter)
43-
44-
# Add handler to the logger
45-
LOG.addHandler(handler)
24+
# Warning suppression /!\ Caution /!\
25+
warnings.filterwarnings("ignore")
4626

47-
# Log the start of the script execution
48-
LOG.info("Script execution started.")
27+
# Setup PATH_WORK_DIR, and LOGGER using quantify.setup()
28+
_, PATH_WORK_DIR, _, _, LOGGER = quantify.setup(__file__)
4929

5030

5131
def tags_frequency(csv_path, column_names):
@@ -59,7 +39,7 @@ def tags_frequency(csv_path, column_names):
5939
Example: ["tags", "description"]
6040
6141
"""
62-
LOG.info("Generating word cloud based on tags.")
42+
LOGGER.info("Generating word cloud based on tags.")
6343

6444
df = pd.read_csv(csv_path)
6545
# Process each column containing tags
@@ -79,7 +59,7 @@ def tags_frequency(csv_path, column_names):
7959
and str(row) != ""
8060
and str(row) != "nan"
8161
):
82-
LOG.debug(f"Processing row: {row}")
62+
LOGGER.debug(f"Processing row: {row}")
8363
if "ChineseinUS.org" in str(row):
8464
row = "ChineseinUS"
8565
list2 += re.split(r"\s|(?<!\d)[,.](?!\d)", str(row))
@@ -168,7 +148,7 @@ def time_trend_helper(df):
168148
Returns:
169149
- DataFrame: DataFrame with counts of entries per year.
170150
"""
171-
LOG.info("Extracting year-wise count of entries.")
151+
LOGGER.info("Extracting year-wise count of entries.")
172152

173153
year_list = []
174154
for date_row in df["dates"][0:]:
@@ -196,7 +176,7 @@ def time_trend(csv_path):
196176
Args:
197177
- csv_path (str): Path to the CSV file.
198178
"""
199-
LOG.info("Generating time trend line graph.")
179+
LOGGER.info("Generating time trend line graph.")
200180

201181
df = pd.read_csv(csv_path)
202182
count_df = time_trend_helper(df)
@@ -239,7 +219,7 @@ def time_trend_compile_helper(yearly_count):
239219
Returns:
240220
- DataFrame: Filtered yearly count data.
241221
"""
242-
LOG.info("Filtering yearly trend data.")
222+
LOGGER.info("Filtering yearly trend data.")
243223

244224
Years = np.arange(2018, 2023)
245225
yearly_count["year"] = list(yearly_count.index)
@@ -249,7 +229,7 @@ def time_trend_compile_helper(yearly_count):
249229
int(yearly_count["year"][num]) >= 2018
250230
):
251231
counts.append(yearly_count["Counts"][num])
252-
LOG.info(f"{counts}")
232+
LOGGER.info(f"{counts}")
253233
final_yearly_count = pd.DataFrame(
254234
list(zip(Years, counts)), columns=["Years", "Yearly_counts"]
255235
)
@@ -260,7 +240,7 @@ def time_trend_compile():
260240
"""
261241
Compile yearly trends for different licenses and plot them.
262242
"""
263-
LOG.info("Compiling yearly trends for different licenses.")
243+
LOGGER.info("Compiling yearly trends for different licenses.")
264244

265245
license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
266246
license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
@@ -319,7 +299,7 @@ def time_trend_compile():
319299
yearly_count6 = time_trend_compile_helper(yearly_count6)
320300
yearly_count9 = time_trend_compile_helper(yearly_count9)
321301
yearly_count10 = time_trend_compile_helper(yearly_count10)
322-
LOG.info(f"{yearly_count1}")
302+
LOGGER.info(f"{yearly_count1}")
323303

324304
# Plot yearly trend for all licenses
325305
plt.plot(
@@ -408,20 +388,22 @@ def view_compare_helper(df):
408388
Returns:
409389
- int: Maximum views.
410390
"""
411-
LOG.info("Calculating maximum views of pictures under a license.")
391+
LOGGER.info("Calculating maximum views of pictures under a license.")
412392

413393
highest_view = int(max(df["views"]))
414394
df = df.sort_values("views", ascending=False)
415-
LOG.info(f"DataFrame sorted by views in descending order: {df}")
416-
LOG.info(f"Maximum views found: {highest_view}")
395+
LOGGER.info(f"DataFrame sorted by views in descending order: {df}")
396+
LOGGER.info(f"Maximum views found: {highest_view}")
417397
return highest_view
418398

419399

420400
def view_compare():
421401
"""
422402
Compare maximum views of pictures under different licenses.
423403
"""
424-
LOG.info("Comparing maximum views of pictures under different licenses.")
404+
LOGGER.info(
405+
"Comparing maximum views of pictures under different licenses."
406+
)
425407

426408
license1 = pd.read_csv(
427409
os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv")
@@ -461,7 +443,7 @@ def view_compare():
461443
maxs = []
462444
for lic in licenses:
463445
maxs.append(view_compare_helper(lic))
464-
LOG.info(f"{maxs}")
446+
LOGGER.info(f"{maxs}")
465447
# Create DataFrame to store license and their maximum views
466448
temp_data = pd.DataFrame()
467449
temp_data["Licenses"] = [
@@ -517,7 +499,9 @@ def total_usage():
517499
"""
518500
Generate a bar plot showing the total usage of different licenses.
519501
"""
520-
LOG.info("Generating bar plot showing total usage of different licenses.")
502+
LOGGER.info(
503+
"Generating bar plot showing total usage of different licenses."
504+
)
521505

522506
# Reads the license total file as the input dataset
523507
df = pd.read_csv(
@@ -538,15 +522,14 @@ def main():
538522

539523

540524
if __name__ == "__main__":
541-
# Exception Handling
542525
try:
543526
main()
544527
except SystemExit as e:
545-
LOG.error(f"System exit with code: {e.code}")
528+
LOGGER.error(f"System exit with code: {e.code}")
546529
sys.exit(e.code)
547530
except KeyboardInterrupt:
548-
LOG.info("(130) Halted via KeyboardInterrupt.")
531+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
549532
sys.exit(130)
550533
except Exception:
551-
LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
534+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
552535
sys.exit(1)

deviantart/deviantart_scratcher.py

+36-48
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
data.
55
"""
66
# Standard library
7-
import logging
87
import os
98
import sys
109
import traceback
@@ -20,56 +19,47 @@
2019
# First-party/Local
2120
import quantify # noqa: E402
2221

23-
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup(
24-
__file__
22+
# Setup paths, Date and LOGGER using quantify.setup()
23+
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY, LOGGER = (
24+
quantify.setup(__file__)
2525
)
26+
27+
# Load environment variables
2628
load_dotenv(PATH_DOTENV)
2729

28-
# Retrieve API keys
29-
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
30+
31+
# Global Variable for API_KEYS indexing
3032
API_KEYS_IND = 0
33+
34+
# Gets API_KEYS and PSE_KEY from .env file
35+
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
36+
PSE_KEY = os.getenv("PSE_KEY")
37+
3138
# Set up file path for CSV report
3239
DATA_WRITE_FILE = os.path.join(
3340
PATH_WORK_DIR,
34-
f"data_deviantart_"
41+
f"/data_deviantart_"
3542
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv",
3643
)
37-
# Retrieve Programmable Search Engine key from environment variables
38-
PSE_KEY = os.getenv("PSE_KEY")
39-
40-
# Set up the logger
41-
LOG = logging.getLogger(__name__)
42-
LOG.setLevel(logging.INFO)
43-
44-
# Define both the handler and the formatter
45-
handler = logging.StreamHandler()
46-
formatter = logging.Formatter(
47-
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
48-
)
49-
50-
# Add formatter to the handler
51-
handler.setFormatter(formatter)
52-
53-
# Add handler to the logger
54-
LOG.addHandler(handler)
5544

5645
# Log the start of the script execution
57-
LOG.info("Script execution started.")
46+
LOGGER.info("Script execution started.")
5847

5948

6049
def get_license_list():
6150
"""
6251
Provides the list of license from 2018's record of Creative Commons.
6352
6453
Returns:
65-
- np.array: An array containing all license types that should be
66-
searched via Programmable Search Engine.
54+
- np.array:
55+
An np array containing all license types that should be searched
56+
via Programmable Search Engine (PSE).
6757
"""
68-
LOG.info("Retrieving list of license from Creative Commons' record.")
58+
LOGGER.info("Retrieving list of license from Creative Commons' record.")
6959

7060
# Read license data from file
7161
cc_license_data = pd.read_csv(
72-
os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None
62+
f"{PATH_REPO_ROOT}/legal-tool-paths.txt", header=None
7363
)
7464
# Define regex pattern to extract license types
7565
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
@@ -92,7 +82,9 @@ def get_request_url(license):
9282
Returns:
9383
- str: The API Endpoint URL for the query specified by parameters.
9484
"""
95-
LOG.info(f"Generating API Endpoint URL for specified license: {license}")
85+
LOGGER.info(
86+
f"Generating API Endpoint URL for specified license: {license}"
87+
)
9688

9789
try:
9890
api_key = API_KEYS[API_KEYS_IND]
@@ -104,7 +96,7 @@ def get_request_url(license):
10496
)
10597
except Exception as e:
10698
if isinstance(e, IndexError):
107-
LOG.exception("Depleted all API Keys provided")
99+
LOGGER.error("Depleted all API Keys provided")
108100
else:
109101
raise e
110102

@@ -121,7 +113,7 @@ def get_response_elems(license):
121113
- dict: A dictionary mapping metadata to its value provided from the API
122114
query.
123115
"""
124-
LOG.info("Making a request to the API and handling potential retries.")
116+
LOGGER.info("Making a request to the API and handling potential retries.")
125117

126118
try:
127119
# Make a request to the API and handle potential retries
@@ -146,16 +138,14 @@ def get_response_elems(license):
146138
# If quota limit exceeded, switch to the next API key
147139
global API_KEYS_IND
148140
API_KEYS_IND += 1
149-
LOG.exception("Changing API KEYS due to depletion of quota")
141+
LOGGER.error("Changing API KEYS due to depletion of quota")
150142
return get_response_elems(license)
151143
else:
152144
raise e
153145

154146

155147
def set_up_data_file():
156-
"""Writes the header row to the file to contain DeviantArt data."""
157-
LOG.info("Setting up data file by writing the header row.")
158-
148+
# Writes the header row to the file to contain DeviantArt data.
159149
header_title = "LICENSE TYPE,Document Count"
160150
with open(DATA_WRITE_FILE, "w") as f:
161151
f.write(f"{header_title}\n")
@@ -164,11 +154,13 @@ def set_up_data_file():
164154
def record_license_data(license_type):
165155
"""Writes the row for LICENSE_TYPE to the file to contain DeviantArt data.
166156
Args:
167-
- license_type(str): A string representing the type of license.
168-
It's a segment of the URL towards the license description. If not provided,
169-
it defaults to None, indicating no assumption about the license type.
157+
- license_type:
158+
A string representing the type of license, and should be a segment
159+
of its URL towards the license description. Alternatively, the
160+
default None value stands for having no assumption about license
161+
type.
170162
"""
171-
LOG.info(
163+
LOGGER.info(
172164
"Writing the row for license type %s to contain DeviantArt data",
173165
license_type,
174166
)
@@ -187,11 +179,8 @@ def record_all_licenses():
187179
list and writes this data into the DATA_WRITE_FILE, as specified by the
188180
constant.
189181
"""
190-
LOG.info("Recording data for all available license types.")
191-
192-
# Get the list of license types
182+
# Gets the list of license types and record data for each license type
193183
license_list = get_license_list()
194-
# Record data for each license types
195184
for license_type in license_list:
196185
record_license_data(license_type)
197186

@@ -202,15 +191,14 @@ def main():
202191

203192

204193
if __name__ == "__main__":
205-
# Exception Handling
206194
try:
207195
main()
208196
except SystemExit as e:
209-
LOG.error(f"System exit with code: {e.code}")
197+
LOGGER.error(f"System exit with code: {e.code}")
210198
sys.exit(e.code)
211199
except KeyboardInterrupt:
212-
LOG.info("(130) Halted via KeyboardInterrupt.")
200+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
213201
sys.exit(130)
214202
except Exception:
215-
LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
203+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
216204
sys.exit(1)

0 commit comments

Comments
 (0)