Skip to content

Commit 5657052

Browse files
committed
complete base codebase for gcs_reports.py
1 parent 1baecce commit 5657052

File tree

13 files changed

+228
-20
lines changed

13 files changed

+228
-20
lines changed

data/2024Q2/1-fetched/gcs_fetched.csv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,10 @@ https://creativecommons.org/publicdomain/zero/1.0,32700000,30000000,20200,18200,
1313
https://creativecommons.org/licenses/nc-sa/2.0,25000000,17700000,19700,4640,80700,14700,11900,23400000,368000,1340000,1490,25000000,36100
1414
https://creativecommons.org/licenses/sa/1.0,108000000,91400000,23000,17100,159000,38200,21600,104000000,270000,337000,28600,108000000,224000
1515
https://creativecommons.org/licenses/by-sa/1.0,102000000,85900000,21900,15800,150000,36100,21200,97400000,267000,330000,28300,102000000,219000
16+
https://creativecommons.org/licenses/by-nd/2.5,49500000,35500000,45100,8640,133000,24600,23600,47800000,367000,37900,2200,49500000,11200
17+
https://creativecommons.org/licenses/by-nd-nc/1.0,15800000,10300000,15600,2940,66800,10200,21400,15300000,62400,29000,1980,15800000,27500
18+
https://creativecommons.org/licenses/nd/1.0,220000000,194000000,89900,56500,424000,147000,88400,213000000,490000,297000,69500,220000000,2570000
19+
https://creativecommons.org/licenses/by-nc-sa/4.0,62900000,44700000,49900,32900,163000,32900,79400,58900000,1020000,3610000,11500,62900000,102000
20+
https://creativecommons.org/licenses/sampling+/1.0,169000000,151000000,27300,22900,274000,57300,34700,168000000,123000,62800,1310,169000000,200000
21+
https://creativecommons.org/licenses/by-nc-sa/2.5,31600000,22800000,20100,5400,83100,15200,10400,29900000,258000,250000,1570,31600000,7290
22+
https://creativecommons.org/licenses/nc-sa/1.0,24100000,17500000,10400,3820,58800,11800,10600,23300000,67500,280000,1200,24100000,42400
54.9 KB
Loading
55.2 KB
Loading
85.3 KB
Loading

data/2024Q2/state.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
total_records_retrieved: 182
1+
total_records_retrieved: 273

data/2024Q3/1-fetched/gcs_fetched.csv

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
LICENSE TYPE, No Priori, United States, Canada, India, United Kingdom, Australia, Japan, English, Spanish, French, Arabic, Chinese (Simplified), Indonesian
2+
https://creativecommons.org/licenses/by/2.5,235000000,208000000,112000,82400,406000,701000,76800,225000000,3940000,835000,88300,235000000,115000
3+
https://creativecommons.org/licenses/by/4.0,412000000,334000000,702000,360000,7250000,770000,675000,381000000,16100000,5260000,255000,412000000,11900000
4+
https://creativecommons.org/licenses/by-nc-sa/2.5,31400000,22700000,21200,5400,83000,15000,10300,29800000,254000,248000,1580,31400000,7480
5+
https://creativecommons.org/licenses/nc/1.0,54600000,43400000,25100,8390,128000,22200,30500,53600000,136000,479000,3760,54500000,81500
6+
https://creativecommons.org/licenses/by-nc-sa/1.0,24000000,17400000,11300,3810,59100,11900,10600,23300000,66900,276000,1220,24000000,42900
7+
https://creativecommons.org/licenses/by/2.1,219000000,189000000,132000,56200,477000,139000,269000,211000000,494000,232000,19800,219000000,119000
8+
https://creativecommons.org/licenses/nc-sampling+/1.0,41200000,32000000,12900,4590,82300,14600,18800,41000000,52800,37800,381,41200000,31100
54.7 KB
Loading
50.3 KB
Loading
77.3 KB
Loading

data/2024Q3/state.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
total_records_retrieved: 91

scripts/1-fetched/gcs_fetched.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def main():
335335
state = load_state()
336336
total_records_retrieved = state["total_records_retrieved"]
337337
LOGGER.info(f"Initial total_records_retrieved: {total_records_retrieved}")
338-
goal_records = 200 # Set goal number of records
338+
goal_records = 1000 # Set goal number of records
339339

340340
if total_records_retrieved >= goal_records:
341341
LOGGER.info(

scripts/3-reports/gcs_reports.py

Lines changed: 208 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
This file is dedicated to visualizing and analyzing the data collected.
44
"""
55
# Standard library
6+
import argparse
67
import os
78
import sys
89
import traceback
910

1011
# Third-party
1112
import matplotlib.pyplot as plt
13+
import matplotlib.ticker as ticker
1214
import pandas as pd
15+
import seaborn as sns
1316

1417
# Add parent directory so shared can be imported
1518
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -21,11 +24,31 @@
2124
LOGGER, PATHS = shared.setup(__file__)
2225

2326

24-
def load_data():
27+
def parse_arguments():
28+
"""
29+
Parses command-line arguments, returns parsed arguments.
30+
"""
31+
LOGGER.info("Parsing command-line arguments")
32+
parser = argparse.ArgumentParser(description="Google Custom Search Report")
33+
parser.add_argument(
34+
"--quarter",
35+
type=str,
36+
required=True,
37+
help="Data quarter in format YYYYQx, e.g., 2024Q2",
38+
)
39+
return parser.parse_args()
40+
41+
42+
def load_data(args):
2543
"""
2644
Load the collected data from the CSV file.
2745
"""
28-
file_path = os.path.join(PATHS["data_phase"], "gcs_fetched.csv")
46+
selected_quarter = args.quarter
47+
48+
file_path = os.path.join(
49+
PATHS["data"], f"{selected_quarter}", "1-fetched", "gcs_fetched.csv"
50+
)
51+
2952
if not os.path.exists(file_path):
3053
LOGGER.error(f"Data file not found: {file_path}")
3154
return pd.DataFrame()
@@ -35,38 +58,205 @@ def load_data():
3558
return data
3659

3760

38-
def process_data(data):
61+
# By country, by license type, by license language
62+
63+
64+
def visualize_by_country(data, args):
3965
"""
40-
Process the data to prepare it for visualization.
66+
Create a bar chart for the number of webpages licensed by country.
4167
"""
42-
# are we supposed to take from phase 2?
43-
return data
68+
LOGGER.info(
69+
"Creating a bar chart for the number of webpages licensed by country."
70+
)
71+
72+
selected_quarter = args.quarter
73+
74+
# Get the list of country columns dynamically
75+
columns = [col.strip() for col in data.columns.tolist()]
76+
77+
start_index = columns.index("United States")
78+
end_index = columns.index("Japan") + 1
79+
80+
countries = columns[start_index:end_index]
81+
82+
data.columns = data.columns.str.strip()
83+
84+
LOGGER.info(f"Cleaned Columns: {data.columns.tolist()}")
4485

86+
# Aggregate the data by summing the counts for each country
87+
country_data = data[countries].sum()
4588

46-
def visualize_data(data):
89+
plt.figure(figsize=(12, 8))
90+
ax = sns.barplot(x=country_data.index, y=country_data.values)
91+
plt.title(
92+
f"Number of Google Webpages Licensed by Country ({selected_quarter})"
93+
)
94+
plt.xlabel("Country")
95+
plt.ylabel("Number of Webpages")
96+
plt.xticks(rotation=45)
97+
98+
# Add value numbers to the top of each bar
99+
for p in ax.patches:
100+
ax.annotate(
101+
format(p.get_height(), ",.0f"),
102+
(p.get_x() + p.get_width() / 2.0, p.get_height()),
103+
ha="center",
104+
va="center",
105+
xytext=(0, 9),
106+
textcoords="offset points",
107+
)
108+
109+
# Format the y-axis to display numbers without scientific notation
110+
ax.get_yaxis().get_major_formatter().set_scientific(False)
111+
ax.get_yaxis().set_major_formatter(
112+
plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x)))
113+
)
114+
115+
output_directory = os.path.join(
116+
PATHS["data"], f"{selected_quarter}", "3-reports"
117+
)
118+
119+
LOGGER.info(f"Output directory: {output_directory}")
120+
121+
# Create the directory if it does not exist
122+
os.makedirs(output_directory, exist_ok=True)
123+
plt.savefig(os.path.join(output_directory, "gcs_country_report.png"))
124+
125+
plt.show()
126+
127+
LOGGER.info("Visualization by country created.")
128+
129+
130+
def visualize_by_license_type(data, args):
47131
"""
48-
Create visualizations for the data.
132+
Create a bar chart for the number of webpages licensed by license type
49133
"""
50-
plt.figure(figsize=(10, 6))
134+
LOGGER.info(
135+
"Creating a bar chart for the number of"
136+
" webpages licensed by license type."
137+
)
138+
139+
selected_quarter = args.quarter
51140

52-
# Example - fix later
53-
license_counts = data["LICENSE TYPE"].value_counts()
54-
license_counts.plot(kind="bar")
55-
plt.title("License Counts")
141+
# Strip any leading/trailing spaces from the columns
142+
data.columns = data.columns.str.strip()
143+
144+
# Sum the values across all columns except the first one ('LICENSE TYPE')
145+
license_data = data.set_index("LICENSE TYPE").sum(axis=1)
146+
147+
plt.figure(figsize=(12, 8))
148+
ax = sns.barplot(x=license_data.index, y=license_data.values)
149+
plt.title(
150+
f"Number of Webpages Licensed by License Type ({selected_quarter})"
151+
)
56152
plt.xlabel("License Type")
57-
plt.ylabel("Count")
153+
plt.ylabel("Number of Webpages")
154+
plt.xticks(rotation=45, ha="right")
155+
156+
# Use the millions formatter for y-axis
157+
def millions_formatter(x, pos):
158+
"The two args are the value and tick position"
159+
return f"{x * 1e-6:.1f}M"
160+
161+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(millions_formatter))
162+
163+
plt.tight_layout()
164+
165+
output_directory = os.path.join(
166+
PATHS["data"], f"{selected_quarter}", "3-reports"
167+
)
168+
169+
LOGGER.info(f"Output directory: {output_directory}")
170+
171+
# Create the directory if it does not exist
172+
os.makedirs(output_directory, exist_ok=True)
173+
plt.savefig(os.path.join(output_directory, "gcs_licensetype_report.png"))
174+
58175
plt.show()
59176

60-
LOGGER.info("Visualization created.")
177+
LOGGER.info("Visualization by license type created.")
178+
179+
180+
def visualize_by_language(data, args):
181+
"""
182+
Create a bar chart for the number of webpages licensed by language.
183+
"""
184+
LOGGER.info(
185+
"Creating a bar chart for the number of webpages licensed by language."
186+
)
187+
188+
selected_quarter = args.quarter
189+
190+
# Get the list of country columns dynamically
191+
columns = [col.strip() for col in data.columns.tolist()]
192+
193+
start_index = columns.index("English")
194+
end_index = columns.index("Indonesian") + 1
195+
196+
languages = columns[start_index:end_index]
197+
198+
data.columns = data.columns.str.strip()
199+
200+
LOGGER.info(f"Cleaned Columns: {data.columns.tolist()}")
201+
202+
# Aggregate the data by summing the counts for each country
203+
language_data = data[languages].sum()
204+
205+
plt.figure(figsize=(12, 8))
206+
ax = sns.barplot(x=language_data.index, y=language_data.values)
207+
plt.title(
208+
f"Number of Google Webpages Licensed by Country ({selected_quarter})"
209+
)
210+
plt.xlabel("Country")
211+
plt.ylabel("Number of Webpages")
212+
plt.xticks(rotation=45)
213+
214+
# Add value numbers to the top of each bar
215+
for p in ax.patches:
216+
ax.annotate(
217+
format(p.get_height(), ",.0f"),
218+
(p.get_x() + p.get_width() / 2.0, p.get_height()),
219+
ha="center",
220+
va="center",
221+
xytext=(0, 9),
222+
textcoords="offset points",
223+
)
224+
225+
# Format the y-axis to display numbers without scientific notation
226+
ax.get_yaxis().get_major_formatter().set_scientific(False)
227+
ax.get_yaxis().set_major_formatter(
228+
plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x)))
229+
)
230+
231+
output_directory = os.path.join(
232+
PATHS["data"], f"{selected_quarter}", "3-reports"
233+
)
234+
235+
LOGGER.info(f"Output directory: {output_directory}")
236+
237+
# Create the directory if it does not exist
238+
os.makedirs(output_directory, exist_ok=True)
239+
plt.savefig(os.path.join(output_directory, "gcs_language_report.png"))
240+
241+
plt.show()
242+
243+
LOGGER.info("Visualization by language created.")
61244

62245

63246
def main():
64-
data = load_data()
247+
248+
args = parse_arguments()
249+
250+
data = load_data(args)
65251
if data.empty:
66252
return
67253

68-
processed_data = process_data(data)
69-
visualize_data(processed_data)
254+
current_directory = os.getcwd()
255+
LOGGER.info(f"Current working directory: {current_directory}")
256+
257+
visualize_by_country(data, args)
258+
visualize_by_license_type(data, args)
259+
visualize_by_language(data, args)
70260

71261

72262
if __name__ == "__main__":

scripts/shared.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def setup(current_file):
3636
paths["state"] = os.path.join(data_quarter, "state.yaml")
3737
paths["data_phase"] = os.path.join(data_quarter, phase)
3838

39+
paths["data_quarter"] = data_quarter
40+
3941
return logger, paths
4042

4143

0 commit comments

Comments
 (0)