3
3
This file is dedicated to visualizing and analyzing the data collected.
4
4
"""
5
5
# Standard library
6
+ import argparse
6
7
import os
7
8
import sys
8
9
import traceback
9
10
10
11
# Third-party
11
12
import matplotlib .pyplot as plt
13
+ import matplotlib .ticker as ticker
12
14
import pandas as pd
15
+ import seaborn as sns
13
16
14
17
# Add parent directory so shared can be imported
15
18
sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
21
24
LOGGER , PATHS = shared .setup (__file__ )
22
25
23
26
24
- def load_data ():
27
+ def parse_arguments ():
28
+ """
29
+ Parses command-line arguments, returns parsed arguments.
30
+ """
31
+ LOGGER .info ("Parsing command-line arguments" )
32
+ parser = argparse .ArgumentParser (description = "Google Custom Search Report" )
33
+ parser .add_argument (
34
+ "--quarter" ,
35
+ type = str ,
36
+ required = True ,
37
+ help = "Data quarter in format YYYYQx, e.g., 2024Q2" ,
38
+ )
39
+ return parser .parse_args ()
40
+
41
+
42
+ def load_data (args ):
25
43
"""
26
44
Load the collected data from the CSV file.
27
45
"""
28
- file_path = os .path .join (PATHS ["data_phase" ], "gcs_fetched.csv" )
46
+ selected_quarter = args .quarter
47
+
48
+ file_path = os .path .join (
49
+ PATHS ["data" ], f"{ selected_quarter } " , "1-fetched" , "gcs_fetched.csv"
50
+ )
51
+
29
52
if not os .path .exists (file_path ):
30
53
LOGGER .error (f"Data file not found: { file_path } " )
31
54
return pd .DataFrame ()
@@ -35,38 +58,205 @@ def load_data():
35
58
return data
36
59
37
60
38
- def process_data (data ):
61
+ # By country, by license type, by license language
62
+
63
+
64
+ def visualize_by_country (data , args ):
39
65
"""
40
- Process the data to prepare it for visualization .
66
+ Create a bar chart for the number of webpages licensed by country .
41
67
"""
42
- # are we supposed to take from phase 2?
43
- return data
68
+ LOGGER .info (
69
+ "Creating a bar chart for the number of webpages licensed by country."
70
+ )
71
+
72
+ selected_quarter = args .quarter
73
+
74
+ # Get the list of country columns dynamically
75
+ columns = [col .strip () for col in data .columns .tolist ()]
76
+
77
+ start_index = columns .index ("United States" )
78
+ end_index = columns .index ("Japan" ) + 1
79
+
80
+ countries = columns [start_index :end_index ]
81
+
82
+ data .columns = data .columns .str .strip ()
83
+
84
+ LOGGER .info (f"Cleaned Columns: { data .columns .tolist ()} " )
44
85
86
+ # Aggregate the data by summing the counts for each country
87
+ country_data = data [countries ].sum ()
45
88
46
- def visualize_data (data ):
89
+ plt .figure (figsize = (12 , 8 ))
90
+ ax = sns .barplot (x = country_data .index , y = country_data .values )
91
+ plt .title (
92
+ f"Number of Google Webpages Licensed by Country ({ selected_quarter } )"
93
+ )
94
+ plt .xlabel ("Country" )
95
+ plt .ylabel ("Number of Webpages" )
96
+ plt .xticks (rotation = 45 )
97
+
98
+ # Add value numbers to the top of each bar
99
+ for p in ax .patches :
100
+ ax .annotate (
101
+ format (p .get_height (), ",.0f" ),
102
+ (p .get_x () + p .get_width () / 2.0 , p .get_height ()),
103
+ ha = "center" ,
104
+ va = "center" ,
105
+ xytext = (0 , 9 ),
106
+ textcoords = "offset points" ,
107
+ )
108
+
109
+ # Format the y-axis to display numbers without scientific notation
110
+ ax .get_yaxis ().get_major_formatter ().set_scientific (False )
111
+ ax .get_yaxis ().set_major_formatter (
112
+ plt .FuncFormatter (lambda x , loc : "{:,}" .format (int (x )))
113
+ )
114
+
115
+ output_directory = os .path .join (
116
+ PATHS ["data" ], f"{ selected_quarter } " , "3-reports"
117
+ )
118
+
119
+ LOGGER .info (f"Output directory: { output_directory } " )
120
+
121
+ # Create the directory if it does not exist
122
+ os .makedirs (output_directory , exist_ok = True )
123
+ plt .savefig (os .path .join (output_directory , "gcs_country_report.png" ))
124
+
125
+ plt .show ()
126
+
127
+ LOGGER .info ("Visualization by country created." )
128
+
129
+
130
+ def visualize_by_license_type (data , args ):
47
131
"""
48
- Create visualizations for the data.
132
+ Create a bar chart for the number of webpages licensed by license type
49
133
"""
50
- plt .figure (figsize = (10 , 6 ))
134
+ LOGGER .info (
135
+ "Creating a bar chart for the number of"
136
+ " webpages licensed by license type."
137
+ )
138
+
139
+ selected_quarter = args .quarter
51
140
52
- # Example - fix later
53
- license_counts = data ["LICENSE TYPE" ].value_counts ()
54
- license_counts .plot (kind = "bar" )
55
- plt .title ("License Counts" )
141
+ # Strip any leading/trailing spaces from the columns
142
+ data .columns = data .columns .str .strip ()
143
+
144
+ # Sum the values across all columns except the first one ('LICENSE TYPE')
145
+ license_data = data .set_index ("LICENSE TYPE" ).sum (axis = 1 )
146
+
147
+ plt .figure (figsize = (12 , 8 ))
148
+ ax = sns .barplot (x = license_data .index , y = license_data .values )
149
+ plt .title (
150
+ f"Number of Webpages Licensed by License Type ({ selected_quarter } )"
151
+ )
56
152
plt .xlabel ("License Type" )
57
- plt .ylabel ("Count" )
153
+ plt .ylabel ("Number of Webpages" )
154
+ plt .xticks (rotation = 45 , ha = "right" )
155
+
156
+ # Use the millions formatter for y-axis
157
+ def millions_formatter (x , pos ):
158
+ "The two args are the value and tick position"
159
+ return f"{ x * 1e-6 :.1f} M"
160
+
161
+ ax .yaxis .set_major_formatter (ticker .FuncFormatter (millions_formatter ))
162
+
163
+ plt .tight_layout ()
164
+
165
+ output_directory = os .path .join (
166
+ PATHS ["data" ], f"{ selected_quarter } " , "3-reports"
167
+ )
168
+
169
+ LOGGER .info (f"Output directory: { output_directory } " )
170
+
171
+ # Create the directory if it does not exist
172
+ os .makedirs (output_directory , exist_ok = True )
173
+ plt .savefig (os .path .join (output_directory , "gcs_licensetype_report.png" ))
174
+
58
175
plt .show ()
59
176
60
- LOGGER .info ("Visualization created." )
177
+ LOGGER .info ("Visualization by license type created." )
178
+
179
+
180
+ def visualize_by_language (data , args ):
181
+ """
182
+ Create a bar chart for the number of webpages licensed by language.
183
+ """
184
+ LOGGER .info (
185
+ "Creating a bar chart for the number of webpages licensed by language."
186
+ )
187
+
188
+ selected_quarter = args .quarter
189
+
190
+ # Get the list of country columns dynamically
191
+ columns = [col .strip () for col in data .columns .tolist ()]
192
+
193
+ start_index = columns .index ("English" )
194
+ end_index = columns .index ("Indonesian" ) + 1
195
+
196
+ languages = columns [start_index :end_index ]
197
+
198
+ data .columns = data .columns .str .strip ()
199
+
200
+ LOGGER .info (f"Cleaned Columns: { data .columns .tolist ()} " )
201
+
202
+ # Aggregate the data by summing the counts for each country
203
+ language_data = data [languages ].sum ()
204
+
205
+ plt .figure (figsize = (12 , 8 ))
206
+ ax = sns .barplot (x = language_data .index , y = language_data .values )
207
+ plt .title (
208
+ f"Number of Google Webpages Licensed by Country ({ selected_quarter } )"
209
+ )
210
+ plt .xlabel ("Country" )
211
+ plt .ylabel ("Number of Webpages" )
212
+ plt .xticks (rotation = 45 )
213
+
214
+ # Add value numbers to the top of each bar
215
+ for p in ax .patches :
216
+ ax .annotate (
217
+ format (p .get_height (), ",.0f" ),
218
+ (p .get_x () + p .get_width () / 2.0 , p .get_height ()),
219
+ ha = "center" ,
220
+ va = "center" ,
221
+ xytext = (0 , 9 ),
222
+ textcoords = "offset points" ,
223
+ )
224
+
225
+ # Format the y-axis to display numbers without scientific notation
226
+ ax .get_yaxis ().get_major_formatter ().set_scientific (False )
227
+ ax .get_yaxis ().set_major_formatter (
228
+ plt .FuncFormatter (lambda x , loc : "{:,}" .format (int (x )))
229
+ )
230
+
231
+ output_directory = os .path .join (
232
+ PATHS ["data" ], f"{ selected_quarter } " , "3-reports"
233
+ )
234
+
235
+ LOGGER .info (f"Output directory: { output_directory } " )
236
+
237
+ # Create the directory if it does not exist
238
+ os .makedirs (output_directory , exist_ok = True )
239
+ plt .savefig (os .path .join (output_directory , "gcs_language_report.png" ))
240
+
241
+ plt .show ()
242
+
243
+ LOGGER .info ("Visualization by language created." )
61
244
62
245
63
246
def main ():
64
- data = load_data ()
247
+
248
+ args = parse_arguments ()
249
+
250
+ data = load_data (args )
65
251
if data .empty :
66
252
return
67
253
68
- processed_data = process_data (data )
69
- visualize_data (processed_data )
254
+ current_directory = os .getcwd ()
255
+ LOGGER .info (f"Current working directory: { current_directory } " )
256
+
257
+ visualize_by_country (data , args )
258
+ visualize_by_license_type (data , args )
259
+ visualize_by_language (data , args )
70
260
71
261
72
262
if __name__ == "__main__" :
0 commit comments