1
1
#!/usr/bin/env python
2
2
"""
3
- This file is dedicated to processing Google Custom Search data
4
- for analysis and comparison between quarters.
3
+ Process Google Custom Search (GCS) data.
5
4
"""
6
5
# Standard library
7
6
import argparse
@@ -205,7 +204,7 @@ def data_to_csv(args, data, file_path):
205
204
206
205
207
206
def process_top_25_tools (args , count_data ):
208
- LOGGER .info ("Processing top 25 tools" )
207
+ LOGGER .info ("Processing count data: top 25 tools" )
209
208
data = count_data .sort_values ("COUNT" , ascending = False )
210
209
data .reset_index (drop = True , inplace = True )
211
210
data = data .iloc [:25 ]
@@ -218,7 +217,7 @@ def process_top_25_tools(args, count_data):
218
217
219
218
220
219
def process_totals_by_product (args , count_data ):
221
- LOGGER .info ("Processing totals by product" )
220
+ LOGGER .info ("Processing count data: totals by product" )
222
221
data = {
223
222
"Licenses version 4.0" : 0 ,
224
223
"Licenses version 3.0" : 0 ,
@@ -259,7 +258,7 @@ def process_totals_by_product(args, count_data):
259
258
260
259
261
260
def process_totals_by_unit (args , count_data ):
262
- LOGGER .info ("Processing totals by unit" )
261
+ LOGGER .info ("Processing count data: totals by unit" )
263
262
data = {}
264
263
for row in count_data .itertuples (index = False ):
265
264
tool = row [0 ]
@@ -287,11 +286,14 @@ def process_totals_by_unit(args, count_data):
287
286
data_to_csv (args , data , file_path )
288
287
289
288
289
+ # https://creativecommons.org/public-domain/freeworks/
290
290
def process_totals_by_free_cultural (args , count_data ):
291
- LOGGER .info ("Processing totals by Approved for Free Cultural Works" )
291
+ LOGGER .info (
292
+ "Processing count data: totals by Approved for Free Cultural Works"
293
+ )
292
294
data = {
293
295
"Approved for Free Cultural Works" : 0 ,
294
- "Limited uses " : 0 ,
296
+ "Limited use " : 0 ,
295
297
}
296
298
for row in count_data .itertuples (index = False ):
297
299
tool = row [0 ]
@@ -304,7 +306,7 @@ def process_totals_by_free_cultural(args, count_data):
304
306
if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
305
307
key = "Approved for Free Cultural Works"
306
308
else :
307
- key = "Limited uses "
309
+ key = "Limited use "
308
310
data [key ] += count
309
311
310
312
data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
@@ -317,22 +319,27 @@ def process_totals_by_free_cultural(args, count_data):
317
319
318
320
319
321
def process_totals_by_restrictions (args , count_data ):
320
- LOGGER .info ("Processing totals by restriction" )
321
- data = {"level 0" : 0 , "level 1" : 0 , "level 2" : 0 , "level 3" : 0 }
322
+ LOGGER .info ("Processing count data: totals by restriction" )
323
+ data = {
324
+ "level 0 - unrestricted" : 0 ,
325
+ "level 1 - few restrictions" : 0 ,
326
+ "level 2 - some restrictions" : 0 ,
327
+ "level 3 - many restrictions" : 0 ,
328
+ }
322
329
for row in count_data .itertuples (index = False ):
323
330
tool = row [0 ]
324
331
count = row [1 ]
325
332
if tool .startswith ("PDM" ) or "CC0" in tool or "PUBLICDOMAIN" in tool :
326
- key = "level 0"
333
+ key = "level 0 - unrestricted "
327
334
else :
328
335
parts = tool .split ()
329
336
unit = parts [1 ].lower ()
330
337
if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
331
- key = "level 1"
338
+ key = "level 1 - few restrictions "
332
339
elif unit in ["by-nc" , "by-nc-sa" , "sampling" , "nc" , "nc-sa" ]:
333
- key = "level 2"
340
+ key = "level 2 - some restrictions "
334
341
else :
335
- key = "level 3"
342
+ key = "level 3 - many restrictions "
336
343
data [key ] += count
337
344
338
345
data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
@@ -342,6 +349,64 @@ def process_totals_by_restrictions(args, count_data):
342
349
data_to_csv (args , data , file_path )
343
350
344
351
352
+ def process_totals_by_langauage (args , data ):
353
+ LOGGER .info ("Processing language data: totals by language" )
354
+ data = data .groupby (["LANGUAGE" ], as_index = False )["COUNT" ].sum ()
355
+ data = data .sort_values ("COUNT" , ascending = False )
356
+ data .reset_index (drop = True , inplace = True )
357
+ data .rename (
358
+ columns = {
359
+ "LANGUAGE" : "Language" ,
360
+ "COUNT" : "Count" ,
361
+ },
362
+ inplace = True ,
363
+ )
364
+ file_path = shared .path_join (
365
+ PATHS ["data_phase" ], "gcs_totals_by_langauage.csv"
366
+ )
367
+ data_to_csv (args , data , file_path )
368
+
369
+
370
+ def process_totals_by_country (args , data ):
371
+ LOGGER .info ("Processing country data: totals by country" )
372
+ data = data .groupby (["COUNTRY" ], as_index = False )["COUNT" ].sum ()
373
+ data = data .sort_values ("COUNT" , ascending = False )
374
+ data .reset_index (drop = True , inplace = True )
375
+ data .rename (
376
+ columns = {
377
+ "COUNTRY" : "Country" ,
378
+ "COUNT" : "Count" ,
379
+ },
380
+ inplace = True ,
381
+ )
382
+ file_path = shared .path_join (
383
+ PATHS ["data_phase" ], "gcs_totals_by_country.csv"
384
+ )
385
+ data_to_csv (args , data , file_path )
386
+
387
+
388
+ # Data is already limited to licenses 4.0, CC0, and PDM
389
+ #
390
+ # def process_license_40_totals_by_langauage(args, data):
391
+ # LOGGER.info("Processing language data: top 25 languages")
392
+ # data = data[data["TOOL_IDENTIFIER"].str.contains("CC BY")]
393
+ # data = data[data["TOOL_IDENTIFIER"].str.contains("4.0")]
394
+ # data = data.groupby(["LANGUAGE"], as_index=False)['COUNT'].sum()
395
+ # data = data.sort_values("COUNT", ascending=False)
396
+ # data.reset_index(drop=True, inplace=True)
397
+ # data.rename(
398
+ # columns={
399
+ # "LANGUAGE": "Language",
400
+ # "COUNT": "Count",
401
+ # },
402
+ # inplace=True,
403
+ # )
404
+ # file_path = shared.path_join(
405
+ # PATHS["data_phase"], "gcs_license_40_totals_by_langauage.csv"
406
+ # )
407
+ # data_to_csv(args, data, file_path)
408
+
409
+
345
410
def main ():
346
411
args = parse_arguments ()
347
412
shared .log_paths (LOGGER , PATHS )
@@ -355,15 +420,18 @@ def main():
355
420
process_totals_by_free_cultural (args , count_data )
356
421
process_totals_by_restrictions (args , count_data )
357
422
358
- # # Langauge data
359
- # langauge_data = pd.read_csv(
360
- # FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361
- # )
423
+ # Langauge data
424
+ language_data = pd .read_csv (
425
+ FILE2_LANGUAGE , usecols = ["TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
426
+ )
427
+ process_totals_by_langauage (args , language_data )
428
+ # process_license_40_totals_by_langauage(args, language_data)
362
429
363
- # # Country data
364
- # country_data = pd.read_csv(
365
- # FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
366
- # )
430
+ # Country data
431
+ country_data = pd .read_csv (
432
+ FILE3_COUNTRY , usecols = ["TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
433
+ )
434
+ process_totals_by_country (args , country_data )
367
435
368
436
args = shared .git_add_and_commit (
369
437
args ,
0 commit comments