Skip to content

Commit f05b713

Browse files
committed
add totals_by_country and totals_by_langauage
1 parent 6cfacd2 commit f05b713

File tree

1 file changed

+90
-22
lines changed

1 file changed

+90
-22
lines changed

scripts/2-process/gcs_process.py

+90-22
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to processing Google Custom Search data
4-
for analysis and comparison between quarters.
3+
Process Google Custom Search (GCS) data.
54
"""
65
# Standard library
76
import argparse
@@ -205,7 +204,7 @@ def data_to_csv(args, data, file_path):
205204

206205

207206
def process_top_25_tools(args, count_data):
208-
LOGGER.info("Processing top 25 tools")
207+
LOGGER.info("Processing count data: top 25 tools")
209208
data = count_data.sort_values("COUNT", ascending=False)
210209
data.reset_index(drop=True, inplace=True)
211210
data = data.iloc[:25]
@@ -218,7 +217,7 @@ def process_top_25_tools(args, count_data):
218217

219218

220219
def process_totals_by_product(args, count_data):
221-
LOGGER.info("Processing totals by product")
220+
LOGGER.info("Processing count data: totals by product")
222221
data = {
223222
"Licenses version 4.0": 0,
224223
"Licenses version 3.0": 0,
@@ -259,7 +258,7 @@ def process_totals_by_product(args, count_data):
259258

260259

261260
def process_totals_by_unit(args, count_data):
262-
LOGGER.info("Processing totals by unit")
261+
LOGGER.info("Processing count data: totals by unit")
263262
data = {}
264263
for row in count_data.itertuples(index=False):
265264
tool = row[0]
@@ -287,11 +286,14 @@ def process_totals_by_unit(args, count_data):
287286
data_to_csv(args, data, file_path)
288287

289288

289+
# https://creativecommons.org/public-domain/freeworks/
290290
def process_totals_by_free_cultural(args, count_data):
291-
LOGGER.info("Processing totals by Approved for Free Cultural Works")
291+
LOGGER.info(
292+
"Processing count data: totals by Approved for Free Cultural Works"
293+
)
292294
data = {
293295
"Approved for Free Cultural Works": 0,
294-
"Limited uses": 0,
296+
"Limited use": 0,
295297
}
296298
for row in count_data.itertuples(index=False):
297299
tool = row[0]
@@ -304,7 +306,7 @@ def process_totals_by_free_cultural(args, count_data):
304306
if unit in ["by-sa", "by", "sa", "sampling+"]:
305307
key = "Approved for Free Cultural Works"
306308
else:
307-
key = "Limited uses"
309+
key = "Limited use"
308310
data[key] += count
309311

310312
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
@@ -317,22 +319,27 @@ def process_totals_by_free_cultural(args, count_data):
317319

318320

319321
def process_totals_by_restrictions(args, count_data):
320-
LOGGER.info("Processing totals by restriction")
321-
data = {"level 0": 0, "level 1": 0, "level 2": 0, "level 3": 0}
322+
LOGGER.info("Processing count data: totals by restriction")
323+
data = {
324+
"level 0 - unrestricted": 0,
325+
"level 1 - few restrictions": 0,
326+
"level 2 - some restrictions": 0,
327+
"level 3 - many restrictions": 0,
328+
}
322329
for row in count_data.itertuples(index=False):
323330
tool = row[0]
324331
count = row[1]
325332
if tool.startswith("PDM") or "CC0" in tool or "PUBLICDOMAIN" in tool:
326-
key = "level 0"
333+
key = "level 0 - unrestricted"
327334
else:
328335
parts = tool.split()
329336
unit = parts[1].lower()
330337
if unit in ["by-sa", "by", "sa", "sampling+"]:
331-
key = "level 1"
338+
key = "level 1 - few restrictions"
332339
elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
333-
key = "level 2"
340+
key = "level 2 - some restrictions"
334341
else:
335-
key = "level 3"
342+
key = "level 3 - many restrictions"
336343
data[key] += count
337344

338345
data = pd.DataFrame(data.items(), columns=["Category", "Count"])
@@ -342,6 +349,64 @@ def process_totals_by_restrictions(args, count_data):
342349
data_to_csv(args, data, file_path)
343350

344351

352+
def process_totals_by_langauage(args, data):
353+
LOGGER.info("Processing language data: totals by language")
354+
data = data.groupby(["LANGUAGE"], as_index=False)["COUNT"].sum()
355+
data = data.sort_values("COUNT", ascending=False)
356+
data.reset_index(drop=True, inplace=True)
357+
data.rename(
358+
columns={
359+
"LANGUAGE": "Language",
360+
"COUNT": "Count",
361+
},
362+
inplace=True,
363+
)
364+
file_path = shared.path_join(
365+
PATHS["data_phase"], "gcs_totals_by_langauage.csv"
366+
)
367+
data_to_csv(args, data, file_path)
368+
369+
370+
def process_totals_by_country(args, data):
371+
LOGGER.info("Processing country data: totals by country")
372+
data = data.groupby(["COUNTRY"], as_index=False)["COUNT"].sum()
373+
data = data.sort_values("COUNT", ascending=False)
374+
data.reset_index(drop=True, inplace=True)
375+
data.rename(
376+
columns={
377+
"COUNTRY": "Country",
378+
"COUNT": "Count",
379+
},
380+
inplace=True,
381+
)
382+
file_path = shared.path_join(
383+
PATHS["data_phase"], "gcs_totals_by_country.csv"
384+
)
385+
data_to_csv(args, data, file_path)
386+
387+
388+
# Data is already limited to licenses 4.0, CC0, and PDM
389+
#
390+
# def process_license_40_totals_by_langauage(args, data):
391+
# LOGGER.info("Processing language data: top 25 languages")
392+
# data = data[data["TOOL_IDENTIFIER"].str.contains("CC BY")]
393+
# data = data[data["TOOL_IDENTIFIER"].str.contains("4.0")]
394+
# data = data.groupby(["LANGUAGE"], as_index=False)['COUNT'].sum()
395+
# data = data.sort_values("COUNT", ascending=False)
396+
# data.reset_index(drop=True, inplace=True)
397+
# data.rename(
398+
# columns={
399+
# "LANGUAGE": "Language",
400+
# "COUNT": "Count",
401+
# },
402+
# inplace=True,
403+
# )
404+
# file_path = shared.path_join(
405+
# PATHS["data_phase"], "gcs_license_40_totals_by_langauage.csv"
406+
# )
407+
# data_to_csv(args, data, file_path)
408+
409+
345410
def main():
346411
args = parse_arguments()
347412
shared.log_paths(LOGGER, PATHS)
@@ -355,15 +420,18 @@ def main():
355420
process_totals_by_free_cultural(args, count_data)
356421
process_totals_by_restrictions(args, count_data)
357422

358-
# # Langauge data
359-
# langauge_data = pd.read_csv(
360-
# FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361-
# )
423+
# Langauge data
424+
language_data = pd.read_csv(
425+
FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
426+
)
427+
process_totals_by_langauage(args, language_data)
428+
# process_license_40_totals_by_langauage(args, language_data)
362429

363-
# # Country data
364-
# country_data = pd.read_csv(
365-
# FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
366-
# )
430+
# Country data
431+
country_data = pd.read_csv(
432+
FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
433+
)
434+
process_totals_by_country(args, country_data)
367435

368436
args = shared.git_add_and_commit(
369437
args,

0 commit comments

Comments
 (0)