Skip to content

Commit 24741cf

Browse files
committed
add GCS intro and references
1 parent 955a29e commit 24741cf

File tree

2 files changed

+202
-34
lines changed

2 files changed

+202
-34
lines changed

scripts/3-report/gcs_report.py

+67-34
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
# Constants
3232
QUARTER = os.path.basename(PATHS["data_quarter"])
33+
SECTION = "Google Custom Search (GCS)"
3334

3435

3536
def parse_arguments():
@@ -61,34 +62,73 @@ def parse_arguments():
6162
args = parser.parse_args()
6263
if not args.enable_save and args.enable_git:
6364
parser.error("--enable-git requires --enable-save")
65+
args.logger = LOGGER
66+
args.paths = PATHS
6467
return args
6568

6669

70+
def gcs_intro(args):
71+
"""
72+
Write Google Custom Search (GCS) introduction.
73+
"""
74+
LOGGER.info(plot_totals_by_product.__doc__.strip())
75+
file_path = shared.path_join(
76+
PATHS["data_2-process"], "gcs_totals_by_product.csv"
77+
)
78+
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
79+
data = pd.read_csv(file_path)
80+
shared.update_readme(
81+
args,
82+
SECTION,
83+
"Overview",
84+
None,
85+
None,
86+
"Google Custom Search (GCS) data uses the `totalResults` returned by"
87+
" API for search queries of the legal tool URLs (quoted and using"
88+
" `linkSite` for accuracy), countries codes, and language codes.\n"
89+
"\n"
90+
f"**The results show there are a total of {data['Count'].sum():,d}"
91+
" online documents in the commons--documents that are licensed or put"
92+
" in the public domain using a Creative Commons (CC) legal tool.**\n"
93+
"\n"
94+
"Thank you Google for providing the Programable Search Engine: Custom"
95+
" Search JSON API!\n",
96+
)
97+
98+
6799
def plot_top_25_tools(args):
68100
"""
69101
Create a bar chart for the top 25 legal tools
70102
"""
103+
LOGGER.info(plot_totals_by_product.__doc__.strip())
71104
file_path = shared.path_join(
72105
PATHS["data_2-process"], "gcs_top_25_tools.csv"
73106
)
74-
LOGGER.info("Create a bar chart for the top 25 legal tools")
75107
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
76108
data = pd.read_csv(file_path)
77109

78110
plt.figure(figsize=(10, 10))
79-
ax = sns.barplot(data, x="Count", y="CC legal tool")
111+
y_column = "CC legal tool"
112+
ax = sns.barplot(
113+
data,
114+
x="Count",
115+
y=y_column,
116+
hue=y_column,
117+
palette="pastel",
118+
legend=False,
119+
)
80120
for index, row in data.iterrows():
81121
ax.annotate(
82122
f"{row['Count']:,d}",
83-
(4, index),
123+
(4 + 80, index),
84124
xycoords=("axes points", "data"),
85-
color="white",
86-
fontsize="x-small",
87-
horizontalalignment="left",
125+
color="black",
126+
fontsize="small",
127+
horizontalalignment="right",
88128
verticalalignment="center",
89129
)
90130
plt.title(f"Top 25 legal tools ({args.quarter})")
91-
plt.xlabel("Number of references")
131+
plt.xlabel("Number of works")
92132
plt.ylabel("Creative Commons (CC) legal tool")
93133

94134
# Use the millions formatter for x-axis
@@ -113,13 +153,11 @@ def millions_formatter(x, pos):
113153
plt.savefig(image_path)
114154

115155
shared.update_readme(
116-
PATHS,
117-
image_path,
118-
"Google Custom Search",
119-
"Bar chart showing the top 25 legal tools based on the count of"
120-
" search results for each legal tool's URL.",
121-
"Top 25 legal tools",
122156
args,
157+
SECTION,
158+
"Top 25 legal tools",
159+
image_path,
160+
"Bar chart showing the top 25 individual legal tools.",
123161
)
124162

125163
LOGGER.info("Visualization by license type created.")
@@ -129,10 +167,10 @@ def plot_totals_by_product(args):
129167
"""
130168
Create a bar chart of the totals by product
131169
"""
170+
LOGGER.info(plot_totals_by_product.__doc__.strip())
132171
file_path = shared.path_join(
133172
PATHS["data_2-process"], "gcs_totals_by_product.csv"
134173
)
135-
LOGGER.info(__doc__)
136174
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
137175
data = pd.read_csv(file_path)
138176

@@ -152,14 +190,14 @@ def plot_totals_by_product(args):
152190
(0 + 80, index),
153191
xycoords=("axes points", "data"),
154192
color="black",
155-
fontsize="x-small",
193+
fontsize="small",
156194
horizontalalignment="right",
157195
verticalalignment="center",
158196
)
159197
plt.title(f"Totals by product ({args.quarter})")
160198
plt.ylabel("Creative Commons (CC) legal tool product")
161199
plt.xscale("log")
162-
plt.xlabel("Number of references")
200+
plt.xlabel("Number of works")
163201

164202
# Use the millions formatter for x-axis
165203
def millions_formatter(x, pos):
@@ -185,15 +223,12 @@ def millions_formatter(x, pos):
185223
plt.savefig(image_path)
186224

187225
shared.update_readme(
188-
PATHS,
226+
args,
227+
SECTION,
228+
"Totals by product",
189229
image_path,
190-
"Google Custom Search",
191230
"Bar chart showing how many documents there are for each Creative"
192-
" Commons (CC) legal tool. **There are a total of"
193-
f" {data['Count'].sum():,d} documents that are either CC licensed"
194-
" or put in the public domain using a CC legal tool.**",
195-
"Totals by product",
196-
args,
231+
" Commons (CC) legal tool product.",
197232
)
198233

199234
LOGGER.info("Visualization by license type created.")
@@ -234,7 +269,7 @@ def millions_formatter(x, pos):
234269
# plt.xticks(rotation=45)
235270
#
236271
# # Add value numbers to the top of each bar
237-
# for p in ax.patches:
272+
# for p in ax.patcplot_totals_by_producthes:
238273
# ax.annotate(
239274
# format(p.get_height(), ",.0f"),
240275
# (p.get_x() + p.get_width() / 2.0, p.get_height()),
@@ -265,12 +300,11 @@ def millions_formatter(x, pos):
265300
# plt.show()
266301
#
267302
# shared.update_readme(
268-
# PATHS,
303+
# args,
304+
# SECTION,
305+
# "Country Report",
269306
# image_path,
270-
# "Google Custom Search",
271307
# "Number of Google Webpages Licensed by Country",
272-
# "Country Report",
273-
# args,
274308
# )
275309
#
276310
# LOGGER.info("Visualization by country created.")
@@ -343,25 +377,24 @@ def millions_formatter(x, pos):
343377
# plt.show()
344378
#
345379
# shared.update_readme(
346-
# PATHS,
380+
# args,
381+
# SECTION,
382+
# "Language Report",
347383
# image_path,
348-
# "Google Custom Search",
349384
# "Number of Google Webpages Licensed by Language",
350-
# "Language Report",
351-
# args,
352385
# )
353386
#
354387
# LOGGER.info("Visualization by language created.")
355388

356389

357390
def main():
358391
args = parse_arguments()
359-
args.logger = LOGGER
360392
shared.log_paths(LOGGER, PATHS)
361393
shared.git_fetch_and_merge(args, PATHS["repo"])
362394

363-
plot_top_25_tools(args)
395+
gcs_intro(args)
364396
plot_totals_by_product(args)
397+
plot_top_25_tools(args)
365398
# plot_by_country(data, args)
366399
# plot_by_language(data, args)
367400

scripts/3-report/references.py

+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python
2+
"""
3+
Add project references.
4+
"""
5+
# Standard library
6+
import argparse
7+
import os
8+
import sys
9+
import textwrap
10+
import traceback
11+
12+
# Third-party
13+
from pygments import highlight
14+
from pygments.formatters import TerminalFormatter
15+
from pygments.lexers import PythonTracebackLexer
16+
17+
# Add parent directory so shared can be imported
18+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
19+
20+
# First-party/Local
21+
import shared # noqa: E402
22+
23+
# Setup
24+
LOGGER, PATHS = shared.setup(__file__)
25+
26+
# Constants
27+
QUARTER = os.path.basename(PATHS["data_quarter"])
28+
SECTION = "References"
29+
30+
31+
def parse_arguments():
32+
"""
33+
Parses command-line arguments, returns parsed arguments.
34+
"""
35+
LOGGER.info("Parsing command-line arguments")
36+
parser = argparse.ArgumentParser(description=__doc__)
37+
parser.add_argument(
38+
"--quarter",
39+
default=QUARTER,
40+
help="Data quarter in format YYYYQx, e.g., 2024Q2",
41+
)
42+
parser.add_argument(
43+
"--show-plots",
44+
action="store_true",
45+
help="Show generated plots (in addition to saving them)",
46+
)
47+
parser.add_argument(
48+
"--enable-save",
49+
action="store_true",
50+
help="Enable saving results",
51+
)
52+
parser.add_argument(
53+
"--enable-git",
54+
action="store_true",
55+
help="Enable git actions (fetch, merge, add, commit, and push)",
56+
)
57+
args = parser.parse_args()
58+
if not args.enable_save and args.enable_git:
59+
parser.error("--enable-git requires --enable-save")
60+
args.logger = LOGGER
61+
args.paths = PATHS
62+
return args
63+
64+
65+
def data_locations(args):
66+
"""
67+
Write References
68+
"""
69+
shared.update_readme(
70+
args,
71+
SECTION,
72+
"Data locations",
73+
None,
74+
None,
75+
"This report was generated as part of:\n"
76+
"\n"
77+
"**[creativecommons/quantifying][repo]:** *quantify the size and"
78+
" diversity of the commons--the collection of works that are openly"
79+
" licensed or in the public domain*\n"
80+
"\nThe data used to generate this report is avaiable in that"
81+
" repository at the following locations:\n"
82+
"\n"
83+
" | Resource | Location |\n"
84+
" | --------------- | -------- |\n"
85+
" | Fetched data: | [`../1-fetch/`](../1-fetch) |\n"
86+
" | Processed data: | [`../2-process/`](../2-process) |\n"
87+
" | Report data: | [`../2-report/`](../2-report) |\n"
88+
"\n"
89+
"[repo]: https://github.com/creativecommons/quantifying\n",
90+
)
91+
92+
93+
def main():
94+
args = parse_arguments()
95+
shared.log_paths(LOGGER, PATHS)
96+
shared.git_fetch_and_merge(args, PATHS["repo"])
97+
98+
data_locations(args)
99+
100+
args = shared.git_add_and_commit(
101+
args,
102+
PATHS["repo"],
103+
PATHS["data_quarter"],
104+
f"Add and commit References for {QUARTER}",
105+
)
106+
shared.git_push_changes(args, PATHS["repo"])
107+
108+
109+
if __name__ == "__main__":
110+
try:
111+
main()
112+
except shared.QuantifyingException as e:
113+
if e.exit_code == 0:
114+
LOGGER.info(e.message)
115+
else:
116+
LOGGER.error(e.message)
117+
sys.exit(e.exit_code)
118+
except SystemExit as e:
119+
if e.code != 0:
120+
LOGGER.error(f"System exit with code: {e.code}")
121+
sys.exit(e.code)
122+
except KeyboardInterrupt:
123+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
124+
sys.exit(130)
125+
except Exception:
126+
traceback_formatted = textwrap.indent(
127+
highlight(
128+
traceback.format_exc(),
129+
PythonTracebackLexer(),
130+
TerminalFormatter(),
131+
),
132+
" ",
133+
)
134+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
135+
sys.exit(1)

0 commit comments

Comments
 (0)