Skip to content

Commit 656591f

Browse files
committed
add successful report generation of all data sources (including README generation)
1 parent 18d9633 commit 656591f

14 files changed

+1654
-88
lines changed
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
LICENSE_TYPE,Repository Count
2+
CC0-1.0,239474
3+
CC-BY-4.0,89918
4+
CC-BY-SA-4.0,23318
1.02 KB
Loading

data/2024Q3/README.md

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# 2024Q3 Quantifying the Commons
2-
<!-- GCS Start -->
32
## Data Source: Google Custom Search
4-
3+
<!-- Google Custom Search Start -->
54
<!-- Country Report Start -->
65
### Country Report
76
![Number of Google Webpages Licensed by Country](3-report/gcs_country_report.png)
@@ -17,9 +16,4 @@ Number of Webpages Licensed by License Type
1716
![Number of Google Webpages Licensed by Language](3-report/gcs_language_report.png)
1817
Number of Google Webpages Licensed by Language
1918
<!-- Language Report End -->
20-
<!-- GitHub License Type Report Start -->
21-
### GitHub License Type Report
22-
![Number of Repositories Licensed by License Type](../3-report/github_license_type_report.png)
23-
Number of Repositories Licensed by License Type
24-
<!-- GitHub License Type Report End -->
25-
<!-- GCS End -->
19+
<!-- Google Custom Search End -->
+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file is dedicated to visualizing and analyzing the data collected
4+
from Deviantart.
5+
"""
6+
# Standard library
7+
import argparse
8+
import os
9+
import sys
10+
import traceback
11+
from datetime import datetime, timezone
12+
13+
# Third-party
14+
import matplotlib.pyplot as plt
15+
import pandas as pd
16+
import seaborn as sns
17+
from pandas import PeriodIndex
18+
19+
# Add parent directory so shared can be imported
20+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
21+
22+
# First-party/Local
23+
import shared # noqa: E402
24+
25+
# Setup
26+
LOGGER, PATHS = shared.setup(__file__)
27+
28+
29+
def parse_arguments():
30+
"""
31+
Parses command-line arguments, returns parsed arguments.
32+
"""
33+
LOGGER.info("Parsing command-line arguments")
34+
35+
# Taken from shared module, fix later
36+
datetime_today = datetime.now(timezone.utc)
37+
quarter = PeriodIndex([datetime_today.date()], freq="Q")[0]
38+
39+
parser = argparse.ArgumentParser(description="Deviantart Reports")
40+
parser.add_argument(
41+
"--quarter",
42+
"-q",
43+
type=str,
44+
required=False,
45+
default=f"{quarter}",
46+
help="Data quarter in format YYYYQx, e.g., 2024Q2",
47+
)
48+
parser.add_argument(
49+
"--skip-commit",
50+
action="store_true",
51+
help="Don't git commit changes (also skips git push changes)",
52+
)
53+
parser.add_argument(
54+
"--skip-push",
55+
action="store_true",
56+
help="Don't git push changes",
57+
)
58+
parser.add_argument(
59+
"--show-plots",
60+
action="store_true",
61+
help="Show generated plots (in addition to saving them)",
62+
)
63+
args = parser.parse_args()
64+
if args.skip_commit:
65+
args.skip_push = True
66+
return args
67+
68+
69+
def load_data(args):
70+
"""
71+
Load the collected data from the CSV file.
72+
"""
73+
selected_quarter = args.quarter
74+
75+
file_path = os.path.join(
76+
PATHS["data"],
77+
f"{selected_quarter}",
78+
"1-fetch",
79+
"deviantart_fetched.csv",
80+
)
81+
82+
if not os.path.exists(file_path):
83+
LOGGER.error(f"Data file not found: {file_path}")
84+
return pd.DataFrame()
85+
86+
data = pd.read_csv(file_path)
87+
LOGGER.info(f"Data loaded from {file_path}")
88+
return data
89+
90+
91+
def visualize_by_license_type(data, args):
92+
"""
93+
Create a bar chart for the number of repositories licensed by license type.
94+
"""
95+
LOGGER.info(
96+
"Creating a bar chart for the number of documents by license type."
97+
)
98+
99+
selected_quarter = args.quarter
100+
101+
# Strip any leading/trailing spaces from the columns
102+
data.columns = data.columns.str.strip()
103+
104+
plt.figure(figsize=(12, 8))
105+
ax = sns.barplot(x=data["LICENSE TYPE"], y=data["Document Count"])
106+
plt.title("Number of DeviantArt Documents by License Type")
107+
plt.xlabel("License Type")
108+
plt.ylabel("Document Count")
109+
plt.xticks(rotation=45, ha="right")
110+
111+
# Add value numbers to the top of each bar
112+
for p in ax.patches:
113+
ax.annotate(
114+
format(p.get_height(), ",.0f"),
115+
(p.get_x() + p.get_width() / 2.0, p.get_height()),
116+
ha="center",
117+
va="center",
118+
xytext=(0, 9),
119+
textcoords="offset points",
120+
)
121+
122+
output_directory = os.path.join(
123+
PATHS["data"], f"{selected_quarter}", "3-report"
124+
)
125+
126+
LOGGER.info(f"Output directory: {output_directory}")
127+
128+
os.makedirs(output_directory, exist_ok=True)
129+
image_path = os.path.join(
130+
output_directory, "deviantart_license_report.png"
131+
)
132+
plt.savefig(image_path)
133+
134+
if args.show_plots:
135+
plt.show()
136+
137+
shared.update_readme(
138+
PATHS,
139+
image_path,
140+
"DeviantArt",
141+
"Number of DeviantArt Documents by License Type",
142+
"License Type Report",
143+
args,
144+
)
145+
146+
LOGGER.info("Visualization by license type created.")
147+
148+
149+
def main():
150+
151+
# Fetch and merge changes
152+
shared.fetch_and_merge(PATHS["repo"])
153+
154+
args = parse_arguments()
155+
156+
data = load_data(args)
157+
if data.empty:
158+
return
159+
160+
current_directory = os.getcwd()
161+
LOGGER.info(f"Current working directory: {current_directory}")
162+
163+
visualize_by_license_type(data, args)
164+
165+
# Add and commit changes
166+
if not args.skip_commit:
167+
shared.add_and_commit(
168+
PATHS["repo"], "Added and committed new Deviantart reports"
169+
)
170+
171+
# Push changes
172+
if not args.skip_push:
173+
shared.push_changes(PATHS["repo"])
174+
175+
176+
if __name__ == "__main__":
177+
try:
178+
main()
179+
except shared.QuantifyingException as e:
180+
if e.exit_code == 0:
181+
LOGGER.info(e.message)
182+
else:
183+
LOGGER.error(e.message)
184+
sys.exit(e.exit_code)
185+
except SystemExit as e:
186+
LOGGER.error(f"System exit with code: {e.code}")
187+
sys.exit(e.code)
188+
except KeyboardInterrupt:
189+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
190+
sys.exit(130)
191+
except Exception:
192+
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
193+
sys.exit(1)

scripts/3-report/flickr_reports.py

+48-62
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,25 @@ def parse_arguments():
4646
default=f"{quarter}",
4747
help="Data quarter in format YYYYQx, e.g., 2024Q2",
4848
)
49-
return parser.parse_args()
49+
parser.add_argument(
50+
"--skip-commit",
51+
action="store_true",
52+
help="Don't git commit changes (also skips git push changes)",
53+
)
54+
parser.add_argument(
55+
"--skip-push",
56+
action="store_true",
57+
help="Don't git push changes",
58+
)
59+
parser.add_argument(
60+
"--show-plots",
61+
action="store_true",
62+
help="Show generated plots (in addition to saving them)",
63+
)
64+
args = parser.parse_args()
65+
if args.skip_commit:
66+
args.skip_push = True
67+
return args
5068

5169

5270
def load_data(args):
@@ -60,7 +78,7 @@ def load_data(args):
6078
f"{selected_quarter}",
6179
"1-fetch",
6280
"flickr_fetched",
63-
"license_total.csv",
81+
"final.csv",
6482
)
6583

6684
if not os.path.exists(file_path):
@@ -72,79 +90,47 @@ def load_data(args):
7290
return data
7391

7492

75-
def update_readme(image_path, description, section_title, args):
76-
"""
77-
Update the README.md file with the generated images and descriptions.
78-
"""
79-
readme_path = os.path.join(PATHS["data"], args.quarter, "README.md")
80-
section_marker_start = "<!-- Flickr Start -->"
81-
section_marker_end = "<!-- Flickr End -->"
82-
data_source_title = "## Data Source: Flickr"
83-
84-
# Convert image path to a relative path
85-
rel_image_path = os.path.relpath(image_path, os.path.dirname(readme_path))
86-
87-
if os.path.exists(readme_path):
88-
with open(readme_path, "r") as f:
89-
lines = f.readlines()
90-
else:
91-
lines = []
92-
93-
section_start = None
94-
section_end = None
95-
for i, line in enumerate(lines):
96-
if section_marker_start in line:
97-
section_start = i
98-
if section_marker_end in line:
99-
section_end = i
100-
101-
if section_start is None or section_end is None:
102-
# If the section does not exist, add it at the end
103-
lines.append(f"\n# {args.quarter} Quantifying the Commons\n")
104-
lines.append(f"{section_marker_start}\n")
105-
lines.append(f"{data_source_title}\n\n")
106-
lines.append(f"{section_marker_end}\n")
107-
section_start = len(lines) - 3
108-
section_end = len(lines) - 1
109-
110-
# Prepare the content to be added
111-
new_content = [
112-
f"\n### {section_title}\n",
113-
f"![{description}]({rel_image_path})\n",
114-
f"{description}\n",
115-
]
116-
117-
# Insert the new content before the section end marker
118-
lines = lines[:section_end] + new_content + lines[section_end:]
119-
120-
# Write back to the README.md file
121-
with open(readme_path, "w") as f:
122-
f.writelines(lines)
123-
124-
LOGGER.info(f"Updated {readme_path} with new image and description.")
125-
126-
12793
# Add functions for individual license graphs + word clouds + total license
12894

12995

13096
def main():
13197

132-
# args = parse_arguments()
98+
# Fetch and merge changes
99+
shared.fetch_and_merge(PATHS["repo"])
100+
101+
args = parse_arguments()
102+
103+
data = load_data(args)
104+
if data.empty:
105+
return
133106

134-
# data = load_data(args)
135-
# if data.empty:
136-
# return
107+
current_directory = os.getcwd()
108+
LOGGER.info(f"Current working directory: {current_directory}")
109+
110+
"""
111+
Insert functions for Flickr
112+
"""
137113

138-
# current_directory = os.getcwd()
139-
# LOGGER.info(f"Current working directory: {current_directory}")
114+
# Add and commit changes
115+
if not args.skip_commit:
116+
shared.add_and_commit(
117+
PATHS["repo"], "Added and committed new GitHub reports"
118+
)
140119

141-
LOGGER.info("Generating reports for Flickr.")
142-
pass
120+
# Push changes
121+
if not args.skip_push:
122+
shared.push_changes(PATHS["repo"])
143123

144124

145125
if __name__ == "__main__":
146126
try:
147127
main()
128+
except shared.QuantifyingException as e:
129+
if e.exit_code == 0:
130+
LOGGER.info(e.message)
131+
else:
132+
LOGGER.error(e.message)
133+
sys.exit(e.exit_code)
148134
except SystemExit as e:
149135
LOGGER.error(f"System exit with code: {e.code}")
150136
sys.exit(e.code)

scripts/3-report/gcs_reports.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to visualizing and analyzing the data collected.
3+
This file is dedicated to visualizing and analyzing the data collected
4+
from Google Custom Search.
45
"""
56
# Standard library
67
import argparse

0 commit comments

Comments
 (0)