Skip to content

Commit 963aa2c

Browse files
committed
update GitHub fetch script
1 parent cea4957 commit 963aa2c

File tree

1 file changed

+138
-148
lines changed

1 file changed

+138
-148
lines changed

scripts/1-fetch/github_fetch.py

Lines changed: 138 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
#!/usr/bin/env python
22
"""
3-
This file is dedicated to querying data from the GitHub API.
3+
Fetch CC Legal Tool usage from GitHub API.
44
"""
55

66
# Standard library
77
import argparse
88
import csv
99
import os
1010
import sys
11+
import textwrap
1112
import traceback
13+
import urllib.parse
1214

1315
# Third-party
1416
import requests
15-
import yaml
17+
from pygments import highlight
18+
from pygments.formatters import TerminalFormatter
19+
from pygments.lexers import PythonTracebackLexer
1620
from requests.adapters import HTTPAdapter
1721
from urllib3.util.retry import Retry
1822

@@ -25,174 +29,152 @@
2529
# Setup
2630
LOGGER, PATHS = shared.setup(__file__)
2731

28-
# Log the start of the script execution
29-
LOGGER.info("Script execution started.")
32+
# Constants
33+
FILE1_COUNT = os.path.join(PATHS["data_phase"], "github_1_count.csv")
34+
GITHUB_RETRY_STATUS_FORCELIST = [
35+
408, # Request Timeout
36+
422, # Unprocessable Content
37+
# (Validation failed, or the endpoint has been spammed)
38+
429, # Too Many Requests
39+
500, # Internal Server Error
40+
502, # Bad Gateway
41+
503, # Service Unavailable
42+
504, # Gateway Timeout
43+
]
44+
# Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
45+
GITHUB_TOOLS = [
46+
{"TOOL_IDENTIFIER": "BSD Zero Clause License", "SPDX_IDENTIFIER": "0BSD"},
47+
{"TOOL_IDENTIFIER": "CC0 1.0", "SPDX_IDENTIFIER": "CC0-1.0"},
48+
{"TOOL_IDENTIFIER": "CC BY 4.0", "SPDX_IDENTIFIER": "CC-BY-4.0"},
49+
{"TOOL_IDENTIFIER": "CC BY-SA 4.0", "SPDX_IDENTIFIER": "CC-BY-SA-4.0"},
50+
{"TOOL_IDENTIFIER": "MIT No Attribution", "SPDX_IDENTIFIER": "MIT-0"},
51+
{"TOOL_IDENTIFIER": "Unlicense", "SPDX_IDENTIFIER": "Unlicense"},
52+
{"TOOL_IDENTIFIER": "Total public repositories", "SPDX_IDENTIFIER": "N/A"},
53+
]
54+
HEADER1_COUNT = ["TOOL_IDENTIFIER", "SPDX_IDENTIFIER", "COUNT"]
55+
QUARTER = os.path.basename(PATHS["data_quarter"])
3056

3157

3258
def parse_arguments():
3359
"""
34-
Parses command-line arguments, returns parsed arguments.
60+
Parse command-line options, returns parsed argument namespace.
3561
"""
36-
LOGGER.info("Parsing command-line arguments")
37-
parser = argparse.ArgumentParser(description="GitHub Data Fetching Script")
62+
LOGGER.info("Parsing command-line options")
63+
parser = argparse.ArgumentParser(description=__doc__)
3864
parser.add_argument(
39-
"--licenses", type=int, default=3, help="Number of licenses to query"
65+
"--enable-save",
66+
action="store_true",
67+
help="Enable saving results",
68+
)
69+
parser.add_argument(
70+
"--enable-git",
71+
action="store_true",
72+
help="Enable git actions (fetch, merge, add, commit, and push)",
4073
)
4174
return parser.parse_args()
4275

4376

44-
def set_up_data_file():
45-
"""
46-
Sets up the data file for recording results.
47-
"""
48-
LOGGER.info("Setting up the data file for recording results.")
49-
header = "LICENSE_TYPE,Repository Count\n"
50-
with open(
51-
os.path.join(PATHS["data_phase"], "github_fetched.csv"), "w"
52-
) as f:
53-
f.write(header)
54-
55-
56-
def get_response_elems(license_type):
57-
"""
58-
Provides the metadata for a query of
59-
specified license type from GitHub API.
60-
61-
Args:
62-
license_type: A string representing the type of license.
63-
Returns:
64-
dict: A dictionary mapping metadata
65-
to its value provided from the API query.
66-
"""
67-
LOGGER.info(f"Querying metadata for license: {license_type}")
77+
def check_for_completion():
6878
try:
69-
base_url = "https://api.github.com/search/repositories?q=license:"
70-
request_url = f"{base_url}{license_type}"
71-
max_retries = Retry(
72-
total=5,
73-
backoff_factor=10,
74-
status_forcelist=[403, 408, 429, 500, 502, 503, 504],
75-
)
76-
session = requests.Session()
77-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
78-
with session.get(request_url) as response:
79-
response.raise_for_status()
80-
search_data = response.json()
81-
return {"totalResults": search_data["total_count"]}
82-
except requests.HTTPError as e:
83-
LOGGER.error(f"HTTP Error: {e}")
84-
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
85-
except requests.RequestException as e:
86-
LOGGER.error(f"Request Exception: {e}")
87-
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
88-
except KeyError as e:
89-
LOGGER.error(f"KeyError: {e}.")
90-
raise shared.QuantifyingException(f"KeyError: {e}", 1)
91-
92-
93-
def retrieve_license_data(args):
94-
"""
95-
Retrieves the data of all license types specified.
96-
"""
97-
LOGGER.info("Retrieving the data for all license types.")
98-
licenses = ["CC0-1.0", "CC-BY-4.0", "CC-BY-SA-4.0"][: args.licenses]
99-
100-
data = []
101-
total_repos_retrieved = 0
102-
103-
for license_type in licenses:
104-
data_dict = get_response_elems(license_type)
105-
total_repos_retrieved += data_dict["totalResults"]
106-
record_results(license_type, data_dict)
79+
with open(FILE1_COUNT, "r", newline="") as file_obj:
80+
reader = csv.DictReader(file_obj, dialect="unix")
81+
if len(list(reader)) == len(GITHUB_TOOLS):
82+
raise shared.QuantifyingException(
83+
f"Data fetch completed for {QUARTER}", 0
84+
)
85+
except FileNotFoundError:
86+
pass # File may not be found without --enable-save, etc.
87+
88+
89+
def get_requests_session():
90+
max_retries = Retry(
91+
total=5,
92+
backoff_factor=10,
93+
status_forcelist=GITHUB_RETRY_STATUS_FORCELIST,
94+
)
95+
session = requests.Session()
96+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
97+
session.headers.update({"Accept": "application/vnd.github+json"})
10798

108-
for row in data:
109-
LOGGER.info(f"Collected data row: {row}")
99+
return session
110100

111-
return data
112101

102+
def write_data(args, tool_data):
103+
if not args.enable_save:
104+
return args
113105

114-
def record_results(license_type, data):
115-
"""
116-
Records the data for a specific license type into the CSV file.
117-
"""
118-
LOGGER.info(f"Recording data for license: {license_type}")
119-
row = [license_type, data["totalResults"]]
120-
with open(
121-
os.path.join(PATHS["data_phase"], "github_fetched.csv"),
122-
"a",
123-
newline="",
124-
) as f:
125-
writer = csv.writer(f, dialect="unix")
126-
writer.writerow(row)
127-
128-
129-
def load_state():
130-
"""
131-
Loads the state from a YAML file, returns the last recorded state.
132-
"""
133-
if os.path.exists(PATHS["state"]):
134-
with open(PATHS["state"], "r") as f:
135-
return yaml.safe_load(f)
136-
return {"total_records_retrieved (github)": 0}
106+
# Create data directory for this phase
107+
os.makedirs(PATHS["data_phase"], exist_ok=True)
137108

109+
if len(tool_data) < len(GITHUB_TOOLS):
110+
LOGGER.error("Unable to fetch all records. Aborting.")
111+
return args
138112

139-
def save_state(state: dict):
140-
"""
141-
Saves the state to a YAML file.
142-
Parameters:
143-
state_file: Path to the state file.
144-
state: The state dictionary to save.
145-
"""
146-
with open(PATHS["state"], "w") as f:
147-
yaml.safe_dump(state, f)
113+
with open(FILE1_COUNT, "w", newline="") as file_obj:
114+
writer = csv.DictWriter(
115+
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
116+
)
117+
writer.writeheader()
118+
for row in tool_data:
119+
writer.writerow(row)
120+
return args
121+
122+
123+
def query_github(args, session):
124+
tool_data = []
125+
for tool in GITHUB_TOOLS:
126+
tool_identifier = tool["TOOL_IDENTIFIER"]
127+
spdx_identifier = tool["SPDX_IDENTIFIER"]
128+
LOGGER.info(f"Query: tool: {tool_identifier}, spdx: {spdx_identifier}")
129+
130+
base_url = "https://api.github.com/search/repositories?per_page=1&q="
131+
search_parameters = "is:public"
132+
if tool_identifier != "Total public repositories":
133+
search_parameters = (
134+
f"{search_parameters} license:{spdx_identifier.lower()}"
135+
)
136+
search_parameters = urllib.parse.quote(search_parameters, safe=":/")
137+
request_url = f"{base_url}{search_parameters}"
138+
139+
try:
140+
with session.get(request_url) as response:
141+
response.raise_for_status()
142+
search_data = response.json()
143+
count = search_data["total_count"]
144+
tool_data.append(
145+
{
146+
"TOOL_IDENTIFIER": tool_identifier,
147+
"SPDX_IDENTIFIER": spdx_identifier,
148+
"COUNT": count,
149+
}
150+
)
151+
LOGGER.info(f"count: {count}")
152+
except requests.HTTPError as e:
153+
LOGGER.error(f"HTTP Error: {e}")
154+
raise shared.QuantifyingException(f"HTTP Error: {e}", 1)
155+
except requests.RequestException as e:
156+
LOGGER.error(f"Request Exception: {e}")
157+
raise shared.QuantifyingException(f"Request Exception: {e}", 1)
158+
except KeyError as e:
159+
LOGGER.error(f"KeyError: {e}.")
160+
raise shared.QuantifyingException(f"KeyError: {e}", 1)
161+
return tool_data
148162

149163

150164
def main():
151-
152-
# Fetch and merge changes
153-
shared.fetch_and_merge(PATHS["repo"])
154-
155165
args = parse_arguments()
156-
157-
state = load_state()
158-
total_records_retrieved = state["total_records_retrieved (github)"]
159-
LOGGER.info(f"Initial total_records_retrieved: {total_records_retrieved}")
160-
goal_records = 1000 # Set goal number of records
161-
162-
if total_records_retrieved >= goal_records:
163-
LOGGER.info(
164-
f"Goal of {goal_records} records already achieved."
165-
" No further action required."
166-
)
167-
return
168-
169-
# Log the paths being used
170166
shared.log_paths(LOGGER, PATHS)
171-
172-
# Create data directory for this phase
173-
os.makedirs(PATHS["data_phase"], exist_ok=True)
174-
175-
if total_records_retrieved == 0:
176-
set_up_data_file()
177-
178-
# Retrieve and record data
179-
repos_retrieved = retrieve_license_data(args)
180-
181-
# Update the state with the new count of retrieved records
182-
total_records_retrieved += repos_retrieved
183-
LOGGER.info(
184-
f"Total records retrieved after fetching: {total_records_retrieved}"
167+
check_for_completion()
168+
session = get_requests_session()
169+
tool_data = query_github(args, session)
170+
args = write_data(args, tool_data)
171+
args = shared.git_add_and_commit(
172+
args,
173+
PATHS["repo"],
174+
PATHS["data_quarter"],
175+
f"Add and commit new GitHUB data for {QUARTER}",
185176
)
186-
state["total_records_retrieved (github)"] = total_records_retrieved
187-
save_state(state)
188-
189-
# Add and commit changes
190-
shared.add_and_commit(
191-
PATHS["repo"], PATHS["data_quarter"], "Add and commit GitHub data"
192-
)
193-
194-
# Push changes
195-
shared.push_changes(PATHS["repo"])
177+
shared.git_push_changes(args, PATHS["repo"])
196178

197179

198180
if __name__ == "__main__":
@@ -211,5 +193,13 @@ def main():
211193
LOGGER.info("(130) Halted via KeyboardInterrupt.")
212194
sys.exit(130)
213195
except Exception:
214-
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
196+
traceback_formatted = textwrap.indent(
197+
highlight(
198+
traceback.format_exc(),
199+
PythonTracebackLexer(),
200+
TerminalFormatter(),
201+
),
202+
" ",
203+
)
204+
LOGGER.exception(f"(1) Unhandled exception:\n{traceback_formatted}")
215205
sys.exit(1)

0 commit comments

Comments
 (0)