Skip to content

Commit 5b4f4ae

Browse files
authored
Merge pull request #101 from Darylgolden/add-github-source
Add GitHub repository tracking
2 parents 8412423 + 5e0475c commit 5b4f4ae

File tree

3 files changed

+123
-0
lines changed

3 files changed

+123
-0
lines changed

github/data_github_2024_4_5.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
LICENSE_TYPE,Repository Count
2+
CC0-1.0,239474
3+
CC-BY-4.0,89918
4+
CC-BY-SA-4.0,23318

github/github_scratcher.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Standard library
2+
import os
3+
import os.path
4+
import sys
5+
import traceback
6+
7+
# Third-party
8+
import requests
9+
from requests.adapters import HTTPAdapter
10+
from urllib3.util.retry import Retry
11+
12+
sys.path.append(".")
13+
# First-party/Local
14+
import quantify # noqa: E402
15+
16+
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup(
17+
__file__
18+
)
19+
20+
DATA_WRITE_FILE = os.path.join(
21+
PATH_WORK_DIR,
22+
f"data_github_"
23+
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv",
24+
)
25+
26+
27+
def set_up_data_file():
28+
"""Writes the header row of the data file."""
29+
header_title = "LICENSE_TYPE,Repository Count"
30+
with open(DATA_WRITE_FILE, "w") as f:
31+
f.write(f"{header_title}\n")
32+
33+
34+
def get_response_elems(license):
35+
"""Provides the metadata for query of specified parameters
36+
Args:
37+
license:
38+
A string representing the type of license, and should be a segment
39+
of its URL towards the license description. Alternatively, the
40+
default None value stands for having no assumption about license
41+
type.
42+
Returns:
43+
dict: A dictionary mapping metadata to its value provided from the API
44+
query of specified parameters.
45+
"""
46+
try:
47+
base_url = "https://api.github.com/search/repositories?q=license:"
48+
request_url = f"{base_url}{license}"
49+
max_retries = Retry(
50+
total=5,
51+
backoff_factor=10,
52+
status_forcelist=[403, 408, 429, 500, 502, 503, 504],
53+
)
54+
session = requests.Session()
55+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
56+
with session.get(request_url) as response:
57+
response.raise_for_status()
58+
search_data = response.json()
59+
return {"totalResults": search_data["total_count"]}
60+
except Exception as e:
61+
raise e
62+
63+
64+
def record_license_data(license_type):
65+
"""Writes the row for LICENSE_TYPE to file to contain Github Query data.
66+
Args:
67+
license_type:
68+
A string representing the type of license, and should be a segment
69+
of its URL towards the license description. Alternatively, the
70+
default None value stands for having no assumption about license
71+
type.
72+
"""
73+
data_log = (
74+
f"{license_type},"
75+
f"{get_response_elems(license_type)['totalResults']}"
76+
)
77+
with open(DATA_WRITE_FILE, "a") as f:
78+
f.write(f"{data_log}\n")
79+
80+
81+
def record_all_licenses():
82+
"""Records the data of all license types findable in the license list and
83+
records these data into the DATA_WRITE_FILE as specified in that constant.
84+
"""
85+
licenses = ["CC0-1.0", "CC-BY-4.0", "CC-BY-SA-4.0"]
86+
for license in licenses:
87+
record_license_data(license)
88+
89+
90+
def main():
91+
set_up_data_file()
92+
record_all_licenses()
93+
94+
95+
if __name__ == "__main__":
96+
try:
97+
main()
98+
except SystemExit as e:
99+
sys.exit(e.code)
100+
except KeyboardInterrupt:
101+
print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
102+
sys.exit(130)
103+
except Exception:
104+
print("ERROR (1) Unhandled exception:", file=sys.stderr)
105+
print(traceback.print_exc(), file=sys.stderr)
106+
sys.exit(1)

sources.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,19 @@ The Flickr Developer Guide](https://www.flickr.com/services/developer/))
3636
- Query limit: 3600 requests per hour
3737
- Data available through CSV format
3838

39+
## GitHub
40+
41+
**Description:** A development platform for hosting and managing code.
42+
43+
**API documentation link:**
44+
- [GitHub REST API v3](https://docs.github.com/en/rest)
45+
46+
**API information:**
47+
- API key not required but recommended by GitHub
48+
- Query limit: 60 requests per hour if unauthenticated,
49+
5000 requests per hour if authenticated
50+
- Data available through JSON format
51+
3952

4053
## Google Custom Search JSON API
4154

0 commit comments

Comments
 (0)