1
1
#!/usr/bin/env python
2
2
"""
3
- This file is dedicated to querying data from the GitHub API.
3
+ Fetch CC Legal Tool usage from GitHub API.
4
4
"""
5
5
6
6
# Standard library
7
7
import argparse
8
8
import csv
9
9
import os
10
10
import sys
11
+ import textwrap
11
12
import traceback
13
+ import urllib .parse
12
14
13
15
# Third-party
14
16
import requests
15
- import yaml
17
+ from pygments import highlight
18
+ from pygments .formatters import TerminalFormatter
19
+ from pygments .lexers import PythonTracebackLexer
16
20
from requests .adapters import HTTPAdapter
17
21
from urllib3 .util .retry import Retry
18
22
25
29
# Setup
26
30
LOGGER , PATHS = shared .setup (__file__ )
27
31
28
- # Log the start of the script execution
29
- LOGGER .info ("Script execution started." )
32
+ # Constants
33
+ FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "github_1_count.csv" )
34
+ GITHUB_RETRY_STATUS_FORCELIST = [
35
+ 408 , # Request Timeout
36
+ 422 , # Unprocessable Content
37
+ # (Validation failed, or the endpoint has been spammed)
38
+ 429 , # Too Many Requests
39
+ 500 , # Internal Server Error
40
+ 502 , # Bad Gateway
41
+ 503 , # Service Unavailable
42
+ 504 , # Gateway Timeout
43
+ ]
44
+ # Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
45
+ GITHUB_TOOLS = [
46
+ {"TOOL_IDENTIFIER" : "BSD Zero Clause License" , "SPDX_IDENTIFIER" : "0BSD" },
47
+ {"TOOL_IDENTIFIER" : "CC0 1.0" , "SPDX_IDENTIFIER" : "CC0-1.0" },
48
+ {"TOOL_IDENTIFIER" : "CC BY 4.0" , "SPDX_IDENTIFIER" : "CC-BY-4.0" },
49
+ {"TOOL_IDENTIFIER" : "CC BY-SA 4.0" , "SPDX_IDENTIFIER" : "CC-BY-SA-4.0" },
50
+ {"TOOL_IDENTIFIER" : "MIT No Attribution" , "SPDX_IDENTIFIER" : "MIT-0" },
51
+ {"TOOL_IDENTIFIER" : "Unlicense" , "SPDX_IDENTIFIER" : "Unlicense" },
52
+ {"TOOL_IDENTIFIER" : "Total public repositories" , "SPDX_IDENTIFIER" : "N/A" },
53
+ ]
54
+ HEADER1_COUNT = ["TOOL_IDENTIFIER" , "SPDX_IDENTIFIER" , "COUNT" ]
55
+ QUARTER = os .path .basename (PATHS ["data_quarter" ])
30
56
31
57
32
58
def parse_arguments ():
33
59
"""
34
- Parses command-line arguments , returns parsed arguments .
60
+ Parse command-line options , returns parsed argument namespace .
35
61
"""
36
- LOGGER .info ("Parsing command-line arguments " )
37
- parser = argparse .ArgumentParser (description = "GitHub Data Fetching Script" )
62
+ LOGGER .info ("Parsing command-line options " )
63
+ parser = argparse .ArgumentParser (description = __doc__ )
38
64
parser .add_argument (
39
- "--licenses" , type = int , default = 3 , help = "Number of licenses to query"
65
+ "--enable-save" ,
66
+ action = "store_true" ,
67
+ help = "Enable saving results" ,
68
+ )
69
+ parser .add_argument (
70
+ "--enable-git" ,
71
+ action = "store_true" ,
72
+ help = "Enable git actions (fetch, merge, add, commit, and push)" ,
40
73
)
41
74
return parser .parse_args ()
42
75
43
76
44
- def set_up_data_file ():
45
- """
46
- Sets up the data file for recording results.
47
- """
48
- LOGGER .info ("Setting up the data file for recording results." )
49
- header = "LICENSE_TYPE,Repository Count\n "
50
- with open (
51
- os .path .join (PATHS ["data_phase" ], "github_fetched.csv" ), "w"
52
- ) as f :
53
- f .write (header )
54
-
55
-
56
- def get_response_elems (license_type ):
57
- """
58
- Provides the metadata for a query of
59
- specified license type from GitHub API.
60
-
61
- Args:
62
- license_type: A string representing the type of license.
63
- Returns:
64
- dict: A dictionary mapping metadata
65
- to its value provided from the API query.
66
- """
67
- LOGGER .info (f"Querying metadata for license: { license_type } " )
77
+ def check_for_completion ():
68
78
try :
69
- base_url = "https://api.github.com/search/repositories?q=license:"
70
- request_url = f"{ base_url } { license_type } "
71
- max_retries = Retry (
72
- total = 5 ,
73
- backoff_factor = 10 ,
74
- status_forcelist = [403 , 408 , 429 , 500 , 502 , 503 , 504 ],
75
- )
76
- session = requests .Session ()
77
- session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
78
- with session .get (request_url ) as response :
79
- response .raise_for_status ()
80
- search_data = response .json ()
81
- return {"totalResults" : search_data ["total_count" ]}
82
- except requests .HTTPError as e :
83
- LOGGER .error (f"HTTP Error: { e } " )
84
- raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
85
- except requests .RequestException as e :
86
- LOGGER .error (f"Request Exception: { e } " )
87
- raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
88
- except KeyError as e :
89
- LOGGER .error (f"KeyError: { e } ." )
90
- raise shared .QuantifyingException (f"KeyError: { e } " , 1 )
91
-
92
-
93
- def retrieve_license_data (args ):
94
- """
95
- Retrieves the data of all license types specified.
96
- """
97
- LOGGER .info ("Retrieving the data for all license types." )
98
- licenses = ["CC0-1.0" , "CC-BY-4.0" , "CC-BY-SA-4.0" ][: args .licenses ]
99
-
100
- data = []
101
- total_repos_retrieved = 0
102
-
103
- for license_type in licenses :
104
- data_dict = get_response_elems (license_type )
105
- total_repos_retrieved += data_dict ["totalResults" ]
106
- record_results (license_type , data_dict )
79
+ with open (FILE1_COUNT , "r" , newline = "" ) as file_obj :
80
+ reader = csv .DictReader (file_obj , dialect = "unix" )
81
+ if len (list (reader )) == len (GITHUB_TOOLS ):
82
+ raise shared .QuantifyingException (
83
+ f"Data fetch completed for { QUARTER } " , 0
84
+ )
85
+ except FileNotFoundError :
86
+ pass # File may not be found without --enable-save, etc.
87
+
88
+
89
+ def get_requests_session ():
90
+ max_retries = Retry (
91
+ total = 5 ,
92
+ backoff_factor = 10 ,
93
+ status_forcelist = GITHUB_RETRY_STATUS_FORCELIST ,
94
+ )
95
+ session = requests .Session ()
96
+ session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
97
+ session .headers .update ({"Accept" : "application/vnd.github+json" })
107
98
108
- for row in data :
109
- LOGGER .info (f"Collected data row: { row } " )
99
+ return session
110
100
111
- return data
112
101
102
+ def write_data (args , tool_data ):
103
+ if not args .enable_save :
104
+ return args
113
105
114
- def record_results (license_type , data ):
115
- """
116
- Records the data for a specific license type into the CSV file.
117
- """
118
- LOGGER .info (f"Recording data for license: { license_type } " )
119
- row = [license_type , data ["totalResults" ]]
120
- with open (
121
- os .path .join (PATHS ["data_phase" ], "github_fetched.csv" ),
122
- "a" ,
123
- newline = "" ,
124
- ) as f :
125
- writer = csv .writer (f , dialect = "unix" )
126
- writer .writerow (row )
127
-
128
-
129
- def load_state ():
130
- """
131
- Loads the state from a YAML file, returns the last recorded state.
132
- """
133
- if os .path .exists (PATHS ["state" ]):
134
- with open (PATHS ["state" ], "r" ) as f :
135
- return yaml .safe_load (f )
136
- return {"total_records_retrieved (github)" : 0 }
106
+ # Create data directory for this phase
107
+ os .makedirs (PATHS ["data_phase" ], exist_ok = True )
137
108
109
+ if len (tool_data ) < len (GITHUB_TOOLS ):
110
+ LOGGER .error ("Unable to fetch all records. Aborting." )
111
+ return args
138
112
139
- def save_state (state : dict ):
140
- """
141
- Saves the state to a YAML file.
142
- Parameters:
143
- state_file: Path to the state file.
144
- state: The state dictionary to save.
145
- """
146
- with open (PATHS ["state" ], "w" ) as f :
147
- yaml .safe_dump (state , f )
113
+ with open (FILE1_COUNT , "w" , newline = "" ) as file_obj :
114
+ writer = csv .DictWriter (
115
+ file_obj , fieldnames = HEADER1_COUNT , dialect = "unix"
116
+ )
117
+ writer .writeheader ()
118
+ for row in tool_data :
119
+ writer .writerow (row )
120
+ return args
121
+
122
+
123
+ def query_github (args , session ):
124
+ tool_data = []
125
+ for tool in GITHUB_TOOLS :
126
+ tool_identifier = tool ["TOOL_IDENTIFIER" ]
127
+ spdx_identifier = tool ["SPDX_IDENTIFIER" ]
128
+ LOGGER .info (f"Query: tool: { tool_identifier } , spdx: { spdx_identifier } " )
129
+
130
+ base_url = "https://api.github.com/search/repositories?per_page=1&q="
131
+ search_parameters = "is:public"
132
+ if tool_identifier != "Total public repositories" :
133
+ search_parameters = (
134
+ f"{ search_parameters } license:{ spdx_identifier .lower ()} "
135
+ )
136
+ search_parameters = urllib .parse .quote (search_parameters , safe = ":/" )
137
+ request_url = f"{ base_url } { search_parameters } "
138
+
139
+ try :
140
+ with session .get (request_url ) as response :
141
+ response .raise_for_status ()
142
+ search_data = response .json ()
143
+ count = search_data ["total_count" ]
144
+ tool_data .append (
145
+ {
146
+ "TOOL_IDENTIFIER" : tool_identifier ,
147
+ "SPDX_IDENTIFIER" : spdx_identifier ,
148
+ "COUNT" : count ,
149
+ }
150
+ )
151
+ LOGGER .info (f"count: { count } " )
152
+ except requests .HTTPError as e :
153
+ LOGGER .error (f"HTTP Error: { e } " )
154
+ raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
155
+ except requests .RequestException as e :
156
+ LOGGER .error (f"Request Exception: { e } " )
157
+ raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
158
+ except KeyError as e :
159
+ LOGGER .error (f"KeyError: { e } ." )
160
+ raise shared .QuantifyingException (f"KeyError: { e } " , 1 )
161
+ return tool_data
148
162
149
163
150
164
def main ():
151
-
152
- # Fetch and merge changes
153
- shared .fetch_and_merge (PATHS ["repo" ])
154
-
155
165
args = parse_arguments ()
156
-
157
- state = load_state ()
158
- total_records_retrieved = state ["total_records_retrieved (github)" ]
159
- LOGGER .info (f"Initial total_records_retrieved: { total_records_retrieved } " )
160
- goal_records = 1000 # Set goal number of records
161
-
162
- if total_records_retrieved >= goal_records :
163
- LOGGER .info (
164
- f"Goal of { goal_records } records already achieved."
165
- " No further action required."
166
- )
167
- return
168
-
169
- # Log the paths being used
170
166
shared .log_paths (LOGGER , PATHS )
171
-
172
- # Create data directory for this phase
173
- os .makedirs (PATHS ["data_phase" ], exist_ok = True )
174
-
175
- if total_records_retrieved == 0 :
176
- set_up_data_file ()
177
-
178
- # Retrieve and record data
179
- repos_retrieved = retrieve_license_data (args )
180
-
181
- # Update the state with the new count of retrieved records
182
- total_records_retrieved += repos_retrieved
183
- LOGGER .info (
184
- f"Total records retrieved after fetching: { total_records_retrieved } "
167
+ check_for_completion ()
168
+ session = get_requests_session ()
169
+ tool_data = query_github (args , session )
170
+ args = write_data (args , tool_data )
171
+ args = shared .git_add_and_commit (
172
+ args ,
173
+ PATHS ["repo" ],
174
+ PATHS ["data_quarter" ],
175
+ f"Add and commit new GitHUB data for { QUARTER } " ,
185
176
)
186
- state ["total_records_retrieved (github)" ] = total_records_retrieved
187
- save_state (state )
188
-
189
- # Add and commit changes
190
- shared .add_and_commit (
191
- PATHS ["repo" ], PATHS ["data_quarter" ], "Add and commit GitHub data"
192
- )
193
-
194
- # Push changes
195
- shared .push_changes (PATHS ["repo" ])
177
+ shared .git_push_changes (args , PATHS ["repo" ])
196
178
197
179
198
180
if __name__ == "__main__" :
@@ -211,5 +193,13 @@ def main():
211
193
LOGGER .info ("(130) Halted via KeyboardInterrupt." )
212
194
sys .exit (130 )
213
195
except Exception :
214
- LOGGER .exception (f"(1) Unhandled exception: { traceback .format_exc ()} " )
196
+ traceback_formatted = textwrap .indent (
197
+ highlight (
198
+ traceback .format_exc (),
199
+ PythonTracebackLexer (),
200
+ TerminalFormatter (),
201
+ ),
202
+ " " ,
203
+ )
204
+ LOGGER .exception (f"(1) Unhandled exception:\n { traceback_formatted } " )
215
205
sys .exit (1 )
0 commit comments