12
12
import time
13
13
import traceback
14
14
import urllib .parse
15
+ from copy import copy
15
16
16
17
# Third-party
17
18
import googleapiclient .discovery
34
35
load_dotenv (PATHS ["dotenv" ])
35
36
36
37
# Constants
37
- DEVELOPER_KEY = os .getenv ("GCS_DEVELOPER_KEY" )
38
- CX = os .getenv ("GCS_CX" )
39
38
BASE_URL = "https://www.googleapis.com/customsearch/v1"
40
39
FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "gcs_1_count.csv" )
41
40
FILE2_LANGUAGE = os .path .join (
42
41
PATHS ["data_phase" ], "gcs_2_count_by_language.csv"
43
42
)
44
43
FILE3_COUNTRY = os .path .join (PATHS ["data_phase" ], "gcs_3_count_by_country.csv" )
44
+ GCS_CX = os .getenv ("GCS_CX" )
45
+ GCS_DEVELOPER_KEY = os .getenv ("GCS_DEVELOPER_KEY" )
45
46
HEADER1_COUNT = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNT" ]
46
47
HEADER2_LANGUAGE = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
47
48
HEADER3_COUNTRY = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
@@ -87,7 +88,11 @@ def get_search_service():
87
88
"""
88
89
LOGGER .info ("Getting Google Custom Search API Service." )
89
90
return googleapiclient .discovery .build (
90
- "customsearch" , "v1" , developerKey = DEVELOPER_KEY , cache_discovery = False
91
+ "customsearch" ,
92
+ "v1" ,
93
+ developerKey = GCS_DEVELOPER_KEY ,
94
+ cache_discovery = False ,
95
+ num_retries = 5 ,
91
96
)
92
97
93
98
@@ -184,21 +189,15 @@ def query_gcs(args, service, last_completed_plan_index, plan):
184
189
185
190
max_tries = 5
186
191
initial_delay = 1 # in seconds
192
+ rate_delay = copy (initial_delay ) # query gently
187
193
start = last_completed_plan_index + 1
188
194
stop = start + args .limit
189
195
190
196
for plan_row in plan [start :stop ]: # noqa: E203
191
197
index = plan .index (plan_row )
192
198
query_info = f"index: { index } , tool: { plan_row ['TOOL_IDENTIFIER' ]} "
193
199
encoded_tool_url = urllib .parse .quote (plan_row ["TOOL_URL" ], safe = ":/" )
194
- query_params = {
195
- "cx" : CX ,
196
- # "num": records_per_query,
197
- # "start": start_index,
198
- # "cr": cr,
199
- # "lr": lr,
200
- "q" : encoded_tool_url ,
201
- }
200
+ query_params = {"cx" : GCS_CX , "q" : encoded_tool_url }
202
201
if plan_row ["COUNTRY" ]:
203
202
query_info = f"{ query_info } , country: { plan_row ['COUNTRY' ]} "
204
203
query_params ["cr" ] = plan_row ["CR" ]
@@ -222,6 +221,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
222
221
results .get ("searchInformation" , {}).get ("totalResults" , 0 )
223
222
)
224
223
success = True
224
+ time .sleep (rate_delay )
225
225
break # no need to try again
226
226
227
227
except HttpError as e :
@@ -230,7 +230,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
230
230
"Quota exceeded" in e .reason
231
231
and "Queries per day" in e .reason
232
232
):
233
- LOGGER .warning (f"{ e .status_code } : { e .reason } . " )
233
+ LOGGER .warning (f"{ e .status_code } : { e .reason } " )
234
234
return # abort queries
235
235
else :
236
236
LOGGER .warning (
0 commit comments