Skip to content

Commit 5b2f7d7

Browse files
authored
Merge pull request #142 from creativecommons/update-gcs-fetch
GCS fetch: add discovery num_retries and query rate_delay
2 parents 670d1a6 + 6a5e557 commit 5b2f7d7

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

scripts/1-fetch/gcs_fetch.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import time
1313
import traceback
1414
import urllib.parse
15+
from copy import copy
1516

1617
# Third-party
1718
import googleapiclient.discovery
@@ -34,14 +35,14 @@
3435
load_dotenv(PATHS["dotenv"])
3536

3637
# Constants
37-
DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
38-
CX = os.getenv("GCS_CX")
3938
BASE_URL = "https://www.googleapis.com/customsearch/v1"
4039
FILE1_COUNT = os.path.join(PATHS["data_phase"], "gcs_1_count.csv")
4140
FILE2_LANGUAGE = os.path.join(
4241
PATHS["data_phase"], "gcs_2_count_by_language.csv"
4342
)
4443
FILE3_COUNTRY = os.path.join(PATHS["data_phase"], "gcs_3_count_by_country.csv")
44+
GCS_CX = os.getenv("GCS_CX")
45+
GCS_DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
4546
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
4647
HEADER2_LANGUAGE = ["PLAN_INDEX", "TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
4748
HEADER3_COUNTRY = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
@@ -87,7 +88,11 @@ def get_search_service():
8788
"""
8889
LOGGER.info("Getting Google Custom Search API Service.")
8990
return googleapiclient.discovery.build(
90-
"customsearch", "v1", developerKey=DEVELOPER_KEY, cache_discovery=False
91+
"customsearch",
92+
"v1",
93+
developerKey=GCS_DEVELOPER_KEY,
94+
cache_discovery=False,
95+
num_retries=5,
9196
)
9297

9398

@@ -184,21 +189,15 @@ def query_gcs(args, service, last_completed_plan_index, plan):
184189

185190
max_tries = 5
186191
initial_delay = 1 # in seconds
192+
rate_delay = copy(initial_delay) # query gently
187193
start = last_completed_plan_index + 1
188194
stop = start + args.limit
189195

190196
for plan_row in plan[start:stop]: # noqa: E203
191197
index = plan.index(plan_row)
192198
query_info = f"index: {index}, tool: {plan_row['TOOL_IDENTIFIER']}"
193199
encoded_tool_url = urllib.parse.quote(plan_row["TOOL_URL"], safe=":/")
194-
query_params = {
195-
"cx": CX,
196-
# "num": records_per_query,
197-
# "start": start_index,
198-
# "cr": cr,
199-
# "lr": lr,
200-
"q": encoded_tool_url,
201-
}
200+
query_params = {"cx": GCS_CX, "q": encoded_tool_url}
202201
if plan_row["COUNTRY"]:
203202
query_info = f"{query_info}, country: {plan_row['COUNTRY']}"
204203
query_params["cr"] = plan_row["CR"]
@@ -222,6 +221,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
222221
results.get("searchInformation", {}).get("totalResults", 0)
223222
)
224223
success = True
224+
time.sleep(rate_delay)
225225
break # no need to try again
226226

227227
except HttpError as e:
@@ -230,7 +230,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
230230
"Quota exceeded" in e.reason
231231
and "Queries per day" in e.reason
232232
):
233-
LOGGER.warning(f"{e.status_code}: {e.reason}.")
233+
LOGGER.warning(f"{e.status_code}: {e.reason}")
234234
return # abort queries
235235
else:
236236
LOGGER.warning(

0 commit comments

Comments
 (0)