Skip to content

Commit 121187e

Browse files
committed
expo backoff, code quality, general update
1 parent f318e6c commit 121187e

File tree

2 files changed

+73
-92
lines changed

2 files changed

+73
-92
lines changed

wikicommons/data_wikicommons_2022_10_18.txt renamed to wikicommons/data_wikicommons_2022_10_21.csv

+26-26
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
LICENSE TYPE,File Count, Page Count
1+
LICENSE TYPE,File Count,Page Count
22
Free_Creative_Commons_licenses,0,0
33
Free_Creative_Commons_licenses/CC-BY,1,0
44
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0,7722,0
55
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+,7722,0
66
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+/CC-BY-1.0-FI,93,0
77
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+/CC-BY-1.0-IL,1,0
88
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+/CC-BY-1.0-NL,45,0
9-
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+/CC-BY-3.0,2.5,2.0,1.0,18410,0
10-
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.0,6122659,0
9+
Free_Creative_Commons_licenses/CC-BY/CC-BY-1.0/CC-BY-1.0+/CC-BY-3.0|2.5|2.0|1.0,18410,0
10+
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.0,6127008,0
1111
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.0/CC-BY-2.0-AT,231,0
1212
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.0/CC-BY-2.0-AU,0,0
1313
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.0/CC-BY-2.0-BE,74,0
@@ -21,8 +21,8 @@ Free_Creative_Commons_licenses/CC-BY/CC-BY-2.1,0,0
2121
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.1/CC-BY-2.1-AU,21,0
2222
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.1/CC-BY-2.1-ES,33,0
2323
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.1/CC-BY-2.1-JP,6061,0
24-
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5,360264,0
25-
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-AR,5712,0
24+
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5,360378,0
25+
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-AR,5709,0
2626
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-AU,6658,0
2727
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-BG,399,0
2828
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-BR,252,0
@@ -31,9 +31,9 @@ Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-CH,312,0
3131
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-CN,88,0
3232
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-CO,6,0
3333
Free_Creative_Commons_licenses/CC-BY/CC-BY-2.5/CC-BY-2.5-DK,5914,0
34-
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0,3053828,0
34+
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0,3054729,0
3535
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-AT,5811,0
36-
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-AU,8883,0
36+
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-AU,8886,0
3737
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR,13465,0
3838
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR/Photographs by Agencia Brasil,10712,0
3939
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR/Photographs by Agencia Brasil/Ag�ncia Brasil related uploads affected by license change,1,0
@@ -51,60 +51,60 @@ Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR/Images by TV Brasil,
5151
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR/Images by TV Brasil/TV Brasil related deletion requests,1,5
5252
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-BR/Images by TV Brasil/Photographs by TV Brasil,350,0
5353
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CH,153,0
54-
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL,3838,0
54+
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL,3830,0
5555
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL/CC-Alejandro Hales BCN,25,0
56-
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL/CC-Gobierno de Chile,2432,0
56+
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL/CC-Gobierno de Chile,2424,0
5757
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CL/CC-Historia Pol�tica BCN,1326,0
5858
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CN,102,0
5959
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CR,0,0
6060
Free_Creative_Commons_licenses/CC-BY/CC-BY-3.0/CC-BY-3.0-CZ,437,0
61-
Free_Creative_Commons_licenses/CC-BY/CC-BY-4.0,1957394,0
61+
Free_Creative_Commons_licenses/CC-BY/CC-BY-4.0,1958761,0
6262
Free_Creative_Commons_licenses/CC-BY-SA,4,0
63-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0,30472,1
64-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+,30472,1
63+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0,30475,1
64+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+,30475,1
6565
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-1.0-FI,148,0
6666
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-1.0-IL,0,0
6767
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-1.0-NL,2,0
68-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-2.5,2.0,1.0,293277,0
69-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-3.0,2.5,2.0,1.0,1120674,0
70-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-4.0,3.0,2.5,2.0,1.0,413538,0
71-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0,8288464,17
72-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+,8288464,17
68+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-2.5|2.0|1.0,293304,0
69+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-3.0|2.5|2.0|1.0,1120719,0
70+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-1.0/CC-BY-SA-1.0+/CC-BY-SA-4.0|3.0|2.5|2.0|1.0,413605,0
71+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0,8293847,17
72+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+,8293847,17
7373
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-AT,773,0
7474
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-AU,3,0
7575
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-BE,1032,0
7676
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-BR,218,0
7777
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-CA,316,0
7878
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-CL,15,0
79-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-DE,37204,0
79+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-DE,37206,0
8080
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.0/CC-BY-SA-2.0+/CC-BY-SA-2.0-ES,44,0
8181
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.1,0,0
8282
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.1/CC-BY-SA-2.1-AU,61,0
8383
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.1/CC-BY-SA-2.1-ES,715,0
8484
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.1/CC-BY-SA-2.1-JP,23612,0
85-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5,338304,0
85+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5,338337,0
8686
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-AR,998,0
8787
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-AU,1774,0
8888
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-BG,0,0
8989
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-BR,521,0
9090
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-CA,1701,0
9191
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-CH,3009,0
9292
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-2.5/CC-BY-SA-2.5-CN,38,0
93-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0,10310341,0
93+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0,10311807,0
9494
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-AT,111610,0
95-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-AU,168579,0
95+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-AU,168580,0
9696
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-BG,0,0
9797
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-BR,3665,0
98-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-CH,1666,0
98+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-CH,1664,0
9999
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-CL,2798,0
100100
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-3.0/CC-BY-SA-3.0-CN,142,0
101-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-4.0,22015789,11
102-
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-4.0/CC-BY-SA-4.0,3.0,33,0
101+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-4.0,22031587,11
102+
Free_Creative_Commons_licenses/CC-BY-SA/CC-BY-SA-4.0/CC-BY-SA-4.0|3.0,33,0
103103
Free_Creative_Commons_licenses/CC-PD,9910,0
104104
Free_Creative_Commons_licenses/CC-PD/All LibriVox recordings,5461,0
105105
Free_Creative_Commons_licenses/CC-PD/LibriVox - The Federalist Papers,85,0
106106
Free_Creative_Commons_licenses/CC-SA-1.0,4446,0
107-
Free_Creative_Commons_licenses/CC-Zero,5390598,2
107+
Free_Creative_Commons_licenses/CC-Zero,5391981,2
108108
Free_Creative_Commons_licenses/CC-Zero/PD OpenClipart,2666,0
109109
Free_Creative_Commons_licenses/CC-Zero/PD OpenClipart/OpenClipart playing cards,0,0
110110
Free_Creative_Commons_licenses/CC-Zero/PD OpenClipart/OpenClipart playing cards/OpenClipart bordered playing cards,54,0
@@ -116,7 +116,7 @@ Free_Creative_Commons_licenses/CC-Zero/Collections of the Biblioteca Virtual de
116116
Free_Creative_Commons_licenses/CC-Zero/Collections of the Biblioteca Virtual de Defensa/Documents from the Biblioteca Virtual de Defensa,17,0
117117
Free_Creative_Commons_licenses/CC-Zero/Collections of the Biblioteca Virtual de Defensa/Images from the Biblioteca Virtual de Defensa,163,0
118118
Free_Creative_Commons_licenses/CC-Zero/Media from piviso.com,1,0
119-
Free_Creative_Commons_licenses/CC-Zero/CC0 files by PantheraLeo1359531,573,0
119+
Free_Creative_Commons_licenses/CC-Zero/CC0 files by PantheraLeo1359531,576,0
120120
Free_Creative_Commons_licenses/CC-Zero/Files from Pexels,271,0
121121
Free_Creative_Commons_licenses/CC-Zero/Images from Pixabay,5408,0
122122
Free_Creative_Commons_licenses/CC-Zero/Images from Pixabay/Files from Pixabay by user,0,2

wikicommons/wikicommons-scratcher.py

+47-66
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,22 @@
66
# Standard library
77
import datetime as dt
88
import os
9-
import random
109
import sys
11-
import time
1210
import traceback
1311

1412
# Third-party
1513
import requests
14+
from requests.adapters import HTTPAdapter
15+
from urllib3.util.retry import Retry
1616

17+
today = dt.datetime.today()
1718
CWD = os.path.dirname(os.path.abspath(__file__))
18-
CALLBACK_INDEX = 2
19-
CALLBACK_EXPO = 0
20-
MAX_WAIT = 64
21-
DATA_WRITE_FILE = CWD
19+
DATA_WRITE_FILE = (
20+
f"{CWD}" f"/data_wikicommons_{today.year}_{today.month}_{today.day}.csv"
21+
)
2222
LICENSE_CACHE = {}
2323

2424

25-
def expo_backoff():
26-
"""Performs exponential backoff upon call.
27-
The function will force a wait of CALLBACK_INDEX ** CALLBACK_EXPO + r
28-
seconds, where r is a decimal number between 0.001 and 0.999, inclusive.
29-
If that value is higher than MAX_WAIT, then it will just wait MAX_WAIT
30-
seconds instead.
31-
"""
32-
global CALLBACK_EXPO
33-
backoff = random.randint(1, 1000) / 1000 + CALLBACK_INDEX**CALLBACK_EXPO
34-
time.sleep(min(backoff, MAX_WAIT))
35-
if backoff < MAX_WAIT:
36-
CALLBACK_EXPO += 1
37-
38-
39-
def expo_backoff_reset():
40-
"""Resets the CALLBACK_EXPO to 0."""
41-
global CALLBACK_EXPO
42-
CALLBACK_EXPO = 0
43-
44-
4525
def get_content_request_url(license):
4626
"""Provides the API Endpoint URL for specified parameters' WikiCommons
4727
contents.
@@ -57,12 +37,11 @@ def get_content_request_url(license):
5737
string: A string representing the API Endpoint URL for the query
5838
specified by this function's parameters.
5939
"""
60-
base_url = (
40+
return (
6141
r"https://commons.wikimedia.org/w/api.php?"
6242
r"action=query&prop=categoryinfo&titles="
6343
f"Category:{license}&format=json"
6444
)
65-
return base_url
6645

6746

6847
def get_subcat_request_url(license):
@@ -89,7 +68,7 @@ def get_subcat_request_url(license):
8968
return base_url
9069

9170

92-
def get_subcategories(license, eb=False):
71+
def get_subcategories(license):
9372
"""Obtain the subcategories of LICENSE in WikiCommons Database for
9473
recursive searching.
9574
@@ -99,9 +78,6 @@ def get_subcategories(license, eb=False):
9978
of its URL towards the license description. Alternatively, the
10079
default None value stands for having no assumption about license
10180
type.
102-
eb:
103-
A boolean indicating whether there should be exponential callback.
104-
Is by default False.
10581
10682
Returns:
10783
list: A list representing the subcategories of current license type
@@ -110,18 +86,24 @@ def get_subcategories(license, eb=False):
11086
"""
11187
try:
11288
request_url = get_subcat_request_url(license)
113-
search_data = requests.get(request_url).json()
114-
cat_list = []
89+
max_retries = Retry(
90+
total=5,
91+
backoff_factor=10,
92+
status_forcelist=[403, 408, 429, 500, 502, 503, 504],
93+
)
94+
session = requests.Session()
95+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
96+
with session.get(request_url) as response:
97+
response.raise_for_status()
98+
search_data = response.json()
99+
category_list = []
115100
for members in search_data["query"]["categorymembers"]:
116-
cat_list.append(
101+
category_list.append(
117102
members["title"].replace("Category:", "").replace("&", "%26")
118103
)
119-
return cat_list
120-
except Exception as e:
121-
if eb:
122-
expo_backoff()
123-
get_subcategories(license)
124-
elif "query" not in search_data:
104+
return category_list
105+
except Exception:
106+
if "query" not in search_data:
125107
print(search_data)
126108
print("This query will not be processed due to empty subcats.")
127109
else:
@@ -130,7 +112,7 @@ def get_subcategories(license, eb=False):
130112
sys.exit(1)
131113

132114

133-
def get_license_contents(license, eb=False):
115+
def get_license_contents(license):
134116
"""Provides the metadata for query of specified parameters.
135117
136118
Args:
@@ -139,17 +121,23 @@ def get_license_contents(license, eb=False):
139121
of its URL towards the license description. Alternatively, the
140122
default None value stands for having no assumption about license
141123
type.
142-
eb:
143-
A boolean indicating whether there should be exponential callback.
144-
Is by default False.
145124
146125
Returns:
147126
dict: A dictionary mapping metadata to its value provided from the API
148127
query of specified parameters.
149128
"""
150129
try:
151-
url = get_content_request_url(license)
152-
search_data = requests.get(url).json()
130+
request_url = get_content_request_url(license)
131+
max_retries = Retry(
132+
total=5,
133+
backoff_factor=10,
134+
status_forcelist=[403, 408, 429, 500, 502, 503, 504],
135+
)
136+
session = requests.Session()
137+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
138+
with session.get(request_url) as response:
139+
response.raise_for_status()
140+
search_data = response.json()
153141
file_cnt = 0
154142
page_cnt = 0
155143
for id in search_data["query"]["pages"]:
@@ -161,11 +149,8 @@ def get_license_contents(license, eb=False):
161149
"total_page_cnt": page_cnt,
162150
}
163151
return search_data_dict
164-
except Exception as e:
165-
if eb:
166-
expo_backoff()
167-
get_license_contents(license)
168-
elif "queries" not in search_data:
152+
except Exception:
153+
if "queries" not in search_data:
169154
print(search_data)
170155
print("This query will not be processed due to empty result.")
171156
else:
@@ -176,7 +161,7 @@ def get_license_contents(license, eb=False):
176161

177162
def set_up_data_file():
178163
"""Writes the header row to file to contain WikiCommons Query data."""
179-
header_title = "LICENSE TYPE,File Count, Page Count"
164+
header_title = "LICENSE TYPE,File Count,Page Count"
180165
with open(DATA_WRITE_FILE, "a") as f:
181166
f.write(header_title + "\n")
182167

@@ -196,8 +181,9 @@ def record_license_data(license_type, license_alias):
196181
eventual efforts of aggregating data.
197182
"""
198183
search_result = get_license_contents(license_type)
184+
cleaned_alias = license_alias.replace(",", "|")
199185
data_log = (
200-
f"{license_alias},"
186+
f"{cleaned_alias},"
201187
f"{search_result['total_file_cnt']},{search_result['total_page_cnt']}"
202188
)
203189
with open(DATA_WRITE_FILE, "a") as f:
@@ -216,25 +202,20 @@ def recur_record_all_licenses(alias="Free_Creative_Commons_licenses"):
216202
eventual efforts of aggregating data. Defaults to
217203
"Free_Creative_Commons_licenses".
218204
"""
219-
cur_cat = alias.split("/")[-1]
220-
subcategories = get_subcategories(cur_cat)
221-
if cur_cat not in LICENSE_CACHE:
222-
record_license_data(cur_cat, alias)
223-
LICENSE_CACHE[cur_cat] = True
224-
print("DEBUG", f"Logged {cur_cat} from {alias}")
205+
alias.replace(",", "|")
206+
cur_category = alias.split("/")[-1]
207+
subcategories = get_subcategories(cur_category)
208+
if cur_category not in LICENSE_CACHE:
209+
record_license_data(cur_category, alias)
210+
LICENSE_CACHE[cur_category] = True
211+
print("DEBUG", f"Logged {cur_category} from {alias}")
225212
for cats in subcategories:
226213
recur_record_all_licenses(alias=f"{alias}/{cats}")
227214

228215

229216
def main():
230-
global DATA_WRITE_FILE
231-
today = dt.datetime.today()
232-
DATA_WRITE_FILE += (
233-
f"/data_wikicommons_{today.year}_{today.month}_{today.day}.txt"
234-
)
235217
set_up_data_file()
236218
recur_record_all_licenses()
237-
DATA_WRITE_FILE = CWD
238219

239220

240221
if __name__ == "__main__":

0 commit comments

Comments
 (0)