Skip to content

Commit f77ea63

Browse files
committed
internetarchive_scratcher refined
1 parent 1e1bf8e commit f77ea63

File tree

1 file changed

+36
-14
lines changed

1 file changed

+36
-14
lines changed

internetarchive/internetarchive_scratcher.py

+36-14
Original file line numberDiff line numberDiff line change
@@ -18,37 +18,55 @@
1818
from internetarchive.search import Search
1919
from internetarchive.session import ArchiveSession
2020

21-
today = dt.datetime.today()
21+
# Set up current working directory (CWD) and root_path
2222
CWD = os.path.dirname(os.path.abspath(__file__))
23+
root_path = os.path.dirname(CWD)
24+
25+
# Gets Date then Create File in CWD with Date Attached
26+
today = dt.datetime.today()
2327
DATA_WRITE_FILE = (
2428
f"{CWD}"
2529
f"/data_internetarchive_{today.year}_{today.month}_{today.day}.csv"
2630
)
2731

2832

2933
def get_license_list():
30-
"""Provides the list of license from a Creative Commons provided tool list.
34+
"""
35+
Provides the list of license from a Creative Commons provided tool list.
36+
3137
Returns:
32-
np.array: An np array containing all license types that should be
33-
searched via Programmable Search Engine.
38+
- np.array:
39+
An np array containing all license types that should be
40+
searched from Internet Archive.
3441
"""
35-
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
36-
license_list = cc_license_data[0].unique()
42+
# Read license data from file
43+
cc_license_data = pd.read_csv(f"{root_path}/legal-tool-paths.txt", header=None)
44+
# Define regex pattern to extract license types
45+
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
46+
license_list = (
47+
cc_license_data[0]
48+
.str.extract(license_pattern, expand=False)
49+
.dropna()
50+
.unique()
51+
)
3752
return license_list
3853

3954

4055
def get_response_elems(license):
41-
"""Provides the metadata for query of specified parameters
56+
"""
57+
Provides the metadata for query of specified parameters
58+
4259
Args:
43-
license:
60+
- license:
4461
A string representing the type of license, and should be a segment
4562
of its URL towards the license description. Alternatively, the
4663
default None value stands for having no assumption about license
4764
type.
4865
4966
Returns:
50-
dict: A dictionary mapping metadata to its value provided from the API
51-
query of specified parameters.
67+
- dict:
68+
A dictionary mapping metadata to its value provided from the API
69+
query of specified parameters.
5270
"""
5371
try:
5472
max_retries = Retry(
@@ -72,16 +90,18 @@ def get_response_elems(license):
7290

7391

7492
def set_up_data_file():
75-
"""Writes the header row to file to contain IA data."""
93+
# Writes the header row to file to contain IA data.
7694
header_title = "LICENSE TYPE,Document Count"
7795
with open(DATA_WRITE_FILE, "w") as f:
7896
f.write(f"{header_title}\n")
7997

8098

8199
def record_license_data(license_type):
82-
"""Writes the row for LICENSE_TYPE to file to contain IA Query data.
100+
"""
101+
Writes the row for LICENSE_TYPE to file to contain IA Query data.
102+
83103
Args:
84-
license_type:
104+
- license_type:
85105
A string representing the type of license, and should be a segment
86106
of its URL towards the license description. Alternatively, the
87107
default None value stands for having no assumption about license
@@ -96,9 +116,11 @@ def record_license_data(license_type):
96116

97117

98118
def record_all_licenses():
99-
"""Records the data of all license types findable in the license list and
119+
"""
120+
Records the data of all license types findable in the license list and
100121
records these data into the DATA_WRITE_FILE as specified in that constant.
101122
"""
123+
# Gets the list of license types and record data for each license type
102124
license_list = get_license_list()
103125
for license_type in license_list:
104126
record_license_data(license_type)

0 commit comments

Comments
 (0)