18
18
from internetarchive .search import Search
19
19
from internetarchive .session import ArchiveSession
20
20
21
- today = dt . datetime . today ()
21
+ # Set up current working directory (CWD) and root_path
22
22
CWD = os .path .dirname (os .path .abspath (__file__ ))
23
+ root_path = os .path .dirname (CWD )
24
+
25
+ # Gets Date then Create File in CWD with Date Attached
26
+ today = dt .datetime .today ()
23
27
DATA_WRITE_FILE = (
24
28
f"{ CWD } "
25
29
f"/data_internetarchive_{ today .year } _{ today .month } _{ today .day } .csv"
26
30
)
27
31
28
32
29
33
def get_license_list ():
30
- """Provides the list of license from a Creative Commons provided tool list.
34
+ """
35
+ Provides the list of license from a Creative Commons provided tool list.
36
+
31
37
Returns:
32
- np.array: An np array containing all license types that should be
33
- searched via Programmable Search Engine.
38
+ - np.array:
39
+ An np array containing all license types that should be
40
+ searched from Internet Archive.
34
41
"""
35
- cc_license_data = pd .read_csv (f"{ CWD } /legal-tool-paths.txt" , header = None )
36
- license_list = cc_license_data [0 ].unique ()
42
+ # Read license data from file
43
+ cc_license_data = pd .read_csv (f"{ root_path } /legal-tool-paths.txt" , header = None )
44
+ # Define regex pattern to extract license types
45
+ license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
46
+ license_list = (
47
+ cc_license_data [0 ]
48
+ .str .extract (license_pattern , expand = False )
49
+ .dropna ()
50
+ .unique ()
51
+ )
37
52
return license_list
38
53
39
54
40
55
def get_response_elems (license ):
41
- """Provides the metadata for query of specified parameters
56
+ """
57
+ Provides the metadata for query of specified parameters
58
+
42
59
Args:
43
- license:
60
+ - license:
44
61
A string representing the type of license, and should be a segment
45
62
of its URL towards the license description. Alternatively, the
46
63
default None value stands for having no assumption about license
47
64
type.
48
65
49
66
Returns:
50
- dict: A dictionary mapping metadata to its value provided from the API
51
- query of specified parameters.
67
+ - dict:
68
+ A dictionary mapping metadata to its value provided from the API
69
+ query of specified parameters.
52
70
"""
53
71
try :
54
72
max_retries = Retry (
@@ -72,16 +90,18 @@ def get_response_elems(license):
72
90
73
91
74
92
def set_up_data_file ():
75
- """ Writes the header row to file to contain IA data."""
93
+ # Writes the header row to file to contain IA data.
76
94
header_title = "LICENSE TYPE,Document Count"
77
95
with open (DATA_WRITE_FILE , "w" ) as f :
78
96
f .write (f"{ header_title } \n " )
79
97
80
98
81
99
def record_license_data (license_type ):
82
- """Writes the row for LICENSE_TYPE to file to contain IA Query data.
100
+ """
101
+ Writes the row for LICENSE_TYPE to file to contain IA Query data.
102
+
83
103
Args:
84
- license_type:
104
+ - license_type:
85
105
A string representing the type of license, and should be a segment
86
106
of its URL towards the license description. Alternatively, the
87
107
default None value stands for having no assumption about license
@@ -96,9 +116,11 @@ def record_license_data(license_type):
96
116
97
117
98
118
def record_all_licenses ():
99
- """Records the data of all license types findable in the license list and
119
+ """
120
+ Records the data of all license types findable in the license list and
100
121
records these data into the DATA_WRITE_FILE as specified in that constant.
101
122
"""
123
+ # Gets the list of license types and record data for each license type
102
124
license_list = get_license_list ()
103
125
for license_type in license_list :
104
126
record_license_data (license_type )
0 commit comments