Skip to content

Commit cf9a5b3

Browse files
author
Greg Lindahl
committed
fix: code cleanups and refactors
1 parent 29fdf9b commit cf9a5b3

File tree

6 files changed

+92
-27
lines changed

6 files changed

+92
-27
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ example programs need to be surt_host_names -- e.g. org,commoncrawl
101101
for commoncrawl.org, or org,commoncrawl, for *.commoncrawl.org.
102102

103103
- host.py -- generates a bunch of pngs, csvs, and a webpage summarizing a host or wildcarded host
104-
- count-hosts.py -- given a file containing a bunch of surt hostnames, plot fetch\_200 over time
105104

106105
## Example SQL queries
107106

@@ -200,12 +199,13 @@ LIMIT 10
200199
## Expected changes in test v3
201200

202201
- warc\_record\_length\_av will be renamed to \_avg (that was a typo)
203-
- more _pct columns
202+
- more \_pct columns
204203
- addition of indegree and outdegree for all hosts from the web graph
205204
- add unicode block information, similar to languages
206205
- improve language details to be more than only LOTE and LOTE_pct
207206
- prank10 needs its power law touched up (hcrank10 is much better)
208207
- there's a sort problem that .com shards have a smattering of not-.com hosts. This hurts performance.
208+
- add domain prank/hcrank
209209
- CI running against S3
210210

211211
## Contributing

duck_utils.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,10 @@ def open_host_index(paths=None, bucket='https://data.commoncrawl.org', grep=None
7272
else:
7373
if verbose:
7474
print(f'{len(paths)} paths found')
75-
return duckdb.read_parquet(paths, hive_partitioning=True)
75+
try:
76+
return duckdb.read_parquet(paths, hive_partitioning=True)
77+
except Exception:
78+
print('exception:')
79+
print('HOST_INDEX=', os.environ['HOST_INDEX'])
80+
print('HOST_INDEX_BUCKET=', os.environ['HOST_INDEX_BUCKET'])
81+
raise

host.py renamed to graph.py

+67-19
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,30 @@
1515
SELECT
1616
{cols}
1717
FROM host_index
18-
WHERE surt_host_name = '{surt_host_name}'
18+
WHERE surt_host_name = '{surt_host_name}'{and_tld}
1919
ORDER BY crawl ASC
2020
'''
2121

22-
sub_host_sql = '''
22+
subdomain_sql = '''
2323
SELECT
2424
{cols}
2525
FROM host_index
26-
WHERE surt_host_name LIKE '{surt_host_name}'
26+
WHERE surt_host_name LIKE '{surt_host_name}'{and_tld}
2727
GROUP BY crawl
2828
ORDER BY crawl ASC
2929
'''
3030

31+
many_host_sql = '''
32+
SELECT
33+
crawl,
34+
CAST(SUM(fetch_200) AS INT64) AS sum_fetch_200
35+
FROM host_index
36+
WHERE contains(ARRAY [{surt_list}], surt_host_name){and_tld}
37+
GROUP BY crawl
38+
ORDER BY crawl ASC
39+
'''
40+
41+
3142
host_columns = {
3243
# list order does matter for the graphs
3344
'rank': ['crawl', 'fetch_200', 'fetch_200_lote', 'prank10', 'hcrank10'],
@@ -39,6 +50,9 @@
3950
domain_columns = {
4051
'sum': ['crawl', 'fetch_200', 'fetch_200_lote'],
4152
}
53+
many_host_columns = {
54+
'sum': ['crawl', 'fetch_200'],
55+
}
4256

4357
def surt_host_name_to_title(surt_host_name):
4458
parts = list(reversed(surt_host_name.split(',')))
@@ -48,14 +62,31 @@ def surt_host_name_to_title(surt_host_name):
4862

4963

5064
def get_surt_host_name_values(host_index, surt_host_name, col_names):
65+
if not isinstance(surt_host_name, str):
66+
# if not a string, it's a list of strings
67+
surt_list = ','.join(f"'{s}'" for s in surt_host_name)
68+
69+
tlds = set([s.split(',', 1)[0] for s in surt_host_name])
70+
if len(tlds) == 1:
71+
tld = next(iter(tlds))
72+
and_tld = f" AND url_host_tld = '{tld}'"
73+
else:
74+
and_tld = ''
75+
76+
sql = many_host_sql.format(surt_list=surt_list, and_tld=and_tld)
77+
return duckdb.sql(sql).arrow()
78+
79+
tld = surt_host_name.split(',', 1)[0]
80+
and_tld = f" AND url_host_tld = '{tld}'"
5181
if surt_host_name.endswith(','):
82+
col_names.remove('crawl')
5283
cols = ', '.join(f'CAST(SUM({col}) AS INT64) AS sum_{col}' for col in col_names)
5384
cols = 'crawl, '+cols
54-
sql = sub_host_sql.format(cols=cols, surt_host_name=surt_host_name+'%')
85+
sql = subdomain_sql.format(cols=cols, surt_host_name=surt_host_name+'%', and_tld=and_tld)
5586
print(sql)
5687
else:
5788
cols = ', '.join(col_names)
58-
sql = host_sql.format(cols=cols, surt_host_name=surt_host_name)
89+
sql = host_sql.format(cols=cols, surt_host_name=surt_host_name, and_tld=and_tld)
5990
print(sql)
6091
return duckdb.sql(sql).arrow()
6192

@@ -81,7 +112,8 @@ def host_plot_values(table, col_names, title):
81112
side = 'r'
82113
else:
83114
side = 'l'
84-
lines.append(['crawl', name, side, 'o', name])
115+
# x, y, side, marker, label
116+
lines.append(['crawl', name, side, None, name])
85117
return do_plot(df, lines, title)
86118

87119

@@ -99,7 +131,9 @@ def do_plot(df, lines, title):
99131
continue
100132
xvalues = df[x].astype(str)
101133
xvalues = [x.replace('CC-MAIN-', '') for x in xvalues]
102-
color, ls = graph_utils.get_color(i)
134+
ls = None
135+
#color, ls = graph_utils.get_color_ls(i)
136+
color, marker = graph_utils.get_color_marker(i)
103137
if side == 'l':
104138
our_line, = ax1.plot(xvalues, yvalues, marker=marker, label=label, color=color, ls=ls)
105139
else:
@@ -145,7 +179,7 @@ def get_host(host_index, surt_host_name, title):
145179

146180

147181
def plot_host(host_index, surt_host_name, title,
148-
do_csv=False, do_png=False, do_html=False, html_template = 'host.html'):
182+
do_csv=False, do_png=False, do_html=False, verbose=0, html_template = 'host.html'):
149183
tables, plots = get_host(host_index, surt_host_name, title)
150184
for key in tables:
151185
if do_csv:
@@ -168,7 +202,7 @@ def plot_host(host_index, surt_host_name, title,
168202
f.write(page)
169203

170204

171-
def get_domain(host_index, surt_host_name, title):
205+
def get_domain(host_index, surt_host_name, title, verbose=0):
172206
plots = {}
173207
tables = {}
174208
for key, cols in domain_columns.items():
@@ -183,11 +217,11 @@ def get_domain(host_index, surt_host_name, title):
183217

184218

185219
def plot_domain(host_index, surt_host_name, title,
186-
do_csv=False, do_png=False, do_html=False, html_template = 'domain.html'):
187-
tables, plots = get_domain(host_index, surt_host_name, title)
220+
do_csv=False, do_png=False, do_html=False, verbose=0, html_template = 'domain.html'):
221+
tables, plots = get_domain(host_index, surt_host_name, title, verbose=verbose)
188222
for key in tables:
189223
if do_csv:
190-
out = surt_host_name + '_' + key
224+
out = title + '_' + key
191225
host_csv(tables[key], out+'.csv')
192226
if do_png:
193227
with open (out+'.png', 'wb') as fd:
@@ -202,19 +236,19 @@ def plot_domain(host_index, surt_host_name, title,
202236
)
203237
template = env.get_template(html_template)
204238
page = template.render(title=title, plots=plots)
205-
with open(surt_host_name + '.html', 'w') as f:
239+
with open(title + '.html', 'w') as f:
206240
f.write(page)
207241

208242

209-
def make_plot(surt_host_name, host_index):
210-
title = surt_host_name_to_title(surt_host_name)
211-
if surt_host_name.endswith(','):
243+
def make_plot(surt_host_name, host_index, title, verbose=0):
244+
if not isinstance(surt_host_name, str) or surt_host_name.endswith(','):
212245
plot_domain(host_index, surt_host_name, title,
213-
do_csv=True, do_png=True, do_html=True)
246+
do_csv=False, do_png=False, do_html=True, verbose=verbose)
214247
return
215248

249+
print('plot host')
216250
plot_host(host_index, surt_host_name, title,
217-
do_csv=True, do_png=True, do_html=True)
251+
do_csv=True, do_png=True, do_html=True, verbose=verbose)
218252

219253

220254
def main():
@@ -223,11 +257,25 @@ def main():
223257
grep = None
224258
#grep = 'CC-MAIN-2022'
225259
host_index = duck_utils.open_host_index(grep=grep, verbose=verbose)
260+
if len(sys.argv) > 2 and sys.argv[1] == '-f':
261+
assert len(sys.argv) == 3
262+
surts = []
263+
title = sys.argv[2]
264+
with open(sys.argv[2], encoding='utf8') as fd:
265+
for thing in fd:
266+
surt_host_name = utils.thing_to_surt_host_name(thing.rstrip(), verbose=verbose)
267+
if surt_host_name:
268+
surts.append(surt_host_name)
269+
if verbose:
270+
print(f'making a plot for {len(surts)} hosts')
271+
make_plot(surts, host_index, title, verbose=verbose)
272+
return
226273
for thing in sys.argv[1:]:
227274
surt_host_name = utils.thing_to_surt_host_name(thing)
228275
if not surt_host_name:
229276
continue
230-
make_plot(surt_host_name, host_index)
277+
title = surt_host_name_to_title(surt_host_name)
278+
make_plot(surt_host_name, host_index, title, verbose=verbose)
231279

232280

233281
if __name__ == '__main__':

graph_utils.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@
1616
combinations = len(color_list) * len(ls_list)
1717

1818

19-
def get_color(i):
19+
def get_color_ls(i):
2020
assert i < combinations
2121
return color_list[i % 5], ls_list[i // 5]
2222

2323

24+
def get_color_marker(i):
25+
assert i < combinations
26+
return color_list[i % 5], marker_list[i // 5]
27+
28+
2429
def png_to_embed(png):
2530
png_b64 = base64.b64encode(png).decode('utf8')
2631
return '<img src="data:image/jpeg;base64,'+png_b64+'">'

test_utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
def test_thing_to_surt_host_name():
77
tests = [
8-
('va', 'va'),
8+
('va', 'va,'),
99
('va,', 'va,'),
1010
('va,*', 'va,'),
1111

@@ -17,10 +17,11 @@ def test_thing_to_surt_host_name():
1717
#('.example.com/', 'com,example,'), # python's library drops that leading dot
1818
('sub.example.com', 'com,example,sub'),
1919
('*.sub.example.com', 'com,example,sub,'),
20+
21+
('example.com/foo', None),
2022
]
2123

2224
value_error = [
23-
('example.com/foo', ''),
2425
('example.com,', ''),
2526
]
2627

utils.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import surt
22

33

4-
def thing_to_surt_host_name(thing):
4+
def thing_to_surt_host_name(thing, verbose=0):
55
'''convert a thing, probably an url of some kind, to a surt host name'''
66
orig = thing
77
if '/' not in thing and '.' not in thing:
@@ -11,10 +11,15 @@ def thing_to_surt_host_name(thing):
1111
raise ValueError('unexpected * in '+thing)
1212
if ',,' in thing:
1313
raise ValueError('unexpected ,, in '+thing)
14+
if ',' not in thing:
15+
# assume it is a tld wildcard
16+
thing += ','
1417
return thing
1518
surt_host_name, extra = surt.surt(thing).split(')/', 1)
1619
if extra:
17-
raise ValueError(f'skipping {orig} because extra is {extra}')
20+
if verbose:
21+
print(f'skipping {orig} because extra is {extra}')
22+
return
1823
if surt_host_name.endswith(',*'):
1924
surt_host_name = surt_host_name[:-1]
2025
if ',,' in surt_host_name:

0 commit comments

Comments
 (0)