15
15
SELECT
16
16
{cols}
17
17
FROM host_index
18
- WHERE surt_host_name = '{surt_host_name}'
18
+ WHERE surt_host_name = '{surt_host_name}'{and_tld}
19
19
ORDER BY crawl ASC
20
20
'''
21
21
22
- sub_host_sql = '''
22
+ subdomain_sql = '''
23
23
SELECT
24
24
{cols}
25
25
FROM host_index
26
- WHERE surt_host_name LIKE '{surt_host_name}'
26
+ WHERE surt_host_name LIKE '{surt_host_name}'{and_tld}
27
27
GROUP BY crawl
28
28
ORDER BY crawl ASC
29
29
'''
30
30
31
+ many_host_sql = '''
32
+ SELECT
33
+ crawl,
34
+ CAST(SUM(fetch_200) AS INT64) AS sum_fetch_200
35
+ FROM host_index
36
+ WHERE contains(ARRAY [{surt_list}], surt_host_name){and_tld}
37
+ GROUP BY crawl
38
+ ORDER BY crawl ASC
39
+ '''
40
+
41
+
31
42
host_columns = {
32
43
# list order does matter for the graphs
33
44
'rank' : ['crawl' , 'fetch_200' , 'fetch_200_lote' , 'prank10' , 'hcrank10' ],
39
50
domain_columns = {
40
51
'sum' : ['crawl' , 'fetch_200' , 'fetch_200_lote' ],
41
52
}
53
+ many_host_columns = {
54
+ 'sum' : ['crawl' , 'fetch_200' ],
55
+ }
42
56
43
57
def surt_host_name_to_title (surt_host_name ):
44
58
parts = list (reversed (surt_host_name .split (',' )))
@@ -48,14 +62,31 @@ def surt_host_name_to_title(surt_host_name):
48
62
49
63
50
64
def get_surt_host_name_values (host_index , surt_host_name , col_names ):
65
+ if not isinstance (surt_host_name , str ):
66
+ # if not a string, it's a list of strings
67
+ surt_list = ',' .join (f"'{ s } '" for s in surt_host_name )
68
+
69
+ tlds = set ([s .split (',' , 1 )[0 ] for s in surt_host_name ])
70
+ if len (tlds ) == 1 :
71
+ tld = next (iter (tlds ))
72
+ and_tld = f" AND url_host_tld = '{ tld } '"
73
+ else :
74
+ and_tld = ''
75
+
76
+ sql = many_host_sql .format (surt_list = surt_list , and_tld = and_tld )
77
+ return duckdb .sql (sql ).arrow ()
78
+
79
+ tld = surt_host_name .split (',' , 1 )[0 ]
80
+ and_tld = f" AND url_host_tld = '{ tld } '"
51
81
if surt_host_name .endswith (',' ):
82
+ col_names .remove ('crawl' )
52
83
cols = ', ' .join (f'CAST(SUM({ col } ) AS INT64) AS sum_{ col } ' for col in col_names )
53
84
cols = 'crawl, ' + cols
54
- sql = sub_host_sql .format (cols = cols , surt_host_name = surt_host_name + '%' )
85
+ sql = subdomain_sql .format (cols = cols , surt_host_name = surt_host_name + '%' , and_tld = and_tld )
55
86
print (sql )
56
87
else :
57
88
cols = ', ' .join (col_names )
58
- sql = host_sql .format (cols = cols , surt_host_name = surt_host_name )
89
+ sql = host_sql .format (cols = cols , surt_host_name = surt_host_name , and_tld = and_tld )
59
90
print (sql )
60
91
return duckdb .sql (sql ).arrow ()
61
92
@@ -81,7 +112,8 @@ def host_plot_values(table, col_names, title):
81
112
side = 'r'
82
113
else :
83
114
side = 'l'
84
- lines .append (['crawl' , name , side , 'o' , name ])
115
+ # x, y, side, marker, label
116
+ lines .append (['crawl' , name , side , None , name ])
85
117
return do_plot (df , lines , title )
86
118
87
119
@@ -99,7 +131,9 @@ def do_plot(df, lines, title):
99
131
continue
100
132
xvalues = df [x ].astype (str )
101
133
xvalues = [x .replace ('CC-MAIN-' , '' ) for x in xvalues ]
102
- color , ls = graph_utils .get_color (i )
134
+ ls = None
135
+ #color, ls = graph_utils.get_color_ls(i)
136
+ color , marker = graph_utils .get_color_marker (i )
103
137
if side == 'l' :
104
138
our_line , = ax1 .plot (xvalues , yvalues , marker = marker , label = label , color = color , ls = ls )
105
139
else :
@@ -145,7 +179,7 @@ def get_host(host_index, surt_host_name, title):
145
179
146
180
147
181
def plot_host (host_index , surt_host_name , title ,
148
- do_csv = False , do_png = False , do_html = False , html_template = 'host.html' ):
182
+ do_csv = False , do_png = False , do_html = False , verbose = 0 , html_template = 'host.html' ):
149
183
tables , plots = get_host (host_index , surt_host_name , title )
150
184
for key in tables :
151
185
if do_csv :
@@ -168,7 +202,7 @@ def plot_host(host_index, surt_host_name, title,
168
202
f .write (page )
169
203
170
204
171
- def get_domain (host_index , surt_host_name , title ):
205
+ def get_domain (host_index , surt_host_name , title , verbose = 0 ):
172
206
plots = {}
173
207
tables = {}
174
208
for key , cols in domain_columns .items ():
@@ -183,11 +217,11 @@ def get_domain(host_index, surt_host_name, title):
183
217
184
218
185
219
def plot_domain (host_index , surt_host_name , title ,
186
- do_csv = False , do_png = False , do_html = False , html_template = 'domain.html' ):
187
- tables , plots = get_domain (host_index , surt_host_name , title )
220
+ do_csv = False , do_png = False , do_html = False , verbose = 0 , html_template = 'domain.html' ):
221
+ tables , plots = get_domain (host_index , surt_host_name , title , verbose = verbose )
188
222
for key in tables :
189
223
if do_csv :
190
- out = surt_host_name + '_' + key
224
+ out = title + '_' + key
191
225
host_csv (tables [key ], out + '.csv' )
192
226
if do_png :
193
227
with open (out + '.png' , 'wb' ) as fd :
@@ -202,19 +236,19 @@ def plot_domain(host_index, surt_host_name, title,
202
236
)
203
237
template = env .get_template (html_template )
204
238
page = template .render (title = title , plots = plots )
205
- with open (surt_host_name + '.html' , 'w' ) as f :
239
+ with open (title + '.html' , 'w' ) as f :
206
240
f .write (page )
207
241
208
242
209
- def make_plot (surt_host_name , host_index ):
210
- title = surt_host_name_to_title (surt_host_name )
211
- if surt_host_name .endswith (',' ):
243
+ def make_plot (surt_host_name , host_index , title , verbose = 0 ):
244
+ if not isinstance (surt_host_name , str ) or surt_host_name .endswith (',' ):
212
245
plot_domain (host_index , surt_host_name , title ,
213
- do_csv = True , do_png = True , do_html = True )
246
+ do_csv = False , do_png = False , do_html = True , verbose = verbose )
214
247
return
215
248
249
+ print ('plot host' )
216
250
plot_host (host_index , surt_host_name , title ,
217
- do_csv = True , do_png = True , do_html = True )
251
+ do_csv = True , do_png = True , do_html = True , verbose = verbose )
218
252
219
253
220
254
def main ():
@@ -223,11 +257,25 @@ def main():
223
257
grep = None
224
258
#grep = 'CC-MAIN-2022'
225
259
host_index = duck_utils .open_host_index (grep = grep , verbose = verbose )
260
+ if len (sys .argv ) > 2 and sys .argv [1 ] == '-f' :
261
+ assert len (sys .argv ) == 3
262
+ surts = []
263
+ title = sys .argv [2 ]
264
+ with open (sys .argv [2 ], encoding = 'utf8' ) as fd :
265
+ for thing in fd :
266
+ surt_host_name = utils .thing_to_surt_host_name (thing .rstrip (), verbose = verbose )
267
+ if surt_host_name :
268
+ surts .append (surt_host_name )
269
+ if verbose :
270
+ print (f'making a plot for { len (surts )} hosts' )
271
+ make_plot (surts , host_index , title , verbose = verbose )
272
+ return
226
273
for thing in sys .argv [1 :]:
227
274
surt_host_name = utils .thing_to_surt_host_name (thing )
228
275
if not surt_host_name :
229
276
continue
230
- make_plot (surt_host_name , host_index )
277
+ title = surt_host_name_to_title (surt_host_name )
278
+ make_plot (surt_host_name , host_index , title , verbose = verbose )
231
279
232
280
233
281
if __name__ == '__main__' :
0 commit comments