From e6856936c623e4a4d47ffa457ca707f248f53a6c Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 21 Jun 2026 18:15:55 +0000 Subject: [PATCH 1/2] feat: count WAF blocking of robots.txt --- Makefile | 5 +++- http_class.py | 13 +++++++++ post_robots_http.py | 69 +++++++++++++++++++++++++++++++++++++++++++-- quad_class.py | 3 +- survey_http.py | 3 ++ 5 files changed, 88 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 3fefa41..868fb60 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ sursurvey_robots_txt: # all segments of survey_robotstxt_txt survey_all_robots_http: - cat files_robotstxt | xargs -IFILE python survey_http.py s3://commoncrawl FILE > all_robotstxt.out 2> all_robotstxt_diag.out + cat files_robotstxt | parallel -j25 python survey_http.py s3://commoncrawl {} > all_robotstxt.out 2> all_robotstxt_diag.out survey_all_crawldiagnostics_http: cat files_crawldiagnostics | xargs -IFILE python survey_http.py s3://commoncrawl FILE > all_crawldiagnostics.out 2> all_crawldiagnostics_diag.out @@ -68,6 +68,9 @@ survey_all_warc_http: survey_all_robots_txt: cat files_robotstxt | xargs -IFILE python survey_robotstxt_txt.py s3://commoncrawl FILE > all_robotstxt_txt.out 2> all_robotstxt_txt_diag.out +check_ADD_robotstxt: + grep ADD all_robotstxt.out | jq -c .kinds | sort | uniq -c | sort -nr + post-robotstxt: python post_robots_http.py all_robotstxt.out diff --git a/http_class.py b/http_class.py index a435f5a..bc0a9c2 100644 --- a/http_class.py +++ b/http_class.py @@ -53,3 +53,16 @@ def analyze_http(url, status, ip, http_headers, classifier, verbose=0): print(*nm, file=sys.stderr) return ret + + +def facet_to_kind(classifier, techs, kinds): + # given a list of techs, pass back kinds that match kinds + conf = classifier.configuration + ret = defaultdict(list) + + for tech in techs: + ks = conf[tech]['kind'] + for k in ks: + if k in kinds: + ret[k].append(tech) + return ret diff --git a/post_robots_http.py b/post_robots_http.py index ec8686f..d4319b8 100644 --- a/post_robots_http.py +++ b/post_robots_http.py @@ -15,6 +15,9 @@ status = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) outredir = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) +waf_blocked = defaultdict(set) +waf_not_blocked = defaultdict(set) + with open(sys.argv[1], 'rb', buffering=1024*1024) as f: for line in f: record = orjson.loads(line) @@ -25,9 +28,24 @@ status[quads.quad_host]['status'][record['status']] += 1 status[quads.quad_host][quads.scheme_host][record['status']] += 1 + # block if record['status'].startswith(('4', '5')) and record['status'] not in ('404', '410'): status[quads.quad_host]['bad_status']['all'] += 1 - + if 'kinds' in record: + # only robotstxt -- not present for robotstxt_txt + if 'Web Application Firewall' in record['kinds']: + for k in record['kinds']['Web Application Firewall']: + waf_blocked[k].add(quads.quad_host) + + # not blocked + if record['status'] in ('200', '404', '410'): + status[quads.quad_host]['good_status']['all'] += 1 + if 'kinds' in record: + if 'Web Application Firewall' in record['kinds']: + for k in record['kinds']['Web Application Firewall']: + waf_not_blocked[k].add(quads.quad_host) + + # redirect if 'location' in record: location_quads = quad_class.make_quad(record['location']) # XXX can fail # is the location quad host equal to the quad host @@ -36,6 +54,7 @@ status[quads.quad_host]['outredir'][record['location']] += 1 #status[quads.quad_host]['outredir']['all'] += 1 + # all records should have this if 'analysis' in record: # robotstxt if any('CloudFlare' in k for k in record['analysis']): @@ -67,8 +86,6 @@ for status_code in ('200', '404', '410'): quads_with_status[status_code] = len([True for x in status if status[x]['status'][status_code]]) print(f'quads with {status_code}: {quads_with_status[status_code]}') - for x in status: - status[quads.quad_host]['good_status']['all'] += 1 print('quads with outredir', len(outredir)) @@ -77,6 +94,9 @@ good_and_bad = len([status[x]['status'][status_code] for x in status if status[x]['bad_status']['all'] and status[x]['good_status']['all']]) print('quads that are both good and bad', good_and_bad) +#for x in status: +# if status[x]['bad_status']['all'] and status[x]['good_status']['all']: +# print(x, status[x]) multiple_outredir = len([True for x in status if len(status[x]['outredir']) > 1]) print('multiple outredir', multiple_outredir) @@ -106,7 +126,50 @@ foo = sum(status[x].get(thing, False) for x in status) print(thing, foo) +all_wafs = set(waf_blocked) +all_wafs.update(set(waf_not_blocked)) +sum_blocked = 0 +sum_not_blocked = 0 +for waf in sorted(all_wafs): + blocked = len(waf_blocked[waf]) + not_blocked = len(waf_not_blocked[waf]) + pct = round(100 * blocked / (blocked + not_blocked)) + print('WAF blocked / not blocked', waf, blocked, '/', not_blocked, '('+str(pct)+'%)') + sum_blocked += blocked + sum_not_blocked += not_blocked +pct = round(100 * sum_blocked / (sum_blocked + sum_not_blocked)) +print('WAF blocked / not blocked sums', sum_blocked, '/', sum_not_blocked,'('+str(pct)+'%)') + ''' +robotstxt +quad count 9757 +quad cloudflare count 1738 +quads with 200: 6638 +quads with 404: 2113 +quads with 410: 5 +quads with outredir 523 +quads with bad status 729 +quads that are both good and bad 115 <==== +multiple outredir 2 +quads with cf bad status 179 + quads with cf blocked status 162 +WAF blocked / not blocked Amazon Cloudfront bots 6 / 0 (100%) +WAF blocked / not blocked Azure Front Door 3 / 27 (10%) +WAF blocked / not blocked BIG-IP Application Security Manager (F5 Networks) 1 / 140 (1%) +WAF blocked / not blocked Barracuda Load Balancer ADC and WAF 0 / 3 (0%) +WAF blocked / not blocked CleanTalk 0 / 1 (0%) +WAF blocked / not blocked CloudFlare-bot 94 / 586 (14%) +WAF blocked / not blocked CloudProxy WebSite Firewall (Sucuri) 2 / 24 (8%) +WAF blocked / not blocked DDoS-Guard 2 / 4 (33%) +WAF blocked / not blocked Ergon Airlock WAF 0 / 1 (0%) +WAF blocked / not blocked Generic WAF 0 / 21 (0%) +WAF blocked / not blocked Incapsula 3 / 31 (9%) +WAF blocked / not blocked QRATOR Labs WAF 1 / 7 (12%) +WAF blocked / not blocked Stingray Application Firewall (Riverbed / Brocade) 0 / 2 (0%) +WAF blocked / not blocked Wordfence WordPress Plugins 0 / 1 (0%) +WAF blocked / not blocked Zenedge Cybersecurity Suite (Oracle) 0 / 2 (0%) +WAF blocked / not blockd sums 112 850 (12%) + robotstxt quad count 9757 quad cloudflare count 256 diff --git a/quad_class.py b/quad_class.py index 2c09480..fb3f35a 100644 --- a/quad_class.py +++ b/quad_class.py @@ -14,7 +14,8 @@ def make_quad(url): if host.startswith('www.'): # this is a cheat, but we probably don't care about 'www.com' - host = host.replace('www.', '', 1) + host = '^' + host + host = host.replace('^www.', '', 1).replace('^', '') # keeps the port # XXX consider normalizing https plus :443 or http plus :80 diff --git a/survey_http.py b/survey_http.py index 3846bc0..735e50a 100644 --- a/survey_http.py +++ b/survey_http.py @@ -34,8 +34,11 @@ #payload = record.content_stream().read().decode('utf-8', errors='replace') ret = http_class.analyze_http(uri, status, ip, http_headers, classifier, verbose=verbose) + kinds = http_class.facet_to_kind(classifier, ret, ('Web Application Firewall', 'ADD')) out = {'status': status, 'url': uri, 'date': date, 'analysis': ret} + if kinds: + out['kinds'] = kinds if status in {'301', '302', '303', '307', '308'}: location = record.http_headers.get_header('Location') # XXX this can fail From afbc962e311dfad07941b0719434bc20a343cdce Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 21 Jun 2026 18:28:11 +0000 Subject: [PATCH 2/2] feat: count blocked but no WAF seen --- post_robots_http.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/post_robots_http.py b/post_robots_http.py index d4319b8..72317e5 100644 --- a/post_robots_http.py +++ b/post_robots_http.py @@ -18,6 +18,8 @@ waf_blocked = defaultdict(set) waf_not_blocked = defaultdict(set) +nowaf_blocked = set() + with open(sys.argv[1], 'rb', buffering=1024*1024) as f: for line in f: record = orjson.loads(line) @@ -36,6 +38,8 @@ if 'Web Application Firewall' in record['kinds']: for k in record['kinds']['Web Application Firewall']: waf_blocked[k].add(quads.quad_host) + else: + nowaf_blocked.add(quads.quad_host) # not blocked if record['status'] in ('200', '404', '410'): @@ -140,6 +144,9 @@ pct = round(100 * sum_blocked / (sum_blocked + sum_not_blocked)) print('WAF blocked / not blocked sums', sum_blocked, '/', sum_not_blocked,'('+str(pct)+'%)') +print('blocked but no visible WAF', len(nowaf_blocked)) +print(' ', '\n '.join(x for x in nowaf_blocked)) + ''' robotstxt quad count 9757 @@ -168,7 +175,9 @@ WAF blocked / not blocked Stingray Application Firewall (Riverbed / Brocade) 0 / 2 (0%) WAF blocked / not blocked Wordfence WordPress Plugins 0 / 1 (0%) WAF blocked / not blocked Zenedge Cybersecurity Suite (Oracle) 0 / 2 (0%) -WAF blocked / not blockd sums 112 850 (12%) +WAF blocked / not blocked sums 112 / 850 (12%) +blocked but no visible WAF 1 + quad://philsci.com robotstxt quad count 9757