Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions server_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,19 @@ def process_record(self, record):
payload = record['Envelope']['Payload-Metadata']
if 'HTTP-Response-Metadata' in payload:
try:
server_name = payload['HTTP-Response-Metadata'] \
['Headers'] \
['Server'] \
.strip()
if server_name and server_name != '':
yield server_name, 1
server_names = []
headers = payload['HTTP-Response-Metadata']['Headers']
for header in headers:
if header.lower() == 'server':
if isinstance(headers[header], list):
for server_name in headers[header]:
server_names.append(server_name.strip())
else:
server_names.append(headers[header].strip())
if server_names:
for server_name in server_names:
if server_name != '':
yield server_name, 1
else:
yield ServerCountJob.fallback_server_name, 1
except KeyError:
Expand Down
35 changes: 24 additions & 11 deletions wat_extract_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,30 @@ def yield_redirect(self, src, target, http_status_line):
if src != target:
yield src, target

def extract_http_header_links(self, url, headers):
"""Extract links from WAT HTTP response headers"""
links = []
for header in headers:
header_name = header.lower()
if header_name == 'content-location':
if isinstance(headers[header], list):
for cl in headers[header]:
links.append(cl)
else:
links.append(headers[header])
elif header_name == 'link':
if isinstance(headers[header], list):
for li in headers[header]:
for m in ExtractLinksJob.http_link_pattern.finditer(li):
links.append(m.group(1))
else:
for m in ExtractLinksJob.http_link_pattern.finditer(headers[header]):
links.append(m.group(1))
return links

def yield_http_header_links(self, url, headers):
if 'Content-Location' in headers:
yield url, headers['Content-Location']
if 'Link' in headers:
for m in ExtractLinksJob.http_link_pattern.finditer(headers['Link']):
yield url, m.group(1)
for l in self.extract_http_header_links(url, headers):
yield url, l

def yield_links(self, src_url, base_url, links, url_attr, opt_attr=None):
# base_url = urlparse(base)
Expand Down Expand Up @@ -437,12 +455,7 @@ def yield_link(self, src, target):
yield src_host, thost

def yield_http_header_links(self, url, headers, src_host=None):
links = []
if 'Content-Location' in headers:
links.append(headers['Content-Location'])
if 'Link' in headers:
for m in ExtractLinksJob.http_link_pattern.finditer(headers['Link']):
links.append(m.group(1))
links = self.extract_http_header_links(url, headers)
if links:
if not src_host:
src_host = ExtractHostLinksJob.get_surt_host(url)
Expand Down