From 7e0b9c85e73b8c9fa115773299c3fa1a04a0b6ca Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 13 Jan 2026 16:14:26 +0100 Subject: [PATCH 1/2] Host-level link extraction: preserve the www. prefix in host names Resolves #56. --- wat_extract_links.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/wat_extract_links.py b/wat_extract_links.py index c09e103..4623f64 100644 --- a/wat_extract_links.py +++ b/wat_extract_links.py @@ -345,8 +345,7 @@ class ExtractHostLinksJob(ExtractLinksJob): re.IGNORECASE|re.ASCII) # match IP addresses - # - including IPs with leading `www.' (stripped) - ip_pattern = re.compile(r'^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z') + ip_pattern = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z') # valid host names, relaxed allowing underscore, allowing also IDNAs # https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames @@ -382,10 +381,6 @@ def get_surt_host(url): if len(parts) <= 1: # do not accept single-word hosts, must be at least `domain.tld' return None - if len(parts) > 2 and parts[0] == 'www': - # strip leading 'www' to reduce number of "duplicate" hosts, - # but leave at least 2 trailing parts (www.com is a valid domain) - parts = parts[1:] for (i, part) in enumerate(parts): if len(part) > 63: return None From cd588a6b0317a8972d26ead75bf656f0ce56b653 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 13 Jan 2026 17:22:13 +0100 Subject: [PATCH 2/2] Host-level link extraction: early skip IPv6 addresses --- wat_extract_links.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wat_extract_links.py b/wat_extract_links.py index 4623f64..a14effc 100644 --- a/wat_extract_links.py +++ b/wat_extract_links.py @@ -344,8 +344,9 @@ class ExtractHostLinksJob(ExtractLinksJob): global_link_pattern = re.compile(r'^(?:[a-z][a-z0-9]{1,5}:)?//', re.IGNORECASE|re.ASCII) - # match IP addresses - ip_pattern = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z') + # simple pattern to match common IPv4 and IPv6 addresses + # (short link to avoid that IP addresses are validated as host names) + host_ip_pattern = re.compile(r'^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f]{0,4}:[0-9a-f:]+)\Z') # valid host names, relaxed allowing underscore, allowing also IDNAs # https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames @@ -372,7 +373,7 @@ def get_surt_host(url): host = host.strip().lower() if len(host) < 1 or len(host) > 253: return None - if ExtractHostLinksJob.ip_pattern.match(host): + if ExtractHostLinksJob.host_ip_pattern.match(host): return None parts = host.split('.') if parts[-1] == '':