diff --git a/wat_extract_links.py b/wat_extract_links.py index c09e103..a14effc 100644 --- a/wat_extract_links.py +++ b/wat_extract_links.py @@ -344,9 +344,9 @@ class ExtractHostLinksJob(ExtractLinksJob): global_link_pattern = re.compile(r'^(?:[a-z][a-z0-9]{1,5}:)?//', re.IGNORECASE|re.ASCII) - # match IP addresses - # - including IPs with leading `www.' (stripped) - ip_pattern = re.compile(r'^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z') + # simple pattern to match common IPv4 and IPv6 addresses + # (short link to avoid that IP addresses are validated as host names) + host_ip_pattern = re.compile(r'^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f]{0,4}:[0-9a-f:]+)\Z') # valid host names, relaxed allowing underscore, allowing also IDNAs # https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames @@ -373,7 +373,7 @@ def get_surt_host(url): host = host.strip().lower() if len(host) < 1 or len(host) > 253: return None - if ExtractHostLinksJob.ip_pattern.match(host): + if ExtractHostLinksJob.host_ip_pattern.match(host): return None parts = host.split('.') if parts[-1] == '': @@ -382,10 +382,6 @@ def get_surt_host(url): if len(parts) <= 1: # do not accept single-word hosts, must be at least `domain.tld' return None - if len(parts) > 2 and parts[0] == 'www': - # strip leading 'www' to reduce number of "duplicate" hosts, - # but leave at least 2 trailing parts (www.com is a valid domain) - parts = parts[1:] for (i, part) in enumerate(parts): if len(part) > 63: return None