From 8586881d3e353701ab9a71dcdd175a22e72afe65 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 31 May 2017 15:20:36 +0200 Subject: [PATCH] Fix URL ID generator _fast_make_domain_id: - do not overwrite local variable "domain" used as key for cache - avoid hash collisions for pure domains (host without subdomain) by using a 64-bit hash value on domain.suffix - avoid hash collisions inside large domains (aka. public suffixes, e.g., deviantart.com, wordpress.org): replace two-part hash value (32-bit subdomain + 32-bit domain.suffix) by 64-bit hash on subdomain.domain.suffix - add notice that _fast_make_domain_id is not compatible with make_domain_id --- urlserver/id_generator.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/urlserver/id_generator.py b/urlserver/id_generator.py index 62374f4..1e8f92e 100644 --- a/urlserver/id_generator.py +++ b/urlserver/id_generator.py @@ -52,22 +52,22 @@ def make_subdomain_id(url): return mmh3.hash(url.normalized_subdomain) -def _fast_make_domain_id(domain): - """ Experimental fast version bypassing cosrlib.URL """ +def _fast_make_domain_id(host): + """ Experimental fast version bypassing cosrlib.URL + Note: not compatible with make_domain_id""" - if domain not in URL_DOMAIN_IDS_CACHE: + if host not in URL_DOMAIN_IDS_CACHE: - subdomain, domain, suffix = tld_extract(domain) + subdomain, domain, suffix = tld_extract(host) if subdomain == "www" or not subdomain: - URL_DOMAIN_IDS_CACHE[domain] = mmh3.hash("%s.%s" % (domain, suffix)) + URL_DOMAIN_IDS_CACHE[host] = \ + mmh3.hash64("%s.%s" % (domain, suffix))[0] else: while subdomain.startswith("www."): subdomain = subdomain[4:] - URL_DOMAIN_IDS_CACHE[domain] = ( - (mmh3.hash(subdomain) << 32) + - mmh3.hash("%s.%s" % (domain, suffix)) - ) + URL_DOMAIN_IDS_CACHE[host] = \ + mmh3.hash64("%s.%s.%s" % (subdomain, domain, suffix))[0] - return URL_DOMAIN_IDS_CACHE[domain] + return URL_DOMAIN_IDS_CACHE[host]