urlparse4/benchmarks/urls.py at master · commonsearch/urlparse4 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from numpy import median, percentile, mean
from time import clock
import os
import gc
import tabulate
import sys

import urlparse
import urlparse2
from uritools import urisplit as uritools_urisplit
from uritools import urijoin as uritools_urijoin

from yurl import URL as yurl_url
import pygurl

# Disabled benchmarks
# import slimurl
# import urlparse3
import cyuri

sys.path.insert(-1, os.path.dirname(os.path.dirname(__file__)))
import urlparse4

gc.disable()

REPEATS = 10

URLS = []
for fp in os.listdir("tests/urls/"):
    with open("tests/urls/%s" % fp) as f:
        URLS += f.readlines()

data = []


def benchmark(name, func, debug=False):
    times = []
    for n in range(0, REPEATS):
        for i, url in enumerate(URLS):
            u = url.strip()
            if debug:
                print u
            t = clock()
            func(u)
            times.append(clock() - t)

    row = [name, sum(times), mean(times), median(times), percentile(times, 90)]
    print row
    data.append(row)


def title(name):
    data.append(["", "", "", "", ""])
    data.append(["%s:" % name, "", "", "", ""])
    data.append(["----", "----", "----", "----", "----"])

# Segfault: https://github.com/mitghi/cyuri/issues/1
cyuri_parser = cyuri.uriparser()

title("urlsplit")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url))
benchmark("pygurl", lambda url: pygurl.ParseStandard(url))
benchmark("uritools", lambda url: uritools_urisplit(url))
benchmark("yurl", lambda url: yurl_url(url))
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url))
benchmark("urlparse", lambda url: urlparse.urlsplit(url))
benchmark("cyuri", lambda url: cyuri_parser.components(url))

title("urljoin_sibling")
benchmark("urlparse4", lambda url: urlparse4.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("pygurl", lambda url: pygurl.URL(url).Resolve("sibling.html?q=1#e=b"))
benchmark("uritools", lambda url: uritools_urijoin(url, "sibling.html?q=1#e=b"))
benchmark("yurl", lambda url: yurl_url(url) + yurl_url("sibling.html?q=1#e=b"))
benchmark("urlparse2", lambda url: urlparse2.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("urlparse", lambda url: urlparse.urljoin(url, "sibling.html?q=1#e=b"))
benchmark("cyuri", lambda url: cyuri_parser.join(url, "sibling.html?q=1#e=b"))

# Not very representative because some libraries have functions to access the host directly without parsing the rest.
# Might still be useful for some people!
title("hostname")
benchmark("urlparse4", lambda url: urlparse4.urlsplit(url).hostname)
benchmark("pygurl", lambda url: pygurl.URL(url).host())
benchmark("uritools", lambda url: uritools_urisplit(url).host)
benchmark("yurl", lambda url: yurl_url(url).host)
benchmark("urlparse2", lambda url: urlparse2.urlsplit(url).hostname)
benchmark("urlparse", lambda url: urlparse.urlsplit(url).hostname)
benchmark("cyuri", lambda url: cyuri_parser.components(url)["host"])

# Very slow!
# benchmark("slimurl", lambda url: slimurl.URL(url))

# Breaks on simple URLs like http://1-14th.com/timeline-4-66T.htm
# benchmark("urlparse3_urlsplit", lambda url: urlparse3.parse_url(url))


print
print "Benchmark results on %s URLs x %s times, in seconds:" % (len(URLS), REPEATS)
print
print tabulate.tabulate(data, headers=["Name", "Sum", "Mean", "Median", "90%"])
print