diff --git a/data/warc-record-size-truncation-by-mime-type-CC-SUPPLEMENTAL-2026-22.csv b/data/warc-record-size-truncation-by-mime-type-CC-SUPPLEMENTAL-2026-22.csv
new file mode 100644
index 0000000..9d77407
--- /dev/null
+++ b/data/warc-record-size-truncation-by-mime-type-CC-SUPPLEMENTAL-2026-22.csv
@@ -0,0 +1,223 @@
+"n_pages","perc_pages","avg_warc_record_length","sum_warc_record_length","max_warc_record_length","percentiles_warc_record_length","perc_warc_storage","perc_truncated","sum_warc_record_length_truncated","perc_warc_storage_truncated","sum_warc_record_length_truncated_length","perc_warc_storage_truncated_length","content_mime_detected","reason_truncated","common_url_path_suffixes","uniq_tlds","uniq_domains","uniq_hosts","top_tlds","top_domains"
+"40613284","84.27772679536315","25121.29692430191","1020258366435","19848197","[967, 1096, 1755, 3354, 4558, 7006, 13738, 28021, 41307, 53646, 76266, 118616, 200195, 279373]","15.245210456074519","0.002075675535127866","1194770674","0.017852860580326774","1136067210","0.016975684080115627","text/html","{length=255, disconnect=432, time=156}","[{.html, 6044216}, {.php, 1290782}, {.htm, 831170}, {.aspx, 268616}, {.do, 217619}, {.cgi, 127792}, {.shtml, 111921}, {.pl, 103811}, {.asp, 95325}, {.jsp, 72030}, {.cfm, 51596}, {.en, 36700}, {.ja, 31105}, {.pdf, 19461}, {.HTM, 19411}, {.showcfp, 15060}, {.HTML, 11873}, {.1, 8844}, {.xml, 8764}, {.2, 4938}, {.de, 4772}, {.es, 4704}, {.2003, 4376}, {.2004, 4369}, {.2002, 4329}]","184","5109","7491","[{com, 8204849}, {org, 7988496}, {edu, 4364031}, {de, 2390003}, {gov, 2110414}, {uk, 1125095}, {jp, 899297}, {ca, 754304}, {nl, 748835}, {fr, 738358}, {ru, 714493}, {se, 705555}, {net, 620669}, {ch, 537122}, {it, 496428}, {es, 490055}, {au, 472364}, {pl, 464163}, {eu, 369866}, {at, 305854}, {kr, 295846}, {no, 292552}, {br, 279125}, {cn, 277346}, {fi, 275921}]","{epfl.ch=158864, berkeley.edu=138929, mendeley.com=138141, stanford.edu=132611, nii.ac.jp=240992, nih.gov=282627, nasa.gov=636987, noaa.gov=296578, arcgis.com=152254, wisc.edu=193077, mit.edu=201479, copernicus.org=671032, mpg.de=137885, usda.gov=159482, proteinatlas.org=154996, canada.ca=138984, duke.edu=154950, uni-bielefeld.de=130305, harvard.edu=251278, caltech.edu=190863, cern.ch=148213, uni-tuebingen.de=147999, rutgers.edu=156560, apache.org=132279, scholaris.ca=140291}"
+"3169664","6.577455706982423","1551893.1150027257","4918979738472","26223850","[6707, 12398, 32128, 58837, 84962, 135729, 371410, 1174367, 2332993, 3737661, 7393187, 15214114, 22333101, 25389147]","73.50185385316284","1.050616090538303","772971459789","11.550125898206202","765314298195","11.435708762471656","application/pdf","{length=31373, disconnect=1713, time=215}","[{.pdf, 2773396}, {.PDF, 41523}, {.php, 11324}, {.do, 4972}, {.shtml, 3517}, {.html, 797}, {.htm, 592}, {.asp, 282}, {.aspx, 217}, {.ashx, 148}, {.jsp, 40}, {.Pdf, 19}, {.2013, 12}, {.pdf-1, 8}, {.1, 8}, {.31, 6}, {.cfm, 6}, {.x, 6}, {.2, 5}, {.3, 5}]","133","2937","4180","[{org, 593956}, {edu, 388561}, {de, 329161}, {com, 176972}, {jp, 173769}, {gov, 147427}, {uk, 96604}, {it, 88637}, {cc, 81456}, {fr, 80750}, {ru, 78284}, {ca, 77882}, {ch, 72648}, {press, 62092}, {es, 58806}, {nl, 49031}, {br, 45249}, {au, 41305}, {int, 41210}, {in, 38181}, {at, 34008}, {se, 31710}, {hr, 28334}, {eu, 25081}, {il, 25061}]","{numdam.org=32809, epj-conferences.org=20771, uni-jena.de=43279, nasa.gov=21793, ems.press=39803, mlr.press=22289, copernicus.org=41100, columbia.edu=26624, tamu.edu=23175, thecvf.com=26294, istra-istria.hr=23640, questjournals.org=21275, sissa.it=28795, bom.gov.au=26922, kyoto-u.ac.jp=21963, academie-sciences.fr=19277, nips.cc=25674, uam.es=25876, unict.it=19275, isca-archive.org=39786, cwi.nl=18513, neurips.cc=48224, iodp.org=31513, cbd.int=19612, dagstuhl.de=23780}"
+"2417807","5.017256860831953","14985.212882996864","36231352605","19637007","[1013, 1052, 1317, 1826, 2392, 3576, 6944, 13475, 23742, 30594, 46195, 67803, 87161, 114045]","0.5413869797525046","0.0013235134152560564","321576818","0.004805161544849834","321442842","0.004803159608494046","application/xhtml+xml","{length=19, disconnect=6, time=7}","[{.html, 883563}, {.php, 114515}, {.aspx, 88958}, {.htm, 76121}, {.shtml, 70952}, {.cfc, 49044}, {.do, 32103}, {.cfm, 31236}, {.asp, 27035}, {.xml, 18208}, {.page, 16374}, {.cgi, 5695}, {.dk, 2807}, {.pdf, 1138}, {.genomic, 724}, {.cdna, 496}, {.xht, 487}, {.de, 470}, {.en, 392}, {.xhtml, 262}, {.1, 143}, {.txt, 118}, {.xhtm, 115}, {.pl, 109}, {.0, 100}]","66","786","1039","[{org, 566035}, {com, 529973}, {uk, 336407}, {edu, 230617}, {jp, 124401}, {be, 74858}, {gov, 67496}, {ca, 63671}, {ch, 61890}, {de, 58806}, {cern, 51238}, {br, 48024}, {net, 35349}, {cn, 26630}, {ar, 22286}, {au, 12842}, {pl, 12258}, {it, 9296}, {at, 8763}, {dk, 8341}, {nl, 7652}, {in, 7264}, {ru, 7177}, {is, 6505}, {kr, 5661}]","{soton.ac.uk=68546, stanford.edu=46586, lancs.ac.uk=56328, postgis.net=34186, nasa.gov=34022, naturalsciences.be=46448, uqam.ca=51326, blogspot.com=93519, arlis.org=69653, kyushu-u.ac.jp=54801, whiterose.ac.uk=32843, universaldependencies.org=34301, edge.org=30826, ikonet.com=43195, mapsales.com=53898, harvard.edu=36310, w3.org=82396, sketchport.com=35521, marchionnilab.org=81381, ncl.ac.uk=32527, techscience.com=57809, nvidia.com=45877, root.cern=51238, jmp.com=58606, bgs.ac.uk=79214}"
+"875344","1.8164500680112536","106948.8749577309","93617056001","25946033","[698, 733, 798, 859, 917, 1098, 2221, 12128, 35385, 85521, 354051, 1175070, 2520550, 4183063]","1.3988728423765253","0.5704043210440696","32299486148","0.48263506592934663","32234006677","0.4816566389457625","text/plain","{length=4923, disconnect=54, time=16}","[{.txt, 238878}, {.dat, 48266}, {.tab, 37502}, {.pdb, 31297}, {.ris, 24805}, {.php, 21494}, {.tbl, 14274}, {.lst, 14209}, {.fasta, 13109}, {.WEA, 12005}, {.sasf, 9970}, {.asc, 9709}, {.fa, 8735}, {.TAB, 8303}, {.Rmd, 8216}, {.ipynb, 7703}, {.cgi, 5990}, {.TXT, 5155}, {.lbl, 4525}, {.BAK, 4518}, {.md5, 4373}, {.ibg, 3759}, {.tsc, 3577}, {.bak, 2889}, {.input, 2667}]","81","804","1044","[{edu, 232105}, {org, 216516}, {gov, 114550}, {com, 67710}, {de, 64479}, {fr, 54982}, {pl, 46363}, {ca, 11512}, {es, 10756}, {ch, 8743}, {jp, 5748}, {uk, 5077}, {ir, 4703}, {br, 4068}, {net, 3620}, {id, 2308}, {uz, 2258}, {ec, 1955}, {cz, 1713}, {nl, 1667}, {it, 1486}, {pt, 1364}, {se, 1291}, {cc, 1290}, {io, 1038}]","{obspm.fr=53069, freedesktop.org=12510, fishtedb.com=31193, noaa.gov=53025, nasa.gov=44586, bioconductor.org=36691, wisc.edu=7626, mit.edu=24145, uni-bayreuth.de=58290, copernicus.org=11093, rcin.org.pl=7578, cnio.es=9109, astrouw.edu.pl=29156, rstudio.com=11197, ucla.edu=8372, nmsu.edu=40900, predictioncenter.org=35114, lowell.edu=69697, salmobase.org=42364, colorado.edu=11663, caltech.edu=23848, usgs.gov=7983, cern.ch=7006, techscience.com=15574, nyu.edu=6864}"
+"147997","0.30711258741187636","1453.5783968593958","215125242","1789966","[791, 831, 882, 918, 947, 984, 1049, 1201, 1475, 1674, 1905, 2292, 2913, 7038]","0.0032145088897076977","0.0","0","0.0","0","0.0","application/x-bibtex-text-file",,"[{.bib, 110109}, {.txt, 2020}, {.do, 1230}, {.php, 848}, {.bibtex, 35}]","41","177","206","[{cc, 50482}, {org, 33607}, {pl, 11701}, {de, 11372}, {uk, 11078}, {gov, 4517}, {com, 4469}, {br, 4110}, {id, 2317}, {uz, 2258}, {ec, 1954}, {pt, 1864}, {edu, 1374}, {es, 1294}, {it, 973}, {fr, 651}, {press, 628}, {lu, 595}, {vn, 520}, {kz, 430}, {at, 315}, {dk, 237}, {science, 236}, {cn, 234}, {nl, 232}]","{ox.ac.uk=11049, prin.or.id=1938, ufpel.edu.br=2747, copernicus.org=11097, rcin.org.pl=7595, mpg.de=7191, jmlr.org=9902, nist.gov=2930, sissa.it=955, efdeportes.com=1303, ufla.br=902, nips.cc=16587, mapress.com=911, lanl.gov=1238, ucuenca.edu.ec=1954, openscience.uz=2258, umk.pl=3893, xjegi.com=1230, oeko.de=3227, haematologica.org=4349, uc.pt=1864, icml.cc=633, ijcai.org=5043, ugr.es=719, neurips.cc=33262}"
+"108643","0.22544803498846924","21801.838765498007","2368617169","19273055","[834, 856, 925, 1124, 1353, 2061, 5256, 12610, 32581, 41139, 56055, 79346, 153737, 440973]","0.03539306161972745","0.13806687959647654","355011199","0.005304754776900093","355011199","0.005304754776900093","application/xml","{length=150}","[{.xml, 79557}, {.mw, 10159}, {.html, 2443}, {.XML, 1421}, {.lblx, 1394}, {.the, 827}, {.dno, 748}, {.dco, 746}, {.dfr, 718}, {.def, 709}, {.dcl, 656}, {.jsp, 478}, {.xml_fmt, 161}, {.aspx, 153}, {.sch, 151}, {.gpml, 150}, {.ounav, 147}, {.php, 140}, {.rels, 115}, {.xsd, 109}, {.txt, 79}, {.drd, 66}, {.jnlp, 50}, {.cps, 39}, {.htm, 37}]","49","350","425","[{org, 38418}, {edu, 23676}, {gov, 12914}, {com, 12352}, {uk, 9660}, {pl, 4883}, {ir, 2575}, {ch, 2346}, {jp, 816}, {de, 192}, {nl, 173}, {es, 125}, {fr, 67}, {ca, 59}, {at, 55}, {ru, 46}, {io, 36}, {it, 24}, {net, 21}, {si, 20}, {cz, 17}, {kr, 15}, {dk, 15}, {community, 14}, {press, 10}]","{berkeley.edu=2833, opentopography.org=477, esf.edu=712, noaa.gov=6614, nasa.gov=3362, adho.org=430, areeo.ac.ir=2515, mit.edu=1567, ceda.ac.uk=9348, copernicus.org=35194, columbia.edu=3154, kek.jp=771, lemoyne.edu=593, ucla.edu=4882, uwb.edu.pl=4774, hixie.ch=2275, nmsu.edu=463, fau.edu=4845, caltech.edu=2281, usgs.gov=2724, openexoplanetcatalogue.com=1273, rutgers.edu=1649, maplesoft.com=10278, frontiersin.org=308, satobs.org=391}"
+"89137","0.18497060551316866","2756.045996611957","245665672","1453229","[1102, 1289, 1360, 1534, 1563, 1599, 1746, 2460, 3594, 4746, 6769, 11289, 14183, 17597]","0.00367085925946345","0.0","0","0.0","0","0.0","application/atom+xml",,"[{.atom, 25349}, {.xml, 734}, {.php, 420}]","35","141","182","[{com, 53364}, {org, 26093}, {eu, 3531}, {gov, 2960}, {pt, 1139}, {edu, 531}, {de, 415}, {it, 277}, {il, 176}, {uk, 165}, {au, 105}, {es, 95}, {ca, 75}, {eus, 47}, {jp, 46}, {cat, 40}, {br, 19}, {io, 13}, {fi, 11}, {at, 8}, {info, 6}, {ch, 5}]","{nasa.gov=2586, uni-hamburg.de=117, upc.edu=478, copernicus.org=441, defra.gov.uk=163, usda.gov=183, ehu.eus=47, blogspot.com=49915, ibamendes.com=2917, produccioncientificaluz.org=33, thespaceacademy.org=534, uni-augsburg.de=276, bgu.ac.il=176, nist.gov=143, us.es=78, diba.cat=40, uevora.pt=1136, anu.edu.au=105, w3.org=505, spinroot.com=418, theoj.org=24483, scientix.eu=3530, globe.gov=27, geoitaliani.it=276, scholaris.ca=51}"
+"87210","0.18097183556551644","1668.3001834651989","145492459","731196","[816, 875, 947, 1001, 1068, 1168, 1411, 1766, 2066, 2339, 2643, 3027, 3554, 4623]","0.0021740210423141916","0.0","0","0.0","0","0.0","text/calendar",,"[{.ics, 30888}, {.php, 2541}, {.ashx, 631}, {.jsp, 226}]","40","199","243","[{ch, 22536}, {de, 21305}, {com, 7180}, {edu, 6587}, {dk, 5813}, {ca, 5496}, {au, 2427}, {uk, 2050}, {at, 1629}, {gov, 1601}, {it, 1328}, {se, 1264}, {no, 1249}, {org, 1199}, {nl, 664}, {za, 631}, {eu, 624}, {si, 606}, {ru, 478}, {mx, 461}, {il, 431}, {hr, 313}, {pl, 295}, {lu, 292}, {fr, 245}]","{epfl.ch=13312, fu-berlin.de=4018, spacepolicyonline.com=7120, uni-jena.de=3109, wisc.edu=2542, au.dk=5770, isi.edu=933, ed.gov=929, fau.de=855, mpg.de=5895, usyd.edu.au=1399, unibas.ch=7252, birmingham.ac.uk=838, uni-halle.de=753, yorku.ca=2055, fz-juelich.de=1737, uq.edu.au=775, uit.no=1182, ncl.ac.uk=810, cern.ch=1912, scilifelab.se=1078, uni-potsdam.de=710, uni-regensburg.de=987, carleton.ca=2542, olemiss.edu=1535}"
+"81950","0.17005666694867644","4906268.0499816965","402068666696","26223382","[709, 712, 916, 1565, 5952, 89236, 1382213, 5680499, 12621314, 18810103, 24599693, 25807283, 26092386, 26219668]","6.007910979443376","16.019524100061012","237110930611","3.543030037804325","237097914742","3.542835548184145","application/octet-stream","{length=12744, disconnect=383, time=1}","[{.bc, 18805}, {.DAT, 9515}, {.bb, 9481}, {.bw, 5734}, {.bai, 4261}, {.ldb, 3984}, {.bpc, 3472}, {.ck, 1988}, {.BC, 1536}, {.root, 976}, {.dat, 957}, {.bigWig, 914}, {.bds, 823}, {.cdb, 807}, {.hdb, 798}, {.imh, 796}, {.foot, 605}, {.b16, 556}, {.bes, 549}, {.2bit, 504}, {.m2t, 475}, {.aspx, 423}, {.pix, 397}, {.tns, 388}, {.glb, 378}]","37","183","219","[{gov, 26663}, {edu, 24338}, {es, 6367}, {org, 5472}, {ca, 4913}, {ch, 4891}, {jp, 3446}, {fr, 2637}, {com, 2164}, {ru, 218}, {pl, 201}, {nl, 159}, {de, 146}, {uk, 100}, {ee, 50}, {cern, 36}, {net, 29}, {kr, 22}, {dk, 19}, {cz, 17}, {se, 12}, {in, 9}, {eu, 7}, {cn, 7}, {it, 5}]","{obspm.fr=2102, 1001genomes.org=1743, nasa.gov=25907, nationsreportcard.gov=423, wisc.edu=3282, mit.edu=9502, cnio.es=6364, u-bordeaux.fr=424, ti.com=1599, riken.jp=3441, hawaii.edu=340, uzh.ch=4034, ucsc.edu=4298, ucla.edu=510, harvard.edu=690, nmsu.edu=2215, esahubble.org=475, salmobase.org=2204, colorado.edu=1142, utoronto.ca=1706, caltech.edu=550, cern.ch=853, bcgsc.ca=3198, personality-project.org=397, nyu.edu=611}"
+"80351","0.16673853869424163","10603.003173575935","851961908","18210375","[1094, 1205, 1285, 1339, 1362, 1371, 1610, 2023, 2280, 3219, 4971, 12429, 23928, 63008]","0.012730440656323965","0.002489079165162848","24649689","3.6832797342782326E-4","24649689","3.6832797342782326E-4","application/x-pds","{length=2}","[{.LBL, 58509}, {.lbl, 11670}, {.CAT, 4157}, {.TXT, 3787}, {.txt, 512}, {.cat, 236}, {.imq, 225}, {.DIM, 16}, {.MOS, 15}, {.CPS, 13}, {.MGL, 13}, {.SLP, 11}, {.DEM, 10}, {.DTM, 8}, {.MEX, 6}]","5","8","10","[{edu, 54217}, {org, 20123}, {gov, 5785}, {com, 225}]","{ucla.edu=14699, nmsu.edu=937, jaxa.jp=1, nasa.gov=5785, uahirise.org=20123, colorado.edu=6717, msss.com=225, mit.edu=31864}"
+"68737","0.14263801239842797","8667.214833350306","595758346","6057312","[843, 985, 1076, 1170, 1225, 1407, 2734, 6034, 10510, 17331, 24843, 43370, 98412, 151584]","0.008902118977439916","0.004364461643656255","1693049","2.529838437653322E-5","1689757","2.524919366712815E-5","application/rss+xml","{length=1, disconnect=2}","[{.xml, 19776}, {.php, 2228}, {.rss, 1798}, {.html, 807}, {.feed, 391}, {.ashx, 197}, {.en, 74}, {.asp, 67}, {.page, 34}, {.aspx, 24}, {.jsp, 13}, {.axd, 10}, {.portlet, 8}]","76","632","838","[{edu, 18663}, {de, 15847}, {gov, 6938}, {it, 5460}, {com, 5242}, {org, 4069}, {cern, 2013}, {ch, 1683}, {ca, 1629}, {pt, 1161}, {fi, 1023}, {se, 918}, {fr, 830}, {press, 620}, {gr, 450}, {za, 409}, {br, 335}, {net, 301}, {be, 155}, {uk, 145}, {il, 93}, {eu, 69}, {io, 69}, {berlin, 58}, {jp, 49}]","{umd.edu=1177, college-de-france.fr=463, nasa.gov=3670, noaa.gov=2472, wombania.com=1937, mit.edu=2700, ucsb.edu=688, mlr.press=620, copernicus.org=880, ucar.edu=2743, ulethbridge.ca=1423, sciencedaily.com=555, mongabay.com=1489, famedisud.it=5405, edge.org=876, gatech.edu=2790, uevora.pt=1136, harvard.edu=937, maaseutuverkosto.fi=989, uni-bremen.de=14753, colorado.edu=4580, cms.cern=2013, brightsurf.com=605, olemiss.edu=1302, patientensicht.ch=1450}"
+"68071","0.1412559777408585","858717.3423043587","58453748208","26223135","[8296, 12893, 24827, 45312, 72845, 119662, 328552, 779186, 1158504, 1592807, 2962388, 7052735, 10811233, 15849797]","0.8734451220343143","0.13809111075200894","2435344823","0.03639013957756347","2432751518","0.036351389118069635","image/vnd.djvu","{length=93, time=1}","[{.djvu, 68069}]","15","26","30","[{ru, 37717}, {org, 30195}, {nl, 56}, {su, 31}, {fr, 23}, {uk, 19}, {edu, 9}, {it, 5}]","{numdam.org=30195, lirmm.fr=21, usp.br=2, eimi.ru=1, berkeley.edu=2, msu.ru=36, utexas.edu=1, ac.ru=2, ethz.ch=2, hse.ru=8, mit.edu=3, unipi.it=5, dauphine.fr=2, rochester.edu=1, cam.ac.uk=19, cimat.mx=4, columbia.edu=2, ras.ru=2650, uva.nl=56, uj.edu.pl=1, bme.hu=2, epizodyspace.ru=34949, mccme.ru=71, msu.su=31, millsian.com=4}"
+"41691","0.08651412448758108","20023.327408793266","834792543","8173213","[873, 923, 1103, 1513, 2086, 5598, 11671, 14695, 15940, 16890, 18876, 37017, 121135, 347625]","0.012473887422914302","0.10313976637643617","172358589","0.0025754681849839613","172358589","0.0025754681849839613","application/json","{length=43}","[{.json, 32266}, {.php, 716}, {.ipynb, 295}, {.geojson, 40}, {.aspx, 9}]","32","125","144","[{cc, 28320}, {gov, 5465}, {org, 4151}, {uk, 1003}, {at, 708}, {com, 592}, {edu, 504}, {press, 257}, {io, 146}, {ca, 87}, {dk, 86}, {cn, 76}, {ch, 61}, {info, 45}, {jp, 34}, {co, 31}, {tn, 30}, {de, 30}, {fr, 24}, {se, 11}, {net, 8}]","{ebi.ac.uk=721, nasa.gov=3297, noaa.gov=1242, mcmaster.ca=54, ems.press=257, tuwien.ac.at=708, au.dk=86, cognitiveatlas.org=1774, deims.org=1343, usda.gov=179, proteinatlas.org=242, cncb.ac.cn=76, data.gov=484, github.io=138, ioccc.org=207, biogps.org=167, nips.cc=9422, fau.edu=432, salmobase.org=126, usgs.gov=187, ibm.com=273, exaly.com=227, neurips.cc=18898, bas.ac.uk=277, kbroman.org=78}"
+"30879","0.06407785013676852","7124.460377602902","219996212","5765561","[844, 989, 1044, 1086, 1109, 1207, 1287, 1327, 1359, 6625, 17324, 67069, 161440, 188142]","0.003287293358052419","0.01943068104537064","11137910","1.6642821816216372E-4","11137910","1.6642821816216372E-4","text/x-vcard","{length=6}","[{.vcf, 2735}]","10","36","43","[{de, 28854}, {no, 1880}, {edu, 71}, {org, 29}, {cern, 28}, {ch, 6}, {ca, 6}]","{ebi.ac.uk=2, uni-jena.de=20106, gfz.de=2015, mit.edu=23, centos.org=20, isi.edu=17, fsu.edu=2, senckenberg.de=1504, eawag.ch=4, uni-augsburg.de=4554, uni-bielefeld.de=74, cassavabase.org=7, tu-dresden.de=3, uni-freiburg.de=12, hzdr.de=4, andrews.edu=7, gustavus.edu=17, zeiss.de=207, uni-heidelberg.de=369, root.cern=28, mutationtaster.org=2, bcgsc.ca=6, uni-wuerzburg.de=3, hvl.no=1880, uni-goettingen.de=2}"
+"25252","0.052401109869285874","3700.947568509425","93456328","330364","[948, 1005, 1018, 1031, 1042, 1066, 2057, 2670, 4329, 8888, 15362, 23643, 30219, 37573]","0.0013964711642506296","0.0","0","0.0","0","0.0","application/x-tex",,"[{.tex, 23506}, {.Rnw, 603}, {.aux, 189}, {.ltx, 130}, {.rnw, 19}, {.Stex, 12}, {.texfile, 12}, {.Snw, 10}, {.sty, 9}, {.toc, 7}, {.Rtex, 6}, {.txt, 6}]","33","164","195","[{gov, 10426}, {il, 9100}, {edu, 1949}, {fr, 1273}, {com, 808}, {org, 799}, {it, 203}, {ru, 153}, {net, 103}, {uk, 81}, {br, 62}, {hu, 54}, {de, 47}, {kr, 42}, {cz, 33}, {ca, 24}, {dk, 24}, {nl, 13}, {jp, 11}, {eu, 11}, {ch, 8}, {pl, 5}]","{numdam.org=358, illinois.edu=116, usp.br=62, berkeley.edu=418, r-project.org=248, msu.ru=131, wisc.edu=227, mit.edu=103, ucdavis.edu=99, knopper.net=91, unipi.it=51, cam.ac.uk=42, northeastern.edu=55, rstudio.com=400, bgu.ac.il=9100, cheatography.com=379, nist.gov=10414, netlib.org=95, academie-sciences.fr=1009, unica.it=128, bme.hu=54, kasi.re.kr=42, univ-grenoble-alpes.fr=237, txstate.edu=671, nyu.edu=63}"
+"24388","0.0506082000432498","2450.3920780711824","59760162","6607854","[741, 837, 880, 901, 916, 948, 1104, 1216, 1268, 1281, 1305, 1320, 1335, 1344]","8.9296620988518E-4","0.020501886173527963","25961556","3.8793054583824345E-4","25961556","3.8793054583824345E-4","text/x-sql","{length=5}","[{.sql, 24388}]","7","13","13","[{edu, 24363}, {ca, 12}, {fr, 5}]","{zitogiuseppe.com=2, lirmm.fr=3, daylight.com=1, uci.edu=3, umb.edu=464, postgis.net=2, bioconductor.org=1, wisc.edu=14, mirbase.org=1, bcgsc.ca=12, loria.fr=2, uzh.ch=1, ucsc.edu=23882}"
+"22962","0.047649068779444886","17108.406279940773","392843225","13944405","[714, 751, 949, 1357, 1481, 1547, 2225, 3521, 3663, 3826, 5454, 14118, 42855, 189881]","0.005870059818568113","0.008710042679209128","22202473","3.3176044879008263E-4","22202473","3.3176044879008263E-4","text/x-python","{length=2}","[{.py, 22880}, {.1, 10}]","33","121","138","[{org, 12932}, {ch, 6744}, {dk, 894}, {edu, 666}, {net, 525}, {fr, 404}, {com, 220}, {ca, 184}, {io, 63}, {be, 60}, {uk, 56}, {pl, 44}, {nl, 35}, {de, 28}, {gov, 23}, {cz, 17}, {hu, 15}, {info, 15}, {at, 11}, {in, 5}]","{scipy.org=155, ucsf.edu=100, wisc.edu=56, consc.net=499, au.dk=894, uwaterloo.ca=103, networkx.org=215, sfepy.org=212, statsmodels.org=88, github.io=62, astroml.org=752, rufat.be=60, gallantlab.org=55, uci.edu=164, greenteapress.com=123, umb.edu=95, scikit-learn.org=6734, utoronto.ca=77, cern.ch=6727, qutip.org=79, fmriprep.org=4054, numpy.org=60, scikit-image.org=280, loria.fr=369, brown.edu=58}"
+"15803","0.03279323377412976","20723.192495095867","327488611","2855905","[713, 719, 722, 725, 728, 742, 1175, 5241, 37039, 96505, 105798, 144055, 146305, 146632]","0.004893498510683958","0.0","0","0.0","0","0.0","text/csv",,"[{.php, 176}, {.nccsv, 9}, {.shtml, 9}]","14","29","32","[{info, 8425}, {de, 6803}, {gov, 175}, {org, 170}, {uk, 98}, {int, 64}, {edu, 48}, {nl, 9}]","{utwente.nl=9, usp.br=1, stanford.edu=1, metoffice.gov.uk=98, jvolcanica.org=3, noaa.gov=10, senescence.info=20, mpg.de=16, usda.gov=160, qiagen.com=2, getalp.org=2, duke.edu=3, in-the-sky.org=153, bayern.de=6787, nmsu.edu=44, predictioncenter.org=2, lco.global=2, officinae.bio=1, usgs.gov=3, addgene.org=4, esa.int=64, cuni.cz=4, marinespecies.org=3, mitoage.info=8405, neonscience.org=2}"
+"13930","0.028906520690604794","270571.4360373295","3769060104","26077175","[5932, 7161, 9047, 11304, 13378, 18248, 59215, 294168, 324438, 361409, 682771, 1903792, 3647067, 6146467]","0.056319179922877764","0.17946877243359655","549175134","0.008206049340548405","549175134","0.008206049340548405","application/x-tika-msoffice","{length=25}","[{.php, 6469}, {.do, 3818}, {.db, 146}, {.hwp, 47}, {.DB, 25}, {.jsp, 22}, {.mso, 13}, {.APR, 12}, {.gi, 10}, {.pdf, 5}]","27","79","88","[{kr, 10071}, {de, 3161}, {sk, 153}, {edu, 112}, {org, 106}, {ca, 55}, {bg, 54}, {se, 40}, {cn, 30}, {cz, 29}, {tw, 21}, {gov, 19}, {cern, 16}, {ru, 15}, {com, 10}, {no, 8}, {ch, 6}]","{nthu.edu.tw=20, msu.ru=12, cas.cz=29, columbia.edu=36, qcenter.kr=174, postech.ac.kr=162, sav.sk=153, arlis.org=26, bas.bg=54, ps-taiwan.org=24, bayern.de=3130, fz-juelich.de=26, triumf.ca=10, andrews.edu=22, fzu.edu.cn=22, kasi.re.kr=17, utoronto.ca=45, usgs.gov=10, yonsei.ac.kr=2823, root.cern=16, amap.no=8, unist.ac.kr=6892, skku.edu=16, ki.se=32, iodp.org=36}"
+"10992","0.022809797231236748","2133.158751819505","23447681","78430","[694, 750, 857, 938, 1032, 1216, 1671, 2464, 3028, 3655, 4634, 6420, 8704, 10760]","3.503669691050494E-4","0.0","0","0.0","0","0.0","text/x-rsrc",,"[{.R, 10762}, {.r, 230}]","22","60","62","[{com, 5892}, {org, 4664}, {edu, 93}, {fr, 64}, {in, 56}, {uk, 56}, {ch, 49}, {works, 44}, {de, 25}, {ca, 14}, {hu, 8}, {es, 5}]","{zju.edu.cn=4, r-project.org=2447, bioconductor.org=450, ethz.ch=38, wisc.edu=13, mathinsight.org=10, microstatslab.ca=8, dauphine.fr=45, cam.ac.uk=56, rstudio.com=5889, uni-bielefeld.de=20, logarithmic.net=3, uzh.ch=7, icrisat.org=4, bme.hu=8, umich.edu=56, france-bioinformatique.fr=12, jef.works=44, ugr.es=4, personality-project.org=1638, loria.fr=7, kbroman.org=100, nyu.edu=16, unimelb.edu.au=3, edrub.in=56}"
+"10542","0.02187599003017629","13249.037943464238","139671358","7037045","[749, 807, 898, 944, 989, 1440, 2060, 3634, 9184, 17694, 43891, 159270, 165720, 291534]","0.0020870392416736778","0.01897173211914248","13678238","2.043870688430772E-4","13678238","2.043870688430772E-4","text/x-matlab","{length=2}","[{.txt, 2366}, {.m, 1159}, {.tex, 544}, {.Rnw, 439}, {.asis, 263}, {.bib, 195}, {.ltx, 182}, {.twx, 168}, {.sty, 136}, {.t, 85}, {.cls, 78}, {.endnote, 45}, {.ail, 32}, {.pl, 30}, {.input, 26}, {.rev, 20}, {.abc, 15}, {.bst, 15}, {.ly, 15}, {.rnw, 14}, {.M, 10}, {.inc, 9}, {.cnf, 9}, {.pvs, 8}, {.dat, 8}]","41","199","256","[{org, 2681}, {ir, 2349}, {edu, 2212}, {com, 1918}, {pl, 244}, {fr, 203}, {cz, 124}, {de, 120}, {uk, 101}, {info, 62}, {gov, 59}, {ca, 55}, {it, 51}, {nl, 50}, {il, 49}, {br, 27}, {science, 24}, {net, 24}, {ch, 23}, {hu, 21}, {jp, 17}, {ru, 17}, {dk, 15}, {au, 14}, {lu, 13}]","{illinois.edu=48, berkeley.edu=191, termedia.pl=189, r-project.org=218, wisc.edu=115, areeo.ac.ir=2299, mit.edu=104, muni.cz=61, cam.ac.uk=91, rcin.org.pl=45, rstudio.com=446, cvut.cz=54, netlib.org=555, toronto.edu=112, uva.nl=47, sbu.ac.ir=50, artint.info=62, sciarena.com=1445, molvis.org=1144, rutgers.edu=43, univ-grenoble-alpes.fr=91, ico2s.org=521, personality-project.org=52, loria.fr=45, nyu.edu=1145}"
+"10412","0.021606223505425494","1516.6410872070687","15791267","2467","[1210, 1230, 1262, 1300, 1328, 1376, 1492, 1630, 1715, 1771, 1866, 1963, 2031, 2098]","2.3596100429371187E-4","0.0","0","0.0","0","0.0","application/mathml-presentation+xml",,"[{.pmml, 10412}]","1","1","1","[{gov, 10412}]","{nist.gov=10412}"
+"10338","0.021452664099028886","6105.520119945831","63118867","4124526","[916, 1078, 1153, 1194, 1290, 1766, 2361, 2764, 3012, 3221, 3662, 5673, 17413, 110098]","9.431536587406969E-4","0.01934610176049526","2203329","3.292324318431573E-5","2203329","3.292324318431573E-5","text/x-log","{length=2}","[{.LOG, 6509}, {.log, 3829}]","22","63","69","[{ca, 7609}, {org, 2087}, {edu, 242}, {ru, 183}, {it, 67}, {com, 31}, {gov, 24}, {uk, 18}, {hu, 12}, {fr, 11}, {community, 10}, {net, 7}, {br, 6}, {de, 6}, {ch, 5}]","{daylight.com=15, obspm.fr=7, r-project.org=7, noaa.gov=13, nasa.gov=10, wisc.edu=60, ox.ac.uk=11, knopper.net=6, unipi.it=22, uwaterloo.ca=8, beast.community=10, tamu.edu=82, cassavabase.org=63, osgeo.org=737, lowell.edu=8, umb.edu=43, unica.it=45, bme.hu=12, salmobase.org=1205, utoronto.ca=7551, caltech.edu=24, rwgrayprojects.com=11, sao.ru=178, bcgsc.ca=50, personality-project.org=62}"
+"9391","0.01948751872257499","8696.589500585667","81669672","2533490","[852, 853, 951, 1163, 1631, 1869, 2131, 2866, 3405, 3894, 48015, 87291, 93023, 100632]","0.0012203490591007069","0.10648493238206794","12964375","1.9372017109458607E-4","12964375","1.9372017109458607E-4","application/rdf+xml","{length=10}","[{.xml, 1550}, {.rdf, 169}, {.owl, 50}, {.rdfs, 20}, {.xmp, 12}, {.php, 8}, {.rss, 8}, {.txt, 5}]","29","78","83","[{gov, 6591}, {pt, 1136}, {org, 711}, {com, 564}, {pl, 96}, {cz, 61}, {edu, 35}, {se, 33}, {de, 32}, {ch, 32}, {br, 22}, {es, 20}, {be, 18}, {jp, 10}, {io, 6}]","{fernuni-hagen.de=21, noaa.gov=972, cas.cz=61, usal.es=10, naturalsciences.be=18, mit.edu=13, ufpel.edu.br=10, copernicus.org=441, rcin.org.pl=22, crossref.org=10, usda.gov=5450, biolscigroup.com=7, produccioncientificaluz.org=33, nist.gov=169, uzh.ch=25, fieldsites.se=11, uevora.pt=1136, um.es=8, bibsonomy.org=142, ufrj.br=9, mdpi.com=511, umk.pl=74, cidoc-crm.org=63, gu.se=22, franz.com=35}"
+"9313","0.01932565880772451","7442506.444754644","69312062520","25434347","[1864, 2303, 6760, 19768, 66537, 410672, 4563187, 12650395, 19848008, 22410276, 23402499, 23787149, 24052061, 24517294]","1.0356954816790664","35.35917534629013","54057241820","0.8077503261261583","54057241820","0.8077503261261583","model/vnd.valve.source.compiled-map","{length=3293}","[{.bsp, 8021}, {.BSP, 1291}]","2","2","4","[{gov, 9311}]","{nmsu.edu=2, nasa.gov=9311}"
+"9242","0.019178324782668305","1189.2483228738367","10991033","213909","[932, 945, 957, 967, 974, 985, 1012, 1054, 1281, 1715, 1902, 2067, 2159, 2249]","1.6423350861620724E-4","0.0","0","0.0","0","0.0","application/x-endnote-refer",,"[{.do, 1232}, {.abs, 155}, {.enw, 30}, {.txt, 9}, {.btg, 7}, {.ris, 5}]","9","14","14","[{com, 9032}, {edu, 159}, {de, 37}, {net, 7}]","{ucla.edu=2, emorychem.science=1, sciarena.com=14, noaa.gov=1, wisc.edu=157, consc.net=7, xjegi.com=1232, techscience.com=7786, pik-potsdam.de=13, sigops.org=1, lmu.de=24, cncb.ac.cn=2, qmul.ac.uk=1, netlib.org=1}"
+"8659","0.01796852567551665","672.4856218962929","5823053","677","[669, 669, 670, 671, 671, 672, 673, 673, 674, 674, 675, 675, 676, 676]","8.701096840016142E-5","0.0","0","0.0","0","0.0","save/fasta-format",,"[{.php, 8659}]","1","1","1","[{jp, 8659}]","{riken.jp=8659}"
+"8025","0.01665289508557814","16946.100062305297","135992453","2908705","[972, 1237, 1288, 1349, 1354, 1359, 1646, 1715, 1734, 1774, 23802, 199502, 400793, 595427]","0.002032067204376027","0.13707165109034267","8731033","1.3046345902463304E-4","8731033","1.3046345902463304E-4","application/vnd.google-earth.kml+xml","{length=11}","[{.kml, 7016}]","16","42","50","[{gov, 6759}, {org, 832}, {eu, 201}, {jp, 162}, {edu, 49}, {com, 7}]","{earthbyte.org=18, nii.ac.jp=55, noaa.gov=6683, nasa.gov=35, wisc.edu=14, csiro.au=2, deims.org=806, sinica.edu.tw=2, copernicus.eu=201, canada.ca=2, komazawa-u.ac.jp=106, gpsvisualizer.com=3, planet4589.org=2, richmond.edu=2, iqm.tech=1, weather.gov=24, mst.edu=7, colorado.edu=2, caltech.edu=12, usgs.gov=16, mtu.edu=5, bgs.ac.uk=2, usf.edu=3, ucf.edu=2, franz.com=3}"
+"7594","0.015758515299673568","253244.4880168554","1923138642","26218035","[755, 775, 852, 885, 953, 1279, 2797, 3365, 3430, 3464, 5359, 31175, 9475644, 26207949]","0.02873649881053656","0.9349486436660521","1847891922","0.0276121247105351","1847891922","0.0276121247105351","application/x-sh","{length=71}","[{.sh, 5539}, {.py, 560}, {.run, 243}, {.sig, 98}, {.csh, 27}, {.txt, 26}, {.pl, 25}, {.tcl, 20}, {.uu, 20}, {.shar, 14}, {.bash, 14}, {.TXT, 12}, {.7, 10}, {.php, 10}, {.in, 9}, {.sif, 8}, {.6, 7}, {.10, 7}, {.3, 7}, {.8, 7}, {.5, 7}, {.1, 7}, {.9, 7}, {.4, 7}, {.2, 7}]","30","139","170","[{ch, 3425}, {org, 1339}, {gov, 1300}, {edu, 509}, {com, 506}, {io, 172}, {ca, 63}, {jp, 60}, {uk, 52}, {net, 38}, {nl, 27}, {fr, 24}, {it, 15}, {ru, 11}, {at, 11}, {cz, 6}, {de, 6}, {int, 5}]","{drive5.com=33, scm.com=371, daylight.com=39, berkeley.edu=76, nasa.gov=1251, noaa.gov=29, wisc.edu=115, consc.net=22, ucsb.edu=47, columbia.edu=21, kek.jp=37, sfepy.org=40, github.io=19, netlib.org=162, uva.nl=18, stsci.edu=18, ioccc.org=825, scikit-learn.org=69, caltech.edu=83, hutton.ac.uk=44, cern.ch=3420, isthe.com=22, bcgsc.ca=48, readthedocs.io=153, gnu.org=103}"
+"7424","0.015405743690384062","3876.4919181034484","28779076","813381","[734, 773, 890, 1036, 1185, 1447, 2215, 3456, 4521, 5446, 7974, 16979, 32698, 46690]","4.30031337929063E-4","0.0","0","0.0","0","0.0","text/x-fortran",,"[{.f, 6694}, {.f90, 527}, {.F, 99}, {.for, 89}, {.F90, 7}]","21","52","63","[{org, 5855}, {gov, 656}, {jp, 317}, {edu, 305}, {br, 114}, {ca, 48}, {uk, 42}, {cz, 32}, {pl, 14}, {net, 12}, {de, 9}, {it, 5}]","{hitran.org=70, llnl.gov=19, usp.br=11, nasa.gov=31, noaa.gov=19, utexas.edu=46, wisc.edu=44, ox.ac.uk=41, mit.edu=16, muni.cz=32, kek.jp=6, nist.gov=586, uni-bielefeld.de=5, netlib.org=5696, uj.edu.pl=13, ucsd.edu=32, kyoto-u.ac.jp=60, lowell.edu=119, euroforth.org=5, lk.net=8, utoronto.ca=47, lamost.org=79, nyu.edu=35, gsj.jp=251, ufrgs.br=103}"
+"6465","0.013415696788568557","2974.6972931167825","19231418","1048499","[714, 740, 776, 871, 964, 1218, 2060, 3112, 3547, 4153, 5993, 9527, 15414, 22932]","2.873654599895099E-4","0.0","0","0.0","0","0.0","text/x-csrc",,"[{.c, 4551}, {.orl, 1201}, {.cpp, 302}, {.cc, 158}, {.m, 80}, {.C, 54}, {.h, 36}, {.txt, 25}, {.php, 10}, {.sqc, 9}, {.cxx, 7}]","25","109","125","[{edu, 2939}, {org, 2180}, {com, 359}, {tw, 197}, {br, 164}, {jp, 163}, {net, 84}, {dk, 71}, {gov, 62}, {ca, 49}, {nl, 32}, {fr, 29}, {de, 25}, {cz, 20}, {cern, 19}, {uk, 16}, {ch, 15}, {in, 15}, {it, 14}]","{daylight.com=55, usp.br=164, cmu.edu=152, stanford.edu=35, boost.org=149, wisc.edu=1042, knopper.net=84, au.dk=71, mit.edu=56, columbia.edu=137, nist.gov=38, netlib.org=862, toronto.edu=39, uva.nl=32, ucsd.edu=31, zitogiuseppe.com=188, harvard.edu=1209, ioccc.org=933, euroforth.org=152, maine.edu=58, ksu.edu=38, ntu.edu.tw=197, isthe.com=32, swtch.com=31, gsj.jp=161}"
+"6375","0.013228935348356466","3439072.769882353","21924088908","26223344","[13132, 19075, 32442, 92102, 161593, 476573, 1737784, 4300279, 6435404, 8358684, 12926923, 22666087, 26209116, 26220734]","0.3276006945456821","1.7098039215686274","2788786731","0.04167144522397967","2772069532","0.04142164848094329","application/epub+zip","{length=106, disconnect=3}","[{.epub, 3634}]","25","62","64","[{org, 2581}, {com, 1210}, {ec, 1056}, {it, 711}, {br, 389}, {es, 205}, {gov, 73}, {eu, 52}, {ch, 22}, {edu, 17}, {cl, 11}, {net, 9}, {ca, 8}, {cern, 6}, {de, 6}, {fr, 6}]","{r-project.org=36, nasa.gov=15, swsc-journal.org=544, usal.es=172, kmae-journal.org=415, ed.gov=7, ufpel.edu.br=378, rstudio.com=18, produccioncientificaluz.org=279, sissa.it=709, ercim.eu=52, europhysicsnews.org=45, textualvisualmedia.com=51, um.es=27, ufrj.br=11, ucuenca.edu.ec=1056, sfn.org=13, openbioinformaticsjournal.com=114, uautonoma.cl=10, techscience.com=1021, fourmilab.ch=22, cancer.gov=34, globe.gov=14, frontiersin.org=31, aanda.org=1216}"
+"5429","0.011265865099016038","2650.2096150303923","14387988","57752","[1094, 1118, 1182, 1232, 1280, 1411, 1972, 3003, 3827, 4723, 6099, 8794, 10974, 14621]","2.149925080898116E-4","0.0","0","0.0","0","0.0","text/x-c++hdr",,"[{.hpp, 5375}, {.H, 32}, {.hh, 20}]","4","8","8","[{org, 5376}, {edu, 33}, {de, 16}]","{uni-freiburg.de=16, ucla.edu=3, illinois.edu=6, root.cern=4, boost.org=5374, wisc.edu=24, iodp.org=1, netlib.org=1}"
+"5291","0.0109794975573575","1991.8380268380267","10538815","62265","[1575, 1595, 1626, 1658, 1686, 1732, 1882, 2091, 2221, 2340, 2542, 2869, 3115, 3402]","1.574762412329318E-4","0.0","0","0.0","0","0.0","text/turtle",,"[{.ttl, 15}]","4","8","8","[{gov, 5272}, {se, 11}, {org, 7}]","{fieldsites.se=11, rdf4j.org=1, usda.gov=5272, obofoundry.org=2, cidoc-crm.org=1, getalp.org=2, stoa.org=1, bgs.ac.uk=1}"
+"4780","0.009919107602375514","2746.315690376569","13127389","557809","[805, 926, 1118, 1223, 1291, 1405, 1803, 2469, 3091, 3470, 4516, 6427, 8630, 14256]","1.961560077601263E-4","0.0","0","0.0","0","0.0","text/x-c++src",,"[{.cpp, 4370}, {.cc, 237}, {.C, 161}, {.cxx, 7}]","20","55","62","[{org, 4208}, {edu, 244}, {com, 98}, {ch, 67}, {de, 50}, {cern, 33}, {cz, 17}, {net, 14}, {fr, 14}, {tw, 7}, {dk, 6}]","{illinois.edu=9, stanford.edu=8, berkeley.edu=6, boost.org=4187, wisc.edu=82, princeton.edu=15, knopper.net=14, au.dk=6, mit.edu=5, software-lab.org=10, lemoyne.edu=63, hawaii.edu=6, toronto.edu=13, uni-freiburg.de=43, ucla.edu=13, zitogiuseppe.com=90, stroustrup.com=4, algorithmicbotany.org=5, cern.ch=67, uni-heidelberg.de=6, root.cern=33, ntu.edu.tw=7, cuni.cz=15, univ-grenoble-alpes.fr=12, nyu.edu=7}"
+"4172","0.008657430317387164","318408.15987535956","1328398843","26220868","[10525, 11226, 12514, 14556, 16837, 23151, 60945, 152633, 325027, 561760, 1397990, 2913773, 4800934, 8218191]","0.019849599471460075","0.023969319271332695","26220868","3.9180531535138076E-4","26220868","3.9180531535138076E-4","application/vnd.openxmlformats-officedocument.wordprocessingml.document","{length=1}","[{.do, 417}, {.shtml, 177}, {.aspx, 42}, {.php, 27}, {.pdf, 14}]","35","101","111","[{br, 835}, {se, 626}, {com, 491}, {kr, 416}, {edu, 383}, {nl, 220}, {org, 211}, {hr, 128}, {jp, 126}, {ch, 97}, {de, 93}, {gov, 90}, {au, 65}, {int, 65}, {fr, 43}, {eus, 42}, {no, 40}, {fi, 39}, {uk, 32}, {es, 22}, {id, 22}, {tr, 20}, {cz, 15}, {cat, 13}, {net, 10}]","{utwente.nl=177, ans.org=53, ntnu.edu=53, jvolcanica.org=37, ufpel.edu.br=575, fhnw.ch=84, columbia.edu=115, ehu.eus=42, ti.com=436, stuk.fi=37, ufla.br=251, uva.nl=42, irb.hr=128, liu.se=45, fz-juelich.de=58, unav.edu=55, chalmers.se=52, osaka-u.ac.jp=126, ansto.gov.au=65, keene.edu=41, bipm.org=115, yonsei.ac.kr=348, ki.se=529, globe.gov=72, cbd.int=41}"
+"3210","0.006661158034231255","50389.08348909657","161748958","14399877","[1037, 1142, 1324, 1514, 1680, 1932, 6049, 21143, 33701, 44772, 59988, 86775, 121909, 263443]","0.002416933775683827","0.2803738317757009","106459131","0.001590765545728271","106459131","0.001590765545728271","application/mbox","{length=9}","[{.txt, 489}, {.b, 410}, {.d, 402}, {.c, 398}, {.e, 376}, {.a, 318}, {.patch, 49}, {.f, 38}, {.rxte, 35}, {.html, 20}, {.mm, 17}, {.uu, 13}, {.feb, 8}, {.dec, 8}, {.jul, 8}, {.apr, 8}, {.nov, 8}, {.aug, 8}, {.jun, 8}, {.200203, 6}, {.shar, 6}, {.200207, 6}, {.may, 6}, {.jan, 6}, {.200112, 6}]","11","28","32","[{edu, 2838}, {gov, 191}, {org, 109}, {ca, 35}, {uk, 9}, {de, 9}, {ru, 7}]","{usp.br=4, berkeley.edu=4, miketaylor.org.uk=3, msu.ru=7, nasa.gov=191, wisc.edu=2379, centos.org=49, povray.org=2, dbaron.org=1, mit.edu=1, uwaterloo.ca=14, cam.ac.uk=6, cimat.mx=4, mpg.de=9, hawaii.edu=34, netlib.org=51, toronto.edu=1, stsci.edu=2, lowell.edu=413, purdue.edu=1, euroforth.org=5, unica.it=3, rice.edu=2, utoronto.ca=20, realclimate.org=1}"
+"2974","0.0061714280354528825","439093.8880295898","1305865223","12486114","[900, 930, 1045, 1288, 1588, 2552, 9889, 224550, 440574, 1306633, 3128594, 4585857, 5540255, 7076262]","0.019512890858682335","3.160726294552791","454591103","0.006792727474424074","454591103","0.006792727474424074","text/tab-separated-values","{length=94}","[{.tsv, 2724}]","13","44","47","[{org, 2704}, {com, 57}, {ca, 53}, {edu, 53}, {jp, 31}, {de, 22}, {gov, 19}, {es, 17}, {fr, 8}]","{metabolic-economics.de=9, kazusa.or.jp=4, illumina.com=54, uni-jena.de=10, noaa.gov=15, bioconductor.org=7, expasy.org=23, mit.edu=10, cnio.es=15, obofoundry.org=12, proteinatlas.org=240, lemoyne.edu=4, riken.jp=26, hawaii.edu=6, planet4589.org=449, ucsc.edu=26, salmobase.org=879, ergoso.me=4, peptideatlas.org=121, interactome-atlas.org=9, usgs.gov=3, france-bioinformatique.fr=8, bindingdb.org=95, bcgsc.ca=53, iodp.org=856}"
+"2720","0.0056443457486320915","30722.500735294117","83565202","2960738","[837, 839, 842, 1414, 5258, 6078, 10121, 18643, 29003, 42895, 87058, 231590, 430396, 881877]","0.0012486730157831478","0.0","0","0.0","0","0.0","application/vnd.apple.installer+xml",,"[{.dist, 2393}]","3","4","4","[{org, 2717}]","{usp.br=1, salmobase.org=2390, freedesktop.org=327, bcgsc.ca=2}"
+"2528","0.005245921342846297","4091.9319620253164","10344404","295536","[838, 841, 879, 882, 980, 1033, 1499, 3400, 3778, 4490, 5793, 7025, 143014, 158273]","1.5457125490056563E-4","0.0","0","0.0","0","0.0","application/javascript",,"[{.mem, 28}, {.php, 13}]","7","9","10","[{uk, 2373}, {edu, 102}, {fr, 28}, {org, 19}]","{swri.edu=101, stsci.edu=1, hixie.ch=2, kwarc.info=3, u-bordeaux.fr=28, glbrc.org=6, ox.ac.uk=2373, genetic.edu.ph=1, in-the-sky.org=13}"
+"2409","0.004998981216343643","479613.6762141968","1155389346","26222682","[1494, 1538, 1679, 3558, 6553, 15488, 33378, 168119, 367810, 465803, 1308718, 7097629, 11446025, 18327302]","0.017264405093803743","0.29057700290577004","183228915","0.0027378980292744825","183228915","0.0027378980292744825","application/vnd.google-earth.kmz","{length=7}","[{.kmz, 2347}]","16","48","60","[{org, 1317}, {gov, 763}, {edu, 177}, {es, 62}, {jp, 36}, {ca, 28}, {au, 6}, {ch, 6}]","{earthbyte.org=13, broermapsonline.org=37, nii.ac.jp=13, quantum.info=4, nasa.gov=561, noaa.gov=140, wisc.edu=17, csiro.au=3, wsl.ch=6, komazawa-u.ac.jp=11, occultations.org=7, in-the-sky.org=1240, ucsd.edu=3, gva.es=60, tos.org=3, agr.gc.ca=28, colorado.edu=72, ms.gov=8, caltech.edu=4, usgs.gov=52, fnai.org=3, nmt.edu=73, unavco.org=7, idahogeology.org=3, gsj.jp=9}"
+"2340","0.004855797445514373","2070.9837606837605","4846102","21139","[770, 797, 857, 945, 997, 1123, 1494, 2127, 3069, 3769, 5963, 8500, 9590, 9770]","7.24128782592154E-5","0.0","0","0.0","0","0.0","application/java-vm",,"[{.class, 2334}, {.sav, 6}]","9","22","23","[{com, 1147}, {edu, 959}, {net, 110}, {gov, 59}, {org, 46}, {dk, 6}, {ca, 5}, {at, 5}]","{ucla.edu=18, zitogiuseppe.com=1136, park.org=6, fu-berlin.de=1, triumf.ca=3, umb.edu=102, euroforth.org=31, wolfk-wk.de=2, princeton.edu=59, wisc.edu=25, perisic.com=8, knopper.net=110, au.dk=6, metamath.org=9, rwgrayprojects.com=3, jku.at=5, bcgsc.ca=2, lemoyne.edu=746, nist.gov=59, uml.edu=4, toronto.edu=4, brown.edu=1}"
+"2279","0.0047292146915928444","2056864.8674857393","4687595033","26222911","[736, 790, 903, 1256, 1643, 4694, 13510, 111212, 488151, 2731761, 24434984, 26172015, 26210549, 26221392]","0.0700443879334632","7.810443176831944","4069460360","0.060807910693869614","4069460360","0.060807910693869614","text/troff","{length=178}","[{.map, 832}, {.dat, 271}, {.bb, 255}, {.py, 104}, {.tr, 96}, {.t, 67}, {.1, 65}, {.ms, 49}, {.5, 43}, {.bigwig, 42}, {.man, 37}, {.2bit, 27}, {.me, 27}, {.2, 26}, {.3, 22}, {.6, 17}, {.4, 16}, {.bw, 14}, {.troff, 11}, {.fasta, 11}, {.fa, 11}, {.8, 10}, {.bild, 8}, {.groups, 7}, {.bmrk, 7}]","18","47","55","[{pl, 1086}, {edu, 605}, {ca, 132}, {org, 118}, {uk, 107}, {gov, 77}, {nl, 77}, {at, 40}, {info, 10}, {io, 6}, {net, 5}, {dk, 5}]","{berkeley.edu=34, stanford.edu=6, metoffice.gov.uk=97, noaa.gov=53, nasa.gov=16, ucsf.edu=8, wisc.edu=146, astrouw.edu.pl=1086, nist.gov=7, statsmodels.org=14, github.io=6, uva.nl=77, toronto.edu=65, netlib.org=56, ucsc.edu=281, harvard.edu=10, scikit-learn.org=13, colorado.edu=13, utoronto.ca=39, peptideatlas.org=19, kwarc.info=10, bcgsc.ca=92, qmul.ac.uk=9, uibk.ac.at=40, nyu.edu=36}"
+"2228","0.0046233832088059925","298451.8711849192","664950769","21220246","[919, 3657, 6460, 7786, 9103, 13466, 89252, 207438, 317524, 514954, 1114618, 1864764, 4618921, 8639715]","0.009936026745613004","0.0","0","0.0","0","0.0","application/msword",,"[{.do, 92}, {.dot, 65}, {.shtml, 34}, {.jsp, 13}, {.php, 11}, {.aspx, 10}, {.pdf, 10}]","30","77","82","[{com, 865}, {br, 426}, {edu, 262}, {org, 107}, {kr, 89}, {nl, 83}, {hr, 61}, {ca, 49}, {jp, 36}, {se, 36}, {gov, 31}, {cz, 26}, {de, 24}, {fr, 17}, {cn, 17}, {int, 16}, {es, 16}, {no, 15}, {uk, 12}, {eus, 10}, {ro, 9}, {ch, 6}]","{utwente.nl=34, ans.org=19, ntnu.edu=20, davidson.edu=11, ufpel.edu.br=199, columbia.edu=131, yale.edu=12, onf.fr=17, ti.com=853, lemoyne.edu=36, eartharxiv.org=13, ufla.br=225, uva.nl=49, irb.hr=61, ntnu.no=15, unav.edu=21, fz-juelich.de=18, fzu.edu.cn=13, osaka-u.ac.jp=32, bipm.org=72, yonsei.ac.kr=86, ki.se=24, globe.gov=22, cuni.cz=20, scholaris.ca=49}"
+"2007","0.004164780116729635","2454.875934230194","4926936","426811","[699, 728, 821, 897, 980, 1178, 1704, 2303, 2744, 3022, 3699, 5607, 10950, 33288]","7.362074028960714E-5","0.0","0","0.0","0","0.0","text/x-chdr",,"[{.h, 1570}, {.hpp, 404}, {.c, 14}]","17","60","67","[{edu, 1103}, {org, 568}, {com, 125}, {net, 79}, {cern, 44}, {ca, 19}, {gov, 16}, {it, 12}, {de, 9}, {ch, 9}, {fr, 8}, {cz, 8}]","{cmu.edu=13, berkeley.edu=16, stanford.edu=9, boost.org=403, wisc.edu=768, princeton.edu=17, knopper.net=79, ucdavis.edu=16, unipi.it=12, muni.cz=8, worrydream.com=7, columbia.edu=113, lemoyne.edu=26, nist.gov=14, netlib.org=116, toronto.edu=17, zitogiuseppe.com=92, lowell.edu=52, ioccc.org=33, umb.edu=16, maine.edu=7, root.cern=44, swtch.com=10, bcgsc.ca=16, nyu.edu=17}"
+"1841","0.0038203090158940006","169505.46713742532","312059565","26206946","[7209, 8145, 8892, 9155, 10362, 18744, 36938, 66449, 99999, 149317, 504918, 974205, 2208764, 5198801]","0.004662949993617286","0.16295491580662683","78597355","0.0011744409628834341","78597355","0.0011744409628834341","application/vnd.oasis.opendocument.text","{length=3}","[{.odt, 1804}, {.php, 14}]","30","94","98","[{org, 1451}, {tw, 87}, {it, 63}, {br, 35}, {pl, 27}, {uk, 27}, {edu, 23}, {fr, 22}, {de, 18}, {ru, 16}, {es, 15}, {sk, 11}, {com, 8}, {lt, 6}, {pf, 5}]","{upjs.sk=11, gobiernodecanarias.org=1411, nycu.edu.tw=28, nuk.edu.tw=9, nthu.edu.tw=8, ukri.org=10, nccu.edu.tw=4, meteo.lt=6, service-public.pf=5, uw.edu.pl=20, dcc.ac.uk=5, rd-alliance.org=5, southampton.ac.uk=12, mapress.com=5, ncl.ac.uk=7, ouvrirlascience.fr=12, unizar.es=14, ntu.edu.tw=26, scienceeurope.org=8, cidoc-crm.org=8, unict.it=51, www.gov.br=29, nchu.edu.tw=8, sao.ru=12, brown.edu=14}"
+"1780","0.003693726261972472","57347.065730337075","102077777","2464259","[10514, 11563, 12645, 14182, 14962, 19180, 34920, 39843, 42430, 48848, 89618, 476938, 888582, 1259877]","0.0015252971643750666","0.056179775280898875","758743","1.1337517140380428E-5","758743","1.1337517140380428E-5","application/vnd.ms-word2006ml","{length=1}","[{.xml, 1768}]","2","3","3","[{hr, 1767}, {org, 13}]","{produccioncientificaluz.org=12, oecd-nea.org=1, istra-istria.hr=1767}"
+"1779","0.003691651134859004","1615.8426082068577","2874584","18016","[891, 1087, 1296, 1493, 1500, 1509, 1520, 1534, 1554, 1610, 1860, 3007, 5135, 7604]","4.2953470900506936E-5","0.0","0","0.0","0","0.0","text/x-rst",,"[{.rst, 1778}]","8","11","12","[{dev, 1578}, {org, 123}, {io, 44}, {de, 15}, {com, 8}, {net, 7}]","{earthbyte.org=19, gallantlab.org=1, usp.br=1, jax.dev=1578, boost.org=103, rub.de=15, deepmodeling.com=8, readthedocs.io=6, ucsf.edu=3, consc.net=7, github.io=38}"
+"1775","0.0036833506264051332","347107.0033802817","616114931","23772216","[8799, 8889, 9306, 9676, 10144, 11027, 21645, 81529, 202464, 445585, 1305646, 3863967, 9565662, 12885467]","0.009206297245123587","0.28169014084507044","63425231","9.47731502755331E-4","62580671","9.351116651079593E-4","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","{length=4, disconnect=1}","[{.do, 741}, {.shtml, 30}, {.php, 11}]","25","64","70","[{kr, 741}, {org, 256}, {com, 125}, {hr, 122}, {se, 114}, {es, 99}, {jp, 89}, {edu, 62}, {nl, 32}, {fi, 26}, {no, 26}, {gov, 24}, {de, 10}, {int, 10}, {br, 9}, {uk, 7}, {fr, 7}, {cat, 6}]","{utwente.nl=30, maanmittauslaitos.fi=8, cis.es=53, ntnu.edu=9, jvolcanica.org=69, columbia.edu=13, yale.edu=15, qiagen.com=50, openrepository.com=33, stuk.fi=18, nist.gov=9, irb.hr=122, ntnu.no=25, gva.es=44, fz-juelich.de=8, unav.edu=7, osaka-u.ac.jp=89, mapress.com=37, keene.edu=10, bipm.org=27, esa.int=10, unist.ac.kr=739, haematologica.org=141, ki.se=107, phyphox.org=9}"
+"1740","0.0036107211774337646","1259.971264367816","2192350","8166","[903, 910, 974, 1010, 1022, 1072, 1165, 1251, 1335, 1440, 1792, 2313, 3762, 6676]","3.275918947879985E-5","0.0","0","0.0","0","0.0","text/x-fasta",,,"1","1","1","[{org, 1740}]","{rcsb.org=1740}"
+"1651","0.003426034864335141","1054.9400363416112","1741706","1545","[802, 916, 921, 924, 926, 929, 937, 1202, 1330, 1449, 1499, 1508, 1516, 1522]","2.6025441590239956E-5","0.0","0","0.0","0","0.0","application/pgp-signature",,"[{.sig, 1650}]","3","5","5","[{org, 1631}, {ch, 14}, {com, 6}]","{fourmilab.ch=14, gnu.org=1526, nfshost.com=5, euroforth.org=105, mapsmarker.com=1}"
+"1629","0.003380382067838852","27827.468385512584","45330946","18090807","[754, 822, 1031, 1346, 1598, 2569, 4707, 5094, 11611, 43759, 45744, 82432, 456439, 496474]","6.773576524128191E-4","0.061387354205033766","18090807","2.703218803281404E-4","18090807","2.703218803281404E-4","text/x-perl","{length=1}","[{.pl, 588}, {.cgi, 410}, {.txt, 37}, {.pm, 26}, {.perl, 8}, {.rpar, 6}]","20","64","67","[{edu, 811}, {nl, 449}, {org, 148}, {com, 47}, {jp, 42}, {gov, 36}, {ru, 33}, {uk, 12}, {ch, 10}, {info, 9}, {ca, 7}]","{mrob.com=18, daylight.com=13, einsteintoolkit.org=6, berkeley.edu=62, miketaylor.org.uk=12, brandeis.edu=134, msu.ru=33, upenn.edu=6, ethz.ch=5, senescence.info=9, dbaron.org=4, astrouw.edu.pl=4, nist.gov=35, uva.nl=141, zitogiuseppe.com=8, park.org=66, harvard.edu=484, jirka.org=26, colorado.edu=50, caltech.edu=55, washington.edu=4, caida.org=23, vu.nl=308, gsj.jp=42, squid-cache.org=7}"
+"1629","0.003380382067838852","4540.487415592388","7396454","172995","[894, 1132, 1417, 1628, 1788, 2453, 3322, 4280, 5282, 5973, 7136, 32082, 34028, 35562]","1.1052151255831736E-4","0.0","0","0.0","0","0.0","application/x-subrip",,"[{.srt, 1626}]","5","9","10","[{org, 1225}, {gov, 297}, {ca, 88}, {ch, 18}]","{eso.org=111, harvard.edu=1, hixie.ch=18, esahubble.org=1110, freedesktop.org=4, nasa.gov=290, goes-r.gov=6, noaa.gov=1, uwaterloo.ca=88}"
+"1579","0.003276625712165468","3176.9347688410385","5016380","69444","[892, 977, 1057, 1160, 1258, 1478, 2003, 3134, 4192, 5403, 8387, 15244, 23458, 38197]","7.495725724344288E-5","0.0","0","0.0","0","0.0","text/x-prolog",,"[{.pro, 1577}]","10","20","31","[{edu, 1167}, {gov, 157}, {es, 101}, {org, 93}, {au, 52}]","{ucla.edu=8, stsci.edu=3, uni-freiburg.de=2, hitran.org=93, uni.lu=2, harvard.edu=304, nmsu.edu=229, lowell.edu=552, nasa.gov=116, noaa.gov=31, csiro.au=52, wisc.edu=13, colorado.edu=7, ox.ac.uk=1, caltech.edu=50, utoronto.ca=2, caha.es=101, lbl.gov=10, sao.ru=2, nyu.edu=1}"
+"1514","0.003141742449790069","1281436.4511228534","1940094787","26212511","[8209, 8589, 9645, 11178, 12131, 15171, 53128, 471334, 1790527, 3134836, 7159792, 15859936, 24144492, 26115888]","0.028989866004134755","1.453104359313078","458551266","0.006851902209335896","458551266","0.006851902209335896","application/x-tika-ooxml","{length=22}","[{.php, 619}, {.do, 314}, {.3mf, 58}, {.dot, 33}, {.jsp, 33}, {.pdf, 24}, {.asp, 20}, {.html, 8}, {.mlx, 7}]","27","63","68","[{kr, 675}, {org, 255}, {sk, 137}, {edu, 93}, {fr, 70}, {de, 64}, {cn, 44}, {com, 40}, {bg, 34}, {ca, 29}, {uk, 20}, {tw, 13}, {cz, 9}, {net, 8}]","{ucas.ac.cn=10, antarea.fr=69, nthu.edu.tw=11, cas.cz=9, upenn.edu=4, ufpel.edu.br=4, ices.dk=3, page-meeting.org=4, yale.edu=8, qcenter.kr=11, sav.sk=137, duke.edu=66, bas.bg=34, eartharxiv.org=222, ps-taiwan.org=13, rocketpy.org=5, fz-juelich.de=4, fzu.edu.cn=33, uni-potsdam.de=59, techscience.com=33, yonsei.ac.kr=314, unist.ac.kr=350, irsrm.net=8, wherwellprimary.co.uk=20, scholaris.ca=29}"
+"1505","0.0031230663057688597","5176887.134883721","7791215138","12397106","[157533, 163259, 169546, 188343, 274170, 577287, 3947902, 10747970, 11286234, 11427487, 11786714, 12042584, 12151858, 12180053]","0.11642023079154136","16.677740863787374","45922048","6.861901939409956E-4","45922048","6.861901939409956E-4","chemical/x-cache","{length=251}","[{.cache, 1505}]","1","1","1","[{org, 1505}]","{earthbyte.org=1505}"
+"1471","0.0030525119839109583","3413.9775662814413","5021961","81695","[717, 796, 891, 1115, 1210, 1495, 2470, 4053, 5242, 6224, 8126, 11373, 15686, 23466]","7.504065133493428E-5","0.0","0","0.0","0","0.0","text/x-web-markdown",,"[{.md, 1470}]","22","77","81","[{org, 650}, {edu, 482}, {dev, 81}, {es, 75}, {com, 60}, {io, 46}, {ca, 21}, {ch, 15}, {fr, 11}, {info, 7}]","{jax.dev=81, xianghuang.org=5, r-project.org=12, upenn.edu=221, cnio.es=74, rstudio.com=18, data8.org=27, networkx.org=16, deepmodeling.com=15, cassavabase.org=6, github.io=46, pelagios.org=15, ucla.edu=244, osgeo.org=22, gallantlab.org=5, ioccc.org=438, theoryandpractice.org=72, cern.ch=5, ubc.ca=5, france-bioinformatique.fr=10, kwarc.info=7, bcgsc.ca=11, personality-project.org=6, wolfram.com=11, patientensicht.ch=10}"
+"1432","0.002971582026485719","44064.71997206704","63100679","2139388","[1420, 1619, 2074, 2610, 3334, 6031, 20840, 47243, 76740, 101898, 172936, 275798, 339449, 418238]","9.428818845539839E-4","0.0","0","0.0","0","0.0","application/x-dvi",,"[{.dvi, 1365}, {.DVI, 65}]","24","53","57","[{edu, 255}, {org, 179}, {de, 144}, {au, 137}, {uk, 135}, {io, 105}, {ru, 95}, {il, 83}, {it, 75}, {hu, 50}, {fr, 42}, {br, 37}, {ca, 25}, {pl, 25}, {mx, 12}, {gov, 7}, {net, 6}]","{usp.br=37, berkeley.edu=34, wisc.edu=48, mit.edu=56, unipi.it=60, uwaterloo.ca=25, uni-bayreuth.de=12, cam.ac.uk=101, cimat.mx=12, usyd.edu.au=133, columbia.edu=73, u-bordeaux.fr=12, bgu.ac.il=81, lmu.de=125, github.io=105, netlib.org=168, uwb.edu.pl=25, aimath.org=9, bme.hu=50, unica.it=15, fau.edu=13, mccme.ru=90, univ-grenoble-alpes.fr=26, qmul.ac.uk=33, nyu.edu=10}"
+"1432","0.002971582026485719","3723.0733240223462","5331441","296817","[858, 990, 2072, 2389, 2984, 3110, 3243, 3302, 3331, 3359, 3393, 3623, 5092, 25493]","7.966505617900525E-5","0.0","0","0.0","0","0.0","text/x-config",,"[{.cfg, 1361}, {.conf, 60}, {.config, 11}]","15","36","39","[{gov, 1303}, {edu, 54}, {org, 50}, {fr, 6}]","{lirmm.fr=3, jaist.ac.jp=1, stanford.edu=6, nasa.gov=1302, ucsf.edu=1, wisc.edu=3, consc.net=2, centos.org=4, knopper.net=1, usyd.edu.au=2, ucla.edu=1, zitogiuseppe.com=4, ioccc.org=3, bellard.org=9, euroforth.org=17, maine.edu=3, jhu.edu=2, washington.edu=33, dsky.co=1, cuni.cz=1, caida.org=11, bcgsc.ca=2, nrao.edu=3, ico2s.org=3, loria.fr=3}"
+"1250","0.002593908891834601","818.3624","1022953","833","[812, 813, 814, 815, 815, 816, 818, 819, 821, 823, 827, 829, 830, 831]","1.528547501763256E-5","0.0","0","0.0","0","0.0","application/x-netcdf",,"[{.cdf, 1250}]","1","1","1","[{edu, 1250}]","{caltech.edu=1250}"
+"1035","0.0021477565624390496","270114.51400966186","279568522","1039849","[69033, 101008, 186768, 197033, 202375, 244727, 255884, 263050, 264817, 268981, 275502, 1027365, 1036634, 1038255]","0.004177452589461548","0.0","0","0.0","0","0.0","application/x-compress",,"[{.03D_Z, 376}, {.02D_Z, 368}, {.04D_Z, 251}]","4","4","4","[{fr, 995}, {org, 37}]","{obspm.fr=995, osgeo.org=37, cwi.nl=2, wisc.edu=1}"
+"1026","0.0021290804184178405","928.7153996101365","952862","3838","[874, 876, 879, 882, 883, 885, 890, 911, 942, 986, 1070, 1377, 1655, 1817]","1.4238140262799363E-5","0.0","0","0.0","0","0.0","text/prs.lines.tag",,"[{.dsc, 1026}]","4","4","4","[{net, 927}, {fr, 86}, {edu, 12}]","{bme.hu=1, univ-grenoble-alpes.fr=86, wisc.edu=12, knopper.net=927}"
+"885","0.0018364874954188975","230711.3525423729","204179547","16141131","[1369, 1568, 2153, 2812, 2963, 3529, 7053, 35247, 93037, 255283, 806718, 2526603, 4846701, 11633215]","0.0030509528441482973","0.3389830508474576","36497205","5.453594791167474E-4","36497205","5.453594791167474E-4","application/mathematica","{length=3}","[{.nb, 461}, {.cdf, 205}, {.v, 88}, {.pi, 34}, {.nbp, 26}, {.pv, 15}, {.m, 13}, {.ma, 9}, {.thy, 9}, {.txt, 7}]","21","60","69","[{org, 368}, {edu, 213}, {cz, 88}, {de, 50}, {lu, 34}, {io, 22}, {com, 19}, {fr, 18}, {ru, 17}, {nl, 13}, {ca, 10}, {net, 10}, {il, 7}]","{illinois.edu=11, berkeley.edu=6, ac.ru=6, upenn.edu=6, ucdavis.edu=8, muni.cz=26, andrej.com=7, uwaterloo.ca=9, jasss.org=7, mpg.de=6, uchicago.edu=48, cvut.cz=62, uni-saarland.de=20, github.io=22, weizmann.ac.il=6, toronto.edu=16, uva.nl=13, ucsd.edu=93, pirsa.org=7, uni.lu=34, abstractmath.org=343, uni-wuerzburg.de=21, loria.fr=18, wolfram.com=10, spintwo.net=9}"
+"839","0.0017410316481993842","153935.31823599522","129151732","21128129","[3936, 3998, 4154, 4338, 4673, 6134, 30927, 66655, 88906, 115675, 178452, 660357, 2041193, 5520960]","0.001929849732069778","0.0","0","0.0","0","0.0","application/vnd.oasis.opendocument.spreadsheet",,"[{.ods, 819}]","20","51","52","[{org, 500}, {es, 108}, {si, 73}, {de, 33}, {be, 32}, {br, 20}, {edu, 20}, {it, 17}, {ch, 14}, {fr, 10}]","{usp.br=6, gobiernodecanarias.org=340, jvolcanica.org=2, ukri.org=156, noaa.gov=2, uni-lj.si=73, upenn.edu=2, dauphine.fr=3, mpg.de=4, sav.sk=2, unige.it=13, oma.be=32, hawaii.edu=12, gva.es=9, nmsu.edu=3, ouvrirlascience.fr=2, tu-darmstadt.de=3, fourmilab.ch=3, www.gov.br=11, ifrj.edu.br=3, uni-wuerzburg.de=3, ugr.es=99, uni-greifswald.de=19, kit.edu=1, patientensicht.ch=10}"
+"782","0.0016227494027317264","7863550.5971867","6149296567","26217640","[35416, 65629, 126213, 268632, 487040, 1183133, 4130742, 11865025, 20900363, 24271100, 25921314, 26141152, 26177320, 26182863]","0.09188586284110038","13.29923273657289","2509108949","0.037492360016987865","2492781903","0.03724839313508348","application/vnd.openxmlformats-officedocument.presentationml.presentation","{length=100, disconnect=4}","[{.shtml, 88}, {.php, 17}, {.do, 12}]","23","45","47","[{gov, 154}, {int, 152}, {org, 129}, {nl, 93}, {edu, 65}, {no, 39}, {se, 36}, {com, 25}, {de, 22}, {hr, 13}, {jp, 12}, {kr, 12}, {at, 12}, {uk, 8}]","{utwente.nl=88, ans.org=65, ntnu.edu=27, tuwien.at=12, tu-braunschweig.de=4, ed.gov=40, columbia.edu=12, yale.edu=7, openrepository.com=3, ti.com=21, uva.nl=5, irb.hr=13, liu.se=8, ntnu.no=39, fz-juelich.de=18, osaka-u.ac.jp=12, rss.org.uk=4, bipm.org=62, esa.int=152, unist.ac.kr=11, cancer.gov=8, globe.gov=106, ki.se=28, bgs.ac.uk=3, nrao.edu=11}"
+"734","0.0015231433012852778","388359.2029972752","285055655","26119671","[669, 732, 819, 895, 921, 972, 1936, 111124, 602499, 1185004, 2306094, 4130169, 5503568, 6157593]","0.004259444073322416","0.1362397820163488","26119671","3.9029317919716904E-4","26119671","3.9029317919716904E-4","application/zlib","{length=1}","[{.txt, 347}, {.HPF, 5}]","4","4","5","[{edu, 723}, {org, 9}]","{tu-braunschweig.de=1, iodp.org=9, mit.edu=723, enn.com=1}"
+"712","0.0014774905047889887","28689.023876404495","20426585","5693811","[891, 983, 1093, 1212, 1304, 1529, 2107, 3341, 5102, 7918, 20983, 68958, 114698, 2153401]","3.0522424267102004E-4","0.0","0","0.0","0","0.0","message/rfc822",,"[{.txt, 368}, {.seq, 33}, {.eml, 16}, {.pub, 12}, {.ten, 9}, {.asis, 5}]","18","65","72","[{org, 303}, {edu, 285}, {gov, 38}, {uk, 27}, {ch, 10}, {ca, 10}, {it, 8}, {de, 8}, {at, 5}]","{miketaylor.org.uk=27, stanford.edu=9, fu-berlin.de=7, berkeley.edu=5, talkorigins.org=4, nasa.gov=34, utexas.edu=47, python.org=10, woudc.org=29, wisc.edu=12, mit.edu=6, centos.org=4, uwaterloo.ca=8, columbia.edu=49, hawaii.edu=52, netlib.org=55, planet4589.org=174, harvard.edu=3, hixie.ch=4, lowell.edu=69, euroforth.org=11, unica.it=7, fau.edu=6, colorado.edu=9, cern.ch=5}"
+"699","0.001450513852313909","59739.83261802575","41758143","8546435","[1151, 1434, 1601, 3051, 4042, 6021, 14712, 33009, 61156, 90721, 161381, 397290, 960280, 1431340]","6.239710442309939E-4","0.1430615164520744","8546435","1.277051034429935E-4","8546435","1.277051034429935E-4","text/x-yaml","{length=1}","[{.yaml, 658}, {.yml, 40}]","10","21","22","[{press, 628}, {org, 38}, {gov, 9}, {io, 7}, {com, 7}, {edu, 5}]","{scm.com=4, earthbyte.org=2, inverseprobability.com=2, nasa.gov=8, noaa.gov=1, wisc.edu=2, salmobase.org=19, caltech.edu=3, molssi.org=1, mlr.press=628, malramsay.com=1, erikdemaine.org=1, obofoundry.org=10, nga.mil=1, bcgsc.ca=1, caida.org=1, cassavabase.org=2, riken.jp=1, github.io=7, kbroman.org=2, uva.nl=2}"
+"687","0.0014256123269522968","8068649.50363901","5543162209","22985648","[7066, 28589, 129562, 516694, 1695117, 3539284, 9375575, 11295563, 12722423, 13399754, 14582017, 15539141, 16698470, 17014023]","0.08282870030622562","22.12518195050946","1801099611","0.02691293061187388","1801099611","0.02691293061187388","application/vnd.ms-pki.stl","{length=152}","[{.stl, 617}, {.STL, 70}]","9","18","21","[{edu, 538}, {gov, 83}, {ca, 29}, {com, 14}, {org, 12}, {ch, 7}]","{swri.org=4, harvard.edu=346, si.edu=168, asteroidmission.org=4, asc-csa.gc.ca=24, uol.de=2, nasa.gov=77, ucsf.edu=14, ulalaunch.com=14, songho.ca=5, ucdavis.edu=5, knopper.net=1, swri.edu=4, usda.gov=6, km3net.org=4, spacerobotics.eu=1, hawaii.edu=1, uzh.ch=7}"
+"681","0.0014131615642714907","2268.3906020558","1544774","71108","[701, 757, 876, 1010, 1103, 1255, 1665, 2429, 2897, 3685, 5156, 7239, 10344, 16447]","2.3082785215829386E-5","0.0","0","0.0","0","0.0","text/x-common-lisp",,"[{.lisp, 462}, {.cl, 185}, {.jl, 20}, {.lsp, 14}]","7","20","22","[{edu, 563}, {com, 70}, {ca, 26}, {fr, 9}, {org, 7}]","{illinois.edu=1, berkeley.edu=261, stanford.edu=2, lowell.edu=92, ioccc.org=3, amstat.org=1, utexas.edu=64, wisc.edu=34, upenn.edu=2, utoronto.ca=26, ucdavis.edu=4, caltech.edu=1, uni-paderborn.de=1, dauphine.fr=9, jasss.org=3, uni-bayreuth.de=1, juliahub.com=6, northwestern.edu=102, nist.gov=4, franz.com=64}"
+"635","0.0013177057170519772","4294.20157480315","2726818","405962","[717, 733, 794, 805, 833, 945, 1323, 2421, 4233, 6955, 18085, 33066, 35842, 61982]","4.074547747221112E-5","0.0","0","0.0","0","0.0","text/x-objcsrc",,"[{.m, 628}, {.M, 7}]","19","58","68","[{edu, 307}, {uk, 77}, {de, 54}, {cz, 48}, {org, 37}, {il, 24}, {nl, 13}, {ir, 11}, {gov, 11}, {ca, 10}, {com, 10}, {fr, 8}, {au, 8}, {it, 7}]","{utwente.nl=10, illinois.edu=13, stanford.edu=14, berkeley.edu=11, wisc.edu=15, mit.edu=102, muni.cz=39, mathinsight.org=27, uni-bayreuth.de=9, uwaterloo.ca=8, cam.ac.uk=71, usyd.edu.au=7, cvut.cz=9, lmu.de=38, nist.gov=5, weizmann.ac.il=23, ucsd.edu=9, purdue.edu=14, cornell.edu=23, iasbs.ac.ir=11, rutgers.edu=7, qmul.ac.uk=6, loria.fr=7, nyu.edu=75, berkeleyscience.com=10}"
+"604","0.0012533767765344791","234768.940397351","141800440","22747350","[782, 783, 789, 816, 844, 871, 1047, 1677, 1864, 1981, 2493, 5298, 4879225, 22540743]","0.002118853049073911","0.9933774834437086","134029120","0.002002730101378339","134029120","0.002002730101378339","text/x-d","{length=6}","[{.d, 604}]","6","14","14","[{edu, 431}, {de, 126}, {org, 24}, {gov, 13}, {net, 8}]","{earthbyte.org=3, osgeo.org=2, llnl.gov=6, cmu.edu=4, ioccc.org=3, wisc.edu=294, woudc.org=1, knopper.net=8, mit.edu=1, cern.ch=2, mpg.de=126, rutgers.edu=132, nist.gov=7, netlib.org=15}"
+"551","0.001143395039520692","4868.791288566244","2682704","193388","[882, 1021, 1234, 1510, 1673, 2003, 2765, 4628, 5817, 7164, 11070, 23170, 35588, 77898]","4.008630403518337E-5","0.0","0","0.0","0","0.0","application/x-object",,"[{.o, 551}]","6","11","12","[{edu, 305}, {org, 122}, {cz, 94}, {ca, 26}]","{hitran.org=68, usp.br=2, lowell.edu=5, euroforth.org=54, bcgsc.ca=24, univ-grenoble-alpes.fr=2, wisc.edu=298, mit.edu=2, utoronto.ca=1, muni.cz=94, ubc.ca=1}"
+"520","0.001079066099003194","16334.465384615385","8493922","448316","[921, 969, 1208, 1329, 1424, 1639, 2324, 3511, 4508, 5183, 28379, 435217, 447737, 448314]","1.2692042794998362E-4","0.0","0","0.0","0","0.0","text/x-verilog",,"[{.v, 520}]","4","8","8","[{edu, 512}, {de, 6}]","{mpg.de=1, nii.ac.jp=1, ioc.ee=1, maine.edu=4, wisc.edu=498, upenn.edu=7, uni-saarland.de=5, brown.edu=3}"
+"510","0.0010583148278685173","5211.917647058824","2658078","71779","[1118, 1123, 1208, 1384, 1559, 1898, 2973, 5603, 7985, 10647, 13744, 25873, 40166, 71690]","3.971833003463377E-5","0.0","0","0.0","0","0.0","application/x-python-code",,"[{.pyc, 510}]","3","3","3","[{net, 508}]","{python.org=1, consc.net=508, muni.cz=1}"
+"481","9.981361415779544E-4","2070.866943866944","996087","3303","[1782, 1822, 1872, 1904, 1920, 1940, 1990, 2132, 2293, 2381, 2578, 2788, 2896, 2970]","1.4884029817487767E-5","0.0","0","0.0","0","0.0","text/json",,,"1","2","2","[{org, 481}]","{opentopography.org=477, bindingdb.org=4}"
+"387","8.030741929119925E-4","32304.48062015504","12501834","1901024","[6030, 6874, 8183, 8880, 9558, 15511, 22063, 22777, 25739, 33586, 53728, 89562, 473702, 796458]","1.8680865228567621E-4","0.0","0","0.0","0","0.0","application/vnd.ms-wordml",,"[{.xml, 378}]","2","3","3","[{hr, 377}, {org, 10}]","{produccioncientificaluz.org=9, woudc.org=1, istra-istria.hr=377}"
+"380","7.885483031177187E-4","12567.955263157895","4775823","983996","[807, 840, 905, 992, 1222, 1360, 2001, 7184, 8934, 12830, 64693, 131578, 134398, 137337]","7.136273431441618E-5","0.0","0","0.0","0","0.0","text/x-diff",,"[{.patch, 268}, {.diff, 63}, {.txt, 38}]","12","35","38","[{gov, 120}, {edu, 113}, {org, 101}, {net, 12}, {com, 8}, {de, 6}, {cern, 5}, {ch, 5}]","{berkeley.edu=7, stanford.edu=2, boost.org=2, nasa.gov=3, python.org=5, mercurylang.org=1, wisc.edu=67, knopper.net=11, mit.edu=9, centos.org=5, mirbase.org=5, nist.gov=117, netlib.org=5, toronto.edu=5, uni-freiburg.de=4, jirka.org=43, euroforth.org=7, maine.edu=21, root.cern=5, isthe.com=7, iresite.org=1, gnu.org=26, loria.fr=4, gsj.jp=4, patientensicht.ch=4}"
+"380","7.885483031177187E-4","908372.9631578948","345181726","25829178","[3076, 3536, 4829, 7998, 21844, 63600, 165901, 916960, 1351129, 1706865, 3224276, 8221660, 12131563, 21835179]","0.00515787787837397","1.5789473684210527","80410033","0.001201526903570848","80410033","0.001201526903570848","application/x-executable","{length=6}","[{.elf, 21}, {.x, 19}, {.out, 6}]","17","41","48","[{edu, 159}, {gov, 116}, {ca, 43}, {org, 28}, {uk, 9}, {com, 6}]","{stanford.edu=12, miketaylor.org.uk=9, nasa.gov=116, ucsf.edu=2, utexas.edu=2, wisc.edu=85, mit.edu=9, columbia.edu=4, mpg.de=3, astrouw.edu.pl=3, toronto.edu=4, ucsd.edu=3, zitogiuseppe.com=4, park.org=2, predictioncenter.org=2, lowell.edu=7, bellard.org=3, euroforth.org=18, utoronto.ca=42, washington.edu=4, caltech.edu=4, ntu.edu.tw=2, univ-grenoble-alpes.fr=2, ugr.es=2, nyu.edu=20}"
+"366","7.594965235291712E-4","301670.7076502732","110411479","9580873","[1422, 1494, 2327, 10551, 23919, 51030, 151364, 319340, 421772, 582897, 933163, 2142942, 3495910, 4551042]","0.001649823505004005","0.0","0","0.0","0","0.0","multipart/related",,"[{.mht, 274}]","13","21","22","[{org, 112}, {am, 100}, {ec, 81}, {edu, 34}, {cz, 10}, {in, 9}, {fr, 8}, {uk, 5}]","{chymist.com=1, cmu.edu=2, stanford.edu=26, aras.am=100, fau.edu=1, ucuenca.edu.ec=81, unl.edu=2, mit.edu=1, utoronto.ca=1, muni.cz=10, uni-tuebingen.de=2, cam.ac.uk=2, millenniumassessment.org=104, usda.gov=1, ernet.in=9, produccioncientificaluz.org=8, univ-grenoble-alpes.fr=8, lemoyne.edu=2, cwi.nl=1, ruhr-uni-bochum.de=1, bournemouth.ac.uk=3}"
+"347","7.200691083732852E-4","5288095.645533141","1834969189","26093992","[55160, 61928, 67795, 107143, 221436, 689007, 2323592, 7860953, 10672689, 16187780, 22447746, 25475817, 25961860, 26003968]","0.02741902677501799","4.322766570605188","336637982","0.00503021298519799","336557201","0.005029005915120089","application/vnd.openxmlformats-officedocument.presentationml.slideshow","{length=14, time=1}","[{.ppsx, 328}]","26","66","72","[{edu, 90}, {de, 56}, {org, 40}, {br, 37}, {uk, 29}, {hr, 22}, {ca, 12}, {gov, 9}, {ch, 9}, {es, 6}, {com, 6}, {int, 5}]","{ceos.org=4, usp.br=37, berkeley.edu=13, esf.edu=19, uwaterloo.ca=11, unizg.hr=18, unican.es=6, wsl.ch=6, umt.edu=38, istra-istria.hr=4, ti.com=6, hawaii.edu=4, imperial.ac.uk=15, tu-chemnitz.de=48, greenclimate.fund=4, purdue.edu=3, weather.gov=4, ncl.ac.uk=5, physicsfoundations.org=15, bipm.org=8, uni-tuebingen.de=4, stm-assoc.org=3, rutgers.edu=5, esa.int=5, reading.ac.uk=5}"
+"338","7.013929643520761E-4","3326401.2899408285","1124323636","26148352","[13678, 22371, 58436, 126857, 181679, 371915, 1588731, 4425534, 6873199, 9078385, 12164636, 21568496, 25923887, 25982737]","0.01680020572774292","2.366863905325444","154796055","0.0023130400238628515","125567373","0.0018762904483600412","application/vnd.ms-powerpoint","{length=5, disconnect=3}","[{.pot, 30}, {.php, 14}, {.do, 11}]","20","39","40","[{int, 98}, {org, 66}, {edu, 33}, {de, 33}, {gov, 32}, {nl, 23}, {ca, 11}, {no, 10}, {kr, 9}, {ru, 5}]","{ans.org=2, berkeley.edu=4, ntnu.edu=2, whoi.edu=2, tu-braunschweig.de=2, cas.cz=3, ucdavis.edu=8, ed.gov=29, uva.nl=22, diba.cat=2, ntnu.no=10, fz-juelich.de=31, unav.edu=3, ijs.si=4, osaka-u.ac.jp=2, unipd.it=1, irk.ru=4, bipm.org=47, salmobase.org=17, yonsei.ac.kr=9, esa.int=98, rutgers.edu=5, globe.gov=2, nrao.edu=4, scholaris.ca=11}"
+"338","7.013929643520761E-4","9678643.639053255","3271381550","26213932","[170369, 185832, 678746, 1503652, 1910795, 2701417, 5993327, 16256332, 23048129, 25590878, 26054048, 26178919, 26194643, 26213931]","0.04888261821967293","27.514792899408285","1816407224","0.027141664616359997","1816407224","0.027141664616359997","application/vnd.apple.keynote","{length=93}","[{.key, 320}]","18","45","50","[{edu, 167}, {org, 56}, {int, 41}, {gov, 19}, {ca, 15}, {ch, 7}, {hk, 6}, {au, 5}]","{unam.mx=2, berkeley.edu=8, si.edu=4, nasa.gov=18, bioconductor.org=4, csiro.au=5, ox.ac.uk=4, mit.edu=24, unipi.it=3, hkust.edu.hk=6, uahirise.org=50, harvard.edu=29, nagoya-u.ac.jp=2, lowell.edu=18, triumf.ca=5, cornell.edu=11, uam.es=2, caltech.edu=7, utoronto.ca=6, cern.ch=6, ubc.ca=3, esa.int=41, nrao.edu=23, loria.fr=2, brown.edu=30}"
+"335","6.95167583011673E-4","2429.6029850746268","813917","5326","[1340, 1400, 1553, 1602, 1792, 2045, 2256, 2478, 3182, 3647, 3964, 5004, 5256, 5294]","1.2161954625409417E-5","0.0","0","0.0","0","0.0","application/x-x509-cert; format=pem",,"[{.pem, 247}, {.crt, 84}]","14","29","29","[{de, 203}, {ch, 101}, {edu, 12}]","{umaryland.edu=1, unifr.ch=1, cmu.edu=8, fu-berlin.de=4, iitbhu.ac.in=1, knopper.net=1, uni-paderborn.de=1, sinica.edu.tw=1, cvut.cz=3, uni-bielefeld.de=4, uni-saarland.de=2, uzh.ch=100, tu-chemnitz.de=7, purdue.edu=2, exeter.ac.uk=1, univie.ac.at=1, seattlechildrens.org=2, dkfz.de=1, uni-muenster.de=179, kwarc.info=3, rub.de=3, cuni.cz=1, ias.edu=1, uibk.ac.at=1, uchile.cl=2}"
+"332","6.889422016712701E-4","4050426.611445783","1344741635","6145282","[20027, 20775, 89814, 309639, 493478, 4042319, 4753110, 5718439, 5878418, 5934976, 6007468, 6060036, 6145197, 6145248]","0.020093801637966615","0.0","0","0.0","0","0.0","application/x-mach-o",,"[{.0-osx, 30}, {.1-osx, 26}, {.2-osx, 14}, {.3-osx, 14}, {.dylib, 10}, {.4-osx, 10}, {.5-osx, 6}]","5","9","10","[{fr, 254}, {gov, 41}, {edu, 29}]","{uni-freiburg.de=4, ucsd.edu=1, u-bordeaux.fr=254, nasa.gov=41, wisc.edu=16, personality-project.org=4, bcm.edu=1, mit.edu=8, caltech.edu=3}"
+"327","6.785665661039317E-4","3958225.721712538","1294339811","26212996","[36425, 47790, 62947, 69104, 104393, 124119, 255055, 1198466, 8644717, 26067962, 26177734, 26211228, 26212651, 26212928]","0.0193406723919552","11.926605504587156","982414226","0.014679724394463701","968650356","0.0144740577694761","application/illustrator","{length=38, disconnect=1}","[{.ai, 180}, {.AI, 147}]","18","33","35","[{org, 212}, {cz, 28}, {edu, 15}, {fr, 14}, {eu, 9}, {de, 9}, {es, 9}, {com, 8}, {ch, 5}]","{utwente.nl=3, usp.br=2, snazzymaps.com=6, jst.go.jp=2, brandeis.edu=2, nasa.gov=3, mit.edu=6, biovis.net=2, uni-bayreuth.de=2, psl.eu=1, egu.eu=8, unican.es=9, u-bordeaux.fr=14, unawe.org=10, uzh.ch=5, esahubble.org=26, uskudar.edu.tr=3, ksu.edu=2, uni-bremen.de=1, caltech.edu=2, uni-potsdam.de=6, cuni.cz=4, nmt.edu=2, iodp.org=174, avcr.cz=24}"
+"327","6.785665661039317E-4","1207169.749235474","394744508","25833596","[73922, 76974, 89756, 112635, 126312, 160428, 273607, 597394, 1362598, 2533309, 6708347, 10444800, 13514877, 25642155]","0.0058984697394520134","0.6116207951070336","51475751","7.691764000148338E-4","51475751","7.691764000148338E-4","application/x-mobipocket-ebook","{length=2}","[{.mobi, 45}]","8","13","13","[{org, 277}, {gov, 41}]","{eloquentjavascript.net=3, miketaylor.org.uk=1, um.es=2, nih.gov=1, nasa.gov=7, ed.gov=2, intelligence.org=1, page-meeting.org=1, scienzainrete.it=1, cancer.gov=31, produccioncientificaluz.org=275, ruthenia.ru=1, informationphilosopher.com=1}"
+"318","6.598904220827225E-4","141933.18867924527","45134754","3959535","[2810, 3462, 4455, 5397, 6623, 9946, 22322, 119960, 189252, 307209, 649266, 1205919, 1492213, 3604400]","6.744260534882749E-4","0.0","0","0.0","0","0.0","application/vnd.ms-excel",,"[{.shtml, 40}, {.php, 29}, {.jsp, 17}, {.do, 12}, {.xlt, 5}]","24","50","51","[{org, 71}, {es, 42}, {nl, 40}, {kr, 34}, {cn, 17}, {gov, 16}, {com, 14}, {edu, 12}, {jp, 10}, {hr, 10}, {br, 9}, {se, 9}, {int, 8}, {fi, 7}, {ch, 5}]","{utwente.nl=40, jvolcanica.org=9, ufpel.edu.br=5, postech.ac.kr=5, qiagen.com=3, stuk.fi=7, ufla.br=4, irb.hr=10, admin.ch=4, gva.es=40, fzu.edu.cn=17, osaka-u.ac.jp=10, mapress.com=5, keene.edu=5, mccme.ru=2, bipm.org=23, yonsei.ac.kr=5, esa.int=8, unist.ac.kr=24, haematologica.org=10, globe.gov=14, ki.se=9, phyphox.org=5, iodp.org=15, personality-project.org=8}"
+"309","6.412142780615134E-4","700.3074433656958","216395","1423","[683, 685, 687, 689, 690, 692, 696, 700, 702, 703, 706, 709, 712, 1180]","3.233482248393228E-6","0.0","0","0.0","0","0.0","application/zip",,"[]","3","4","4","[{org, 306}]","{ti.com=1, earthbyte.org=306, lbl.gov=1, codehs.com=1}"
+"307","6.37064023834578E-4","336161.28013029316","103201513","13793051","[797, 826, 897, 1137, 1417, 1674, 10196, 109639, 797957, 833014, 963491, 1139305, 5556653, 13793050]","0.0015420885893519857","0.3257328990228013","6862112","1.0253710731988331E-4","6862112","1.0253710731988331E-4","application/x-shapefile","{length=1}","[{.shp, 99}, {.shx, 96}, {.sbx, 56}, {.sbn, 56}]","2","4","4","[{org, 266}, {gov, 41}]","{noaa.gov=23, earthbyte.org=264, ceos.org=2, utah.gov=18}"
+"299","6.204630069268365E-4","140852.97658862875","42115040","3069502","[1745, 1852, 3630, 5093, 6141, 10620, 34400, 205758, 237490, 292751, 678400, 997266, 1420690, 1433353]","6.293039775889958E-4","0.33444816053511706","178130","2.661707492808455E-6","0","0.0","application/postscript","{disconnect=1}","[{.ai, 73}, {.tex, 22}, {.txt, 11}, {.pdf, 9}, {.dvi-alw, 8}, {.print1, 8}]","17","39","44","[{edu, 86}, {org, 79}, {eu, 50}, {fi, 18}, {gov, 16}, {de, 13}, {int, 13}, {uk, 9}, {ru, 6}]","{illinois.edu=20, berkeley.edu=8, stanford.edu=2, uni-jena.de=12, msu.ru=6, nasa.gov=4, wisc.edu=17, au.dk=1, tamu.edu=2, imgt.org=2, stuk.fi=18, utah.edu=8, netlib.org=15, ucsc.edu=22, esahubble.org=46, euroforth.org=12, colorado.edu=2, bipm.org=2, utoronto.ca=2, lbl.gov=4, ijpam.eu=50, esa.int=13, globe.gov=8, qmul.ac.uk=8, loria.fr=1}"
+"296","6.142376255864335E-4","1667.7635135135135","493658","9889","[673, 719, 737, 760, 788, 880, 1020, 1438, 1857, 2146, 7379, 9844, 9886, 9889]","7.376484575786428E-6","0.0","0","0.0","0","0.0","text/x-makefile",,"[]","13","42","48","[{edu, 201}, {org, 37}, {net, 20}, {br, 12}, {de, 7}, {gov, 6}]","{usp.br=12, stanford.edu=11, berkeley.edu=9, nasa.gov=4, python.org=2, utexas.edu=2, wisc.edu=142, princeton.edu=4, knopper.net=20, dbaron.org=2, mit.edu=2, columbia.edu=13, toronto.edu=12, netlib.org=6, uva.nl=1, uni-freiburg.de=7, zitogiuseppe.com=2, hixie.ch=1, lowell.edu=2, euroforth.org=15, bcgsc.ca=2, caida.org=2, univ-grenoble-alpes.fr=2, qmul.ac.uk=1, kbroman.org=3}"
+"293","6.080122442460305E-4","3776.286689419795","1106452","168424","[1147, 1151, 1157, 1190, 1378, 1710, 2250, 3080, 3710, 4250, 6176, 27159, 38024, 61599]","1.653315881004267E-5","0.0","0","0.0","0","0.0","text/vtt",,"[{.vtt, 291}]","8","9","9","[{gov, 277}, {ch, 6}]","{tu-chemnitz.de=2, harvard.edu=2, carleton.ca=1, uu.se=3, gla.ac.uk=1, nasa.gov=262, noaa.gov=15, cern.ch=6, swinburne.edu.au=1}"
+"277","5.748102104305476E-4","412526.2346570397","114269767","5726502","[2211, 2385, 2476, 3010, 3289, 3812, 8029, 137545, 506302, 1534713, 3069762, 5057688, 5538272, 5674243]","0.0017074759727467375","15.16245487364621","82280026","0.001229469273635404","55907931","8.354042488638529E-4","text/event-stream","{length=14, disconnect=28}",,"1","1","1","[{org, 277}]","{lmfdb.org=277}"
+"257","5.333076681611939E-4","1882.0972762645915","483699","9254","[652, 732, 807, 940, 983, 1138, 1544, 2142, 2727, 3133, 3946, 6416, 7076, 8126]","7.227672220086213E-6","0.0","0","0.0","0","0.0","text/x-haskell",,"[{.hs, 236}, {.lhs, 21}]","9","13","13","[{ee, 74}, {uk, 72}, {nl, 65}, {edu, 20}, {org, 8}, {it, 7}, {fr, 5}]","{umd.edu=2, ioccc.org=7, uci.edu=5, lowell.edu=1, ioc.ee=74, ox.ac.uk=72, au.dk=3, dbaron.org=1, unipi.it=7, andrej.com=3, dauphine.fr=5, uva.nl=65, brown.edu=12}"
+"256","5.312325410477262E-4","1.620906053515625E7","4149519497","21934678","[292334, 516885, 2064506, 8837918, 13903000, 16529068, 17973511, 18177093, 18424980, 19412730, 20587121, 21801755, 21910225, 21917183]","0.06200419433402388","87.890625","4019985244","0.06006862878198072","4019985244","0.06006862878198072","chemical/x-cif","{length=225}","[{.cif, 256}]","1","1","1","[{uk, 256}]","{ebi.ac.uk=256}"
+"250","5.187817783669202E-4","5284550.064","1321137516","22924778","[13254, 21283, 28438, 31438, 47550, 81950, 222805, 4201726, 21552555, 22144376, 22503764, 22862410, 22922379, 22922780]","0.019741097094074837","21.2","1155561825","0.017266982361230074","1155561825","0.017266982361230074","application/vnd.ms-excel.sheet.macroenabled.12","{length=53}","[{.xlsm, 221}]","26","76","78","[{ca, 63}, {gov, 34}, {edu, 29}, {se, 19}, {uk, 16}, {org, 14}, {fr, 14}, {ch, 11}, {dk, 8}, {ru, 7}, {de, 6}]","{ceos.org=2, stanford.edu=8, royalholloway.ac.uk=6, epa.ie=4, ntnu.edu=3, noaa.gov=4, sandia.gov=7, ed.gov=14, dauphine.fr=2, uvm.dk=5, usda.gov=6, canada.ca=57, stuk.fi=2, service-public.pf=3, admin.ch=7, ntnu.no=2, yorku.ca=5, gva.es=2, keldysh.ru=6, oeko.de=4, forestcarbonpartnership.org=2, openmopac.net=2, ki.se=19, iodp.org=3, anr.fr=12}"
+"246","5.104812699130495E-4","2947125.1097560977","724992777","26223325","[796, 936, 1074, 1345, 1784, 2446, 11466, 311375, 2197914, 13797082, 25929133, 26223254, 26223312, 26223322]","0.010833204439302248","9.34959349593496","571311809","0.00853682660273538","571311809","0.00853682660273538","application/x-matlab-data","{length=23}","[{.mat, 222}, {.fig, 21}]","16","40","47","[{edu, 70}, {uk, 42}, {de, 36}, {gov, 26}, {cz, 20}, {ir, 13}, {il, 7}, {se, 6}, {pl, 6}, {fr, 6}]","{utwente.nl=3, illinois.edu=5, lirmm.fr=3, uu.se=6, bbci.de=8, whoi.edu=5, noaa.gov=21, nasa.gov=5, wisc.edu=3, mit.edu=5, ucdavis.edu=4, muni.cz=4, cam.ac.uk=42, cvut.cz=14, bgu.ac.il=4, weizmann.ac.il=3, uj.edu.pl=6, ucsd.edu=23, uni-freiburg.de=2, uol.de=26, unica.it=4, iasbs.ac.ir=13, tugraz.at=3, loria.fr=3, nyu.edu=11}"
+"232","4.8142949032450194E-4","5230804.797413793","1213546713","13891616","[905900, 1503864, 1750683, 1946493, 2181215, 3056571, 4741121, 6910022, 8280491, 9150367, 10673572, 12662963, 13140063, 13573706]","0.01813342153969108","2.586206896551724","72842958","0.0010884558867508651","72842958","0.0010884558867508651","application/onenote; format=one","{length=6}","[{.one, 232}]","2","2","2","[{ca, 230}]","{ucsd.edu=2, uwaterloo.ca=230}"
+"214","4.440772022820837E-4","6009826.73364486","1286102921","17099397","[1400, 8467, 68430, 871955, 1681050, 4884539, 6575676, 7683752, 8807915, 9277504, 9474948, 9727597, 12959883, 16694071]","0.019217592664618765","76.16822429906541","1195030448","0.017856742253274827","1180626577","0.017641512413460386","application/x-sqlite3","{length=161, time=2}","[{.php, 122}, {.gpkg, 72}, {.sqlite, 7}, {.db, 5}]","11","15","15","[{mil, 122}, {gov, 62}, {org, 9}, {net, 7}, {edu, 5}]","{gva.es=1, uci.edu=2, noaa.gov=61, arcgis.com=2, wisc.edu=3, au.dk=2, martinfleischmann.net=7, cern.ch=2, nga.mil=122, bcgsc.ca=1, uni-bielefeld.de=1, nist.gov=1, geopackage.org=4, personality-project.org=2, netlib.org=3}"
+"207","4.295513124878099E-4","3772.6280193236717","780934","44988","[656, 674, 803, 809, 822, 953, 1342, 1869, 2889, 5031, 30848, 36889, 40835, 44630]","1.1669106153870085E-5","0.0","0","0.0","0","0.0","application/xml-dtd",,"[{.mod, 167}, {.dtd, 40}]","9","17","17","[{cz, 144}, {org, 25}, {edu, 23}, {net, 5}]","{zitogiuseppe.com=2, hixie.ch=4, berkeley.edu=2, stanford.edu=1, nemates.org=3, umb.edu=2, wisc.edu=18, inductive.com=1, rssboard.org=4, muni.cz=144, dauphine.fr=1, cidoc-crm.org=1, aerith.net=5, nist.gov=1, matroska.org=1, openmath.org=16, dagstuhl.de=1}"
+"202","4.1917567692047154E-4","25800.20297029703","5211641","1060492","[1242, 1394, 1440, 1718, 1949, 2445, 3083, 3133, 3974, 6093, 198142, 299651, 689729, 769131]","7.787494470065542E-5","0.0","0","0.0","0","0.0","application/pkcs7-signature",,"[{.p7s, 126}, {.p7b, 51}, {.pdf, 11}, {.sig, 10}]","8","10","10","[{ru, 83}, {de, 47}, {org, 46}, {it, 11}, {edu, 6}]","{zitogiuseppe.com=2, uni-muenster.de=47, unige.it=11, u-bordeaux.fr=3, w3.org=2, lemoyne.edu=2, centos.org=44, mit.edu=4, sgu.ru=83, admin.ch=4}"
+"200","4.1502542269353617E-4","5440.56","1088112","74991","[769, 782, 964, 1054, 1168, 1496, 2378, 5124, 6437, 7464, 20421, 43653, 61633, 74991]","1.6259113363357064E-5","0.0","0","0.0","0","0.0","text/x-assembly",,"[{.S, 86}, {.s, 70}, {.asm, 31}, {.ASM, 13}]","10","23","24","[{edu, 135}, {com, 23}, {org, 22}, {ca, 7}, {fr, 5}]","{usp.br=1, ioccc.org=3, amstat.org=1, euroforth.org=12, vividmachines.com=6, maine.edu=26, wisc.edu=88, princeton.edu=1, utoronto.ca=7, au.dk=3, unipi.it=2, worrydream.com=15, columbia.edu=6, swtch.com=1, u-bordeaux.fr=2, sao.ru=1, univ-grenoble-alpes.fr=3, nist.gov=1, uml.edu=13, nyu.edu=1, netlib.org=5, meatfighter.com=1, planet4589.org=1}"
+"199","4.129502955800685E-4","991.4522613065327","197299","2728","[631, 653, 663, 705, 715, 775, 982, 1071, 1137, 1238, 1728, 2023, 2507, 2728]","2.9481402718442452E-6","0.0","0","0.0","0","0.0","application/x-bat",,"[{.cmd, 131}, {.bat, 67}]","14","27","28","[{edu, 101}, {org, 66}, {cz, 10}]","{earthbyte.org=1, stanford.edu=2, gobiernodecanarias.org=1, ucsf.edu=71, wisc.edu=24, mit.edu=2, muni.cz=9, cam.ac.uk=2, usyd.edu.au=1, nist.gov=4, cassavabase.org=3, netlib.org=59, uva.nl=1, planet4589.org=1, zitogiuseppe.com=2, hixie.ch=1, univie.ac.at=1, cern.ch=2, abstractmath.org=1, root.cern=1, jku.at=1, cuni.cz=1, bcgsc.ca=1, ufrgs.br=4, uchile.cl=1}"
+"195","4.0464978712619775E-4","376791.1743589744","73474279","18381863","[811, 812, 813, 814, 814, 816, 821, 13285, 303182, 1130444, 1818213, 1962420, 17941981, 18381863]","0.0010978894006792733","1.0256410256410255","36323844","5.427690324055772E-4","36323844","5.427690324055772E-4","binary/octet-stream","{length=2}","[{.mask, 107}, {.shk, 47}, {.gsm, 18}, {.RVC, 6}, {.rvc, 5}, {.euc, 5}]","4","5","5","[{edu, 111}, {com, 50}, {org, 18}, {jp, 16}]","{worrydream.com=50, park.org=18, colorado.edu=4, caltech.edu=107, gsj.jp=16}"
+"190","3.9427415155885933E-4","4645818.036842105","882705427","26222836","[18088, 24447, 39570, 68647, 87596, 302207, 2200188, 5157275, 10541139, 13509855, 21499512, 26190575, 26222784, 26222836]","0.013189825683425515","3.6842105263157894","181359517","0.002709964549997319","181359517","0.002709964549997319","application/vnd.oasis.opendocument.presentation","{length=7}","[{.odp, 187}]","22","49","52","[{org, 56}, {edu, 50}, {ca, 24}, {at, 17}, {pl, 6}, {de, 6}, {ch, 5}]","{usp.br=3, si.edu=3, gobiernodecanarias.org=2, freedesktop.org=2, ioc.ee=3, ukri.org=35, tuwien.ac.at=16, mit.edu=19, uwaterloo.ca=24, harvard.edu=9, ncbj.gov.pl=5, vu.lt=2, fz-juelich.de=2, rd-alliance.org=3, cern.ch=5, umk.pl=1, sigops.org=3, cidoc-crm.org=3, esa.int=4, ubbcluj.ro=2, caida.org=2, ucf.edu=7, ias.edu=5, loria.fr=3, brown.edu=3}"
+"188","3.9012389733192397E-4","853.5265957446809","160463","1409","[721, 737, 743, 755, 771, 786, 835, 884, 918, 971, 1026, 1289, 1370, 1409]","2.3977183485012245E-6","0.0","0","0.0","0","0.0","application/x-adobe-indesign-interchange",,"[{.inx, 188}]","1","1","1","[{edu, 188}]","{berkeley.edu=188}"
+"182","3.776731346511179E-4","1569.8021978021977","285704","2395","[1239, 1256, 1287, 1346, 1409, 1459, 1555, 1673, 1728, 1768, 1875, 2030, 2125, 2395]","4.269131968367748E-6","0.0","0","0.0","0","0.0","text/x-bibtex",,,"1","1","1","[{de, 182}]","{oeko.de=182}"
+"176","3.652223719703118E-4","224132.23863636365","39447274","6865731","[12173, 13627, 16586, 26454, 32402, 42207, 61484, 153872, 278073, 526172, 918204, 1914620, 2884780, 6865731]","5.894408846161128E-4","0.0","0","0.0","0","0.0","application/vnd.openxmlformats-officedocument.wordprocessingml.template",,"[{.dotx, 162}]","17","55","57","[{de, 49}, {se, 32}, {ch, 26}, {fi, 15}, {org, 10}, {edu, 9}, {gov, 8}, {at, 7}, {uk, 6}, {sk, 6}]","{upjs.sk=6, ans.org=3, uu.se=7, fernuni-hagen.de=5, aka.fi=15, unibas.ch=12, su.se=8, uni-bielefeld.de=4, uni-saarland.de=5, admin.ch=9, imperial.ac.uk=2, bnl.gov=4, kth.se=6, uni-due.de=4, uol.de=2, jacsm.ro=2, cern.ch=3, dg-pflegewissenschaft.de=2, uni-heidelberg.de=18, wcrp-climate.org=4, ki.se=9, tugraz.at=4, uibk.ac.at=3, reading.ac.uk=2, uni-goettingen.de=2}"
+"174","3.6107211774337645E-4","2065.155172413793","359337","65579","[994, 1001, 1102, 1131, 1172, 1318, 1568, 1971, 2173, 2292, 2740, 4060, 4994, 65579]","5.369393057560837E-6","0.0","0","0.0","0","0.0","text/x-vcalendar",,,"3","3","3","[{com, 142}, {br, 30}]","{stoa.org=2, spacefoxies.com=142, www.gov.br=30}"
+"169","3.5069648217603803E-4","1775097.4911242602","299991476","24032942","[1233, 1659, 1733, 8702, 29222, 92551, 406171, 940134, 2726649, 4717389, 10162936, 18839019, 21363455, 24032942]","0.0044826225759157234","34.9112426035503","204250102","0.0030520071122230466","204250102","0.0030520071122230466","application/x-idl-save-file","{length=59}","[{.sav, 129}, {.dat, 17}, {.xdr_1, 7}, {.idl, 6}]","3","8","11","[{gov, 112}, {edu, 52}, {org, 5}]","{ucla.edu=1, harvard.edu=1, nmsu.edu=12, noaa.gov=104, nasa.gov=8, sdss.org=5, colorado.edu=12, caltech.edu=26}"
+"165","3.4239597372216735E-4","1199576.6","197930139","25598901","[1430, 1436, 1438, 3167, 4316, 4967, 38401, 403635, 563096, 1228615, 5689950, 25598595, 25598900, 25598901]","0.0029575710662376863","2.4242424242424243","102394994","0.0015300371793402392","102394994","0.0015300371793402392","application/x-sharedlib","{length=4}","[{.1, 46}, {.so, 33}, {.mexa64, 7}, {.mexglx, 7}, {.mexs64, 7}, {.mexsol, 5}]","10","12","12","[{edu, 100}, {org, 38}, {de, 8}, {ru, 7}]","{uni-freiburg.de=8, predictioncenter.org=3, berkeley.edu=2, exeter.ac.uk=1, euroforth.org=35, wisc.edu=98, cern.ch=4, root.cern=1, sao.ru=7, univ-grenoble-alpes.fr=3, cncb.ac.cn=1, franz.com=2}"
+"159","3.2994521104136125E-4","1802.9622641509434","286671","8704","[791, 793, 803, 827, 840, 866, 931, 1833, 3607, 4557, 6328, 7412, 8158, 8704]","4.283581365692992E-6","0.0","0","0.0","0","0.0","text/vnd.sosi",,"[{.sos, 156}]","1","1","1","[{gov, 159}]","{noaa.gov=159}"
+"151","3.133441941336198E-4","153173.42384105962","23129187","1863468","[1214, 1243, 1331, 1679, 2694, 4486, 22706, 231952, 306004, 466031, 590428, 857708, 1032280, 1863468]","3.4560787256760747E-4","0.0","0","0.0","0","0.0","application/rtf",,"[{.php, 80}, {.txt, 5}]","15","22","23","[{sk, 78}, {cz, 18}, {br, 11}, {edu, 11}, {org, 10}, {com, 7}]","{nmsu.edu=4, purdue.edu=1, jvolcanica.org=7, mapress.com=6, ufrj.br=9, colorado.edu=2, unipi.it=3, spasb.ro=1, columbia.edu=3, ku.dk=1, sav.sk=78, esa.int=3, unist.ac.kr=2, cuni.cz=18, produccioncientificaluz.org=3, ti.com=1, tugraz.at=1, duke.edu=1, ruhr-uni-bochum.de=1, scholaris.ca=1, uva.nl=3, ufla.br=2}"
+"146","3.029685585662814E-4","9179.22602739726","1340167","373571","[727, 728, 761, 801, 808, 866, 1111, 2258, 4205, 5971, 16980, 179026, 258737, 373571]","2.002544515530584E-5","0.0","0","0.0","0","0.0","text/x-pascal",,"[{.P, 70}, {.pas, 53}, {.p, 10}, {.pp, 8}]","9","16","16","[{edu, 66}, {cz, 38}, {de, 9}, {org, 8}, {fr, 7}, {br, 6}, {space, 6}]","{schemers.org=1, usp.br=6, umd.edu=2, euroforth.org=4, helioforecast.space=6, maths.org=1, wisc.edu=62, muni.cz=37, kek.jp=2, cuni.cz=1, cwi.nl=1, uni-saarland.de=9, uva.nl=3, toronto.edu=2, netlib.org=2, univ-evry.fr=7}"
+"141","2.92592922998943E-4","31628.90780141844","4459676","4249204","[669, 675, 756, 835, 886, 944, 1156, 1691, 1804, 1836, 2015, 15980, 15981, 4249204]","6.663870782405009E-5","0.0","0","0.0","0","0.0","application/smil+xml",,"[{.smi, 61}, {.smil, 41}, {.sml, 38}]","9","11","11","[{com, 56}, {net, 36}, {se, 31}, {edu, 11}]","{daylight.com=56, magnenat.net=36, ebi.ac.uk=1, fz-juelich.de=1, kwarc.info=2, bindingdb.org=1, nrm.se=31, lemoyne.edu=1, upenn.edu=10, ufrgs.br=1, sparks-lab.org=1}"
+"134","2.780670332046692E-4","464049.447761194","62182626","4935773","[881, 959, 5183, 12505, 23405, 40190, 171475, 359036, 527320, 1027439, 2741281, 4593916, 4928527, 4935773]","9.291638777673938E-4","0.0","0","0.0","0","0.0","application/x-archive",,"[{.a, 67}, {.lib, 47}, {.old, 8}, {.LIB, 5}]","7","9","10","[{edu, 54}, {org, 47}, {cz, 27}]","{root.cern=3, lowell.edu=1, bcgsc.ca=1, nist.gov=1, wisc.edu=43, loria.fr=1, mit.edu=10, netlib.org=47, muni.cz=27}"
+"125","2.593908891834601E-4","1072.512","134064","16884","[695, 696, 753, 783, 786, 789, 794, 802, 809, 1071, 1077, 1304, 16013, 16884]","2.0032512957720358E-6","0.0","0","0.0","0","0.0","multipart/appledouble",,"[]","3","3","3","[{edu, 122}]","{lbl.gov=2, iodp.org=1, lowell.edu=122}"
+"118","2.4486499938918633E-4","5774.483050847458","681389","30331","[676, 677, 683, 790, 848, 902, 1734, 7721, 16979, 17772, 20269, 29234, 29240, 30331]","1.0181655009359797E-5","0.0","0","0.0","0","0.0","text/x-php",,"[{.html, 46}, {.txt, 22}, {.php, 19}]","19","35","35","[{net, 30}, {edu, 17}, {com, 16}, {ch, 15}, {fr, 8}, {org, 6}]","{einsteintoolkit.org=2, ioc.ee=3, cp2k.org=2, whoi.edu=3, noaa.gov=1, utexas.edu=1, wisc.edu=1, carleton.edu=1, ox.ac.uk=3, mit.edu=8, caha.es=2, unizg.hr=4, uzh.ch=3, planetwaves.net=30, zitogiuseppe.com=14, gavinr.com=1, tu-chemnitz.de=2, ihmc.us=2, hixie.ch=12, umb.edu=1, nikhef.nl=2, univ-grenoble-alpes.fr=4, rajana.in=1, loria.fr=3, bcm.edu=2}"
+"114","2.365644909353156E-4","2648.7280701754385","301955","25994","[797, 809, 830, 845, 1042, 1163, 1601, 2192, 3125, 6649, 8665, 14563, 19162, 25994]","4.511962532930877E-6","0.0","0","0.0","0","0.0","text/x-tcl",,"[{.tcl, 108}, {.tk, 6}]","3","7","9","[{edu, 92}, {gov, 11}, {org, 11}]","{espressomd.org=11, berkeley.edu=23, nasa.gov=5, nist.gov=6, wisc.edu=22, isi.edu=46, toronto.edu=1}"
+"114","2.365644909353156E-4","1975.9561403508771","225259","4484","[1201, 1214, 1220, 1404, 1413, 1557, 1873, 2243, 2352, 2363, 3210, 4479, 4482, 4484]","3.3659325667913312E-6","0.0","0","0.0","0","0.0","application/x-x509-cert; format=der",,"[{.cer, 57}, {.der, 54}]","6","10","10","[{de, 104}]","{cam.ac.uk=2, bayern.de=1, tu-chemnitz.de=1, uni-muenster.de=95, uni-bielefeld.de=4, uam.es=1, uni-hamburg.de=3, admin.ch=4, uchile.cl=2, sinica.edu.tw=1}"
+"109","2.261888553679772E-4","887.2293577981651","96708","1290","[765, 791, 829, 835, 839, 847, 855, 940, 945, 958, 987, 1078, 1080, 1290]","1.4450592725229893E-6","0.0","0","0.0","0","0.0","application/x-ms-asx",,"[{.asx, 109}]","6","8","8","[{gov, 63}, {edu, 37}, {jp, 5}]","{usda.gov=63, cmu.edu=2, nii.ac.jp=5, erwinschwab.de=2, msu.ru=1, fau.edu=1, msu.su=1, mit.edu=34}"
+"106","2.1996347402757416E-4","8236.566037735849","873076","44962","[799, 819, 825, 912, 994, 1135, 1829, 2996, 28174, 39422, 43313, 44958, 44959, 44962]","1.304593797221824E-5","0.0","0","0.0","0","0.0","application/xslt+xml",,"[{.xsl, 91}, {.xml, 12}]","5","9","9","[{edu, 57}, {com, 20}, {pl, 14}, {org, 12}]","{uwb.edu.pl=14, ucla.edu=3, zitogiuseppe.com=20, stanford.edu=1, vu.nl=3, dlib.org=1, ksu.edu=53, openmath.org=7, dbaron.org=4}"
+"105","2.1788834691410648E-4","4655.419047619048","488819","33722","[757, 791, 933, 1105, 1193, 1637, 2938, 5052, 7880, 11408, 14932, 21203, 25882, 33722]","7.304177819160929E-6","0.0","0","0.0","0","0.0","text/x-scheme",,"[{.scm, 97}, {.ss, 8}]","3","4","4","[{org, 88}, {edu, 14}]","{schemers.org=88, uni-saarland.de=3, umb.edu=12, brown.edu=2}"
+"105","2.1788834691410648E-4","1819.2380952380952","191020","6094","[775, 838, 880, 931, 964, 1150, 1516, 2190, 2519, 3203, 3754, 5332, 5390, 6094]","2.8543163154789822E-6","0.0","0","0.0","0","0.0","text/x-csharp",,"[{.cs, 105}]","6","6","6","[{cz, 49}, {at, 32}, {edu, 19}]","{metamath.org=1, johndcook.com=1, cuni.cz=49, jku.at=32, umb.edu=19, uva.nl=3}"
+"104","2.158132198006388E-4","4509.625","469001","122341","[893, 894, 895, 897, 916, 922, 1244, 1607, 2943, 2960, 18502, 62917, 62927, 122341]","7.008047357742426E-6","0.0","0","0.0","0","0.0","application/vnd.mif",,"[{.mif, 98}]","3","5","6","[{edu, 95}, {gov, 8}]","{nasa.gov=2, nist.gov=6, python.org=1, wisc.edu=94, stanford.edu=1}"
+"100","2.0751271134676809E-4","73569.7","7356970","916799","[2401, 2402, 2406, 2408, 2409, 2414, 2428, 2476, 144670, 235978, 686723, 916754, 916799, 916799]","1.0993152289545288E-4","0.0","0","0.0","0","0.0","application/x-httpresponse",,"[{.pdf, 17}]","2","3","3","[{de, 98}]","{uni-bielefeld.de=17, ki.se=2, podcast.de=81}"
+"90","1.8676144021209128E-4","834796.4888888889","75131684","26101281","[4613, 6733, 13891, 17526, 17529, 17533, 20382, 20389, 26787, 33876, 86277, 26044483, 26101281, 26101281]","0.001122655174592248","2.2222222222222223","52145764","7.791880691462494E-4","52145764","7.791880691462494E-4","application/coreldraw","{length=2}","[{.cdr, 89}]","5","8","9","[{edu, 64}, {ca, 14}, {gov, 5}]","{ucla.edu=62, bnl.gov=5, usp.br=3, stanford.edu=2, triumf.ca=14, keldysh.ru=1, ibch.ru=2, ufpel.edu.br=1}"
+"89","1.846863130986236E-4","222877.75280898876","19836120","2995187","[11611, 13041, 13551, 15231, 17439, 23896, 43075, 90817, 180550, 498333, 1238012, 2777531, 2995187, 2995187]","2.9640121951522854E-4","0.0","0","0.0","0","0.0","application/x-msaccess",,"[{.mdb, 56}, {.MDB, 17}, {.accdb, 11}]","8","13","13","[{edu, 68}, {gov, 8}]","{bayern.de=1, uevora.pt=4, exeter.ac.uk=3, esf.edu=1, wisc.edu=1, dauphine.fr=1, bo.berlin=1, defra.gov.uk=1, usda.gov=6, utah.gov=2, lemoyne.edu=64, clarkson.edu=2, planet4589.org=2}"
+"86","1.7846093175822054E-4","1229.2325581395348","105714","4314","[875, 878, 884, 888, 890, 931, 948, 1271, 1416, 2070, 3077, 3640, 4314, 4314]","1.5796314258954306E-6","0.0","0","0.0","0","0.0","application/x-plist",,"[{.webloc, 44}, {.plist, 18}]","6","12","12","[{edu, 62}, {jp, 14}, {org, 6}]","{harvard.edu=1, ioccc.org=1, ucsf.edu=3, wisc.edu=44, caltech.edu=2, dream.jp=14, usyd.edu.au=1, uni-muenster.de=2, stm-assoc.org=1, bcgsc.ca=1, bcm.edu=12, personality-project.org=4}"
+"79","1.6393504196394678E-4","5275.341772151899","416752","18440","[1109, 1112, 1798, 2210, 2618, 3503, 4933, 6069, 7666, 8832, 11503, 13545, 18440, 18440]","6.227316684684833E-6","0.0","0","0.0","0","0.0","message/news",,"[{.txt, 6}]","3","5","5","[{edu, 57}, {org, 20}]","{ugr.es=2, berkeley.edu=57, dbaron.org=1, netlib.org=1, euroforth.org=18}"
+"79","1.6393504196394678E-4","365100.96202531643","28842976","3600466","[679, 688, 708, 729, 774, 910, 1232, 132076, 660820, 893101, 2772134, 3600461, 3600466, 3600466]","4.30986163667515E-4","0.0","0","0.0","0","0.0","text/vnd.dvb.subtitle",,"[{.sub, 57}, {.peaks, 22}]","2","2","2","[{org, 57}, {ca, 22}]","{netlib.org=57, bcgsc.ca=22}"
+"68","1.411086437158023E-4","848570.5588235294","57702798","8976892","[817, 867, 1189, 1621, 1650, 1812, 3407, 13586, 58476, 5234351, 8965065, 8976289, 8976892, 8976892]","8.622240486869856E-4","11.764705882352942","56849064","8.49467128546272E-4","56849064","8.49467128546272E-4","text/vnd.ascii-art","{length=8}","[{.ascii, 63}, {.table, 5}]","3","6","6","[{edu, 47}, {gov, 12}, {org, 9}]","{hitran.org=8, noaa.gov=12, wisc.edu=36, nyu.edu=8, caltech.edu=3, netlib.org=1}"
+"68","1.411086437158023E-4","7384127.029411765","502120638","26068708","[2607, 30669, 35403, 44100, 350011, 831640, 4532217, 12511889, 16151451, 23028698, 25334318, 25431130, 26068708, 26068708]","0.007502937542572066","22.058823529411764","203425566","0.003039686483192056","203425566","0.003039686483192056","application/mxf","{length=15}","[{.mxf, 68}]","2","2","2","[{org, 66}]","{nasa.gov=2, freedesktop.org=66}"
+"60","1.2450762680806085E-4","46522.933333333334","2791376","968499","[822, 825, 961, 999, 1005, 1035, 2679, 77784, 116684, 116749, 123388, 123388, 968499, 968499]","4.1710135375544236E-5","0.0","0","0.0","0","0.0","text/vnd.graphviz",,"[{.dot, 52}, {.gv, 5}]","8","11","11","[{org, 21}, {ca, 19}, {jp, 8}, {com, 8}]","{zitogiuseppe.com=2, johndcook.com=5, lirmm.fr=1, boost.org=20, bcgsc.ca=19, euroforth.org=1, csic.es=1, riken.jp=8, knopper.net=1, uzh.ch=1, ptrckprry.com=1}"
+"58","1.2035737258112548E-4","6921.137931034483","401426","266385","[879, 891, 909, 985, 1074, 1513, 1762, 1803, 2014, 2150, 7484, 35847, 266385, 266385]","5.998307932454538E-6","0.0","0","0.0","0","0.0","application/x-dtbresource+xml",,"[{.res, 57}]","7","9","9","[{org, 26}, {pl, 14}, {fr, 12}]","{obspm.fr=12, astrouw.edu.pl=14, esa.int=1, sdss.org=25, nasa.gov=2, noaa.gov=1, uni-saarland.de=1, ox.ac.uk=1, netlib.org=1}"
+"56","1.1620711835419013E-4","2853046.4285714286","159770600","18033228","[519897, 533533, 742173, 765625, 813875, 1048484, 1416064, 2690419, 3909919, 4680782, 15175705, 16300755, 18033228, 18033228]","0.0023873721616263547","0.0","0","0.0","0","0.0","text/sgml",,"[{.SGM, 56}]","1","1","1","[{edu, 56}]","{tamu.edu=56}"
+"56","1.1620711835419013E-4","1027.4107142857142","57535","1800","[798, 798, 801, 803, 842, 851, 900, 1060, 1072, 1794, 1798, 1799, 1800, 1800]","8.597167271023099E-7","0.0","0","0.0","0","0.0","text/x-less",,"[{.less, 56}]","1","1","1","[{edu, 56}]","{andrews.edu=56}"
+"56","1.1620711835419013E-4","5250270.214285715","294015132","26134937","[59186, 77712, 90915, 96135, 100465, 154568, 691323, 4772042, 16568596, 21093906, 25716104, 25716149, 26134937, 26134937]","0.004393321056775765","10.714285714285714","144861758","0.0021645967927356718","144861758","0.0021645967927356718","application/vnd.ms-powerpoint.presentation.macroenabled.12","{length=6}","[{.pptm, 55}]","11","21","22","[{at, 17}, {org, 11}, {edu, 8}, {ch, 6}]","{upjs.sk=2, eurasip.org=1, ceos.org=4, nagoya-u.ac.jp=3, doe.gov=1, towson.edu=1, exeter.ac.uk=2, u-tokyo.ac.jp=1, utexas.edu=3, wisc.edu=1, mit.edu=2, cern.ch=1, uni-heidelberg.de=1, unizg.hr=1, forestcarbonpartnership.org=6, esa.int=1, jku.at=17, uni-bielefeld.de=1, hawaii.edu=1, bournemouth.ac.uk=1, uzh.ch=5}"
+"56","1.1620711835419013E-4","11296.857142857143","632624","189544","[678, 680, 683, 695, 697, 700, 1214, 1800, 1832, 3164, 89566, 189455, 189544, 189544]","9.452984005672578E-6","0.0","0","0.0","0","0.0","text/x-ini",,"[{.ini, 56}]","8","15","15","[{de, 18}, {pl, 14}, {org, 12}, {edu, 7}]","{uwb.edu.pl=14, stsci.edu=1, earthbyte.org=1, fu-berlin.de=18, stanford.edu=3, lifs-tools.org=1, lowell.edu=3, nasa.gov=2, salmobase.org=4, povray.org=4, utoronto.ca=1, abstractmath.org=1, wgtn.ac.nz=1, usyd.edu.au=1, netlib.org=1}"
+"54","1.1205686412725476E-4","460039.0","24842106","3163999","[708, 716, 737, 821, 1032, 1338, 2748, 13734, 35759, 3047051, 3119751, 3163418, 3163999, 3163999]","3.7120316441554977E-4","0.0","0","0.0","0","0.0","text/x-basic",,"[{.bas, 29}, {.BAS, 25}]","8","11","12","[{edu, 22}, {com, 12}, {jp, 6}]","{kek.jp=6, miketaylor.org.uk=4, ioccc.org=2, euroforth.org=1, philrutherford.com=12, noaa.gov=3, lemoyne.edu=14, caltech.edu=8, utoronto.ca=2, in-the-sky.org=1, unipi.it=1}"
+"54","1.1205686412725476E-4","2715723.537037037","146649071","26131556","[44093, 86399, 112473, 186277, 225118, 272809, 1058807, 2102099, 5813474, 7276716, 13166309, 25946135, 26131556, 26131556]","0.0021913037169151695","3.7037037037037037","52077691","7.781708883560515E-4","52077691","7.781708883560515E-4","application/vnd.openxmlformats-officedocument.presentationml.template","{length=2}","[{.potx, 54}]","8","21","21","[{edu, 12}, {de, 12}, {gov, 11}, {org, 6}, {se, 6}]","{bnl.gov=11, uni-freiburg.de=1, ceos.org=2, tu-chemnitz.de=1, umd.edu=6, berkeley.edu=3, ulm.edu=1, gdch.de=1, earthobservations.org=1, uni-leipzig.de=5, uu.se=5, uol.de=1, uit.no=4, iscb.org=1, cern.ch=1, uni-potsdam.de=1, wcrp-climate.org=2, su.se=1, nrao.edu=2, uni-bielefeld.de=2, reading.ac.uk=2}"
+"52","1.079066099003194E-4","916572.0769230769","47661748","12195963","[2130, 154746, 154749, 164547, 164701, 281025, 443685, 786811, 791372, 795586, 2440770, 12195953, 12195963, 12195963]","7.121856608765981E-4","1.9230769230769231","2440770","3.647120528390554E-5","2440770","3.647120528390554E-5","text/vnd.trolltech.linguist","{length=1}","[{.ts, 52}]","2","2","2","[{org, 51}]","{noaa.gov=1, freedesktop.org=51}"
+"51","1.0583148278685173E-4","35257.294117647056","1798122","582043","[910, 917, 935, 1136, 1163, 1734, 4157, 38374, 66551, 66554, 105087, 140262, 582043, 582043]","2.686843766004449E-5","0.0","0","0.0","0","0.0","application/winhlp",,"[{.hlp, 27}, {.HLP, 24}]","3","5","5","[{edu, 38}, {in, 12}]","{ucla.edu=10, lemoyne.edu=26, iiit.ac.in=12, hawaii.edu=2, uwaterloo.ca=1}"
+"49","1.0168122855991636E-4","1173089.9795918367","57481409","23182695","[805, 805, 810, 817, 828, 7670, 803474, 1139653, 1419448, 1636431, 1976051, 23182695, 23182695, 23182695]","8.589159435944949E-4","2.0408163265306123","23182695","3.4640741584793754E-4","23182695","3.4640741584793754E-4","text/vnd.a","{length=1}","[{.dat, 28}, {.a, 18}]","6","7","7","[{edu, 29}, {cz, 12}]","{bcgsc.ca=2, nasa.gov=2, woudc.org=1, caltech.edu=28, muni.cz=12, uva.nl=3, ucsc.edu=1}"
+"49","1.0168122855991636E-4","9110611.306122448","446419954","25912679","[1574, 1574, 56692, 555983, 1983740, 5123720, 6363257, 8311802, 25098352, 25573128, 25588654, 25912679, 25912679, 25912679]","0.006670630081968259","81.63265306122449","438067718","0.006545826797047669","438067718","0.006545826797047669","application/x-gtar","{length=40}","[{.ova, 9}]","9","12","13","[{uk, 30}]","{metoffice.gov.uk=30, noaa.gov=2, nasa.gov=1, tu-braunschweig.de=2, wisc.edu=1, jhuapl.edu=2, caltech.edu=1, astrouw.edu.pl=1, esa.int=1, ico2s.org=3, uzh.ch=4, fzu.cz=1}"
+"41","8.508021165217492E-5","9355.243902439024","383565","258038","[723, 723, 726, 733, 769, 771, 1436, 1624, 2103, 2469, 3204, 258038, 258038, 258038]","5.731419943182368E-6","0.0","0","0.0","0","0.0","text/x-ruby",,"[{.rb, 41}]","4","6","6","[{org, 30}, {edu, 6}]","{software-lab.org=12, stanford.edu=1, ioccc.org=18, exampler.com=3, bcgsc.ca=2, ias.edu=5}"
+"41","8.508021165217492E-5","23358.365853658535","957693","65032","[865, 865, 1008, 1535, 1559, 1596, 7829, 57147, 63151, 64448, 64930, 65032, 65032, 65032]","1.4310327479426306E-5","0.0","0","0.0","0","0.0","text/x-emacs-lisp",,"[{.el, 41}]","5","9","10","[{edu, 15}, {pl, 13}, {org, 7}, {de, 5}]","{uwb.edu.pl=13, berkeley.edu=11, umb.edu=2, euroforth.org=1, spinroot.com=1, python.org=5, gnu.org=1, uni-saarland.de=5, upenn.edu=2}"
+"40","8.300508453870723E-5","1823.575","72943","4112","[673, 673, 676, 696, 778, 1094, 1437, 2270, 4106, 4108, 4110, 4112, 4112, 4112]","1.0899507643177856E-6","0.0","0","0.0","0","0.0","text/x-idl",,"[{.idl, 40}]","2","3","3","[{com, 39}]","{zitogiuseppe.com=6, berkeley.edu=1, franz.com=33}"
+"38","7.885483031177187E-5","51839.23684210526","1969891","171729","[1706, 1706, 1711, 1713, 1715, 20835, 58177, 72464, 98990, 100904, 123552, 171729, 171729, 171729]","2.9435095911502504E-5","0.0","0","0.0","0","0.0","application/vnd.ms-fontobject",,"[{.eot, 38}]","7","7","7","[{se, 12}, {edu, 8}, {ca, 6}, {gov, 6}]","{iasbs.ac.ir=4, andrews.edu=8, vr.se=12, bcgsc.ca=6, noaa.gov=6, genetic.edu.ph=1, leiza.de=1}"
+"36","7.470457608483651E-5","103288.33333333333","3718380","675406","[3549, 3549, 3911, 4535, 5615, 8667, 29407, 119908, 302453, 316117, 555564, 675406, 675406, 675406]","5.556189247801664E-5","0.0","0","0.0","0","0.0","text/comma-separated-values",,"[]","1","3","3","[{edu, 36}]","{columbia.edu=33, nrao.edu=2, wisc.edu=1}"
+"35","7.262944897136883E-5","2.2100800742857143E7","773528026","26223361","[1817592, 1817592, 1902698, 2593032, 24200983, 24900019, 25213359, 25340122, 26223334, 26223342, 26223352, 26223361, 26223361, 26223361]","0.011558442388713489","85.71428571428571","760832742","0.011368743109837771","760832742","0.011368743109837771","application/vnd.lotus-wordpro","{length=30}","[{.bw, 24}, {.lrz, 6}, {.bai, 5}]","2","2","2","[{edu, 28}, {ca, 7}]","{bcgsc.ca=7, wisc.edu=28}"
+"33","6.847919474443346E-5","2098.909090909091","69264","3749","[1365, 1365, 1369, 1480, 1483, 1577, 1787, 2269, 3138, 3195, 3693, 3749, 3749, 3749]","1.0349773074826523E-6","0.0","0","0.0","0","0.0","application/vnd.frogans.ltf",,"[{.ltf, 33}]","1","1","1","[{edu, 33}]","{brandeis.edu=33}"
+"32","6.640406763096578E-5","20891.40625","668525","56194","[10658, 10658, 11858, 12447, 12495, 13831, 16853, 26043, 29509, 30869, 44871, 56194, 56194, 56194]","9.989434691684571E-6","0.0","0","0.0","0","0.0","application/vnd.chemdraw+xml",,"[{.cdxml, 32}]","1","1","1","[{de, 32}]","{dkfz.de=32}"
+"31","6.432894051749811E-5","16210.967741935483","502540","47178","[3226, 3226, 6269, 8399, 9201, 9840, 13784, 19312, 25084, 27474, 32017, 47178, 47178, 47178]","7.509203859181278E-6","0.0","0","0.0","0","0.0","chemical/x-cdx",,"[{.cdx, 30}]","2","2","2","[{de, 30}]","{harvard.edu=1, dkfz.de=30}"
+"31","6.432894051749811E-5","54083.903225806454","1676601","582309","[1944, 1944, 1951, 1958, 1991, 2006, 2109, 2162, 94789, 165873, 523309, 582309, 582309, 582309]","2.505261013950569E-5","0.0","0","0.0","0","0.0","application/x-shockwave-flash",,"[{.txt, 8}, {.html, 8}]","4","4","4","[{ch, 24}]","{ti.com=2, hixie.ch=24, fz-juelich.de=4, cancer.gov=1}"
+"31","6.432894051749811E-5","3386.516129032258","104982","24285","[759, 759, 768, 1042, 1063, 1184, 1563, 3625, 6177, 6630, 12931, 24285, 24285, 24285]","1.5686935160277171E-6","0.0","0","0.0","0","0.0","text/x-ada",,"[{.ads, 19}, {.adb, 12}]","1","2","2","[{edu, 31}]","{lowell.edu=23, ucdavis.edu=8}"
+"31","6.432894051749811E-5","83615.7741935484","2592089","482121","[1197, 1197, 1220, 1246, 1256, 1267, 1646, 4286, 332521, 453523, 482107, 482121, 482121, 482121]","3.873228941405926E-5","0.0","0","0.0","0","0.0","text/x-vbasic",,"[{.frm, 24}, {.cls, 7}]","7","8","8","[{ru, 13}, {com, 8}]","{zitogiuseppe.com=8, vldb.org=4, epfl.ch=1, columbia.edu=1, usp.br=1, harvard.edu=1, msu.ru=13, nasa.gov=2}"
+"27","5.602843206362738E-5","582345.7037037037","15723334","1623066","[843, 843, 844, 846, 12092, 145874, 316049, 870992, 1317259, 1623060, 1623063, 1623066, 1623066, 1623066]","2.349459154534887E-4","0.0","0","0.0","0","0.0","application/vnd.yellowriver-custom-menu",,"[{.cdb, 6}, {.hdb, 6}]","1","1","1","[{edu, 27}]","{wisc.edu=27}"
+"27","5.602843206362738E-5","31850.51851851852","859964","250368","[3152, 3152, 5108, 6550, 6649, 7620, 10041, 11549, 11798, 159335, 235382, 250368, 250368, 250368]","1.2850011914587832E-5","0.0","0","0.0","0","0.0","application/x-stata-dta; version=14",,"[{.dta, 24}]","3","5","5","[{edu, 24}]","{columbia.edu=1, jstatsoft.org=2, unav.edu=1, mtu.edu=22, uu.se=1}"
+"27","5.602843206362738E-5","839.4814814814815","22666","844","[836, 836, 836, 837, 838, 839, 839, 840, 841, 842, 842, 844, 844, 844]","3.3868670090381435E-7","0.0","0","0.0","0","0.0","image/jpeg",,"[{.php, 27}]","1","1","1","[{ru, 27}]","{basis-foundation.ru=27}"
+"26","5.39533049501597E-5","4147940.1923076925","107846445","4192660","[4018219, 4018219, 4018267, 4139947, 4140005, 4148314, 4152110, 4164908, 4173980, 4174036, 4192626, 4192660, 4192660, 4192660]","0.0016114954849225564","0.0","0","0.0","0","0.0","chemical/x-xyz",,"[{.xyz, 26}]","1","1","1","[{edu, 26}]","{mit.edu=26}"
+"26","5.39533049501597E-5","1589.4615384615386","41326","10406","[659, 659, 663, 665, 668, 1156, 1196, 1631, 1712, 1769, 1776, 10406, 10406, 10406]","6.175137475315906E-7","0.0","0","0.0","0","0.0","application/relax-ng-compact-syntax",,"[{.rnc, 26}]","3","3","3","[{org, 16}, {pl, 7}]","{uwb.edu.pl=7, ontopia.net=3, openmath.org=16}"
+"25","5.187817783669202E-5","1348.6","33715","3170","[614, 614, 617, 691, 711, 756, 1120, 1637, 2073, 2513, 3138, 3170, 3170, 3170]","5.037863814070458E-7","0.0","0","0.0","0","0.0","text/x-sed",,"[{.sed, 25}]","3","6","6","[{org, 20}]","{zitogiuseppe.com=2, berkeley.edu=1, ioccc.org=2, lowell.edu=1, wisc.edu=1, netlib.org=18}"
+"24","4.980305072322434E-5","1331.5","31956","1577","[1106, 1106, 1115, 1174, 1236, 1303, 1336, 1401, 1409, 1435, 1501, 1577, 1577, 1577]","4.775025242249312E-7","0.0","0","0.0","0","0.0","application/vnd.businessobjects",,"[{.rep, 24}]","1","1","1","[{ru, 24}]","{sao.ru=24}"
+"24","4.980305072322434E-5","1.79910695E7","431785668","26223431","[1485, 1485, 1487, 2242, 2244, 3259328, 26223398, 26223416, 26223420, 26223425, 26223425, 26223431, 26223431, 26223431]","0.006451957266057958","70.83333333333333","428512482","0.006403047684844486","428512482","0.006403047684844486","application/pgp-encrypted","{length=17}","[{.gpg, 23}]","4","5","5","[{ca, 20}]","{uni-wuerzburg.de=1, fernuni-hagen.de=1, gnu.org=1, mit.edu=1, bcgsc.ca=20}"
+"24","4.980305072322434E-5","5309204.916666667","127420918","22416433","[11821, 11821, 13699, 14014, 14408, 15816, 53335, 8346581, 17209647, 17716146, 19581297, 22416433, 22416433, 22416433]","0.0019039870441875696","4.166666666666667","22416433","3.3495754605141593E-4","22416433","3.3495754605141593E-4","application/vnd.ms-excel.sheet.binary.macroenabled.12","{length=1}","[{.xlsb, 16}]","8","11","11","[{uk, 8}, {edu, 7}]","{fhnw.ch=1, defra.gov.uk=5, ntnu.no=3, uvm.dk=2, oeko.de=1, cmu.edu=1, ntnu.edu=3, whoi.edu=3, theccc.org.uk=3, ed.gov=1, ufpel.edu.br=1}"
+"24","4.980305072322434E-5","3894890.2083333335","93477365","10962690","[174064, 174064, 731283, 858255, 949238, 1185718, 4193700, 5988411, 7015941, 8211663, 10367208, 10962690, 10962690, 10962690]","0.0013967855096193279","0.0","0","0.0","0","0.0","application/ogg",,,"1","1","1","[{org, 24}]","{freedesktop.org=24}"
+"23","4.772792360975666E-5","1071770.8260869565","24650729","5168611","[15583, 15583, 73790, 73801, 74285, 76638, 200706, 466686, 5112221, 5112235, 5168597, 5168611, 5168611, 5168611]","3.683435136276353E-4","0.0","0","0.0","0","0.0","application/vnd.ms-htmlhelp",,"[{.chm, 23}]","6","6","6","[{com, 16}]","{uni.lu=1, sao.ru=1, arcgis.com=16, bioconductor.org=1, lemoyne.edu=1, tuwien.ac.at=3}"
+"22","4.565279649628898E-5","153190.04545454544","3370181","1923264","[13965, 13965, 20362, 27551, 35826, 56239, 78340, 92205, 109582, 111125, 112292, 1923264, 1923264, 1923264]","5.035892898344295E-5","0.0","0","0.0","0","0.0","application/vnd.ms-word.document.macroenabled.12",,"[{.docm, 22}]","11","17","17","[]","{scielo.org.mx=1, tu-chemnitz.de=1, icesd.org=1, towson.edu=1, exeter.ac.uk=1, andrews.edu=1, asc-csa.gc.ca=2, uni-due.de=1, weather.gov=1, uni-bremen.de=1, weebly.com=1, uni-tuebingen.de=1, unizg.hr=4, uvm.dk=2, gla.ac.uk=1, bournemouth.ac.uk=1, ufsm.br=1}"
+"22","4.565279649628898E-5","4100.409090909091","90209","12888","[1114, 1114, 1362, 1831, 2225, 2743, 4094, 4404, 5473, 5975, 8383, 12888, 12888, 12888]","1.3479479661974846E-6","0.0","0","0.0","0","0.0","text/x-jsp",,"[{.rsp, 14}]","7","8","8","[{com, 10}]","{ucla.edu=1, rstudio.com=10, wias-berlin.de=1, jku.at=2, r-project.org=4, whoi.edu=2, idm.gov.vn=1, utoronto.ca=1}"
+"21","4.3577669382821294E-5","128206.09523809524","2692328","682369","[10401, 10401, 15146, 16157, 21460, 33752, 92835, 137449, 172124, 270208, 374310, 682369, 682369, 682369]","4.023011065344413E-5","0.0","0","0.0","0","0.0","model/vnd.dwf",,"[{.dwf, 21}]","1","1","1","[{nl, 21}]","{uva.nl=21}"
+"20","4.150254226935362E-5","1070.95","21419","7102","[645, 645, 646, 646, 647, 647, 649, 869, 1250, 1450, 7102, 7102, 7102, 7102]","3.2005340362917143E-7","0.0","0","0.0","0","0.0","text/x-lex",,"[{.l, 20}]","4","5","5","[{org, 15}]","{usp.br=3, wisc.edu=1, utoronto.ca=1, netlib.org=14, euroforth.org=1}"
+"20","4.150254226935362E-5","253121.2","5062424","4352157","[897, 897, 908, 938, 958, 1003, 1047, 42794, 191356, 321777, 4352157, 4352157, 4352157, 4352157]","7.564526970512183E-5","0.0","0","0.0","0","0.0","file/unknown",,"[{.php, 20}]","1","1","1","[{kr, 20}]","{unist.ac.kr=20}"
+"20","4.150254226935362E-5","4172.75","83455","8888","[1992, 1992, 2230, 2476, 2479, 2604, 3629, 5018, 5996, 8828, 8888, 8888, 8888, 8888]","1.2470263224180633E-6","0.0","0","0.0","0","0.0","application/x-endnote-style",,"[{.ens, 20}]","5","8","8","[{no, 6}]","{techscience.com=2, uni-potsdam.de=1, rsc.org=3, mtu.edu=3, hvl.no=6, mapress.com=1, uni-saarland.de=3, kmae-journal.org=1}"
+"20","4.150254226935362E-5","19604.0","392080","246690","[1036, 1036, 1040, 1040, 1041, 1041, 1046, 1050, 29723, 97918, 246690, 246690, 246690, 246690]","5.858655329143542E-6","0.0","0","0.0","0","0.0","chemical/x-pdb",,"[{.pdb, 20}]","3","4","4","[{org, 18}]","{ijcai.org=1, salmobase.org=17, ucdavis.edu=1, utoronto.ca=1}"
+"20","4.150254226935362E-5","4197.75","83955","27402","[1263, 1263, 1490, 1777, 2088, 2255, 3028, 3992, 4556, 6406, 27402, 27402, 27402, 27402]","1.2544975723277038E-6","0.0","0","0.0","0","0.0","application/x-sas",,"[{.sas, 20}]","4","4","4","[{fr, 12}, {edu, 5}]","{uqam.ca=1, bu.edu=5, ipums.org=2, loria.fr=12}"
+"19","3.9427415155885933E-5","5009.578947368421","95182","19685","[965, 965, 965, 1142, 1146, 1496, 3085, 10665, 10867, 12698, 19685, 19685, 19685, 19685]","1.422257017798767E-6","0.0","0","0.0","0","0.0","application/x-elc",,"[]","6","9","9","[{edu, 5}]","{uni-freiburg.de=2, schemers.org=1, uni.lu=4, harvard.edu=1, berkeley.edu=3, lowell.edu=1, ioc.ee=1, franz.com=3, netlib.org=3}"
+"19","3.9427415155885933E-5","58511.36842105263","1111716","137326","[54124, 54124, 54124, 54125, 54126, 54128, 54132, 54140, 54141, 54142, 137326, 137326, 137326, 137326]","1.6611816129091365E-5","0.0","0","0.0","0","0.0","application/vnd.wordperfect",,"[{.WP, 18}]","1","2","2","[{edu, 19}]","{ucla.edu=18, wisc.edu=1}"
+"19","3.9427415155885933E-5","3361002.3684210526","63859045","7257113","[27279, 27279, 27279, 27279, 27281, 27307, 3231935, 6778726, 7115882, 7255230, 7257113, 7257113, 7257113, 7257113]","9.542137683719325E-4","47.36842105263158","60374649","9.021481817716304E-4","60374649","9.021481817716304E-4","application/bizagi-modeler","{length=9}","[{.bpm, 19}]","2","2","2","[{com, 10}, {edu, 9}]","{illumina.com=10, wisc.edu=9}"
+"18","3.735228804241826E-5","63157.166666666664","1136829","318354","[4109, 4109, 4109, 4111, 4127, 4559, 36330, 65904, 191096, 196769, 318354, 318354, 318354, 318354]","1.698706712705296E-5","0.0","0","0.0","0","0.0","application/java-archive",,"[]","2","2","2","[{int, 15}]","{esa.int=15, unizg.hr=3}"
+"18","3.735228804241826E-5","56918.22222222222","1024528","168152","[1064, 1064, 1064, 1066, 1067, 1092, 3853, 164135, 168150, 168151, 168152, 168152, 168152, 168152]","1.5309009454847927E-5","0.0","0","0.0","0","0.0","application/vnd.kde.kpresenter",,"[{.kpt, 18}]","1","1","1","[{edu, 18}]","{wisc.edu=18}"
+"17","3.527716092895057E-5","5648.0","96016","13846","[1485, 1485, 1485, 1519, 1524, 1613, 2133, 11633, 13473, 13845, 13846, 13846, 13846, 13846]","1.4347190626480472E-6","0.0","0","0.0","0","0.0","application/x-ms-wmz",,"[{.wmz, 17}]","4","4","4","[{com, 8}, {org, 5}]","{zitogiuseppe.com=8, python.org=5, utoronto.ca=3, umb.edu=1}"
+"17","3.527716092895057E-5","4995577.882352941","84924824","26069141","[15705, 15705, 15705, 16854, 17463, 23255, 37907, 5241199, 19977382, 22874938, 26069141, 26069141, 26069141, 26069141]","0.0012689891672724378","5.882352941176471","26069141","3.8953813468130076E-4","26069141","3.8953813468130076E-4","application/vnd.realvnc.bed","{length=1}","[{.bb, 5}]","1","1","1","[{ca, 17}]","{bcgsc.ca=17}"
+"17","3.527716092895057E-5","1087.2941176470588","18484","1118","[795, 795, 795, 1097, 1098, 1099, 1102, 1109, 1116, 1116, 1118, 1118, 1118, 1118]","2.761971666595828E-7","0.0","0","0.0","0","0.0","application/vnd.lotus-organizer",,"[{.org, 17}]","2","2","2","[{de, 16}]","{ejce.org=1, uni-tuebingen.de=16}"
+"17","3.527716092895057E-5","1107.4705882352941","18827","1773","[780, 780, 780, 915, 932, 943, 1042, 1146, 1238, 1541, 1773, 1773, 1773, 1773]","2.813224440975961E-7","0.0","0","0.0","0","0.0","text/x-go",,"[{.go, 17}]","2","2","2","[{it, 16}]","{unimi.it=16, mit.edu=1}"
+"17","3.527716092895057E-5","10810.64705882353","183781","31196","[919, 919, 919, 930, 962, 5466, 6420, 11292, 23041, 31147, 31196, 31196, 31196, 31196]","2.74614755928721E-6","0.0","0","0.0","0","0.0","application/x-bplist",,"[{.qtz, 14}]","2","3","3","[{com, 14}]","{mrob.com=14, eiu.edu=1, caltech.edu=2}"
+"16","3.320203381548289E-5","2255.3125","36085","5599","[747, 747, 747, 809, 1066, 1339, 1986, 3322, 4017, 4079, 5599, 5599, 5599, 5599]","5.392001059787409E-7","0.0","0","0.0","0","0.0","text/x-forth",,"[{.4th, 16}]","1","1","1","[{org, 16}]","{euroforth.org=16}"
+"16","3.320203381548289E-5","2.3572357625E7","377157722","25532068","[10082896, 10082896, 10082896, 10082897, 25466040, 25469525, 25504552, 25519262, 25519263, 25532059, 25532068, 25532068, 25532068, 25532068]","0.005635679192825287","87.5","356991929","0.005334351834567137","356991929","0.005334351834567137","model/vnd.mts","{length=14}","[{.MTS, 16}]","2","2","2","[{au, 14}]","{freedesktop.org=2, csiro.au=14}"
+"15","3.112690670201521E-5","344828.06666666665","5172421","3264340","[21887, 21887, 21887, 24764, 27224, 54540, 104331, 242111, 358234, 465288, 3264340, 3264340, 3264340, 3264340]","7.728889985774323E-5","0.0","0","0.0","0","0.0","application/vnd.openxmlformats-officedocument.spreadsheetml.template",,"[{.xltx, 15}]","4","7","7","[{de, 8}]","{unibas.ch=1, nmsu.edu=2, uni-wuerzburg.de=1, uni-due.de=1, uni-bielefeld.de=5, genotoul.fr=4, uni-tuebingen.de=1}"
+"15","3.112690670201521E-5","1529.8","22947","1734","[1050, 1050, 1050, 1052, 1136, 1374, 1723, 1724, 1732, 1734, 1734, 1734, 1734, 1734]","3.428855433530322E-7","0.0","0","0.0","0","0.0","text/x-vhdl",,"[{.vhd, 10}]","2","4","4","[{edu, 14}]","{mpg.de=1, maine.edu=4, wisc.edu=9, hawaii.edu=1}"
+"15","3.112690670201521E-5","1065.6666666666667","15985","3521","[767, 767, 767, 768, 769, 769, 806, 811, 1250, 1752, 3521, 3521, 3521, 3521]","2.388558596112006E-7","0.0","0","0.0","0","0.0","application/rls-services+xml",,"[{.Rs, 13}]","4","4","4","[{org, 12}]","{usyd.edu.au=1, personality-project.org=12, loria.fr=1, ucdavis.edu=1}"
+"15","3.112690670201521E-5","1.50434324E7","225651486","16930582","[7882229, 7882229, 7882229, 7963605, 14555202, 14572094, 16145557, 16700680, 16747664, 16769629, 16930582, 16930582, 16930582, 16930582]","0.0033717972887754013","0.0","0","0.0","0","0.0","application/vnd.sun.xml.draw.template",,"[{.bw, 15}]","1","1","1","[{jp, 15}]","{riken.jp=15}"
+"15","3.112690670201521E-5","1889541.3333333333","28343120","7047969","[91061, 91061, 91061, 400843, 747273, 748246, 1206792, 1816203, 1865895, 6686119, 7047969, 7047969, 7047969, 7047969]","4.2351706547784867E-4","0.0","0","0.0","0","0.0","application/vnd.oasis.opendocument.tika.flat.document",,,"1","1","1","[{org, 15}]","{produccioncientificaluz.org=15}"
+"14","2.9051779588547533E-5","1239.357142857143","17351","4476","[650, 650, 650, 665, 665, 667, 765, 1336, 1750, 2380, 4476, 4476, 4476, 4476]","2.592673143643379E-7","0.0","0","0.0","0","0.0","text/css",,"[{.map, 7}]","5","5","5","[{ir, 7}]","{iasbs.ac.ir=7, osaka-u.ac.jp=1, hixie.ch=3, ox.ac.uk=1, dbaron.org=2}"
+"14","2.9051779588547533E-5","2.0215679785714287E7","283019517","21861042","[15205886, 15205886, 15205886, 17463225, 18517540, 19410000, 21256394, 21600521, 21688584, 21718150, 21861042, 21861042, 21861042, 21861042]","0.004229019081625386","85.71428571428571","246146292","0.0036780409237265927","246146292","0.0036780409237265927","application/x-download","{length=12}","[{.cub, 14}]","1","1","1","[{com, 14}]","{im-ldi.com=14}"
+"14","2.9051779588547533E-5","111410.57142857143","1559748","479389","[1751, 1751, 1751, 5565, 12878, 14423, 33934, 144752, 147623, 479384, 479389, 479389, 479389, 479389]","2.3306534208123294E-5","0.0","0","0.0","0","0.0","application/x-spss-sav",,"[{.sav, 10}]","8","9","9","[]","{upjs.sk=1, utwente.nl=1, unizg.hr=4, columbia.edu=3, soton.ac.uk=1, su.se=1, amstat.org=1, jstatsoft.org=1, uwaterloo.ca=1}"
+"14","2.9051779588547533E-5","3782686.214285714","52957607","4489670","[3124, 3124, 3124, 23755, 4365671, 4365740, 4395912, 4404451, 4460149, 4489641, 4489670, 4489670, 4489670, 4489670]","7.913190330270337E-4","0.0","0","0.0","0","0.0","application/wasm",,"[{.wasm, 14}]","2","3","3","[{fr, 13}]","{univ-grenoble-alpes.fr=2, ioccc.org=1, u-bordeaux.fr=11}"
+"14","2.9051779588547533E-5","1015556.6428571428","14217793","7042546","[793, 793, 793, 979, 986, 1008, 1114, 21914, 90877, 7042538, 7042546, 7042546, 7042546, 7042546]","2.1244936933306914E-4","0.0","0","0.0","0","0.0","application/x-stata-do",,"[{.do, 14}]","2","3","4","[{kr, 13}]","{yonsei.ac.kr=4, unist.ac.kr=9, ejce.org=1}"
+"14","2.9051779588547533E-5","964.6428571428571","13505","1394","[642, 642, 642, 658, 725, 761, 907, 1101, 1393, 1393, 1394, 1394, 1394, 1394]","2.0179846005938465E-7","0.0","0","0.0","0","0.0","text/x-robots",,"[{.txt, 14}]","6","11","11","[{org, 5}]","{utwente.nl=2, lowell.edu=1, esf.edu=1, davidar.io=1, gnu.org=3, griffithlab.org=1, ruhr-uni-bochum.de=1, wisc.edu=1, github.io=1, uzh.ch=1, mackelab.org=1}"
+"14","2.9051779588547533E-5","1417.0","19838","5097","[1020, 1020, 1020, 1057, 1074, 1101, 1137, 1150, 1247, 1320, 5097, 5097, 5097, 5097]","2.9642931141488876E-7","0.0","0","0.0","0","0.0","text/x-research-info-systems",,"[{.php, 14}]","1","1","1","[{org, 14}]","{marinespecies.org=14}"
+"13","2.697665247507985E-5","990.9230769230769","12882","2610","[680, 680, 680, 692, 692, 726, 796, 828, 1484, 1484, 2610, 2610, 2610, 2610]","1.9248928267197282E-7","0.0","0","0.0","0","0.0","text/x-awk",,"[{.awk, 13}]","4","6","6","[{org, 10}]","{isthe.com=1, euroforth.org=8, bcgsc.ca=1, caltech.edu=1, netlib.org=1, squid-cache.org=1}"
+"13","2.697665247507985E-5","677.6923076923077","8810","684","[675, 675, 675, 675, 675, 676, 677, 679, 680, 680, 684, 684, 684, 684]","1.3164342340786218E-7","0.0","0","0.0","0","0.0","application/x-research-info-systems",,"[{.ris, 13}]","1","1","1","[{pl, 13}]","{rcin.org.pl=13}"
+"13","2.697665247507985E-5","48271.46153846154","627529","63548","[42784, 42784, 42784, 42825, 42825, 42976, 44699, 53565, 60536, 60536, 63548, 63548, 63548, 63548]","9.376851969093344E-6","0.0","0","0.0","0","0.0","text/directory",,"[{.vct, 13}]","1","1","1","[{pl, 13}]","{uwb.edu.pl=13}"
+"13","2.697665247507985E-5","6456136.153846154","83929770","12523999","[472102, 472102, 472102, 621000, 621000, 1932190, 6985450, 10015530, 11698133, 11698133, 12523999, 12523999, 12523999, 12523999]","0.0012541205730572634","23.076923076923077","28623809","4.2771126080962276E-4","10015530","1.4965705521499954E-4","application/x-adobe-indesign","{length=1, disconnect=2}","[{.indd, 11}]","5","7","7","[{no, 6}]","{unibas.ch=1, wcrp-climate.org=2, aber.ac.uk=1, uni-jena.de=1, tu-braunschweig.de=1, uit.no=6, cern.ch=1}"
+"13","2.697665247507985E-5","1.8315863076923076E7","238106220","26061120","[46758, 46758, 46758, 8572443, 8572443, 15352219, 17186819, 26056245, 26058647, 26058647, 26061120, 26061120, 26061120, 26061120]","0.0035579021493195896","92.3076923076923","238059462","0.0035572034679130396","238059462","0.0035572034679130396","application/vnd.xara","{length=12}",,"2","2","2","[{cern, 12}]","{root.cern=12, bibsonomy.org=1}"
+"13","2.697665247507985E-5","1275.1538461538462","16577","2008","[1205, 1205, 1205, 1205, 1205, 1210, 1211, 1214, 1253, 1253, 2008, 2008, 2008, 2008]","2.477018195042147E-7","0.0","0","0.0","0","0.0","application/x-x509-key; format=pem",,"[{.key, 11}]","3","4","4","[{edu, 11}]","{tau.ac.il=1, berkeley.edu=10, kwarc.info=1, caltech.edu=1}"
+"13","2.697665247507985E-5","1.1956417846153846E7","155433432","12483875","[11437446, 11437446, 11437446, 11593682, 11593682, 11646093, 11829668, 12338189, 12408080, 12408080, 12483875, 12483875, 12483875, 12483875]","0.0023225640295701654","100.0","155433432","0.0023225640295701654","155433432","0.0023225640295701654","application/vnd.snesdev-page-table","{length=13}","[{.bw, 13}]","1","1","1","[{ca, 13}]","{bcgsc.ca=13}"
+"13","2.697665247507985E-5","2782864.4615384615","36177238","6726628","[711, 711, 711, 20194, 20194, 29658, 2471629, 5150413, 6672847, 6672847, 6726628, 6726628, 6726628, 6726628]","5.405783722770717E-4","15.384615384615385","11877041","1.7747268299608842E-4","11877041","1.7747268299608842E-4","application/marc","{length=2}","[{.mrc, 12}]","3","5","5","[{edu, 9}]","{ucsf.edu=7, wisc.edu=1, ems.press=1, aip.org=3, bcm.edu=1}"
+"13","2.697665247507985E-5","464586.76923076925","6039628","2443832","[14756, 14756, 14756, 28473, 28473, 52887, 57079, 434369, 1502865, 1502865, 2443832, 2443832, 2443832, 2443832]","9.024714029852213E-5","0.0","0","0.0","0","0.0","text/x-uuencode",,"[{.uu, 13}]","3","3","3","[{org, 9}]","{nist.gov=1, wisc.edu=3, netlib.org=9}"
+"12","2.490152536161217E-5","1704.0833333333333","20449","5026","[825, 825, 825, 825, 825, 830, 874, 2644, 3642, 3642, 5026, 5026, 5026, 5026]","3.055591788044692E-7","0.0","0","0.0","0","0.0","text/x-java-properties",,,"5","5","5","[{cz, 6}]","{zitogiuseppe.com=2, sandia.gov=1, gobiernodecanarias.org=2, muni.cz=6, bcgsc.ca=1}"
+"12","2.490152536161217E-5","1386321.4166666667","16635857","2637739","[129784, 129784, 129784, 647997, 647997, 1028542, 1367375, 1924514, 2088171, 2088171, 2637739, 2637739, 2637739, 2637739]","2.485812902160781E-4","0.0","0","0.0","0","0.0","application/vnd.ufdl",,"[{.bb, 12}]","1","1","1","[{ca, 12}]","{bcgsc.ca=12}"
+"12","2.490152536161217E-5","4767570.916666667","57210851","21133114","[730350, 730350, 730350, 733099, 733099, 915472, 1053142, 7017829, 20705523, 20705523, 21133114, 21133114, 21133114, 21133114]","8.548731307283899E-4","0.0","0","0.0","0","0.0","application/envi.hdr",,"[{.hdr, 12}]","2","2","2","[{edu, 10}]","{stanford.edu=10, epfl.ch=2}"
+"12","2.490152536161217E-5","73776.5","885318","326001","[8183, 8183, 8183, 11149, 11149, 17517, 39109, 79883, 228824, 228824, 326001, 326001, 326001, 326001]","1.3228864055005872E-5","0.0","0","0.0","0","0.0","application/vnd.sun.xml.writer",,"[{.sxw, 12}]","4","4","4","[{ru, 8}]","{eurasip.org=1, ruhr-uni-bochum.de=1, unipi.it=2, sao.ru=8}"
+"11","2.282639824814449E-5","2870.181818181818","31572","3747","[2347, 2347, 2347, 2452, 2452, 2473, 2733, 3139, 3535, 3535, 3747, 3747, 3747, 3747]","4.7176460429432744E-7","0.0","0","0.0","0","0.0","application/vnd.ms-tnef",,"[{.dat, 8}]","2","3","3","[{edu, 7}]","{centos.org=4, mit.edu=3, isi.edu=4}"
+"11","2.282639824814449E-5","6464.818181818182","71113","19407","[2111, 2111, 2111, 2527, 2527, 2531, 3304, 6640, 19403, 19403, 19407, 19407, 19407, 19407]","1.0626059896485022E-6","0.0","0","0.0","0","0.0","application/x-sfdu",,"[{.sfdu, 11}]","1","1","1","[{edu, 11}]","{ucla.edu=11}"
+"11","2.282639824814449E-5","168317.0909090909","1851488","298764","[58349, 58349, 58349, 60287, 60287, 69522, 157559, 298734, 298734, 298734, 298764, 298764, 298764, 298764]","2.766585910540022E-5","0.0","0","0.0","0","0.0","application/vnd.oasis.opendocument.text-template",,"[{.ott, 11}]","3","3","3","[{org, 6}]","{kek.jp=2, gobiernodecanarias.org=6, cern.ch=3}"
+"10","2.075127113467681E-5","879898.2","8798982","3395397","[1912, 1912, 1912, 2157, 2157, 2313, 3757, 1781889, 3395386, 3395397, 3395397, 3395397, 3395397, 3395397]","1.3147878694485336E-4","0.0","0","0.0","0","0.0","application/x-director",,"[{.dxr, 6}]","1","4","5","[{edu, 10}]","{lemoyne.edu=6, harvard.edu=2, si.edu=1, toronto.edu=1}"
+"10","2.075127113467681E-5","32037.1","320371","289147","[2035, 2035, 2035, 2036, 2036, 3305, 4140, 4145, 4549, 289147, 289147, 289147, 289147, 289147]","4.787143609602749E-6","0.0","0","0.0","0","0.0","application/x-dbf",,"[{.p7s, 9}]","3","3","3","[{org, 7}]","{philo.at=2, centos.org=7, unige.it=1}"
+"10","2.075127113467681E-5","6213487.0","62134870","25588823","[669606, 669606, 669606, 2222263, 2222263, 3122474, 4778645, 6018647, 6468256, 25588823, 25588823, 25588823, 25588823, 25588823]","9.284502837460243E-4","30.0","36480800","5.451143474072121E-4","25588823","3.8236098305310354E-4","application/vnd.rn-realmedia","{length=1, disconnect=2}","[]","4","4","4","[{int, 5}]","{duke.edu=1, ntu.edu.tw=1, esa.int=5, fz-juelich.de=3}"
+"10","2.075127113467681E-5","2073.8","20738","3490","[1032, 1032, 1032, 1073, 1073, 1119, 1973, 3066, 3107, 3490, 3490, 3490, 3490, 3490]","3.0987756125224134E-7","0.0","0","0.0","0","0.0","text/x-java-source",,"[]","2","2","2","[{edu, 8}]","{uva.nl=2, brown.edu=8}"
+"10","2.075127113467681E-5","1.27184487E7","127184487","18897994","[7894954, 7894954, 7894954, 8205454, 8205454, 8286703, 14307468, 16074775, 17296128, 18897994, 18897994, 18897994, 18897994, 18897994]","0.0019004541740127972","0.0","0","0.0","0","0.0","application/vnd.ms-powerpoint.slideshow.macroenabled.12",,"[{.ppsm, 10}]","2","2","2","[{sk, 9}]","{upjs.sk=9, uni-potsdam.de=1}"
diff --git a/data/warc-truncation-domains-CC-SUPPLEMENTAL-2026-22.csv b/data/warc-truncation-domains-CC-SUPPLEMENTAL-2026-22.csv
new file mode 100644
index 0000000..af1b125
--- /dev/null
+++ b/data/warc-truncation-domains-CC-SUPPLEMENTAL-2026-22.csv
@@ -0,0 +1,38 @@
+"n_pages","content_mime_detected","top_tlds","top_domains"
+"31373","application/pdf","[{edu, 5576}, {org, 5140}, {ca, 2730}, {de, 2435}, {jp, 2003}, {gov, 1932}, {com, 1252}, {cc, 1016}, {nl, 906}, {ch, 879}, {uk, 723}, {fr, 669}, {ru, 635}, {pl, 515}, {int, 492}, {at, 363}, {es, 291}, {dk, 277}, {cz, 273}, {no, 251}, {in, 248}, {it, 243}, {eu, 185}, {hr, 175}, {net, 146}]","{epfl.ch=172, berkeley.edu=201, tuwien.ac.at=148, mit.edu=193, uni-bayreuth.de=474, columbia.edu=606, copernicus.org=376, ku.dk=208, onf.fr=164, hawaii.edu=397, harvard.edu=212, gva.es=237, nips.cc=328, mapress.com=196, ms.gov=173, usgs.gov=166, umk.pl=428, jaea.go.jp=611, nrao.edu=157, nmt.edu=278, iodp.org=425, bournemouth.ac.uk=281, dagstuhl.de=174, tudelft.nl=201, u-tokyo.ac.jp=303, nasa.gov=485, noaa.gov=277, jma.go.jp=143, uqam.ca=1265, mpg.de=357, tamu.edu=154, kek.jp=296, arlis.org=417, thecvf.com=299, eartharxiv.org=624, admin.ch=232, bayern.de=465, ntnu.no=172, epizodyspace.ru=259, caltech.edu=595, utoronto.ca=128, cern.ch=135, ipums.org=936, esa.int=333, auburn.edu=408, cwi.nl=414, scholaris.ca=1113, neurips.cc=623, idahogeology.org=251, gsj.jp=188}"
+"12744","application/octet-stream","[{gov, 4294}, {edu, 4107}, {ca, 2139}, {org, 1653}, {ch, 449}, {ee, 50}, {jp, 16}, {com, 12}, {cern, 6}]","{ceos.org=6, si.edu=37, illumina.com=8, freedesktop.org=1, 1001genomes.org=351, nasa.gov=4294, ucsf.edu=2, wisc.edu=717, ethz.ch=1, tuwien.ac.at=1, mit.edu=1623, uwaterloo.ca=1, rochester.edu=1, tamu.edu=7, openrepository.com=1, imgt.org=2, cvut.cz=4, cassavabase.org=21, riken.jp=16, uzh.ch=38, uva.nl=2, scfbio-iitd.res.in=1, ucsc.edu=1502, uj.edu.pl=2, maaamet.ee=50, ucla.edu=45, uni-freiburg.de=2, harvard.edu=66, ihmc.us=1, esahubble.org=467, lowell.edu=4, salmobase.org=798, colorado.edu=31, washington.edu=33, caltech.edu=22, cern.ch=410, ubc.ca=1, nvidia.com=3, root.cern=6, gu.se=4, ntu.edu.tw=1, bcgsc.ca=2137, gnu.org=3, ucf.edu=17, iodp.org=2, personality-project.org=2}"
+"4923","text/plain","[{edu, 2153}, {org, 1444}, {ca, 334}, {gov, 277}, {es, 206}, {cn, 116}, {com, 112}, {jp, 87}, {uk, 74}, {fr, 69}, {de, 13}, {in, 9}, {se, 6}, {pl, 5}]","{obspm.fr=62, stanford.edu=12, si.edu=117, freedesktop.org=4, 1001genomes.org=820, szbl.ac.cn=110, lncipedia.org=57, wisc.edu=148, mit.edu=283, uni-bayreuth.de=4, riken.jp=65, ucsc.edu=330, icrisat.org=22, im-ldi.com=16, harvard.edu=226, nmsu.edu=335, salmobase.org=206, washington.edu=95, generegulation.org=16, peptideatlas.org=154, gu.se=6, bcgsc.ca=332, caida.org=10, cnsgenomics.com=8, iodp.org=8, nyu.edu=8, earthbyte.org=14, lirmm.fr=6, kazusa.or.jp=21, ebi.ac.uk=65, illumina.com=32, unc.edu=8, oecd-nea.org=11, fishtedb.com=54, uni-jena.de=5, metoffice.gov.uk=6, nasa.gov=179, noaa.gov=91, princeton.edu=5, cnio.es=205, proteinatlas.org=11, cassavabase.org=38, cncb.ac.cn=5, scfbio-iitd.res.in=9, ucla.edu=316, stsci.edu=7, lowell.edu=83, colorado.edu=6, caltech.edu=162, ico2s.org=40}"
+"3293","model/vnd.valve.source.compiled-map","[{gov, 3291}]","{nmsu.edu=2, nasa.gov=3291}"
+"255","text/html","[{org, 67}, {edu, 53}, {com, 50}, {jp, 34}, {pl, 18}, {io, 5}]","{imdpune.gov.in=1, expasy.org=2, mit.edu=1, proconsortium.org=2, copernicus.org=1, birmingham.ac.uk=1, produccioncientificaluz.org=6, uni-bielefeld.de=2, logarithmic.net=1, ucsc.edu=1, nmsu.edu=48, sgu.ru=2, rochesterastronomy.org=7, jaxa.jp=34, bcgsc.ca=2, readthedocs.io=1, satijalab.org=1, umaryland.edu=1, osubmi.org=1, nih.gov=2, r-project.org=7, rfam.org=6, noaa.gov=1, immunedynamics.io=4, vedantu.com=3, usda.gov=1, astrouw.edu.pl=18, rstudio.com=3, mirbase.org=1, kcl.ac.uk=2, planet4589.org=3, ksu.edu.kz=1, brenda-enzymes.org=19, iitr.ac.in=2, tommycohn.com=27, hixie.ch=1, purdue.edu=1, dir.bg=2, colorado.edu=1, fabiandablander.com=3, weatherbug.com=14, yonsei.ac.kr=2, snolab.ca=1, esa.int=1, professorbray.net=1, reactome.org=1, mitre.org=10, csir.co.za=2}"
+"251","chemical/x-cache","[{org, 251}]","{earthbyte.org=251}"
+"225","chemical/x-cif","[{uk, 225}]","{ebi.ac.uk=225}"
+"178","text/troff","[{edu, 107}, {ca, 62}, {org, 6}]","{nasa.gov=3, peptideatlas.org=6, bcgsc.ca=62, ucsc.edu=107}"
+"161","application/x-sqlite3","[{mil, 117}, {gov, 36}]","{nga.mil=117, bcgsc.ca=1, noaa.gov=36, wisc.edu=3, martinfleischmann.net=2, cern.ch=1, netlib.org=1}"
+"152","application/vnd.ms-pki.stl","[{edu, 141}, {ch, 5}]","{harvard.edu=93, si.edu=47, asteroidmission.org=2, asc-csa.gc.ca=1, nasa.gov=3, ucsf.edu=1, uzh.ch=5}"
+"150","application/xml","[{org, 70}, {com, 63}, {edu, 7}, {gov, 6}]","{earthbyte.org=14, proteinatlas.org=56, maplesoft.com=5, bcgsc.ca=2, nasa.gov=4, arabiaweather.com=58, nist.gov=1, washington.edu=6, caltech.edu=1, uzh.ch=2, usgs.gov=1}"
+"106","application/epub+zip","[{org, 54}, {gov, 18}, {ec, 11}, {com, 9}, {cl, 5}]","{uni-jena.de=1, um.es=1, nasa.gov=4, gene-quantification.de=1, swsc-journal.org=9, ucuenca.edu.ec=11, clemson.edu=3, uautonoma.cl=5, brainfacts.org=1, techscience.com=9, canada.ca=1, globe.gov=14, sissa.it=2, aanda.org=44}"
+"100","application/vnd.openxmlformats-officedocument.presentationml.presentation","[{int, 34}, {nl, 27}, {gov, 16}, {edu, 13}, {no, 5}]","{utwente.nl=27, liu.se=1, ans.org=1, ntnu.no=5, fz-juelich.de=1, ntnu.edu=5, tuwien.at=1, rss.org.uk=1, wisc.edu=1, columbia.edu=1, esa.int=34, globe.gov=16, duke.edu=1, nrao.edu=5}"
+"94","text/tab-separated-values","[{org, 86}]","{kazusa.or.jp=2, proteinatlas.org=2, bcgsc.ca=4, nasa.gov=1, riken.jp=1, salmobase.org=41, peptideatlas.org=43}"
+"93","application/vnd.apple.keynote","[{edu, 47}, {int, 14}, {gov, 9}, {ca, 9}]","{berkeley.edu=2, nasa.gov=8, noaa.gov=1, csiro.au=1, ox.ac.uk=4, unipi.it=1, hkust.edu.hk=2, columbia.edu=1, uchicago.edu=2, toronto.edu=1, harvard.edu=16, lowell.edu=6, triumf.ca=2, cornell.edu=1, colorado.edu=1, utoronto.ca=5, caltech.edu=2, washington.edu=1, cern.ch=3, ubc.ca=2, ntu.edu.tw=1, esa.int=14, nrao.edu=5, loria.fr=2, brown.edu=9}"
+"93","image/vnd.djvu","[{ru, 58}, {org, 32}]","{numdam.org=32, epizodyspace.ru=58, millsian.com=1, uva.nl=2}"
+"71","application/x-sh","[{uk, 44}, {at, 9}, {ca, 7}, {edu, 6}]","{vcell.org=2, bcgsc.ca=7, tugraz.at=9, noaa.gov=2, wisc.edu=1, hutton.ac.uk=44, caltech.edu=5, uj.edu.pl=1}"
+"59","application/x-idl-save-file","[{gov, 49}, {edu, 10}]","{noaa.gov=47, nasa.gov=2, colorado.edu=1, caltech.edu=9}"
+"53","application/vnd.ms-excel.sheet.macroenabled.12","[{ca, 53}]","{canada.ca=53}"
+"43","application/json","[{at, 23}, {ca, 11}, {org, 5}]","{proteinatlas.org=5, ebi.ac.uk=4, tuwien.ac.at=23, bcgsc.ca=11}"
+"40","application/x-gtar","[{uk, 30}]","{metoffice.gov.uk=30, ico2s.org=3, jhuapl.edu=2, caltech.edu=1, uzh.ch=4}"
+"38","application/illustrator","[{org, 37}]","{iodp.org=37, caltech.edu=1}"
+"30","application/vnd.lotus-wordpro","[{edu, 24}, {ca, 6}]","{bcgsc.ca=6, wisc.edu=24}"
+"25","application/x-tika-msoffice","[{cern, 15}, {kr, 9}]","{root.cern=15, qcenter.kr=2, unist.ac.kr=7, openrepository.com=1}"
+"23","application/x-matlab-data","[{cz, 7}, {pl, 5}, {edu, 5}]","{uni-freiburg.de=2, stanford.edu=2, unica.it=1, cvut.cz=7, nasa.gov=2, noaa.gov=1, nyu.edu=3, uj.edu.pl=5}"
+"22","application/x-tika-ooxml","[{org, 16}]","{yonsei.ac.kr=3, ceos.org=2, cauriensia.es=1, duke.edu=1, eartharxiv.org=14, colorado.edu=1}"
+"19","application/xhtml+xml","[{org, 10}, {ca, 6}]","{dockflow.org=1, esf.edu=1, produccioncientificaluz.org=9, theccc.org.uk=1, clemson.edu=1, uwaterloo.ca=6}"
+"17","application/pgp-encrypted","[{ca, 16}]","{mit.edu=1, bcgsc.ca=16}"
+"15","application/mxf","[{org, 13}]","{nasa.gov=2, freedesktop.org=13}"
+"14","application/vnd.openxmlformats-officedocument.presentationml.slideshow","[]","{mbari.org=1, ceos.org=2, wsl.ch=3, esa.int=2, bcgsc.ca=1, uol.de=1, nasa.gov=2, cec.org=1, uva.nl=1}"
+"14","text/event-stream","[{org, 14}]","{lmfdb.org=14}"
+"14","model/vnd.mts","[{au, 14}]","{csiro.au=14}"
+"13","application/vnd.snesdev-page-table","[{ca, 13}]","{bcgsc.ca=13}"
+"12","application/x-download","[{com, 12}]","{im-ldi.com=12}"
+"12","application/vnd.xara","[{cern, 12}]","{root.cern=12}"
+"11","application/vnd.google-earth.kml+xml","[{gov, 10}]","{richmond.edu=1, noaa.gov=10}"
+"10","application/rdf+xml","[{se, 9}]","{proconsortium.org=1, gu.se=9}"
diff --git a/data/warc-truncation-domains-detailed-CC-SUPPLEMENTAL-2026-22.csv b/data/warc-truncation-domains-detailed-CC-SUPPLEMENTAL-2026-22.csv
new file mode 100644
index 0000000..43c92ec
--- /dev/null
+++ b/data/warc-truncation-domains-detailed-CC-SUPPLEMENTAL-2026-22.csv
@@ -0,0 +1,20 @@
+"count_trunc","count_total","perc_trunc","storage_trunc_gib","content_mime_detected","url_host_registered_domain"
+"4294","25907","16.575","55.71","application/octet-stream","nasa.gov"
+"3291","9311","35.345","50.30","model/vnd.valve.source.compiled-map","nasa.gov"
+"2137","3198","66.823","47.88","application/octet-stream","bcgsc.ca"
+"1502","4298","34.946","30.71","application/octet-stream","ucsc.edu"
+"1265","17132","7.384","27.65","application/pdf","uqam.ca"
+"1113","10964","10.151","25.71","application/pdf","scholaris.ca"
+"936","7582","12.345","22.45","application/pdf","ipums.org"
+"1623","9502","17.081","21.24","application/octet-stream","mit.edu"
+"798","2204","36.207","18.39","application/octet-stream","salmobase.org"
+"717","3282","21.846","16.65","application/octet-stream","wisc.edu"
+"623","48224","1.292","14.68","application/pdf","neurips.cc"
+"624","11354","5.496","14.53","application/pdf","eartharxiv.org"
+"606","26624","2.276","13.85","application/pdf","columbia.edu"
+"595","16008","3.717","13.78","application/pdf","caltech.edu"
+"611","5193","11.766","13.01","application/pdf","jaea.go.jp"
+"465","16693","2.786","11.25","application/pdf","bayern.de"
+"485","21793","2.225","11.12","application/pdf","nasa.gov"
+"474","7418","6.390","10.76","application/pdf","uni-bayreuth.de"
+"428","4980","8.594","10.39","application/pdf","umk.pl"
diff --git a/notebooks/analyze_truncated_content.ipynb b/notebooks/analyze_truncated_content.ipynb
new file mode 100644
index 0000000..e82a812
--- /dev/null
+++ b/notebooks/analyze_truncated_content.ipynb
@@ -0,0 +1,2700 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Truncated Records in Focus Crawl\n",
+ "\n",
+ "This notebook is based on: https://github.com/commoncrawl/cc-notebooks/blob/truncation-metrics-2025/warc-truncation/cc-main-2025-truncation-stats.ipynb\n",
+ "\n",
+ "The focus crawl uses a content limit of 25 MiB (26214400) as opposed to the 5 MiB (5242880) limit used by the main crawl. This notebooks investigates the impact of this change, i.e., we want to know if we might need to refetch large PDFs similar to what FinePDF did.\n",
+ "\n",
+ "Counts of truncated records are aggregated per MIME type from the [columnar index](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format) using [AWS Athena](https://aws.amazon.com/athena/) and the following SQL query (cf. [average-warc-record-length-by-mime-type.sql](https://github.com/commoncrawl/cc-index-table/blob/main/src/sql/examples/cc-index/average-warc-record-length-by-mime-type.sql)):\n",
+ "\n",
+ "See also:\n",
+ "\n",
+ "- Schema: https://github.com/commoncrawl/cc-index-table/blob/main/src/main/resources/schema/index-schema-simple.json\n",
+ " - `warc_record_length` (int): Length of the WARC record\n",
+ " - `content_truncated` (str): Non-null if the WARC record payload is truncated. The value then indicates the reason for the truncation, cf. https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-truncated\n",
+ "\n",
+ "```sql\n",
+ "SELECT COUNT(*) as n_pages,\n",
+ " COUNT(*) * 100.0 / SUM(COUNT(*)) OVER() as perc_pages,\n",
+ " AVG(warc_record_length) as avg_warc_record_length,\n",
+ " SUM(warc_record_length) as sum_warc_record_length,\n",
+ " MAX(warc_record_length) as max_warc_record_length,\n",
+ " approx_percentile(warc_record_length, ARRAY[.01,.02,.05,.1,.15,.25,.5,.75,.85,.9,.95,.98,.99,.995])\n",
+ " as percentiles_warc_record_length,\n",
+ " SUM(warc_record_length) * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage,\n",
+ " SUM(case when content_truncated is null then 0 else 1 end) * 100.0 / COUNT(*) as perc_truncated,\n",
+ " SUM(case when content_truncated is not null then warc_record_length else 0 end)\n",
+ " as sum_warc_record_length_truncated,\n",
+ " SUM(case when content_truncated is not null then warc_record_length else 0 end)\n",
+ " * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage_truncated,\n",
+ " SUM(case when content_truncated = 'length' then warc_record_length else 0 end)\n",
+ " as sum_warc_record_length_truncated_length,\n",
+ " SUM(case when content_truncated = 'length' then warc_record_length else 0 end)\n",
+ " * 100.0 / SUM(SUM(warc_record_length)) OVER() as perc_warc_storage_truncated_length,\n",
+ " content_mime_detected,\n",
+ " histogram(content_truncated) as reason_truncated,\n",
+ " slice(\n",
+ " array_sort(\n",
+ " map_entries(map_filter(\n",
+ " histogram(regexp_extract(url_path, '\\.[a-zA-Z0-9_-]{1,7}$')),\n",
+ " (k, v) -> v > 4)),\n",
+ " (a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))),\n",
+ " 1, 25) as common_url_path_suffixes,\n",
+ " COUNT(DISTINCT url_host_tld) as uniq_tlds,\n",
+ " approx_distinct(url_host_registered_domain) as uniq_domains,\n",
+ " approx_distinct(url_host_name) as uniq_hosts,\n",
+ " slice(\n",
+ " array_sort(\n",
+ " map_entries(map_filter(histogram(url_host_tld), (k, v) -> v > 4)),\n",
+ " (a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))),\n",
+ " 1, 25) as top_tlds,\n",
+ " approx_most_frequent(25, url_host_registered_domain, 1000) as top_domains\n",
+ "FROM \"ccoaindex\".\"ccoaindex\"\n",
+ "WHERE crawl = 'CC-SUPPLEMENTAL-2026-22'\n",
+ " AND subset = 'warc'\n",
+ "GROUP BY content_mime_detected\n",
+ "HAVING (COUNT(*) >= 10) -- ignore MIME types seen less than 10 times\n",
+ "ORDER BY n_pages DESC;\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " content_mime_detected | \n",
+ " n_pages | \n",
+ " perc_warc_storage | \n",
+ " perc_truncated | \n",
+ " perc_warc_storage_truncated | \n",
+ " reason_truncated | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " text/html | \n",
+ " 40613284 | \n",
+ " 15.245210 | \n",
+ " 0.002076 | \n",
+ " 0.017853 | \n",
+ " {length=255, disconnect=432, time=156} | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " application/pdf | \n",
+ " 3169664 | \n",
+ " 73.501854 | \n",
+ " 1.050616 | \n",
+ " 11.550126 | \n",
+ " {length=31373, disconnect=1713, time=215} | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " application/xhtml+xml | \n",
+ " 2417807 | \n",
+ " 0.541387 | \n",
+ " 0.001324 | \n",
+ " 0.004805 | \n",
+ " {length=19, disconnect=6, time=7} | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " text/plain | \n",
+ " 875344 | \n",
+ " 1.398873 | \n",
+ " 0.570404 | \n",
+ " 0.482635 | \n",
+ " {length=4923, disconnect=54, time=16} | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " application/x-bibtex-text-file | \n",
+ " 147997 | \n",
+ " 0.003215 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " application/xml | \n",
+ " 108643 | \n",
+ " 0.035393 | \n",
+ " 0.138067 | \n",
+ " 0.005305 | \n",
+ " {length=150} | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " application/atom+xml | \n",
+ " 89137 | \n",
+ " 0.003671 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " text/calendar | \n",
+ " 87210 | \n",
+ " 0.002174 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " application/octet-stream | \n",
+ " 81950 | \n",
+ " 6.007911 | \n",
+ " 16.019524 | \n",
+ " 3.543030 | \n",
+ " {length=12744, disconnect=383, time=1} | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " application/x-pds | \n",
+ " 80351 | \n",
+ " 0.012730 | \n",
+ " 0.002489 | \n",
+ " 0.000368 | \n",
+ " {length=2} | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " application/rss+xml | \n",
+ " 68737 | \n",
+ " 0.008902 | \n",
+ " 0.004364 | \n",
+ " 0.000025 | \n",
+ " {length=1, disconnect=2} | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " image/vnd.djvu | \n",
+ " 68071 | \n",
+ " 0.873445 | \n",
+ " 0.138091 | \n",
+ " 0.036390 | \n",
+ " {length=93, time=1} | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " application/json | \n",
+ " 41691 | \n",
+ " 0.012474 | \n",
+ " 0.103140 | \n",
+ " 0.002575 | \n",
+ " {length=43} | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " text/x-vcard | \n",
+ " 30879 | \n",
+ " 0.003287 | \n",
+ " 0.019431 | \n",
+ " 0.000166 | \n",
+ " {length=6} | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " application/x-tex | \n",
+ " 25252 | \n",
+ " 0.001396 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " text/x-sql | \n",
+ " 24388 | \n",
+ " 0.000893 | \n",
+ " 0.020502 | \n",
+ " 0.000388 | \n",
+ " {length=5} | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " text/x-python | \n",
+ " 22962 | \n",
+ " 0.005870 | \n",
+ " 0.008710 | \n",
+ " 0.000332 | \n",
+ " {length=2} | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " text/csv | \n",
+ " 15803 | \n",
+ " 0.004893 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " application/x-tika-msoffice | \n",
+ " 13930 | \n",
+ " 0.056319 | \n",
+ " 0.179469 | \n",
+ " 0.008206 | \n",
+ " {length=25} | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " text/x-rsrc | \n",
+ " 10992 | \n",
+ " 0.000350 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " content_mime_detected n_pages perc_warc_storage \\\n",
+ "0 text/html 40613284 15.245210 \n",
+ "1 application/pdf 3169664 73.501854 \n",
+ "2 application/xhtml+xml 2417807 0.541387 \n",
+ "3 text/plain 875344 1.398873 \n",
+ "4 application/x-bibtex-text-file 147997 0.003215 \n",
+ "5 application/xml 108643 0.035393 \n",
+ "6 application/atom+xml 89137 0.003671 \n",
+ "7 text/calendar 87210 0.002174 \n",
+ "8 application/octet-stream 81950 6.007911 \n",
+ "9 application/x-pds 80351 0.012730 \n",
+ "10 application/rss+xml 68737 0.008902 \n",
+ "11 image/vnd.djvu 68071 0.873445 \n",
+ "12 application/json 41691 0.012474 \n",
+ "13 text/x-vcard 30879 0.003287 \n",
+ "14 application/x-tex 25252 0.001396 \n",
+ "15 text/x-sql 24388 0.000893 \n",
+ "16 text/x-python 22962 0.005870 \n",
+ "17 text/csv 15803 0.004893 \n",
+ "18 application/x-tika-msoffice 13930 0.056319 \n",
+ "19 text/x-rsrc 10992 0.000350 \n",
+ "\n",
+ " perc_truncated perc_warc_storage_truncated \\\n",
+ "0 0.002076 0.017853 \n",
+ "1 1.050616 11.550126 \n",
+ "2 0.001324 0.004805 \n",
+ "3 0.570404 0.482635 \n",
+ "4 0.000000 0.000000 \n",
+ "5 0.138067 0.005305 \n",
+ "6 0.000000 0.000000 \n",
+ "7 0.000000 0.000000 \n",
+ "8 16.019524 3.543030 \n",
+ "9 0.002489 0.000368 \n",
+ "10 0.004364 0.000025 \n",
+ "11 0.138091 0.036390 \n",
+ "12 0.103140 0.002575 \n",
+ "13 0.019431 0.000166 \n",
+ "14 0.000000 0.000000 \n",
+ "15 0.020502 0.000388 \n",
+ "16 0.008710 0.000332 \n",
+ "17 0.000000 0.000000 \n",
+ "18 0.179469 0.008206 \n",
+ "19 0.000000 0.000000 \n",
+ "\n",
+ " reason_truncated \n",
+ "0 {length=255, disconnect=432, time=156} \n",
+ "1 {length=31373, disconnect=1713, time=215} \n",
+ "2 {length=19, disconnect=6, time=7} \n",
+ "3 {length=4923, disconnect=54, time=16} \n",
+ "4 NaN \n",
+ "5 {length=150} \n",
+ "6 NaN \n",
+ "7 NaN \n",
+ "8 {length=12744, disconnect=383, time=1} \n",
+ "9 {length=2} \n",
+ "10 {length=1, disconnect=2} \n",
+ "11 {length=93, time=1} \n",
+ "12 {length=43} \n",
+ "13 {length=6} \n",
+ "14 NaN \n",
+ "15 {length=5} \n",
+ "16 {length=2} \n",
+ "17 NaN \n",
+ "18 {length=25} \n",
+ "19 NaN "
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "\n",
+ "# Repo-root-aware data path (works both from repo root and from notebooks/).\n",
+ "DATA_DIR = Path(\"data\") if Path(\"data\").exists() else Path(\"..\") / \"data\"\n",
+ "\n",
+ "df = pd.read_csv(DATA_DIR / 'warc-record-size-truncation-by-mime-type-CC-SUPPLEMENTAL-2026-22.csv')\n",
+ "\n",
+ "df[['content_mime_detected', 'n_pages', 'perc_warc_storage',\n",
+ " 'perc_truncated', 'perc_warc_storage_truncated', 'reason_truncated']].head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The aggregations show which MIME types are mostly affected by truncations.\n",
+ "\n",
+ "Now let's look into the reasons of the truncation and load the histograms with reason counts into columns:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " content_mime_detected | \n",
+ " n_pages | \n",
+ " perc_truncated | \n",
+ " n_pages_truncated | \n",
+ " trunc_reason_length | \n",
+ " trunc_reason_length_perc | \n",
+ " trunc_length_gib | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " application/pdf | \n",
+ " 3169664 | \n",
+ " 1.050616 | \n",
+ " 33301 | \n",
+ " 31373 | \n",
+ " 0.989789 | \n",
+ " 712.754483 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " application/octet-stream | \n",
+ " 81950 | \n",
+ " 16.019524 | \n",
+ " 13128 | \n",
+ " 12744 | \n",
+ " 15.550946 | \n",
+ " 220.814640 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " text/plain | \n",
+ " 875344 | \n",
+ " 0.570404 | \n",
+ " 4993 | \n",
+ " 4923 | \n",
+ " 0.562407 | \n",
+ " 30.020258 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " model/vnd.valve.source.compiled-map | \n",
+ " 9313 | \n",
+ " 35.359175 | \n",
+ " 3293 | \n",
+ " 3293 | \n",
+ " 35.359175 | \n",
+ " 50.344730 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " text/html | \n",
+ " 40613284 | \n",
+ " 0.002076 | \n",
+ " 843 | \n",
+ " 255 | \n",
+ " 0.000628 | \n",
+ " 1.058045 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " chemical/x-cache | \n",
+ " 1505 | \n",
+ " 16.677741 | \n",
+ " 251 | \n",
+ " 251 | \n",
+ " 16.677741 | \n",
+ " 0.042768 | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " chemical/x-cif | \n",
+ " 256 | \n",
+ " 87.890625 | \n",
+ " 225 | \n",
+ " 225 | \n",
+ " 87.890625 | \n",
+ " 3.743903 | \n",
+ "
\n",
+ " \n",
+ " | 42 | \n",
+ " text/troff | \n",
+ " 2279 | \n",
+ " 7.810443 | \n",
+ " 178 | \n",
+ " 178 | \n",
+ " 7.810443 | \n",
+ " 3.789980 | \n",
+ "
\n",
+ " \n",
+ " | 99 | \n",
+ " application/x-sqlite3 | \n",
+ " 214 | \n",
+ " 76.168224 | \n",
+ " 163 | \n",
+ " 161 | \n",
+ " 75.233645 | \n",
+ " 1.099544 | \n",
+ "
\n",
+ " \n",
+ " | 68 | \n",
+ " application/vnd.ms-pki.stl | \n",
+ " 687 | \n",
+ " 22.125182 | \n",
+ " 152 | \n",
+ " 152 | \n",
+ " 22.125182 | \n",
+ " 1.677405 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " application/xml | \n",
+ " 108643 | \n",
+ " 0.138067 | \n",
+ " 150 | \n",
+ " 150 | \n",
+ " 0.138067 | \n",
+ " 0.330630 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " application/epub+zip | \n",
+ " 6375 | \n",
+ " 1.709804 | \n",
+ " 109 | \n",
+ " 106 | \n",
+ " 1.662745 | \n",
+ " 2.581691 | \n",
+ "
\n",
+ " \n",
+ " | 64 | \n",
+ " application/vnd.openxmlformats-officedocument.... | \n",
+ " 782 | \n",
+ " 13.299233 | \n",
+ " 104 | \n",
+ " 100 | \n",
+ " 12.787724 | \n",
+ " 2.321584 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " text/tab-separated-values | \n",
+ " 2974 | \n",
+ " 3.160726 | \n",
+ " 94 | \n",
+ " 94 | \n",
+ " 3.160726 | \n",
+ " 0.423371 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " image/vnd.djvu | \n",
+ " 68071 | \n",
+ " 0.138091 | \n",
+ " 94 | \n",
+ " 93 | \n",
+ " 0.136622 | \n",
+ " 2.265676 | \n",
+ "
\n",
+ " \n",
+ " | 82 | \n",
+ " application/vnd.apple.keynote | \n",
+ " 338 | \n",
+ " 27.514793 | \n",
+ " 93 | \n",
+ " 93 | \n",
+ " 27.514793 | \n",
+ " 1.691661 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " application/x-sh | \n",
+ " 7594 | \n",
+ " 0.934949 | \n",
+ " 71 | \n",
+ " 71 | \n",
+ " 0.934949 | \n",
+ " 1.720983 | \n",
+ "
\n",
+ " \n",
+ " | 110 | \n",
+ " application/x-idl-save-file | \n",
+ " 169 | \n",
+ " 34.911243 | \n",
+ " 59 | \n",
+ " 59 | \n",
+ " 34.911243 | \n",
+ " 0.190223 | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " application/vnd.ms-excel.sheet.macroenabled.12 | \n",
+ " 250 | \n",
+ " 21.200000 | \n",
+ " 53 | \n",
+ " 53 | \n",
+ " 21.200000 | \n",
+ " 1.076201 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " application/json | \n",
+ " 41691 | \n",
+ " 0.103140 | \n",
+ " 43 | \n",
+ " 43 | \n",
+ " 0.103140 | \n",
+ " 0.160521 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " content_mime_detected n_pages \\\n",
+ "1 application/pdf 3169664 \n",
+ "8 application/octet-stream 81950 \n",
+ "3 text/plain 875344 \n",
+ "24 model/vnd.valve.source.compiled-map 9313 \n",
+ "0 text/html 40613284 \n",
+ "55 chemical/x-cache 1505 \n",
+ "95 chemical/x-cif 256 \n",
+ "42 text/troff 2279 \n",
+ "99 application/x-sqlite3 214 \n",
+ "68 application/vnd.ms-pki.stl 687 \n",
+ "5 application/xml 108643 \n",
+ "31 application/epub+zip 6375 \n",
+ "64 application/vnd.openxmlformats-officedocument.... 782 \n",
+ "37 text/tab-separated-values 2974 \n",
+ "11 image/vnd.djvu 68071 \n",
+ "82 application/vnd.apple.keynote 338 \n",
+ "28 application/x-sh 7594 \n",
+ "110 application/x-idl-save-file 169 \n",
+ "96 application/vnd.ms-excel.sheet.macroenabled.12 250 \n",
+ "12 application/json 41691 \n",
+ "\n",
+ " perc_truncated n_pages_truncated trunc_reason_length \\\n",
+ "1 1.050616 33301 31373 \n",
+ "8 16.019524 13128 12744 \n",
+ "3 0.570404 4993 4923 \n",
+ "24 35.359175 3293 3293 \n",
+ "0 0.002076 843 255 \n",
+ "55 16.677741 251 251 \n",
+ "95 87.890625 225 225 \n",
+ "42 7.810443 178 178 \n",
+ "99 76.168224 163 161 \n",
+ "68 22.125182 152 152 \n",
+ "5 0.138067 150 150 \n",
+ "31 1.709804 109 106 \n",
+ "64 13.299233 104 100 \n",
+ "37 3.160726 94 94 \n",
+ "11 0.138091 94 93 \n",
+ "82 27.514793 93 93 \n",
+ "28 0.934949 71 71 \n",
+ "110 34.911243 59 59 \n",
+ "96 21.200000 53 53 \n",
+ "12 0.103140 43 43 \n",
+ "\n",
+ " trunc_reason_length_perc trunc_length_gib \n",
+ "1 0.989789 712.754483 \n",
+ "8 15.550946 220.814640 \n",
+ "3 0.562407 30.020258 \n",
+ "24 35.359175 50.344730 \n",
+ "0 0.000628 1.058045 \n",
+ "55 16.677741 0.042768 \n",
+ "95 87.890625 3.743903 \n",
+ "42 7.810443 3.789980 \n",
+ "99 75.233645 1.099544 \n",
+ "68 22.125182 1.677405 \n",
+ "5 0.138067 0.330630 \n",
+ "31 1.662745 2.581691 \n",
+ "64 12.787724 2.321584 \n",
+ "37 3.160726 0.423371 \n",
+ "11 0.136622 2.265676 \n",
+ "82 27.514793 1.691661 \n",
+ "28 0.934949 1.720983 \n",
+ "110 34.911243 0.190223 \n",
+ "96 21.200000 1.076201 \n",
+ "12 0.103140 0.160521 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# expand embedded Presto/Trino/Athena histogram as columns into data frame\n",
+ "def unroll_histograms(df):\n",
+ " # - transform to valid JSON\n",
+ " df['reasons_truncation'] = df['reason_truncated'].str.replace('(\\\\w+)=', '\"\\\\1\":', regex=True)\n",
+ " df['top_domains'] = df['top_domains'].str.replace('([a-z0-9.-]+)=', '\"\\\\1\":', regex=True)\n",
+ "\n",
+ " # - load columns in data frame\n",
+ " truncation_reason = df['reasons_truncation'].apply(\n",
+ " lambda x: json.loads(x) if type(x) == str else {}\n",
+ " ).apply(pd.Series).apply(lambda s: s.fillna(0).astype(int)).add_prefix('trunc_reason_')\n",
+ "\n",
+ " # - join with original data\n",
+ " df = df.join(truncation_reason)\n",
+ "\n",
+ " df['n_pages_truncated'] \\\n",
+ " = df['trunc_reason_length'] + df['trunc_reason_time'] + df['trunc_reason_disconnect']\n",
+ " df['trunc_reason_length_perc'] = 100.0 * df['trunc_reason_length'] / df['n_pages']\n",
+ " df['trunc_length_gib'] = df['sum_warc_record_length_truncated_length'] / 2**30\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "df = unroll_histograms(df)\n",
+ "df[['content_mime_detected', 'n_pages', 'perc_truncated', 'n_pages_truncated',\n",
+ " 'trunc_reason_length', 'trunc_reason_length_perc', 'trunc_length_gib']\n",
+ " ].sort_values(by=['trunc_reason_length'], ascending=False).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## TLDs and Domains with Truncated Content\n",
+ "\n",
+ "These statistics try to uncover any oddities in the distributions of truncated and overlong pages/documents over top-level and pay-level domains.\n",
+ "\n",
+ "```sql\n",
+ "SELECT COUNT(*) as n_pages,\n",
+ " content_mime_detected,\n",
+ " slice(\n",
+ " array_sort(\n",
+ " map_entries(map_filter(histogram(url_host_tld), (k, v) -> v > 4)),\n",
+ " (a, b) -> IF(a[2] < b[2], 1, IF(a[2] = b[2], 0, -1))),\n",
+ " 1, 25) as top_tlds,\n",
+ " approx_most_frequent(50, url_host_registered_domain, 2000) as top_domains\n",
+ "FROM \"ccoaindex\".\"ccoaindex\"\n",
+ "WHERE crawl = 'CC-SUPPLEMENTAL-2026-22'\n",
+ " AND subset = 'warc'\n",
+ " AND content_truncated = 'length'\n",
+ "GROUP BY content_mime_detected\n",
+ "HAVING (COUNT(*) >= 10) -- ignore MIME types seen less than 10 times\n",
+ "ORDER BY n_pages DESC;\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "text/html\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " domain | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " apache.org | \n",
+ " 0 | \n",
+ " 132279 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " arcgis.com | \n",
+ " 0 | \n",
+ " 152254 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " astrouw.edu.pl | \n",
+ " 18 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " bcgsc.ca | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " berkeley.edu | \n",
+ " 0 | \n",
+ " 138929 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " birmingham.ac.uk | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " brenda-enzymes.org | \n",
+ " 19 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " caltech.edu | \n",
+ " 0 | \n",
+ " 190863 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " canada.ca | \n",
+ " 0 | \n",
+ " 138984 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " cern.ch | \n",
+ " 0 | \n",
+ " 148213 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " colorado.edu | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " copernicus.org | \n",
+ " 1 | \n",
+ " 671032 | \n",
+ " 0.000149 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " csir.co.za | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " dir.bg | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " duke.edu | \n",
+ " 0 | \n",
+ " 154950 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " epfl.ch | \n",
+ " 0 | \n",
+ " 158864 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " esa.int | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " expasy.org | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " fabiandablander.com | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " harvard.edu | \n",
+ " 0 | \n",
+ " 251278 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " hixie.ch | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " iitr.ac.in | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " imdpune.gov.in | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " immunedynamics.io | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " jaxa.jp | \n",
+ " 34 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " domain count_trunc count_all %\n",
+ "0 apache.org 0 132279 0.000000\n",
+ "1 arcgis.com 0 152254 0.000000\n",
+ "2 astrouw.edu.pl 18 0 inf\n",
+ "3 bcgsc.ca 2 0 inf\n",
+ "4 berkeley.edu 0 138929 0.000000\n",
+ "5 birmingham.ac.uk 1 0 inf\n",
+ "6 brenda-enzymes.org 19 0 inf\n",
+ "7 caltech.edu 0 190863 0.000000\n",
+ "8 canada.ca 0 138984 0.000000\n",
+ "9 cern.ch 0 148213 0.000000\n",
+ "10 colorado.edu 1 0 inf\n",
+ "11 copernicus.org 1 671032 0.000149\n",
+ "12 csir.co.za 2 0 inf\n",
+ "13 dir.bg 2 0 inf\n",
+ "14 duke.edu 0 154950 0.000000\n",
+ "15 epfl.ch 0 158864 0.000000\n",
+ "16 esa.int 1 0 inf\n",
+ "17 expasy.org 2 0 inf\n",
+ "18 fabiandablander.com 3 0 inf\n",
+ "19 harvard.edu 0 251278 0.000000\n",
+ "20 hixie.ch 1 0 inf\n",
+ "21 iitr.ac.in 2 0 inf\n",
+ "22 imdpune.gov.in 1 0 inf\n",
+ "23 immunedynamics.io 4 0 inf\n",
+ "24 jaxa.jp 34 0 inf"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from collections import Counter\n",
+ "\n",
+ "def compare_top_domains(df_all, df_trunc, mime='text/html', n=25):\n",
+ " da = pd.DataFrame(\n",
+ " Counter(df_all.loc[df_all['content_mime_detected'] == mime,\n",
+ " 'top_domains'].apply(json.loads).values[0]).most_common(),\n",
+ " columns = ['domain', 'count_all']\n",
+ " )\n",
+ " dt = pd.DataFrame(\n",
+ " Counter(df_trunc.loc[df_trunc['content_mime_detected'] == mime,\n",
+ " 'top_domains'].apply(json.loads).values[0]).most_common(),\n",
+ " columns = ['domain', 'count_trunc']\n",
+ " )\n",
+ " d = dt.merge(da, how = 'outer').fillna(0).astype({'count_all': int, 'count_trunc': int})\n",
+ " d['%'] = 100.0 * d['count_trunc'] / d['count_all']\n",
+ " print(mime)\n",
+ " return d.head(n)\n",
+ "\n",
+ "\n",
+ "df_trunc = pd.read_csv(DATA_DIR / 'warc-truncation-domains-CC-SUPPLEMENTAL-2026-22.csv')\n",
+ "df_trunc['top_domains'] = df_trunc['top_domains'].str.replace('([a-z0-9.-]+)=', '\"\\\\1\":', regex=True)\n",
+ "\n",
+ "\n",
+ "compare_top_domains(df, df_trunc, mime='text/html')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Note**: for the MIME type \"text/html\" there are certain domains which might require a closer look because the rate of truncated docs is high."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "application/xhtml+xml\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " domain | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " arlis.org | \n",
+ " 0 | \n",
+ " 69653 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " bgs.ac.uk | \n",
+ " 0 | \n",
+ " 79214 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " blogspot.com | \n",
+ " 0 | \n",
+ " 93519 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " clemson.edu | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " dockflow.org | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " edge.org | \n",
+ " 0 | \n",
+ " 30826 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " esf.edu | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " harvard.edu | \n",
+ " 0 | \n",
+ " 36310 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " ikonet.com | \n",
+ " 0 | \n",
+ " 43195 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " jmp.com | \n",
+ " 0 | \n",
+ " 58606 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " kyushu-u.ac.jp | \n",
+ " 0 | \n",
+ " 54801 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " lancs.ac.uk | \n",
+ " 0 | \n",
+ " 56328 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " mapsales.com | \n",
+ " 0 | \n",
+ " 53898 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " marchionnilab.org | \n",
+ " 0 | \n",
+ " 81381 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " nasa.gov | \n",
+ " 0 | \n",
+ " 34022 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " naturalsciences.be | \n",
+ " 0 | \n",
+ " 46448 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " ncl.ac.uk | \n",
+ " 0 | \n",
+ " 32527 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " nvidia.com | \n",
+ " 0 | \n",
+ " 45877 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " postgis.net | \n",
+ " 0 | \n",
+ " 34186 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " produccioncientificaluz.org | \n",
+ " 9 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " root.cern | \n",
+ " 0 | \n",
+ " 51238 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " sketchport.com | \n",
+ " 0 | \n",
+ " 35521 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " soton.ac.uk | \n",
+ " 0 | \n",
+ " 68546 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " stanford.edu | \n",
+ " 0 | \n",
+ " 46586 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " techscience.com | \n",
+ " 0 | \n",
+ " 57809 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " domain count_trunc count_all %\n",
+ "0 arlis.org 0 69653 0.0\n",
+ "1 bgs.ac.uk 0 79214 0.0\n",
+ "2 blogspot.com 0 93519 0.0\n",
+ "3 clemson.edu 1 0 inf\n",
+ "4 dockflow.org 1 0 inf\n",
+ "5 edge.org 0 30826 0.0\n",
+ "6 esf.edu 1 0 inf\n",
+ "7 harvard.edu 0 36310 0.0\n",
+ "8 ikonet.com 0 43195 0.0\n",
+ "9 jmp.com 0 58606 0.0\n",
+ "10 kyushu-u.ac.jp 0 54801 0.0\n",
+ "11 lancs.ac.uk 0 56328 0.0\n",
+ "12 mapsales.com 0 53898 0.0\n",
+ "13 marchionnilab.org 0 81381 0.0\n",
+ "14 nasa.gov 0 34022 0.0\n",
+ "15 naturalsciences.be 0 46448 0.0\n",
+ "16 ncl.ac.uk 0 32527 0.0\n",
+ "17 nvidia.com 0 45877 0.0\n",
+ "18 postgis.net 0 34186 0.0\n",
+ "19 produccioncientificaluz.org 9 0 inf\n",
+ "20 root.cern 0 51238 0.0\n",
+ "21 sketchport.com 0 35521 0.0\n",
+ "22 soton.ac.uk 0 68546 0.0\n",
+ "23 stanford.edu 0 46586 0.0\n",
+ "24 techscience.com 0 57809 0.0"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "compare_top_domains(df, df_trunc, mime='application/xhtml+xml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "application/pdf\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " domain | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " academie-sciences.fr | \n",
+ " 0 | \n",
+ " 19277 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " admin.ch | \n",
+ " 232 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " arlis.org | \n",
+ " 417 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " auburn.edu | \n",
+ " 408 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " bayern.de | \n",
+ " 465 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " berkeley.edu | \n",
+ " 201 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " bom.gov.au | \n",
+ " 0 | \n",
+ " 26922 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " bournemouth.ac.uk | \n",
+ " 281 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " caltech.edu | \n",
+ " 595 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " cbd.int | \n",
+ " 0 | \n",
+ " 19612 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " cern.ch | \n",
+ " 135 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " columbia.edu | \n",
+ " 606 | \n",
+ " 26624 | \n",
+ " 2.276142 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " copernicus.org | \n",
+ " 376 | \n",
+ " 41100 | \n",
+ " 0.914842 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " cwi.nl | \n",
+ " 414 | \n",
+ " 18513 | \n",
+ " 2.236266 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " dagstuhl.de | \n",
+ " 174 | \n",
+ " 23780 | \n",
+ " 0.731707 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " eartharxiv.org | \n",
+ " 624 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " ems.press | \n",
+ " 0 | \n",
+ " 39803 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " epfl.ch | \n",
+ " 172 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " epizodyspace.ru | \n",
+ " 259 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " epj-conferences.org | \n",
+ " 0 | \n",
+ " 20771 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " esa.int | \n",
+ " 333 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " gsj.jp | \n",
+ " 188 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " gva.es | \n",
+ " 237 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " harvard.edu | \n",
+ " 212 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " hawaii.edu | \n",
+ " 397 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " domain count_trunc count_all %\n",
+ "0 academie-sciences.fr 0 19277 0.000000\n",
+ "1 admin.ch 232 0 inf\n",
+ "2 arlis.org 417 0 inf\n",
+ "3 auburn.edu 408 0 inf\n",
+ "4 bayern.de 465 0 inf\n",
+ "5 berkeley.edu 201 0 inf\n",
+ "6 bom.gov.au 0 26922 0.000000\n",
+ "7 bournemouth.ac.uk 281 0 inf\n",
+ "8 caltech.edu 595 0 inf\n",
+ "9 cbd.int 0 19612 0.000000\n",
+ "10 cern.ch 135 0 inf\n",
+ "11 columbia.edu 606 26624 2.276142\n",
+ "12 copernicus.org 376 41100 0.914842\n",
+ "13 cwi.nl 414 18513 2.236266\n",
+ "14 dagstuhl.de 174 23780 0.731707\n",
+ "15 eartharxiv.org 624 0 inf\n",
+ "16 ems.press 0 39803 0.000000\n",
+ "17 epfl.ch 172 0 inf\n",
+ "18 epizodyspace.ru 259 0 inf\n",
+ "19 epj-conferences.org 0 20771 0.000000\n",
+ "20 esa.int 333 0 inf\n",
+ "21 gsj.jp 188 0 inf\n",
+ "22 gva.es 237 0 inf\n",
+ "23 harvard.edu 212 0 inf\n",
+ "24 hawaii.edu 397 0 inf"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "compare_top_domains(df, df_trunc, mime='application/pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "text/html\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tld | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " at | \n",
+ " 0 | \n",
+ " 305854 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " au | \n",
+ " 0 | \n",
+ " 472364 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " br | \n",
+ " 0 | \n",
+ " 279125 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ca | \n",
+ " 0 | \n",
+ " 754304 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ch | \n",
+ " 0 | \n",
+ " 537122 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " cn | \n",
+ " 0 | \n",
+ " 277346 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " com | \n",
+ " 50 | \n",
+ " 8204849 | \n",
+ " 0.000609 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " de | \n",
+ " 0 | \n",
+ " 2390003 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " edu | \n",
+ " 53 | \n",
+ " 4364031 | \n",
+ " 0.001214 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " es | \n",
+ " 0 | \n",
+ " 490055 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " eu | \n",
+ " 0 | \n",
+ " 369866 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " fi | \n",
+ " 0 | \n",
+ " 275921 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " fr | \n",
+ " 0 | \n",
+ " 738358 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " gov | \n",
+ " 0 | \n",
+ " 2110414 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " io | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " it | \n",
+ " 0 | \n",
+ " 496428 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " jp | \n",
+ " 34 | \n",
+ " 899297 | \n",
+ " 0.003781 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " kr | \n",
+ " 0 | \n",
+ " 295846 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " net | \n",
+ " 0 | \n",
+ " 620669 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " nl | \n",
+ " 0 | \n",
+ " 748835 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " no | \n",
+ " 0 | \n",
+ " 292552 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " org | \n",
+ " 67 | \n",
+ " 7988496 | \n",
+ " 0.000839 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " pl | \n",
+ " 18 | \n",
+ " 464163 | \n",
+ " 0.003878 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " ru | \n",
+ " 0 | \n",
+ " 714493 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " se | \n",
+ " 0 | \n",
+ " 705555 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tld count_trunc count_all %\n",
+ "0 at 0 305854 0.000000\n",
+ "1 au 0 472364 0.000000\n",
+ "2 br 0 279125 0.000000\n",
+ "3 ca 0 754304 0.000000\n",
+ "4 ch 0 537122 0.000000\n",
+ "5 cn 0 277346 0.000000\n",
+ "6 com 50 8204849 0.000609\n",
+ "7 de 0 2390003 0.000000\n",
+ "8 edu 53 4364031 0.001214\n",
+ "9 es 0 490055 0.000000\n",
+ "10 eu 0 369866 0.000000\n",
+ "11 fi 0 275921 0.000000\n",
+ "12 fr 0 738358 0.000000\n",
+ "13 gov 0 2110414 0.000000\n",
+ "14 io 5 0 inf\n",
+ "15 it 0 496428 0.000000\n",
+ "16 jp 34 899297 0.003781\n",
+ "17 kr 0 295846 0.000000\n",
+ "18 net 0 620669 0.000000\n",
+ "19 nl 0 748835 0.000000\n",
+ "20 no 0 292552 0.000000\n",
+ "21 org 67 7988496 0.000839\n",
+ "22 pl 18 464163 0.003878\n",
+ "23 ru 0 714493 0.000000\n",
+ "24 se 0 705555 0.000000"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def compare_top_tlds(df_all, df_trunc, mime='text/html', n=25):\n",
+ " da = dict()\n",
+ " [da.update(i) for i in df_all.loc[df_all['content_mime_detected'] == mime,\n",
+ " 'top_tlds'].apply(json.loads).values[0]]\n",
+ " da = pd.DataFrame(da.items(), columns = ['tld', 'count_all'])\n",
+ " dt = dict()\n",
+ " [dt.update(i) for i in df_trunc.loc[df_trunc['content_mime_detected'] == mime,\n",
+ " 'top_tlds'].apply(json.loads).values[0]]\n",
+ " dt = pd.DataFrame(dt.items(), columns = ['tld', 'count_trunc'])\n",
+ " d = dt.merge(da, how = 'outer').fillna(0).astype({'count_all': int, 'count_trunc': int})\n",
+ " d['%'] = 100.0 * d['count_trunc'] / d['count_all']\n",
+ " print(mime)\n",
+ " return d.head(n)\n",
+ "\n",
+ "df['top_tlds'] = df['top_tlds'].str.replace('([a-z0-9-]+),', '\"\\\\1\":', regex=True)\n",
+ "df_trunc['top_tlds'] = df_trunc['top_tlds'].str.replace('([a-z0-9-]+),', '\"\\\\1\":', regex=True)\n",
+ "\n",
+ "compare_top_tlds(df, df_trunc, mime='text/html')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "application/xhtml+xml\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tld | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ar | \n",
+ " 0 | \n",
+ " 22286 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " at | \n",
+ " 0 | \n",
+ " 8763 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " au | \n",
+ " 0 | \n",
+ " 12842 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " be | \n",
+ " 0 | \n",
+ " 74858 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " br | \n",
+ " 0 | \n",
+ " 48024 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " ca | \n",
+ " 6 | \n",
+ " 63671 | \n",
+ " 0.009423 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " cern | \n",
+ " 0 | \n",
+ " 51238 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " ch | \n",
+ " 0 | \n",
+ " 61890 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " cn | \n",
+ " 0 | \n",
+ " 26630 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " com | \n",
+ " 0 | \n",
+ " 529973 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " de | \n",
+ " 0 | \n",
+ " 58806 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " dk | \n",
+ " 0 | \n",
+ " 8341 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " edu | \n",
+ " 0 | \n",
+ " 230617 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " gov | \n",
+ " 0 | \n",
+ " 67496 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " in | \n",
+ " 0 | \n",
+ " 7264 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " is | \n",
+ " 0 | \n",
+ " 6505 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " it | \n",
+ " 0 | \n",
+ " 9296 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " jp | \n",
+ " 0 | \n",
+ " 124401 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " kr | \n",
+ " 0 | \n",
+ " 5661 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " net | \n",
+ " 0 | \n",
+ " 35349 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " nl | \n",
+ " 0 | \n",
+ " 7652 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " org | \n",
+ " 10 | \n",
+ " 566035 | \n",
+ " 0.001767 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " pl | \n",
+ " 0 | \n",
+ " 12258 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " ru | \n",
+ " 0 | \n",
+ " 7177 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " uk | \n",
+ " 0 | \n",
+ " 336407 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tld count_trunc count_all %\n",
+ "0 ar 0 22286 0.000000\n",
+ "1 at 0 8763 0.000000\n",
+ "2 au 0 12842 0.000000\n",
+ "3 be 0 74858 0.000000\n",
+ "4 br 0 48024 0.000000\n",
+ "5 ca 6 63671 0.009423\n",
+ "6 cern 0 51238 0.000000\n",
+ "7 ch 0 61890 0.000000\n",
+ "8 cn 0 26630 0.000000\n",
+ "9 com 0 529973 0.000000\n",
+ "10 de 0 58806 0.000000\n",
+ "11 dk 0 8341 0.000000\n",
+ "12 edu 0 230617 0.000000\n",
+ "13 gov 0 67496 0.000000\n",
+ "14 in 0 7264 0.000000\n",
+ "15 is 0 6505 0.000000\n",
+ "16 it 0 9296 0.000000\n",
+ "17 jp 0 124401 0.000000\n",
+ "18 kr 0 5661 0.000000\n",
+ "19 net 0 35349 0.000000\n",
+ "20 nl 0 7652 0.000000\n",
+ "21 org 10 566035 0.001767\n",
+ "22 pl 0 12258 0.000000\n",
+ "23 ru 0 7177 0.000000\n",
+ "24 uk 0 336407 0.000000"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "compare_top_tlds(df, df_trunc, mime='application/xhtml+xml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "application/pdf\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " tld | \n",
+ " count_trunc | \n",
+ " count_all | \n",
+ " % | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " at | \n",
+ " 363 | \n",
+ " 34008 | \n",
+ " 1.067396 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " au | \n",
+ " 0 | \n",
+ " 41305 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " br | \n",
+ " 0 | \n",
+ " 45249 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ca | \n",
+ " 2730 | \n",
+ " 77882 | \n",
+ " 3.505303 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " cc | \n",
+ " 1016 | \n",
+ " 81456 | \n",
+ " 1.247299 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " ch | \n",
+ " 879 | \n",
+ " 72648 | \n",
+ " 1.209944 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " com | \n",
+ " 1252 | \n",
+ " 176972 | \n",
+ " 0.707457 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " cz | \n",
+ " 273 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " de | \n",
+ " 2435 | \n",
+ " 329161 | \n",
+ " 0.739760 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " dk | \n",
+ " 277 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " edu | \n",
+ " 5576 | \n",
+ " 388561 | \n",
+ " 1.435039 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " es | \n",
+ " 291 | \n",
+ " 58806 | \n",
+ " 0.494847 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " eu | \n",
+ " 185 | \n",
+ " 25081 | \n",
+ " 0.737610 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " fr | \n",
+ " 669 | \n",
+ " 80750 | \n",
+ " 0.828483 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " gov | \n",
+ " 1932 | \n",
+ " 147427 | \n",
+ " 1.310479 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " hr | \n",
+ " 175 | \n",
+ " 28334 | \n",
+ " 0.617633 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " il | \n",
+ " 0 | \n",
+ " 25061 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " in | \n",
+ " 248 | \n",
+ " 38181 | \n",
+ " 0.649538 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " int | \n",
+ " 492 | \n",
+ " 41210 | \n",
+ " 1.193885 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " it | \n",
+ " 243 | \n",
+ " 88637 | \n",
+ " 0.274152 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " jp | \n",
+ " 2003 | \n",
+ " 173769 | \n",
+ " 1.152680 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " net | \n",
+ " 146 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " nl | \n",
+ " 906 | \n",
+ " 49031 | \n",
+ " 1.847811 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " no | \n",
+ " 251 | \n",
+ " 0 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " org | \n",
+ " 5140 | \n",
+ " 593956 | \n",
+ " 0.865384 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " tld count_trunc count_all %\n",
+ "0 at 363 34008 1.067396\n",
+ "1 au 0 41305 0.000000\n",
+ "2 br 0 45249 0.000000\n",
+ "3 ca 2730 77882 3.505303\n",
+ "4 cc 1016 81456 1.247299\n",
+ "5 ch 879 72648 1.209944\n",
+ "6 com 1252 176972 0.707457\n",
+ "7 cz 273 0 inf\n",
+ "8 de 2435 329161 0.739760\n",
+ "9 dk 277 0 inf\n",
+ "10 edu 5576 388561 1.435039\n",
+ "11 es 291 58806 0.494847\n",
+ "12 eu 185 25081 0.737610\n",
+ "13 fr 669 80750 0.828483\n",
+ "14 gov 1932 147427 1.310479\n",
+ "15 hr 175 28334 0.617633\n",
+ "16 il 0 25061 0.000000\n",
+ "17 in 248 38181 0.649538\n",
+ "18 int 492 41210 1.193885\n",
+ "19 it 243 88637 0.274152\n",
+ "20 jp 2003 173769 1.152680\n",
+ "21 net 146 0 inf\n",
+ "22 nl 906 49031 1.847811\n",
+ "23 no 251 0 inf\n",
+ "24 org 5140 593956 0.865384"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "compare_top_tlds(df, df_trunc, mime='application/pdf')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### A Closer Look on Domains With Large Amounts of Truncated Content\n",
+ "\n",
+ "The aim is to identify domains which will \"profiteers\" if the content limit threshold is increased - domains which already (with the current 1 MiB limit) have many truncated captures or occupy a large amount of WARC storage.\n",
+ "\n",
+ "Get reliable metrics on the level of registered \"pay-level\" domains\n",
+ "- number of truncated pages\n",
+ "- compared to / ratio of all page captures\n",
+ "- GiB truncated content in WARC files\n",
+ "\n",
+ "```sql\n",
+ "with tmp1 as (select\n",
+ " count(*) as count,\n",
+ " content_mime_detected,\n",
+ " url_host_registered_domain\n",
+ "from \"ccoaindex\".\"ccoaindex\"\n",
+ "where crawl = 'CC-SUPPLEMENTAL-2026-22'\n",
+ " and subset = 'warc'\n",
+ "group by content_mime_detected,\n",
+ " url_host_registered_domain),\n",
+ "tmp2 as (select\n",
+ " count(*) as count,\n",
+ " sum(warc_record_length) as sum_warc_record_length,\n",
+ " content_mime_detected,\n",
+ " url_host_registered_domain\n",
+ "from \"ccoaindex\".\"ccoaindex\"\n",
+ "where crawl = 'CC-SUPPLEMENTAL-2026-22'\n",
+ " and subset = 'warc'\n",
+ " and content_truncated = 'length'\n",
+ "group by content_mime_detected,\n",
+ " url_host_registered_domain\n",
+ "having count(*) >= 10000\n",
+ " or sum(warc_record_length) > 10*1024*1024*cast(1024 as bigint))\n",
+ "select trunc.count as count_trunc,\n",
+ " all.count as count_total,\n",
+ " format('%.3f', 100.0 * trunc.count / all.count) as perc_trunc,\n",
+ " format('%,.2f', trunc.sum_warc_record_length / (1024.0*1024*1024)) storage_trunc_gib,\n",
+ " all.content_mime_detected as content_mime_detected,\n",
+ " all.url_host_registered_domain as url_host_registered_domain\n",
+ "from tmp1 as all\n",
+ " right outer join tmp2 as trunc\n",
+ " on all.content_mime_detected = trunc.content_mime_detected\n",
+ "and all.url_host_registered_domain = trunc.url_host_registered_domain\n",
+ "order by trunc.sum_warc_record_length desc;\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count_trunc | \n",
+ " count_total | \n",
+ " perc_trunc | \n",
+ " storage_trunc_gib | \n",
+ " content_mime_detected | \n",
+ " url_host_registered_domain | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 4294 | \n",
+ " 25907 | \n",
+ " 16.575 | \n",
+ " 55.71 | \n",
+ " application/octet-stream | \n",
+ " nasa.gov | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3291 | \n",
+ " 9311 | \n",
+ " 35.345 | \n",
+ " 50.30 | \n",
+ " model/vnd.valve.source.compiled-map | \n",
+ " nasa.gov | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2137 | \n",
+ " 3198 | \n",
+ " 66.823 | \n",
+ " 47.88 | \n",
+ " application/octet-stream | \n",
+ " bcgsc.ca | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1502 | \n",
+ " 4298 | \n",
+ " 34.946 | \n",
+ " 30.71 | \n",
+ " application/octet-stream | \n",
+ " ucsc.edu | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1265 | \n",
+ " 17132 | \n",
+ " 7.384 | \n",
+ " 27.65 | \n",
+ " application/pdf | \n",
+ " uqam.ca | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1113 | \n",
+ " 10964 | \n",
+ " 10.151 | \n",
+ " 25.71 | \n",
+ " application/pdf | \n",
+ " scholaris.ca | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 936 | \n",
+ " 7582 | \n",
+ " 12.345 | \n",
+ " 22.45 | \n",
+ " application/pdf | \n",
+ " ipums.org | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1623 | \n",
+ " 9502 | \n",
+ " 17.081 | \n",
+ " 21.24 | \n",
+ " application/octet-stream | \n",
+ " mit.edu | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 798 | \n",
+ " 2204 | \n",
+ " 36.207 | \n",
+ " 18.39 | \n",
+ " application/octet-stream | \n",
+ " salmobase.org | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 717 | \n",
+ " 3282 | \n",
+ " 21.846 | \n",
+ " 16.65 | \n",
+ " application/octet-stream | \n",
+ " wisc.edu | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 623 | \n",
+ " 48224 | \n",
+ " 1.292 | \n",
+ " 14.68 | \n",
+ " application/pdf | \n",
+ " neurips.cc | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 624 | \n",
+ " 11354 | \n",
+ " 5.496 | \n",
+ " 14.53 | \n",
+ " application/pdf | \n",
+ " eartharxiv.org | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 606 | \n",
+ " 26624 | \n",
+ " 2.276 | \n",
+ " 13.85 | \n",
+ " application/pdf | \n",
+ " columbia.edu | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 595 | \n",
+ " 16008 | \n",
+ " 3.717 | \n",
+ " 13.78 | \n",
+ " application/pdf | \n",
+ " caltech.edu | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 611 | \n",
+ " 5193 | \n",
+ " 11.766 | \n",
+ " 13.01 | \n",
+ " application/pdf | \n",
+ " jaea.go.jp | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 465 | \n",
+ " 16693 | \n",
+ " 2.786 | \n",
+ " 11.25 | \n",
+ " application/pdf | \n",
+ " bayern.de | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 485 | \n",
+ " 21793 | \n",
+ " 2.225 | \n",
+ " 11.12 | \n",
+ " application/pdf | \n",
+ " nasa.gov | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 474 | \n",
+ " 7418 | \n",
+ " 6.390 | \n",
+ " 10.76 | \n",
+ " application/pdf | \n",
+ " uni-bayreuth.de | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 428 | \n",
+ " 4980 | \n",
+ " 8.594 | \n",
+ " 10.39 | \n",
+ " application/pdf | \n",
+ " umk.pl | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count_trunc count_total perc_trunc storage_trunc_gib \\\n",
+ "0 4294 25907 16.575 55.71 \n",
+ "1 3291 9311 35.345 50.30 \n",
+ "2 2137 3198 66.823 47.88 \n",
+ "3 1502 4298 34.946 30.71 \n",
+ "4 1265 17132 7.384 27.65 \n",
+ "5 1113 10964 10.151 25.71 \n",
+ "6 936 7582 12.345 22.45 \n",
+ "7 1623 9502 17.081 21.24 \n",
+ "8 798 2204 36.207 18.39 \n",
+ "9 717 3282 21.846 16.65 \n",
+ "10 623 48224 1.292 14.68 \n",
+ "11 624 11354 5.496 14.53 \n",
+ "12 606 26624 2.276 13.85 \n",
+ "13 595 16008 3.717 13.78 \n",
+ "14 611 5193 11.766 13.01 \n",
+ "15 465 16693 2.786 11.25 \n",
+ "16 485 21793 2.225 11.12 \n",
+ "17 474 7418 6.390 10.76 \n",
+ "18 428 4980 8.594 10.39 \n",
+ "\n",
+ " content_mime_detected url_host_registered_domain \n",
+ "0 application/octet-stream nasa.gov \n",
+ "1 model/vnd.valve.source.compiled-map nasa.gov \n",
+ "2 application/octet-stream bcgsc.ca \n",
+ "3 application/octet-stream ucsc.edu \n",
+ "4 application/pdf uqam.ca \n",
+ "5 application/pdf scholaris.ca \n",
+ "6 application/pdf ipums.org \n",
+ "7 application/octet-stream mit.edu \n",
+ "8 application/octet-stream salmobase.org \n",
+ "9 application/octet-stream wisc.edu \n",
+ "10 application/pdf neurips.cc \n",
+ "11 application/pdf eartharxiv.org \n",
+ "12 application/pdf columbia.edu \n",
+ "13 application/pdf caltech.edu \n",
+ "14 application/pdf jaea.go.jp \n",
+ "15 application/pdf bayern.de \n",
+ "16 application/pdf nasa.gov \n",
+ "17 application/pdf uni-bayreuth.de \n",
+ "18 application/pdf umk.pl "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "domains = pd.read_csv(DATA_DIR / 'warc-truncation-domains-detailed-CC-SUPPLEMENTAL-2026-22.csv')\n",
+ "domains.head(50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "Headline: PDFs dominate everything.\n",
+ "\n",
+ "- application/pdf is only ~6% of records by count (3.17 M of ~47 M) but 73.5 % of total WARC storage. Truncation analysis is effectively a PDF question.\n",
+ "- ~1.05 % of PDFs are truncated (33,301 records), and 11.55 % of all WARC storage is truncated PDFs — about 712.75 GiB of payload that hit the cap. Of those 33,301 truncated PDFs, 31,373 are length-truncated\n",
+ "(i.e. the 25 MiB limit, not a connection drop or timeout). So nearly all PDF truncation is \"the document was bigger than 25 MiB.\"\n",
+ "\n",
+ "Other MIMEs worth noting.\n",
+ "\n",
+ "- application/octet-stream: 16 % truncation rate, ~221 GiB length-truncated. Heavily driven by nasa.gov, bcgsc.ca, ucsc.edu, mit.edu, salmobase.org, wisc.edu — looks like bioinformatics/science data dumps.\n",
+ "- model/vnd.valve.source.compiled-map: 35 % truncated, 50 GiB, all from nasa.gov. Niche but striking.\n",
+ "- text/html: essentially unaffected (0.002 % truncated, 843 records total, mostly disconnects/timeouts, only 255 by length, ~1 GiB). HTML fits in 25 MiB just fine.\n",
+ "- Long tail of high-rate-but-low-volume types: chemical/x-cif (88 %), application/x-sqlite3 (76 %), application/vnd.apple.keynote (28 %) — interesting curiosities, not material to total losses.\n",
+ "\n",
+ "Where the loss is concentrated. The detailed per-domain table makes the targeting obvious. PDF length-truncations cluster on academic / government / research repositories:\n",
+ "\n",
+ "- uqam.ca 27.6 GiB, scholaris.ca 25.7 GiB, ipums.org 22.5 GiB, neurips.cc 14.7 GiB, eartharxiv.org 14.5 GiB, columbia.edu, caltech.edu, jaea.go.jp, bayern.de, nasa.gov, uni-bayreuth.de, umk.pl.\n",
+ "- TLD breakdown for PDFs confirms it: .ca 3.5 %, .nl 1.85 %, .edu 1.43 %, .gov 1.31 %, .jp 1.15 % — all well above the .com baseline of 0.7 %. So the truncated content is dominated by long research PDFs, theses,\n",
+ "datasets, conference proceedings."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "cc-focus",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}