cc-citations/bib/cc2015.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
@Article{cc:MeuselVignaLehmbergBizer:2015:web-graph-structure-aggregation-levels,
  author       = "Robert Meusel and Sebastiano Vigna and Oliver Lehmberg and Christian Bizer",
  title        = "The Graph Structure in the Web – Analyzed on Different Aggregation Levels",
  year         = "2015",
  volume       = "1",
  journal      = "The Journal of Web Science",
  ISSN         = "",
  number       = "1",
  pages        = "33--47",
  doi          = "http://dx.doi.org/10.1561/106.00000003",
  URL          = "https://pdfs.semanticscholar.org/b5d5/88298e6845b4bfd40ea779ce21e628239ef3.pdf",
  cc-author-affiliation = "University of Mannheim, Germany; Università degli Studi di Milano, Italy",
  cc-class     = "web-science/hyperlinkgraph",
}

@InProceedings{cc:StolzHepp:2015:crawling-for-structured-data,
  author       = "Alex Stolz and Martin Hepp",
  title        = "Towards Crawling the Web for Structured Data: Pitfalls of {Common Crawl} for {E}-Commerce",
  editor       = "Olaf Hartig and Juan Sequeda and Aidan Hogan",
  booktitle    = "Proceedings of the 6th International Workshop on Consuming Linked Data co-located with 14th
                 International Semantic Web Conference ({ISWC} 2105), Bethlehem, Pennsylvania, US, October 12th, 2015.",
  series       = "{CEUR} Workshop Proceedings",
  volume       = "1426",
  publisher    = "CEUR-WS.org",
  year         = "2015",
  URL          = "http://ceur-ws.org/Vol-1426/paper-04.pdf",
  cc-author-affiliation = "Universitaet der Bundeswehr Munich, Germany",
  cc-class     = "nlp/corpus-representativeness, semantic web, microdata, e-commerce",
}

@InProceedings{cc:EberiusThieleBraunschweigLehner:2015:top-k-entity-augmentation,
  author       = "Julian Eberius and Maik Thiele and Katrin Braunschweig and Wolfgang Lehner",
  title        = "Top-k Entity Augmentation Using Consistent Set Covering",
  series       = "SSDBM '15",
  year         = "2015",
  doi          = "10.1145/2791347.2791353",
  URL          = "https://www.semanticscholar.org/paper/Top-k-entity-augmentation-using-consistent-set-Eberius-Thiele/a554fe7c49837e2d2d995e00fd3b62a6ca5650f2",
  pdf          = "https://www.researchgate.net/profile/Julian_Eberius/publication/280658741_Top-k_Entity_Augmentation_Using_Consistent_Set_Covering/links/55e978f408ae21d099c2eeb5/Top-k-Entity-Augmentation-Using-Consistent-Set-Covering.pdf",
  cc-derived-dataset-about = "{DresdenWebTableCorpus}",
  cc-author-affiliation = "Technische Universität Dresden, Germany",
  cc-class     = "semantic web, web tables, web mining",
  cc-snippet   = "To enable repeatability we publish the implementation², but also include the web table corpus used
                 for the evaluation³. This corpus contains 100M Web tables extracted from a publicly available Web
                 crawl⁴ [4: http://commoncrawl.org]",
}

@Article{cc:MalensekPallickaraPallickara:2015:Disk-IO-contention,
  year         = "2015",
  author       = "Matthew Malensek and Sangmi Lee Pallickara and Shrideep Pallickara",
  title        = "Alleviation of Disk {I}/{O} Contention in Virtualized Settings for Data-Intensive Computing",
  URL          = "http://galileo.cs.colostate.edu/papers/DiskInterference-BDC.pdf",
  cc-author-affiliation = "Colorado State University",
}

@Article{cc:BarikLubickSmithSlankasEtAl:2015:FUSE-spreadsheet-corpus,
  year         = "2015",
  author       = "Titus Barik and Kevin Lubick and Justin Smith and John Slankas and Emerson Murphy-Hill",
  title        = "{FUSE}: {A} Reproducible, Extendable, Internet-scale Corpus of Spreadsheets",
  URL          = "http://kjlubick.github.io/pubs/MSR2015-Fuse_spreadsheet_corpus.pdf",
  cc-author-affiliation = "ABB Corporate Research and North Carolina State University",
}

@Article{cc:DaiberQuirozWechslerFrank:2015:Splitting-compounds,
  year         = "2015",
  author       = "Joachim Daiber and Lautaro Quiroz and Roger Wechsler and Stella Frank",
  title        = "Splitting Compounds by Semantic Analogy",
  URL          = "https://ufal.mff.cuni.cz/~rosa/2015/docs/dmtw2015.pdf#page=26",
  cc-author-affiliation = "University of Amsterdam",
}

@Article{cc:GalkinMouromtsevAuer:2015:Identifying-web-tables,
  year         = "2015",
  author       = "Mikhail Galkin and Dmitry Mouromtsev and Sören Auer",
  title        = "Identifying Web Tables –Supporting a Neglected Type of Content on the Web",
  URL          = "http://arxiv.org/pdf/1503.06598.pdf",
  cc-author-affiliation = "IMTO University- St. Petersburg, Russia, University of Bonn- Germany",
}

@Article{cc:Juba:2015:Sampling-for-anomaly-detection,
  year         = "2015",
  author       = "Brendan Juba",
  title        = "Principled Sampling for Anomaly Detection",
  URL          = "http://www.cse.wustl.edu/~bjuba/papers/anomaly_detection.pdf",
  cc-author-affiliation = "Washington University in St. Louis",
}

@Article{cc:EwaPotoniecŁawrynowicz:2015:Extracting-usage-patterns-of-ontologies,
  year         = "2015",
  author       = "Kowalczuk Ewa and Jedrzej Potoniec and Agnieszka Ławrynowicz",
  title        = "Extracting Usage Patterns of Ontologies on the Web: a Case Study on GoodRelations Vocabulary in
                 {RDF}a",
  URL          = "http://ceur-ws.org/Vol-1265/owled2014_submission_14.pdf",
  cc-author-affiliation = "Institute of Computing Science, Poznan University of Technology, Poland",
}

@Article{cc:GuoLiuHanMaletti:2015:Tunable-language-model-for-SMT,
  year         = "2015",
  author       = "Junfei Guo and Juan Liu and Qi Han and Andreas Maletti",
  title        = "A Tunable Language Model for Statistical Machine Translation",
  URL          = "http://www.ims.uni-stuttgart.de/institut/mitarbeiter/maletti/pub/guoliuhanmal14.pdf",
  cc-author-affiliation = "School of Computer, Wuhan University, China, Institute for Natural Language Processing,
                 University of Stuttgart, Germany; Institute for Visualization and Interactive Systems, University of
                 Stuttgart, Germany; Institute of Computer Science, University of Leipzig, Germany",
}

@Article{cc:OusterhoutRastiRatnasamyShenkerEtAl:2015:Performance-in-data-analytics-frameworks,
  year         = "2015",
  author       = "Kay Ousterhout and Ryan Rasti and Sylvia Ratnasamy and Scott Shenker and Byung-Gon Chun",
  title        = "Making Sense of Performance in Data Analytics Frameworks",
  URL          = "http://www.eecs.berkeley.edu/~keo/publications/nsdi15-final147.pdf",
  cc-author-affiliation = "UC Berkeley, ICSI, Vmware, Seoul National University",
}

@Article{cc:JaffeJinKingSchinjdel:2015:Azmat-sentence-similarity,
  year         = "2015",
  author       = "Evan Jaffe and Lifeng Jin and David King and Marten van Schinjdel",
  title        = "Azmat: Sentence Similarity using Associative Matrices",
  URL          = "http://www.ling.ohio-state.edu/~vanschm/resources/uploads/jaffe_etal-2015-semeval.pdf",
  cc-author-affiliation = "Dept. of Linguistics, Ohio State University",
}

@Article{cc:AlemiGinsparg:2015:Text-segmentation-based-on-word-embeddings,
  year         = "2015",
  author       = "Alexander A Alemi and Paul Ginsparg",
  title        = "Text Segmentation based on Semantic Word Embeddings",
  URL          = "http://arxiv.org/pdf/1503.05543.pdf",
  cc-author-affiliation = "Dept. of Physics, Cornell University, Dept. of Physics and Information Science, Cornell
                 University",
}

@Article{cc:Hayes:2015:Crawling-toward-wiser-web,
  title        = "Crawling toward a {Wiser} {Web}",
  volume       = "103",
  ISSN         = "0003-0996, 1545-2786",
  URL          = "https://www.americanscientist.org/article/crawling-toward-a-wiser-web",
  doi          = "https://doi.org/10.1511/2015.114.184",
  number       = "3",
  urldate      = "2025-02-28",
  journal      = "American Scientist",
  author       = "Hayes, Brian",
  year         = "2015",
  pages        = "184",
  cc-author-affiliation = "",
  cc-class     = "web-crawling, dataset-creation",
  cc-snippet   = "What if the public had direct access to the entire crawl, and everyone were welcome to write and run
                 their own programs for analyzing the data? The Common Crawl offers just such an opportunity. [...] The
                 Common Crawl is overseen by a nonprofit corporation, the Common Crawl Foundation, established in 2007
                 by Gil Elbaz, a software entrepreneur. [...] Work on the crawl is done by a small staff in San
                 Francisco, and by volunteers. [...] Stephen Merity, data scientist for the Common Crawl, explains that
                 the group’s crawling strategy has evolved away from pure reliance on discovering and following links.
                 The crawl’s to-do list now begins with a large set of URLs donated by the search engine Blekko.
                 During the crawl the list of discovered links is pruned to reduce the volume of spam and escape spider
                 traps. The aim is to give precedence to pages of interest to human readers.",
}

@Misc{cc:McSherry:2015:scalability-at-what-cost,
  author       = "Frank McSherry",
  year         = "2015",
  title        = "Scalability! {But} at what {COST}?",
  URL          = "https://www.frankmcsherry.org/graph/scalability/cost/2015/01/15/COST.html",
  url2         = "https://github.com/frankmcsherry/blog/blob/master/posts/2015-01-15.md",
  cc-author-affiliation = "ETH Zurich, Switzerland",
  cc-class     = "web-science/hyperlinkgraph, big data",
}

@Misc{cc:McSherry:2015:bigger-data-same-laptop,
  author       = "Frank McSherry",
  year         = "2015",
  title        = "Bigger data; same laptop",
  URL          = "https://www.frankmcsherry.org/graph/scalability/cost/2015/02/04/COST2.html",
  url2         = "https://github.com/frankmcsherry/blog/blob/master/posts/2015-02-04.md",
  cc-author-affiliation = "ETH Zurich, Switzerland",
  cc-class     = "web-science/hyperlinkgraph, big data",
  cc-derived-dataset-used = "WDC-hyperlinkgraph",
}