cc-citations/bib/cc2019.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
@Book{cc:BrueggerMiligan:2018:SAGE-handbook-web-history,
  title        = "The {SAGE} Handbook of Web History",
  author       = "Brügger, Nils and Milligan, Ian",
  year         = "2019",
  URL          = "https://us.sagepub.com/en-us/nam/the-sage-handbook-of-web-history/book252251",
  publisher    = "SAGE Publications Limited",
  cc-author-affiliation = "Aarhus University, Denmark; University of Waterloo, Canada",
  cc-class     = "web-science, web history",
}

@Article{cc:WenzekLachauxConneauChaudharyEtAl:2019:CCNet,
  author       = "Guillaume Wenzek and Marie-Anne Lachaux and Alexis Conneau and Vishrav Chaudhary and Francisco Guzmán
                 and Armand Joulin and Edouard Grave",
  title        = "{CCN}et: Extracting high quality monolingual datasets from web crawl data",
  journal      = "CoRR",
  year         = "2019",
  URL          = "https://arxiv.org/abs/1911.00359",
  eprint       = "1911.00359",
  cc-author-affiliation = "Facebook AI",
  cc-derived-dataset-about = "CCNet",
  abstract     = "Pre-training text representations have led to significant improvements in many areas of natural
                 language processing. The quality of these models benefits greatly from the size of the pretraining
                 corpora as long as its quality is preserved. In this paper, we describe an automatic pipeline to
                 extract massive high-quality monolingual datasets from Common Crawl for a variety of languages. Our
                 pipeline follows the data processing introduced in fastText (Mikolov et al., 2017; Grave et al., 2018),
                 that deduplicates documents and identifies their language. We augment this pipeline with a filtering
                 step to select documents that are close to high quality corpora like Wikipedia.",
  cc-snippet   = "[about https://github.com/facebookresearch/cc_net] In this paper, we present a data collection
                 pipeline that allows to gather massive monolingual corpora of high quality in a variety of languages,
                 including many low-resource ones. The principles of our pipeline are general and we show the results of
                 its application to data collected by the Common Crawl project.¹ Common Crawl is a massive non-curated
                 dataset of webpages in many languages, mixed together in temporal snapshots of the web.",
  cc-class     = "nlp/corpus-construction, nlp/web-as-corpus, nlp/low-resource-language",
}

@InProceedings{cc:RadfordWuChildLuanEtAl:2019:language-models,
  title        = "Language models are unsupervised multitask learners",
  author       = "A. Radford and Jeffrey Wu and R. Child and David Luan and Dario Amodei and Ilya Sutskever",
  year         = "2019",
  URL          = "https://www.semanticscholar.org/paper/Language-Models-are-Unsupervised-Multitask-Learners-Radford-Wu/9405cc0d6169988371b2755e573cc28650d14dfe",
  cc-snippet   = "A promising source of diverse and nearly unlimited text is web scrapes such as Common Crawl. While
                 these archives are many orders of magnitude larger than current language modeling datasets, they have
                 significant data quality issues. Trinh & Le (2018) used Common Crawl in their work on commonsense
                 reasoning but noted a large amount of documents “whose content are mostly unintelligible”. We
                 ob-served similar data issues in our initial experiments with Common Crawl. Trinh & Le (2018)’s best
                 results were achieved using a small subsample of Common Crawl which included only documents most
                 similar to their target dataset,the Winograd Schema Challenge. While this is a pragmatic approach to
                 improve performance on a specific task, we want to avoid making assumptions about the tasks to be
                 performed ahead of time.Instead, we created a new web scrape which emphasizes document quality. To do
                 this we only scraped web pages which have been curated/filtered by humans. Manually filtering a full
                 web scrape would be exceptionally expensive so as a starting point, we scraped all outbound links from
                 Reddit, a social media platform, which received at least 3 karma. This can be thought of as a heuristic
                 indicator for whether other users found the link interesting, educational, or just funny. The resulting
                 dataset, WebText, contains the text subsetof these 45 million links.",
  cc-class     = "cc-cited-not-used",
  cc-author-affiliation = "OpenAI, San Francisco, California, United States",
}

@InProceedings{cc:Ortiz-SuarezSagotRomary:2019:processing-huge-corpora,
  title        = "{Asynchronous pipeline for processing huge corpora on medium to low resource infrastructures}",
  author       = "Ortiz Suárez, Pedro Javier and Sagot, Benoît and Romary, Laurent",
  URL          = "https://hal.inria.fr/hal-02148693",
  booktitle    = "7th Workshop on the Challenges in the Management of Large Corpora (CMLC-7)",
  address      = "Cardiff, United Kingdom",
  editor       = "Piotr Bański and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc
                 Kupietz and Harald Lüngen and Caroline Iliadi",
  publisher    = "{Leibniz-Institut für Deutsche Sprache}",
  year         = "2019",
  doi          = "10.14618/IDS-PUB-9021",
  pdf          = "https://hal.inria.fr/hal-02148693/file/Asynchronous_Pipeline_for_Processing_Huge_Corpora_on_Medium_to_Low_Resource_Infrastructures.pdf",
  cc-author-affiliation = "Inria, Paris, France; Sorbonne University, Paris, France",
  cc-class     = "nlp/corpus-construction",
  cc-derived-dataset-about = "OSCAR",
  cc-dataset-used = "CC-MAIN-2018-47 (WET)",
  cc-snippet   = "We use the November 2018 snapshot which surpasses 20TB of uncompressed data and contains more than 50
                 thousand plain text files where each file consists of the plain text from multiple websites along its
                 metadata header. From now on, when we mention the “Common Crawl” corpus, we refer to this
                 particular November 2018 snapshot.",
}

@MastersThesis{cc:Mottl:2019:branchenklassifikation,
  author       = "Dominik Mottl",
  title        = "Multi-Label Branchenklassifikation von Web-Texten",
  year         = "2019",
  pdf          = "https://fbmn.h-da.de/uploads/Themen/WS18_thesis_mottl.pdf",
  cc-snippet   = "NER of company names and linking to DBpedia performed on English texts in 712 WET files of November
                 2018 crawl (CC-MAIN-2018-47) using cc-pyspark.",
  cc-author-affiliation = "Hochschule Darmstadt, Germany",
  cc-class     = "nlp/NER, entity-linking",
}

@Misc{cc:Nagel:2019:accessing-warc-files-via-sql,
  author       = "Nagel, Sebastian",
  title        = "Accessing {WARC} files via {SQL}",
  year         = "2019",
  howpublished = "Poster at IIPC Web Archiving Conference, 6–7 June 2019, Zagreb, Croatia",
  pdf          = "https://netpreserve.org/ga2019/wp-content/uploads/2019/07/IIPCWAC2019-SEBASTIAN_NAGEL-Accessing_WARC_files_via_SQL-poster.pdf",
  URL          = "https://digital.library.unt.edu/ark:/67531/metadc1608961/",
  cc-author-affiliation = "Common Crawl, USA",
  cc-dataset-used = "cc-index-table",
  cc-class     = "web-archiving, SQL, Parquet",
}

@Misc{cc:LiuOttGoyalDuEtAl:2019:RoBERTa-Robustly-Optimized-BERT,
  title        = "Ro{BERT}a: {A} Robustly Optimized {BERT} Pretraining Approach",
  author       = "Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy
                 and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov",
  year         = "2019",
  eprint       = "1907.11692",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/1907.11692",
  pdf          = "https://arxiv.org/pdf/1907.11692.pdf",
  cc-author-affiliation = "Paul G. Allen School of Computer Science & Engineering, University of Washington, Seattle,
                 WA, USA; Facebook AI",
  cc-class     = "nlp/corpus-construction, nlp/language-model",
  cc-dataset-used = "CC-NEWS",
  cc-derived-dataset-about = "CC-NEWS-RoBERTa",
  cc-snippet   = "We find that BERT was significantly undertrained and propose an improved recipe for training BERT
                 models, which we call RoBERTa, that can match or exceed the performance of all of the post-BERT
                 methods. Our modifications are simple, they include: (1) training the model longer, with bigger
                 batches, over more data; (2) removing the next sentence prediction objective; (3) training on longer
                 sequences; and (4) dynamically changing the masking pattern applied to the training data. We also
                 collect a large new dataset (CC-NEWS) of comparable size to other privately used datasets, to better
                 control for training set size effects. [...] CC-NEWS, which we collected from the English portion of
                 the CommonCrawl News dataset (Nagel, 2016). The data contains 63 million English news articles crawled
                 between September 2016 and February 2019. (76GB after filtering).⁴ [⁴ We use news-please (Hamborg
                 et al.,2017) to collect and extract CC-NEWS. CC-NEWS is similar to the REALNEWS dataset described in
                 Zellers et al. (2019).]",
}

@InCollection{cc:ZellersHoltzmanRashkinBiskEtAl:2019:defending-against-neural-fake-news,
  title        = "Defending against neural fake news",
  author       = "Zellers, Rowan and Holtzman, Ari and Rashkin, Hannah and Bisk, Yonatan and Farhadi, Ali and Roesner,
                 Franziska and Choi, Yejin",
  booktitle    = "Advances in Neural Information Processing Systems 32",
  editor       = "H. Wallach and H. Larochelle and A. Beygelzimer and F. d'Alché-Buc and E. Fox and R. Garnett",
  pages        = "9054--9065",
  year         = "2019",
  publisher    = "Curran Associates, Inc.",
  URL          = "http://papers.nips.cc/paper/9106-defending-against-neural-fake-news.pdf",
  cc-author-affiliation = "University of Washington, USA; Allen Institute for Artificial Intelligence, USA",
  cc-class     = "nlp/language-model, nlp/fake-news-detection, nlp/text-classification, misinformation, disinformation",
  cc-snippet   = "Dataset. We present RealNews, a large corpus of news articles from Common Crawl. Training Grover
                 requires a large corpus of news articles with metadata, but none currently exists. Thus, we construct
                 one by scraping dumps from Common Crawl, limiting ourselves to the 5000 news domains indexed by Google
                 News. We used the Newspaper Python library to extract the body and meta-data from each article. News
                 from Common Crawl dumps from December 2016 through March 2019 were used as training data; articles
                 published in April 2019 from the April 2019 dump were used for evaluation. After deduplication,
                 RealNews is 120 gigabytes without compression. [...] Obtaining the data required through Common Crawl
                 cost \$10k in AWS credits and can be massively parallelized over many CPUs. [...]",
  cc-derived-dataset-about = "Grover-RealNews",
}

@Article{cc:PibiriPetriMoffat:2019:dictionary-based-inverted-index-compression,
  title        = "Fast Dictionary-Based Compression for Inverted Indexes",
  author       = "Pibiri, Giulio Ermanno and Petri, Matthias and Moffat, Alistair",
  year         = "2019",
  URL          = "https://dl.acm.org/citation.cfm?id=3290962",
  pdf          = "http://pages.di.unipi.it/pibiri/papers/WSDM19.pdf",
  doi          = "https://doi.org/10.1145/3289600.3290962",
  abstract     = "Dictionary-based compression schemes provide fast decoding operation, typically at the expense of
                 reduced compression effectiveness compared to statistical or probability-based approaches. In this
                 work, we apply dictionary-based techniques to the compression of inverted lists, showing that the high
                 degree of regularity that these integer sequences exhibit is a good match for certain types of
                 dictionary methods, and that an important new trade-off balance between compression effectiveness and
                 compression efficiency can be achieved. Our observations are supported by experiments using the
                 document-level inverted index data for two large text collections, and a wide range of other index
                 compression implementations as reference points. Those experiments demonstrate that the gap between
                 efficiency and effectiveness can be substantially narrowed.",
  cc-snippet   = "We use the standard Gov2 collection containing 426 GiB of text; and CCNEWS, an English subset of the
                 freely available NEWS subset of the CommonCrawl¹
                 [¹http://commoncrawl.org/2016/10/news-dataset-available/], consisting of news articles in the period
                 09/01/16 to 30/03/18, following the methodology of Petri and Moffat [28].",
  cc-dataset-used = "CC-NEWS",
  cc-dataset-used-subset = "2016/09/01 - 2018/03/30",
  cc-class     = "information-retrieval/search-engine, information-retrieval/inverted-index",
  cc-author-affiliation = "University of Melbourne, Australia; University of Pisa, Italy; ISTI-CNR, Pisa, Italy",
}

@Misc{cc:SchwenkWenzekEdunovGraveEtAl:2019:CCMatrix-high-quality-parallel-sentences,
  doi          = "10.48550/ARXIV.1911.04944",
  URL          = "https://arxiv.org/abs/1911.04944",
  author       = "Schwenk, Holger and Wenzek, Guillaume and Edunov, Sergey and Grave, Edouard and Joulin, Armand",
  keywords     = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
                 information sciences",
  title        = "{CCM}atrix: Mining Billions of High-Quality Parallel Sentences on the {WEB}",
  publisher    = "arXiv",
  year         = "2019",
  abstract     = "We show that margin-based bitext mining in a multilingual sentence space can be applied to monolingual
                 corpora of billions of sentences. We are using ten snapshots of a curated common crawl corpus (Wenzek
                 et al., 2019), totalling 32.7 billion unique sentences. Using one unified approach for 38 languages, we
                 were able to mine 4.5 billions parallel sentences, out of which 661 million are aligned with English.
                 20 language pairs have more then 30 million parallel sentences, 112 more then 10 million, and most more
                 than one million, including direct alignments between many European or Asian languages.",
  cc-snippet   = "The curated Common Crawl corpus¶ In this work, we propose to mine parallel sentences from the Web, by
                 using the data released by the Common Crawl project.[⁵https://commoncrawl.org/] Each month, a
                 snapshot of the Web containing terabytes of web pages in various languages is obtained by randomly
                 exploring URLs. We start by applying some preprocessing steps to the raw text data, following the
                 pipeline introduced by Wenzek et al. (2019) and leading to the CCNet dataset. The first step is to
                 deduplicate the data at the paragraph level, as the original crawls contain up to 70\% of duplicated
                 data. This preprocessing removes low quality content, such as boilerplate, navigation menus or cookie
                 warnings. The second step of the pipeline is to identify the language of each document, using fastText6
                 (Grave et al., 2018). This language identifier uses a linear classifier with character n-gram features,
                 and can recognize up to 176 languages. Finally, the last step of the preprocessing is to filter low
                 quality content by training a language model on Wikipedia, and only keeping documents with a low
                 perplexity score. We refer the reader to Wenzek et al. (2019) for more details about this pre-
                 processing pipeline. In Figure 1, we report the number of unique sentences obtained after preprocessing
                 ten snapshots from Common Crawl. We currently process 38 languages. The English Web content is abundant
                 and we used only one snapshot.",
  cc-author-affiliation = "Facebook AI",
  cc-class     = "nlp/corpus-construction, nlp/parallel-corpus, nlp/machine-translation",
  cc-derived-dataset-about = "CCMatrix",
}

@Misc{cc:BakhtinGrossOttDengEtAl:2019:real-or-fake,
  doi          = "10.48550/ARXIV.1906.03351",
  URL          = "https://arxiv.org/abs/1906.03351",
  author       = "Bakhtin, Anton and Gross, Sam and Ott, Myle and Deng, Yuntian and Ranzato, Marc'Aurelio and Szlam,
                 Arthur",
  title        = "Real or Fake? Learning to Discriminate Machine from Human Generated Text",
  publisher    = "arXiv",
  year         = "2019",
  cc-snippet   = "CCNews: We collect a de-duplicated subset of the English portion of the CommonCrawl news dataset
                 (Nagel, 2016) [Sebastian Nagel. Cc-news. http://web.archive.org/save/http://commoncrawl.
                 org/2016/10/news-dataset-available/, 2016.], which totals around 16 Billion words.",
  cc-dataset-used = "CC-NEWS",
  cc-dataset-used-subset = "2016 - 2018",
  cc-derived-dataset-about = "CCNews (Bakhtin, et al. 2019)",
  cc-author-affiliation = "Facebook AI Research; Harvard University, USA",
  cc-class     = "nlp/text-classification",
}

@Misc{cc:YangDaiYangCarbonellEtAl:2019:XLNet,
  URL          = "https://arxiv.org/abs/1906.08237",
  author       = "Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le,
                 Quoc V.",
  title        = "{XLN}et: Generalized Autoregressive Pretraining for Language Understanding",
  year         = "2019",
  cc-snippet   = "Following BERT [10], we use the BooksCorpus [40] and English Wikipedia as part of our pretraining
                 data, which have 13GB plain text combined. In addition, we include Giga5 (16GB text) [26], ClueWeb
                 2012-B (extended from 5]), and Common Crawl [6] for pretraining. We use heuristics to aggressively
                 filter out short or low-quality articles for ClueWeb 2012-B and Common Crawl, which results in 19GB and
                 110GB text respectively.",
  cc-author-affiliation = "Carnegie Mellon University, Google AI Brain Team",
  cc-class     = "nlp/transformer-language-model",
}

@Misc{cc:ConneauKhandelwalGoyalChaudharyEtAl:2019:XML-R,
  doi          = "10.48550/ARXIV.1911.02116",
  URL          = "https://arxiv.org/abs/1911.02116",
  author       = "Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume
                 and Guzmán, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin",
  keywords     = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
                 information sciences",
  title        = "Unsupervised Cross-lingual Representation Learning at Scale",
  publisher    = "arXiv",
  year         = "2019",
  pdf          = "https://arxiv.org/pdf/1911.02116.pdf",
  abstract     = "This paper shows that pretraining multilingual language models at scale leads to significant
                 performance gains for a wide range of cross- lingual transfer tasks. We train a Transformer- based
                 masked language model on one hundred languages, using more than two terabytes of fil- tered CommonCrawl
                 data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of
                 cross-lingual benchmarks, including +14.6\% average accu- racy on XNLI, +13\% average F1 score on MLQA,
                 and +2.4\% F1 score on NER. XLM-R performs particularly well on low-resource lan- guages, improving
                 15.7\% in XNLI accuracy for Swahili and 11.4\% for Urdu over previ- ous XLM models. We also present a
                 detailed empirical analysis of the key factors that are required to achieve these gains, including the
                 trade-offs between (1) positive transfer and ca- pacity dilution and (2) the performance of high and
                 low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual
                 modeling without sacrificing per- language performance; XLM-R is very compet- itive with strong
                 monolingual models on the GLUE and XNLI benchmarks. We will make our code, data and models publicly
                 available.",
  cc-author-affiliation = "Facebook AI",
  cc-class     = "nlp/corpus-construction, nlp/web-as-corpus, nlp/language-model",
  cc-derived-dataset-about = "CC-100",
  cc-derived-dataset-used = "CCNet",
  cc-snippet   = "Following Wenzek et al. (2019) 2, we build a clean CommonCrawl Corpus in 100 languages. [...] In this
                 work, we introduced XLM-R, our new state of the art multilingual masked language model trained on 2.5
                 TB of newly created clean CommonCrawl data in 100 languages.",
}