-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcc2019.bib
More file actions
280 lines (268 loc) · 20.9 KB
/
Copy pathcc2019.bib
File metadata and controls
280 lines (268 loc) · 20.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
@Book{cc:BrueggerMiligan:2018:SAGE-handbook-web-history,
title = "The {SAGE} Handbook of Web History",
author = "Brügger, Nils and Milligan, Ian",
year = "2019",
URL = "https://us.sagepub.com/en-us/nam/the-sage-handbook-of-web-history/book252251",
publisher = "SAGE Publications Limited",
cc-author-affiliation = "Aarhus University, Denmark; University of Waterloo, Canada",
cc-class = "web-science, web history",
}
@Article{cc:WenzekLachauxConneauChaudharyEtAl:2019:CCNet,
author = "Guillaume Wenzek and Marie-Anne Lachaux and Alexis Conneau and Vishrav Chaudhary and Francisco Guzmán
and Armand Joulin and Edouard Grave",
title = "{CCN}et: Extracting high quality monolingual datasets from web crawl data",
journal = "CoRR",
year = "2019",
URL = "https://arxiv.org/abs/1911.00359",
eprint = "1911.00359",
cc-author-affiliation = "Facebook AI",
cc-derived-dataset-about = "CCNet",
abstract = "Pre-training text representations have led to significant improvements in many areas of natural
language processing. The quality of these models benefits greatly from the size of the pretraining
corpora as long as its quality is preserved. In this paper, we describe an automatic pipeline to
extract massive high-quality monolingual datasets from Common Crawl for a variety of languages. Our
pipeline follows the data processing introduced in fastText (Mikolov et al., 2017; Grave et al., 2018),
that deduplicates documents and identifies their language. We augment this pipeline with a filtering
step to select documents that are close to high quality corpora like Wikipedia.",
cc-snippet = "[about https://github.com/facebookresearch/cc_net] In this paper, we present a data collection
pipeline that allows to gather massive monolingual corpora of high quality in a variety of languages,
including many low-resource ones. The principles of our pipeline are general and we show the results of
its application to data collected by the Common Crawl project.¹ Common Crawl is a massive non-curated
dataset of webpages in many languages, mixed together in temporal snapshots of the web.",
cc-class = "nlp/corpus-construction, nlp/web-as-corpus, nlp/low-resource-language",
}
@InProceedings{cc:RadfordWuChildLuanEtAl:2019:language-models,
title = "Language models are unsupervised multitask learners",
author = "A. Radford and Jeffrey Wu and R. Child and David Luan and Dario Amodei and Ilya Sutskever",
year = "2019",
URL = "https://www.semanticscholar.org/paper/Language-Models-are-Unsupervised-Multitask-Learners-Radford-Wu/9405cc0d6169988371b2755e573cc28650d14dfe",
cc-snippet = "A promising source of diverse and nearly unlimited text is web scrapes such as Common Crawl. While
these archives are many orders of magnitude larger than current language modeling datasets, they have
significant data quality issues. Trinh & Le (2018) used Common Crawl in their work on commonsense
reasoning but noted a large amount of documents “whose content are mostly unintelligible”. We
ob-served similar data issues in our initial experiments with Common Crawl. Trinh & Le (2018)’s best
results were achieved using a small subsample of Common Crawl which included only documents most
similar to their target dataset,the Winograd Schema Challenge. While this is a pragmatic approach to
improve performance on a specific task, we want to avoid making assumptions about the tasks to be
performed ahead of time.Instead, we created a new web scrape which emphasizes document quality. To do
this we only scraped web pages which have been curated/filtered by humans. Manually filtering a full
web scrape would be exceptionally expensive so as a starting point, we scraped all outbound links from
Reddit, a social media platform, which received at least 3 karma. This can be thought of as a heuristic
indicator for whether other users found the link interesting, educational, or just funny. The resulting
dataset, WebText, contains the text subsetof these 45 million links.",
cc-class = "cc-cited-not-used",
cc-author-affiliation = "OpenAI, San Francisco, California, United States",
}
@InProceedings{cc:Ortiz-SuarezSagotRomary:2019:processing-huge-corpora,
title = "{Asynchronous pipeline for processing huge corpora on medium to low resource infrastructures}",
author = "Ortiz Suárez, Pedro Javier and Sagot, Benoît and Romary, Laurent",
URL = "https://hal.inria.fr/hal-02148693",
booktitle = "7th Workshop on the Challenges in the Management of Large Corpora (CMLC-7)",
address = "Cardiff, United Kingdom",
editor = "Piotr Bański and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc
Kupietz and Harald Lüngen and Caroline Iliadi",
publisher = "{Leibniz-Institut für Deutsche Sprache}",
year = "2019",
doi = "10.14618/IDS-PUB-9021",
pdf = "https://hal.inria.fr/hal-02148693/file/Asynchronous_Pipeline_for_Processing_Huge_Corpora_on_Medium_to_Low_Resource_Infrastructures.pdf",
cc-author-affiliation = "Inria, Paris, France; Sorbonne University, Paris, France",
cc-class = "nlp/corpus-construction",
cc-derived-dataset-about = "OSCAR",
cc-dataset-used = "CC-MAIN-2018-47 (WET)",
cc-snippet = "We use the November 2018 snapshot which surpasses 20TB of uncompressed data and contains more than 50
thousand plain text files where each file consists of the plain text from multiple websites along its
metadata header. From now on, when we mention the “Common Crawl” corpus, we refer to this
particular November 2018 snapshot.",
}
@MastersThesis{cc:Mottl:2019:branchenklassifikation,
author = "Dominik Mottl",
title = "Multi-Label Branchenklassifikation von Web-Texten",
year = "2019",
pdf = "https://fbmn.h-da.de/uploads/Themen/WS18_thesis_mottl.pdf",
cc-snippet = "NER of company names and linking to DBpedia performed on English texts in 712 WET files of November
2018 crawl (CC-MAIN-2018-47) using cc-pyspark.",
cc-author-affiliation = "Hochschule Darmstadt, Germany",
cc-class = "nlp/NER, entity-linking",
}
@Misc{cc:Nagel:2019:accessing-warc-files-via-sql,
author = "Nagel, Sebastian",
title = "Accessing {WARC} files via {SQL}",
year = "2019",
howpublished = "Poster at IIPC Web Archiving Conference, 6–7 June 2019, Zagreb, Croatia",
pdf = "https://netpreserve.org/ga2019/wp-content/uploads/2019/07/IIPCWAC2019-SEBASTIAN_NAGEL-Accessing_WARC_files_via_SQL-poster.pdf",
URL = "https://digital.library.unt.edu/ark:/67531/metadc1608961/",
cc-author-affiliation = "Common Crawl, USA",
cc-dataset-used = "cc-index-table",
cc-class = "web-archiving, SQL, Parquet",
}
@Misc{cc:LiuOttGoyalDuEtAl:2019:RoBERTa-Robustly-Optimized-BERT,
title = "Ro{BERT}a: {A} Robustly Optimized {BERT} Pretraining Approach",
author = "Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy
and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov",
year = "2019",
eprint = "1907.11692",
archiveprefix = "arXiv",
primaryclass = "cs.CL",
URL = "https://arxiv.org/abs/1907.11692",
pdf = "https://arxiv.org/pdf/1907.11692.pdf",
cc-author-affiliation = "Paul G. Allen School of Computer Science & Engineering, University of Washington, Seattle,
WA, USA; Facebook AI",
cc-class = "nlp/corpus-construction, nlp/language-model",
cc-dataset-used = "CC-NEWS",
cc-derived-dataset-about = "CC-NEWS-RoBERTa",
cc-snippet = "We find that BERT was significantly undertrained and propose an improved recipe for training BERT
models, which we call RoBERTa, that can match or exceed the performance of all of the post-BERT
methods. Our modifications are simple, they include: (1) training the model longer, with bigger
batches, over more data; (2) removing the next sentence prediction objective; (3) training on longer
sequences; and (4) dynamically changing the masking pattern applied to the training data. We also
collect a large new dataset (CC-NEWS) of comparable size to other privately used datasets, to better
control for training set size effects. [...] CC-NEWS, which we collected from the English portion of
the CommonCrawl News dataset (Nagel, 2016). The data contains 63 million English news articles crawled
between September 2016 and February 2019. (76GB after filtering).⁴ [⁴ We use news-please (Hamborg
et al.,2017) to collect and extract CC-NEWS. CC-NEWS is similar to the REALNEWS dataset described in
Zellers et al. (2019).]",
}
@InCollection{cc:ZellersHoltzmanRashkinBiskEtAl:2019:defending-against-neural-fake-news,
title = "Defending against neural fake news",
author = "Zellers, Rowan and Holtzman, Ari and Rashkin, Hannah and Bisk, Yonatan and Farhadi, Ali and Roesner,
Franziska and Choi, Yejin",
booktitle = "Advances in Neural Information Processing Systems 32",
editor = "H. Wallach and H. Larochelle and A. Beygelzimer and F. d'Alché-Buc and E. Fox and R. Garnett",
pages = "9054--9065",
year = "2019",
publisher = "Curran Associates, Inc.",
URL = "http://papers.nips.cc/paper/9106-defending-against-neural-fake-news.pdf",
cc-author-affiliation = "University of Washington, USA; Allen Institute for Artificial Intelligence, USA",
cc-class = "nlp/language-model, nlp/fake-news-detection, nlp/text-classification, misinformation, disinformation",
cc-snippet = "Dataset. We present RealNews, a large corpus of news articles from Common Crawl. Training Grover
requires a large corpus of news articles with metadata, but none currently exists. Thus, we construct
one by scraping dumps from Common Crawl, limiting ourselves to the 5000 news domains indexed by Google
News. We used the Newspaper Python library to extract the body and meta-data from each article. News
from Common Crawl dumps from December 2016 through March 2019 were used as training data; articles
published in April 2019 from the April 2019 dump were used for evaluation. After deduplication,
RealNews is 120 gigabytes without compression. [...] Obtaining the data required through Common Crawl
cost \$10k in AWS credits and can be massively parallelized over many CPUs. [...]",
cc-derived-dataset-about = "Grover-RealNews",
}
@Article{cc:PibiriPetriMoffat:2019:dictionary-based-inverted-index-compression,
title = "Fast Dictionary-Based Compression for Inverted Indexes",
author = "Pibiri, Giulio Ermanno and Petri, Matthias and Moffat, Alistair",
year = "2019",
URL = "https://dl.acm.org/citation.cfm?id=3290962",
pdf = "http://pages.di.unipi.it/pibiri/papers/WSDM19.pdf",
doi = "https://doi.org/10.1145/3289600.3290962",
abstract = "Dictionary-based compression schemes provide fast decoding operation, typically at the expense of
reduced compression effectiveness compared to statistical or probability-based approaches. In this
work, we apply dictionary-based techniques to the compression of inverted lists, showing that the high
degree of regularity that these integer sequences exhibit is a good match for certain types of
dictionary methods, and that an important new trade-off balance between compression effectiveness and
compression efficiency can be achieved. Our observations are supported by experiments using the
document-level inverted index data for two large text collections, and a wide range of other index
compression implementations as reference points. Those experiments demonstrate that the gap between
efficiency and effectiveness can be substantially narrowed.",
cc-snippet = "We use the standard Gov2 collection containing 426 GiB of text; and CCNEWS, an English subset of the
freely available NEWS subset of the CommonCrawl¹
[¹http://commoncrawl.org/2016/10/news-dataset-available/], consisting of news articles in the period
09/01/16 to 30/03/18, following the methodology of Petri and Moffat [28].",
cc-dataset-used = "CC-NEWS",
cc-dataset-used-subset = "2016/09/01 - 2018/03/30",
cc-class = "information-retrieval/search-engine, information-retrieval/inverted-index",
cc-author-affiliation = "University of Melbourne, Australia; University of Pisa, Italy; ISTI-CNR, Pisa, Italy",
}
@Misc{cc:SchwenkWenzekEdunovGraveEtAl:2019:CCMatrix-high-quality-parallel-sentences,
doi = "10.48550/ARXIV.1911.04944",
URL = "https://arxiv.org/abs/1911.04944",
author = "Schwenk, Holger and Wenzek, Guillaume and Edunov, Sergey and Grave, Edouard and Joulin, Armand",
keywords = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
information sciences",
title = "{CCM}atrix: Mining Billions of High-Quality Parallel Sentences on the {WEB}",
publisher = "arXiv",
year = "2019",
abstract = "We show that margin-based bitext mining in a multilingual sentence space can be applied to monolingual
corpora of billions of sentences. We are using ten snapshots of a curated common crawl corpus (Wenzek
et al., 2019), totalling 32.7 billion unique sentences. Using one unified approach for 38 languages, we
were able to mine 4.5 billions parallel sentences, out of which 661 million are aligned with English.
20 language pairs have more then 30 million parallel sentences, 112 more then 10 million, and most more
than one million, including direct alignments between many European or Asian languages.",
cc-snippet = "The curated Common Crawl corpus¶ In this work, we propose to mine parallel sentences from the Web, by
using the data released by the Common Crawl project.[⁵https://commoncrawl.org/] Each month, a
snapshot of the Web containing terabytes of web pages in various languages is obtained by randomly
exploring URLs. We start by applying some preprocessing steps to the raw text data, following the
pipeline introduced by Wenzek et al. (2019) and leading to the CCNet dataset. The first step is to
deduplicate the data at the paragraph level, as the original crawls contain up to 70\% of duplicated
data. This preprocessing removes low quality content, such as boilerplate, navigation menus or cookie
warnings. The second step of the pipeline is to identify the language of each document, using fastText6
(Grave et al., 2018). This language identifier uses a linear classifier with character n-gram features,
and can recognize up to 176 languages. Finally, the last step of the preprocessing is to filter low
quality content by training a language model on Wikipedia, and only keeping documents with a low
perplexity score. We refer the reader to Wenzek et al. (2019) for more details about this pre-
processing pipeline. In Figure 1, we report the number of unique sentences obtained after preprocessing
ten snapshots from Common Crawl. We currently process 38 languages. The English Web content is abundant
and we used only one snapshot.",
cc-author-affiliation = "Facebook AI",
cc-class = "nlp/corpus-construction, nlp/parallel-corpus, nlp/machine-translation",
cc-derived-dataset-about = "CCMatrix",
}
@Misc{cc:BakhtinGrossOttDengEtAl:2019:real-or-fake,
doi = "10.48550/ARXIV.1906.03351",
URL = "https://arxiv.org/abs/1906.03351",
author = "Bakhtin, Anton and Gross, Sam and Ott, Myle and Deng, Yuntian and Ranzato, Marc'Aurelio and Szlam,
Arthur",
title = "Real or Fake? Learning to Discriminate Machine from Human Generated Text",
publisher = "arXiv",
year = "2019",
cc-snippet = "CCNews: We collect a de-duplicated subset of the English portion of the CommonCrawl news dataset
(Nagel, 2016) [Sebastian Nagel. Cc-news. http://web.archive.org/save/http://commoncrawl.
org/2016/10/news-dataset-available/, 2016.], which totals around 16 Billion words.",
cc-dataset-used = "CC-NEWS",
cc-dataset-used-subset = "2016 - 2018",
cc-derived-dataset-about = "CCNews (Bakhtin, et al. 2019)",
cc-author-affiliation = "Facebook AI Research; Harvard University, USA",
cc-class = "nlp/text-classification",
}
@Misc{cc:YangDaiYangCarbonellEtAl:2019:XLNet,
URL = "https://arxiv.org/abs/1906.08237",
author = "Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le,
Quoc V.",
title = "{XLN}et: Generalized Autoregressive Pretraining for Language Understanding",
year = "2019",
cc-snippet = "Following BERT [10], we use the BooksCorpus [40] and English Wikipedia as part of our pretraining
data, which have 13GB plain text combined. In addition, we include Giga5 (16GB text) [26], ClueWeb
2012-B (extended from 5]), and Common Crawl [6] for pretraining. We use heuristics to aggressively
filter out short or low-quality articles for ClueWeb 2012-B and Common Crawl, which results in 19GB and
110GB text respectively.",
cc-author-affiliation = "Carnegie Mellon University, Google AI Brain Team",
cc-class = "nlp/transformer-language-model",
}
@Misc{cc:ConneauKhandelwalGoyalChaudharyEtAl:2019:XML-R,
doi = "10.48550/ARXIV.1911.02116",
URL = "https://arxiv.org/abs/1911.02116",
author = "Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume
and Guzmán, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin",
keywords = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
information sciences",
title = "Unsupervised Cross-lingual Representation Learning at Scale",
publisher = "arXiv",
year = "2019",
pdf = "https://arxiv.org/pdf/1911.02116.pdf",
abstract = "This paper shows that pretraining multilingual language models at scale leads to significant
performance gains for a wide range of cross- lingual transfer tasks. We train a Transformer- based
masked language model on one hundred languages, using more than two terabytes of fil- tered CommonCrawl
data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of
cross-lingual benchmarks, including +14.6\% average accu- racy on XNLI, +13\% average F1 score on MLQA,
and +2.4\% F1 score on NER. XLM-R performs particularly well on low-resource lan- guages, improving
15.7\% in XNLI accuracy for Swahili and 11.4\% for Urdu over previ- ous XLM models. We also present a
detailed empirical analysis of the key factors that are required to achieve these gains, including the
trade-offs between (1) positive transfer and ca- pacity dilution and (2) the performance of high and
low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual
modeling without sacrificing per- language performance; XLM-R is very compet- itive with strong
monolingual models on the GLUE and XNLI benchmarks. We will make our code, data and models publicly
available.",
cc-author-affiliation = "Facebook AI",
cc-class = "nlp/corpus-construction, nlp/web-as-corpus, nlp/language-model",
cc-derived-dataset-about = "CC-100",
cc-derived-dataset-used = "CCNet",
cc-snippet = "Following Wenzek et al. (2019) 2, we build a clean CommonCrawl Corpus in 100 languages. [...] In this
work, we introduced XLM-R, our new state of the art multilingual masked language model trained on 2.5
TB of newly created clean CommonCrawl data in 100 languages.",
}