-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcc2022.bib
More file actions
971 lines (934 loc) · 80.9 KB
/
Copy pathcc2022.bib
File metadata and controls
971 lines (934 loc) · 80.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
@Misc{cc:SnæbjarnarsonSímonarsonRagnarssonIngólfsdóttirEtAl:2022:warm-start-and-clean-crawled-corpus,
title = "A Warm Start and a Clean Crawled Corpus -- {A} Recipe for Good Language Models",
author = "Vésteinn Snæbjarnarson and Haukur Barri Símonarson and Pétur Orri Ragnarsson and Svanhvít Lilja
Ingólfsdóttir and Haukur Páll Jónsson and Vilhjálmur Þorsteinsson and Hafsteinn Einarsson",
year = "2022",
pdf = "https://arxiv.org/pdf/2201.05601.pdf",
URL = "https://arxiv.org/abs/2201.05601",
abstract = "We train several language models for Icelandic, including IceBERT, that achieve state-of-the-art
performance in a variety of downstream tasks, including part-of-speech tagging, named entity
recognition, grammatical error detection and constituency parsing. To train the models we introduce a
new corpus of Icelandic text, the Icelandic Common Crawl Corpus (IC3), a collection of high quality
texts found online by targeting the Icelandic top-level-domain (TLD). Several other public data sources
are also collected for a total of 16GB of Icelandic text. To enhance the evaluation of model
performance and to raise the bar in baselines for Icelandic, we translate and adapt the WinoGrande
dataset for co-reference resolution. Through these efforts we demonstrate that a properly cleaned
crawled corpus is sufficient to achieve state-of-the-art results in NLP applications for low to medium
resource languages, by comparison with models trained on a curated corpus. We further show that
initializing models using existing multilingual models can lead to state-of-the-art results for some
downstream tasks.",
cc-author-affiliation = "Miðeind ehf., Iceland; University of Iceland, Iceland",
cc-class = "nlp/corpus-construction, nlp/language-model",
cc-dataset-used = "CDX, WARC, ARC 2008 – March 2020",
cc-snippet = "3.1. The Icelandic Common Crawl Corpus¶ The Common Crawl Foundation is a non-profit organization that
scrapes large semi-random subsets of the internet regularly and hosts timestamped and compressed dumps
of the web online¹⁰ [¹⁰https://commoncrawl.org/the-data/get-started/]. Each dump contains
billions of web pages occupying hundreds of terabytes. Parsing these files directly requires storage
and computing power not directly available to most and can come at a significant financial cost. The
foundation also hosts indices of URIs and their locations within the large zipped dump files. While
these indices are also large, their processing is feasible with a few terabytes of storage.¶ 3.1.1.
Extracting Icelandic Common Crawl data¶ The Common Crawl indices, which contain URI and byte offsets
within the compressed dumps, are used to reduce the search space when looking for Icelandic texts. The
Common Crawl Index Server has a public API¹¹ [¹¹https://index.commoncrawl.org/] where URIs can be
queried based on attributes such as date, MIME-type and substring. Using the API eliminates the need to
fetch the massive index files. To extract Icelandic, the .is pattern is targeted to match the Icelandic
top level domain (TLD), resulting in 63.5 million retrieved pages with URIs and byte locations within
the compressed Common Crawl dumps. The computational efficiency of our method can be attributed to
these steps. Given the predominant use of the .is TLD for Icelandic web content, we assume that other
TLDs have a much lower proportion of Icelandic content. That said, a nontrivial amount of text in
Icelandic is still likely to be found outside the .is domain and could be extracted by, e.g., parsing
the whole Common Crawl, albeit at a much higher computational cost.¶ By targeting only the
byte-offsets corresponding to the Icelandic TLD we extract candidate websites that have a high
proportion of Icelandic content. In total, the compressed content is 687GiB on disk. All dumps since
the start of the Common Crawl in 2008 until March 2020 were included.¶ Plain text was extracted from
the collected WARC (Web Archive format) files using jusText (Pomikálek, 2011)12 to remove boilerplate
content and HTML tags.",
}
@Misc{cc:ArtetxeAldabeAgerriPerez-de-ViñaspreEtAl:2022:corpus-quality-low-resource-languages,
DOI = "10.48550/ARXIV.2203.08111",
URL = "https://arxiv.org/abs/2203.08111",
author = "Artetxe, Mikel and Aldabe, Itziar and Agerri, Rodrigo and Perez-de-Viñaspre, Olatz and Soroa, Aitor",
keywords = "Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS:
Computer and information sciences, FOS: Computer and information sciences",
title = "Does Corpus Quality Really Matter for Low-Resource Languages?",
publisher = "arXiv",
year = "2022",
abstract = "The vast majority of non-English corpora are derived from automatically filtered versions of
CommonCrawl. While prior work has identified major issues on the quality of these datasets (Kreutzer et
al., 2021), it is not clear how this impacts downstream performance. Taking Basque as a case study, we
explore tailored crawling (manually identifying and scraping websites with high-quality content) as an
alternative to filtering CommonCrawl. Our new corpus, called EusCrawl, is similar in size to the Basque
portion of popular multilingual corpora like CC100 and mC4, yet it has a much higher quality according
to native annotators. For instance, 66\% of documents are rated as high-quality for EusCrawl, in
contrast with <33\% for both mC4 and CC100. Nevertheless, we obtain similar results on downstream tasks
regardless of the corpus used for pre-training. Our work suggests that NLU performance in low-resource
languages is primarily constrained by the quantity rather than the quality of the data, prompting for
methods to exploit more diverse data sources.",
cc-snippet = "In this paper, we explore tailored crawling (i.e., manually identifying and scraping websites with
high-quality content) as an alternative to filtering CommonCrawl. Taking Basque as a case study, we
collect 12.5M documents from 33 websites with Creative Commons content. The resulting corpus, called
EusCrawl, is similar in size to the Basque portion of CC100 and mC4, but it has substantially less
issues and a higher perceived quality according to our blind audit with native annotators. However, we
find that this improvement does not carry over to downstream tasks, as masked language models
pre-trained on either corpora obtain similar results on 5 NLU benchmarks. Our results suggests that
data quantity and domain play a more important role, prompting for methods to exploit more diverse
sources of data in low-resource languages.",
cc-author-affiliation = "Meta AI; HiTZ Center - Ixa, University of the Basque Country UPV/EHU",
cc-class = "nlp/corpus-construction, nlp/corpus-representativeness, nlp/corpus-quality, nlp/language-models,
nlp/low-resource-languages",
}
@Misc{cc:BidermanBichenoGao:2022:datasheet-for-Pile,
title = "Datasheet for the Pile",
author = "Stella Biderman and Kieran Bicheno and Leo Gao",
year = "2022",
pdf = "https://arxiv.org/pdf/2201.07311.pdf",
URL = "https://arxiv.org/abs/2201.07311",
abstract = "This datasheet describes the Pile, a 825 GiB dataset of human-authored text compiled by EleutherAI for
use in large-scale language modeling. The Pile is comprised of 22 different text sources, ranging from
original scrapes done for this project, to text data made available by the data owners, to third-party
scrapes available online.",
cc-author-affiliation = "EleutherAI",
cc-class = "nlp/corpus-construction, nlp/corpus-datasheet, nlp/corpus-representativeness",
cc-derived-dataset-about = "The-Pile-English",
cc-snippet = "Pile-CC: The Pile-CC dataset is a sample from the Common Crawl WARCs that has been converted to text
using jusText [Endrédy and Novák, 2013].¶ [...] Pile-CC: The Pile-CC dataset was created to be
included in the Pile. The underlying data comes from the Common Crawl, which was created to give people
access to the wealth of information contained in the internet. Its creators were concerned that only
data mining companies would be able to collect this data, and has the explicit aim of democratizing
technology.¶ [...] Pile-CC: The data is sourced from Common Crawl, a non-profit 501(c)(3) organization
founded by Gil Elbaz. The data from Common Crawl was processed by EleutherAI into Pile-CC.¶ [...]
Pile-CC: Instances are webpages.¶ [...] Pile-CC: 54, 953, 117 documents, totaling 227.12 GiB.¶ [...]
Pile-CC: A tiny fraction of the entire Common Crawl was included, chosen arbitrarily and heavily
filtered as detailed in Gao et al. [2020].¶ [...] Pile-CC: Data in the Pile-CC dataset were scraped
from websites by the Common Craw and then downloaded directly from the Common Craw by EleutherAI.¶
[...] Pile-CC: The earliest date of contents in Pile-CC is unknown.¶",
}
@TechReport{cc:Andersson-Schwarz:2022:quantification-online-linguistic-data,
title = "The hitchhiker's guide Method handbook for quantification of online linguistic data in a
country-specific context. Official research report, Linguistic Explorations of Societies (Work Package
1)",
author = "Andersson Schwarz, Jonas",
year = "2022",
pdf = "https://gupea.ub.gu.se/bitstream/handle/2077/70890/2022_1_Andersson%20Schwarz.pdf",
cc-snippet = "Central actors (in no particular order)¶ CommonCrawl. California-based non-profit organization that
makes monthly crawls of the openly available Web and provides datasets and metadata to the public
freely. The CommonCrawl corpus contains petabytes of data including raw web page data, metadata data
and text data collected since 2011. Since 2012, CommonCrawl’s archive is hosted by Amazon Web
Services as part of its Public Data Sets program. Every crawl contains around 300 terabytes of data and
roughly 3 billion pages. In 2020, a filtered version of this CommonCrawl archive was used to train
OpenAI’s GPT-3 language model.¶ [...] Similarly, CommonCrawl (2021) provides an aggregate listing
the percentages of their database covered by each language – measured as the primary language of each
html document, as identified by the Compact Language Detector 2 (CLD2) algorithm. This was included as
a good benchmark to compare with.¶ [...] In comparison, when plotting the cur- rently stated language
distribution of CommonCrawl (2021) in relation to the same population numbers of L1 and L2 speakers,
the CommonCrawl distribution displays a similarly low kurtosis and skewness.",
cc-author-affiliation = "Göteborgs Universitet, Sweden",
cc-class = "nlp/corpus-construction, nlp/corpus-representativeness",
}
@Misc{cc:MorishitaChousaSuzukiNagata:2022:JParaCrawl,
DOI = "10.48550/ARXIV.2202.12607",
URL = "https://arxiv.org/abs/2202.12607",
pdf = "https://arxiv.org/pdf/2202.12607.pdf",
author = "Morishita, Makoto and Chousa, Katsuki and Suzuki, Jun and Nagata, Masaaki",
title = "{JParaCrawl} v3.0: A Large-scale English-Japanese Parallel Corpus",
year = "2022",
abstract = "Most current machine translation models are mainly trained with parallel corpora, and their
translation accuracy largely depends on the quality and quantity of the corpora. Although there are
billions of parallel sentences for a few language pairs, effectively dealing with most language pairs
is difficult due to a lack of publicly available parallel corpora. This paper creates a large parallel
corpus for English-Japanese, a language pair for which only limited resources are available, compared
to such resource-rich languages as English-German. It introduces a new web-based English-Japanese
parallel corpus named JParaCrawl v3.0. Our new corpus contains more than 21 million unique parallel
sentence pairs, which is more than twice as many as the previous JParaCrawl v2.0 corpus. Through
experiments, we empirically show how our new corpus boosts the accuracy of machine translation models
on various domains. The JParaCrawl v3.0 corpus will eventually be publicly available online for
research purposes.",
cc-snippet = "Our method extracts parallel sentences from the web. Thus, the first step is finding a website that
has parallel sentences. This method is based on the hypothesis that websites containing the same
English and Japanese sentences might have parallel texts. To list such parallel websites, we analyzed
all the Common Crawl text archive data released from March 2019 to August 2021³. [³During this
period, the Common Crawl project released 25 archives, and their text size was about 212 TB.] We
identified the language in the archive by CLD2⁴ [⁴ https://github.com/CLD2Owners/cld2] and listed
100,000 large websites that roughly have the same size of English and Japanese texts. For this step, we
used extractor⁵ [⁵ 5https://github.com/paracrawl/extractor] that was provided by the ParaCrawl
project.",
cc-author-affiliation = "NTT Communication Science Laboratories, NTT Corporation, Japan",
cc-class = "nlp/machine-translation, nlp/parallel-corpus, nlp/corpus-construction",
}
@InProceedings{cc:LAKIMAlmazroueiAlhaolDebbahEtAl:2022:carbon-footprint,
title = "A Holistic Assessment of the Carbon Footprint of Noor, a Very Large Arabic Language Model",
author = "Imad LAKIM and Ebtesam Almazrouei and Ibrahim Abu Alhaol and Merouane Debbah and Julien Launay",
booktitle = "Challenges {\&} Perspectives in Creating Large Language Models",
year = "2022",
URL = "https://openreview.net/forum?id=B-lS3zH8Zq",
abstract = "As ever larger language models grow more ubiquitous, it is crucial to consider their environmental
impact. Characterised by extreme size and resource use, recent generations of models have been
criticised for their voracious appetite for compute, and thus significant carbon footprint. Although
reporting of carbon impact has grown more common in machine learning papers, this reporting is usually
limited to compute resources used strictly for training. In this work, we propose a holistic assessment
of the footprint of an extreme-scale language model, Noor. Noor is an ongoing project aiming to develop
the largest multi-task Arabic language models--with up to 13B parameters--leveraging zero-shot
generalisation to enable a wide range of downstream tasks via natural language instructions. We assess
the total carbon bill of the entire project: starting with data collection and storage costs, including
research and development budgets, pretraining costs, future serving estimates, and other exogenous
costs necessary for this international cooperation. Notably, we find that inference costs and exogenous
factors can have a significant impact on total budget. Finally, we discuss pathways to reduce the
carbon footprint of extreme-scale models.",
cc-author-affiliation = "TII, Abu Dhabi, Arabic Emirates; LightOn, Paris, France",
cc-class = "nlp/language-model, nlp/transformer-language-model, carbon-footprint",
cc-snippet = "We use Common Crawl (CC) for acquiring large amounts of web data. Each CC dump is on average around
10TB, and we discard it immediately after processing it. On average, it takes 24 hours to fully process
a dump: we used 21 dumps from CC, meaning we stored 210TB of data for 24hours, equivalent to 57 kWh of
energy consumption. After processing the dumps, we got on average 1.2TB of data per dump, thus 25TB in
total. Considering that this data will be stored for 6 months, we end up with 1.3 MWh of energy
consumption for the bulk data. Note that we keep the processed data in all languages (not just Modern
Standard Arabic).",
}
@Article{cc:Gutiérrez-FandiñoPérez-FernándezArmengol-EstapéGriolEtAl:2022:esCorpius,
author = "Gutiérrez-Fandiño, Asier and Pérez-Fernández, David and Armengol-Estapé, Jordi and Griol, David
and Callejas, Zoraida",
title = "{esCorpius: A Massive Spanish Crawling Corpus}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science - Artificial Intelligence",
year = "2022",
eid = "arXiv:2206.15147",
URL = "https://ui.adsabs.harvard.edu/abs/2022arXiv220615147G",
pdf = "https://arxiv.org/pdf/2206.15147.pdf",
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
cc-snippet = "[…] In this paper, we introduce esCorpius, a Spanish crawling corpus obtained from near 1 Pb of
Common Crawl data. It is the most extensive corpus in Spanish with this level of quality in the
extraction, purification and deduplication of web textual content […] A total of 39,502 compressed
WARC (Web Archive) from Common Crawl files were processed (see section 3.3 for more details). The
compressed information occupied about 180 TB and the size of the processed decompressed information is
estimated to be more than 0.8 PB. Prior to content deduplication, the downloaded corpus was composed of
106.768.594.753 words, 3.129.248.875 lines and 163.518.405 web pages. The deduplicated and cleaned
corpus size is 346.262.072.705 bytes (322.5 GB), with 104.073.706 total number of lines, 50.040.055.322
tokens, 1.125.798.968 paragraphs and 2.421.598.201 sentences.",
cc-author-affiliation = "LHF Labs; Universidad Autónoma de Madrid, Spain; University of Edinburgh, United Kingdom;
Universidad de Granada, Spain",
cc-class = "nlp/corpus-construction, nlp/text-corpora",
}
@InProceedings{cc:OverwijkXiongCallan:2022:ClueWeb22,
author = "Overwijk, Arnold and Xiong, Chenyan and Callan, Jamie",
title = "{ClueWeb22}: 10 Billion Web Documents with Rich Information",
year = "2022",
ISBN = "978-1-4503-8732-3",
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
URL = "https://doi.org/10.1145/3477495.3536321",
pdf = "https://dl.acm.org/doi/pdf/10.1145/3477495.3536321",
doi = "10.1145/3477495.3536321",
abstract = "ClueWeb22, the newest iteration of the ClueWeb line of datasets, is the result of more than a year of
collaboration between industry and academia. Its design is influenced by the research needs of the
academic community and the real-world needs of large-scale industry systems. Compared with earlier
ClueWeb datasets, the ClueWeb22 corpus is larger, more varied, and has higher-quality documents. Its
core is raw HTML, but it includes clean text versions of documents to lower the barrier to entry.
Several aspects of ClueWeb22 are available to the research community for the first time at this scale,
for example, visual representations of rendered web pages, parsed structured information from the HTML
document, and the alignment of document distributions (domains, languages, and topics) to commercial
web search.This talk shares the design and construction of ClueWeb22, and discusses its new features.
We believe this newer, larger, and richer ClueWeb corpus will enable and support a broad range of
research in IR, NLP, and deep learning.",
booktitle = "Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information
Retrieval",
pages = "3360–3362",
numpages = "3",
keywords = "clueweb, web corpus, dataset",
location = "Madrid, Spain",
series = "SIGIR '22",
cc-snippet = "One approach is to sift CommonCrawl data, eg, the C4 dataset used to pretrain T5 [10], which provides
sufficient quantity, but the quality quickly becomes a concern. For example, the cleaned CommonCrawl
reflects a quite weird distribution of the web [5]. Language models pretrained on C4 often perform
worse than models pretrained on higher quality corpora at the same scale. With ClueWeb22, we aim to
provide the web corpus for research in the near future. The design of ClueWeb22 emphasizes on these
goals: 1) to reflect the distribution of the web in real scenarios; 2) to provide web pages at large
quantity and also with high quality; 3) to enable new research directions by including information
important in industry but previously not publicly available.",
cc-author-affiliation = "Microsoft; Carnegie Mellon University",
cc-class = "cc-cited-not-used, nlp/corpus-construction, nlp/text-corpora, information-retrieval",
}
@Misc{cc:ZhangRollerGoyalArtetxeEtAl:2022:OPT-open-pretrained-transformer,
doi = "10.48550/ARXIV.2205.01068",
URL = "https://arxiv.org/abs/2205.01068",
pdf = "https://arxiv.org/pdf/2205.01068.pdf",
snippet = "All corpora were previously collected or filtered to contain predominantly English text, but a small
amount of non-English data is still present within the corpus via CommonCrawl. We removed duplicated
documents across all datasets by filtering …",
cc-author-affiliation = "Meta AI",
author = "Zhang, Susan and Roller, Stephen and Goyal, Naman and Artetxe, Mikel and Chen, Moya and Chen, Shuohui
and Dewan, Christopher and Diab, Mona and Li, Xian and Lin, Xi Victoria and Mihaylov, Todor and Ott,
Myle and Shleifer, Sam and Shuster, Kurt and Simig, Daniel and Koura, Punit Singh and Sridhar, Anjali
and Wang, Tianlu and Zettlemoyer, Luke",
title = "{OPT}: Open Pre-trained Transformer Language Models",
publisher = "arXiv",
year = "2022",
cc-derived-dataset-used = "CC-Stories, Pile-CC, CC-NEWS-RoBERTa-v2",
cc-class = "nlp/language-model, nlp/transformer-language-model, nlp/corpus-construction",
}
@InProceedings{cc:LugeonPiccardiWest:2022:Homepage2Vec,
title = "Homepage2Vec: Language-Agnostic Website Embedding and Classification",
author = "Lugeon, Sylvain and Piccardi, Tiziano and West, Robert",
booktitle = "Proceedings of the International AAAI Conference on Web and Social Media",
volume = "16",
pages = "1285--1291",
year = "2022",
URL = "https://ojs.aaai.org/index.php/ICWSM/article/download/19380/19152",
abstract = "Top-level domain. Some top-level domains (TLD) such as .edu or .biz can offer a good hint about the
website's content. For example, a typical use case for .edu is university websites, whereas .biz is
commonly associated with business activities. Following this intuition, we collected from Common
Crawl,5 a large-scale sample of the Web, the 19 most frequent TLDs: .com, .org, .net, .info, .xyz,
.club, .biz, .top, .edu, .online, .pro, .site, .vip, .icu, .buzz, .app, .asia, .gov, .space, excluding
the country code TLD (ccTLD) because they indicate geographic origin, not website content. We represent
this feature with a one-hot encoding vector of 19 dimensions.",
cc-author-affiliation = "EPFL, Switzerland",
cc-class = "nlp/text-classification, web-site-classification",
}
@InProceedings{cc:ZirngiblDeuschSattlerAulbachEtAl:2022:domain-parking,
title = "Domain Parking: Largely Present, Rarely Considered!",
author = "Zirngibl, Johannes and Deusch, Steffen and Sattler, Patrick and Aulbach, Juliane and Carle, Georg and
Jonker, Mattijs",
booktitle = "Proc. Network Traffic Measurement and Analysis Conference (TMA) 2022",
year = "2022",
abstract = "Domain parking typically involves leveraging advertisements to generate revenue on otherwise inactive
domain names. Their content is rarely of real value to users and tends to be highly similar across
parked domains. They have commonalities beyond content alone: parked domains can share hosting and DNS
infrastructure. Parking rarely receives special treatment in existing studies (e.g., content analyses
or infrastructure concentration studies). While the presence and possible bias introduced by parked
pages is sometimes acknowledged in studies, the studies still treat parked domains as any other, either
because differentiation is infeasible, or because doing so is considered out-of-scope. We argue that
the impact of parked domains on analyses regarding the current state and future development of the
Internet should not be overlooked. In this paper, we motivate this argument through quantification, and
take steps towards helping other researchers identify parked domains. We systematically collect a list
of 82 parking services and develop DNS-based indicators to help identify parked domains. We next
quantify the presence of parked domains, using large-scale DNS data containing hundreds of millions of
registered domain names, representative for a significant part of the global DNS namespace. Overall, we
pinpoint 60 M parked domains, which is a significant percentage of all names under consideration (23\%)
and identify up to 4\% of domains from top lists to be parked. These findings demonstrate that the
effect of parked pages is potentially pronounced. We also break down into the various parking services
and DNS zones. This helps us demonstrate and further discuss the effect that domain parking can have on
research and Internet consolidation.",
cc-snippet = "Common Crawl While visual identification allowed us to validate the inferences to a reasonable extent,
we wanted to upscale validation. Therefore, we consider Common Crawl (CC) data [21] [C. Crawl. (2022)
The Common Crawl Corpus. [Online]. Available: https://commoncrawl.org/] and calculate the similarity of
pages. Common Crawl is an open repository of web crawl data, collected at monthly intervals, accounting
for hundreds of millions of unique domain names, and many more URLs. We consider CC data for Jan 2022
and the ∼60 M parked domains that we identify on Jan 28th, 2022. We extract the HTML content of
parked pages from CC data, only considering URLs that contain exactly the registered domain.
Furthermore, we require the crawl target to have been the landing page (i.e., the path of the URL is /)
and also to have resulted in a useful response (i.e., HTTP status code of 200). Given these filters,
∼1.29 M HTML rich responses can be obtained. We extract visible text and tokenize it into words,
remove stop words, apply lemmatization, and create a vector for the most-frequently used words for each
page.",
URL = "https://mediatum.ub.tum.de/1661842",
pdf = "https://www.net.in.tum.de/fileadmin/bibtex/publications/papers/zirngibl2022prevalenceofparking.pdf",
cc-author-affiliation = "Technical University of Munich, Germany; University of Twente, The Netherlands",
cc-class = "web-science, internet/DNS, internet/domain-parking",
}
@InProceedings{cc:LuccioniCorrySridharanAnannyEtAl:2022:deprecating-datasets,
author = "Luccioni, Alexandra Sasha and Corry, Frances and Sridharan, Hamsini and Ananny, Mike and Schultz,
Jason and Crawford, Kate",
title = "A Framework for Deprecating Datasets: Standardizing Documentation, Identification, and Communication",
year = "2022",
ISBN = "978-1-4503-9352-2",
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
URL = "https://doi.org/10.1145/3531146.3533086",
pdf = "https://facctconference.org/static/pdfs_2022/facct22-17.pdf",
doi = "10.1145/3531146.3533086",
abstract = "Datasets are central to training machine learning (ML) models. The ML community has recently made
significant improvements to data stewardship and documentation practices across the model development
life cycle. However, the act of deprecating, or deleting, datasets has been largely overlooked, and
there are currently no standardized approaches for structuring this stage of the dataset life cycle. In
this paper, we study the practice of dataset deprecation in ML, identify several cases of datasets that
continued to circulate despite having been deprecated, and describe the different technical, legal,
ethical, and organizational issues raised by such continuations. We then propose a Dataset Deprecation
Framework that includes considerations of risk, mitigation of impact, appeal mechanisms, timeline,
post-deprecation protocols, and publication checks that can be adapted and implemented by the ML
community. Finally, we propose creating a centralized, sustainable repository system for archiving
datasets, tracking dataset modifications or deprecations, and facilitating practices of care and
stewardship that can be integrated into research and publication processes.",
booktitle = "2022 ACM Conference on Fairness, Accountability, and Transparency",
pages = "199–212",
numpages = "14",
keywords = "datasets, data stewardship data management dataset deprecation",
location = "Seoul, Republic of Korea",
series = "FAccT '22",
cc-snippet = "When it comes to filtering large text datasets scraped from the Web, given their sheer size (C4
represents 2.3 TB of data, whereas the Common Crawl has 139TB), filtering them is complex and
time-consuming, although approaches have been proposed for reducing duplicates and train-test overlap
[53]. [...] In practice, documenting and deprecating these datasets is akin to a game of whack-a-mole,
since new versions of the Common Crawl come out every few months. Analyzing what they contain and their
degrees of contamination through common evaluation tasks would take significant effort.",
cc-author-affiliation = "Hugging Face; University of Southern California, USA; New York University, USA; Microsoft
Research, USA",
cc-class = "ai/ethics-of-machine-learning, nlp/text-corpora, nlp/corpus-construction, cc-cited-not-used",
}
@Article{cc:KreutzerCaswellWangWahabEtAl:2022:audit-web-multilingual-datasets,
author = "Kreutzer, Julia and Caswell, Isaac and Wang, Lisa and Wahab, Ahsan and van Esch, Daan and
Ulzii-Orshikh, Nasanbayar and Tapo, Allahsera and Subramani, Nishant and Sokolov, Artem and Sikasote,
Claytone and Setyawan, Monang and Sarin, Supheakmungkol and Samb, Sokhar and Sagot, Benoît and Rivera,
Clara and Rios, Annette and Papadimitriou, Isabel and Osei, Salomey and Suarez, Pedro Ortiz and Orife,
Iroro and Ogueji, Kelechi and Rubungo, Andre Niyongabo and Nguyen, Toan Q. and Müller, Mathias and
Müller, André and Muhammad, Shamsuddeen Hassan and Muhammad, Nanda and Mnyakeni, Ayanda and
Mirzakhalov, Jamshidbek and Matangira, Tapiwanashe and Leong, Colin and Lawson, Nze and Kudugunta,
Sneha and Jernite, Yacine and Jenny, Mathias and Firat, Orhan and Dossou, Bonaventure F. P. and
Dlamini, Sakhile and de Silva, Nisansa and Çabuk Ballı, Sakine and Biderman, Stella and Battisti,
Alessia and Baruwa, Ahmed and Bapna, Ankur and Baljekar, Pallavi and Azime, Israel Abebe and Awokoya,
Ayodele and Ataman, Duygu and Ahia, Orevaoghene and Ahia, Oghenefego and Agrawal, Sweta and Adeyemi,
Mofetoluwa",
title = "Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
pages = "50--72",
year = "2022",
month = "01",
abstract = "With the success of large-scale pre-training and multilingual modeling in Natural Language Processing
(NLP), recent years have seen a proliferation of large, Web-mined text datasets covering hundreds of
languages. We manually audit the quality of 205 language-specific corpora released with five major
public datasets (CCAligned, ParaCrawl, WikiMatrix, OSCAR, mC4). Lower-resource corpora have systematic
issues: At least 15 corpora have no usable text, and a significant fraction contains less than 50\%
sentences of acceptable quality. In addition, many are mislabeled or use nonstandard/ambiguous language
codes. We demonstrate that these issues are easy to detect even for non-proficient speakers, and
supplement the human audit with automatic analyses. Finally, we recommend techniques to evaluate and
improve multilingual corpora and discuss potential risks that come with low-quality data releases.",
ISSN = "2307-387X",
doi = "10.1162/tacl_a_00447",
URL = "https://doi.org/10.1162/tacl\_a\_00447",
eprint = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00447/1986585/tacl\_a\_00447.pdf",
cc-class = "nlp/corpus-construction, nlp/web-as-corpus, nlp/parallel-corpus, nlp/low-resource-language",
cc-derived-dataset-about = "CCAligned-2020, Tensorflow-C4-Multilingual, OSCAR",
cc-snippet = "We selected the corpora for their multilinguality and the inclusion of understudied languages in NLP.
With the exception of WikiMatrix and Paracrawl, all corpora are derived from CommonCrawl, and
distinguish themselves by the choice of filtering methods, LangID and automatic alignment technology.",
cc-author-affiliation = "Google Research; Masakhane NLP; Turkic Interlingua; Haverford College; RobotsMali; Intel
Labs; University of Zambia; Google; AIMS-AMMI; Inria; University of Zurich; Stanford University; Kwame
Nkrumah University of Science and Technology; Sorbonne Université; Niger-Volta LTI; University of
Waterloo; University of Electronic Science and Technology of China; University of Notre Dame; Bayero
University Kano; University of South Florida; Hugging Face; Jacobs University Bremen; University of
Moratuwa; EleutherAI; Obafemi Awolowo University; University of Ibadan; Instadeep; University of
Maryland; Defence Space Administration Abuja",
}
@Misc{cc:AbadjiSuarezRomarySagot:2022:cleaner-document-oriented-multilingual-crawled-corpus,
doi = "10.48550/ARXIV.2201.06642",
URL = "https://arxiv.org/abs/2201.06642",
pdf = "https://arxiv.org/pdf/2201.06642.pdf",
author = "Abadji, Julien and Suarez, Pedro Ortiz and Romary, Laurent and Sagot, Benoît",
title = "Towards a Cleaner Document-Oriented Multilingual Crawled Corpus",
publisher = "arXiv",
year = "2022",
abstract = "The need for raw large raw corpora has dramatically increased in recent years with the introduction of
transfer learning and semi-supervised learning methods to Natural Language Processing. And while there
have been some recent attempts to manually curate the amount of data necessary to train large language
models, the main way to obtain this data is still through automatic web crawling. In this paper we take
the existing multilingual web corpus OSCAR and its pipeline Ungoliant that extracts and classifies data
from Common Crawl at the line level, and propose a set of improvements and automatic annotations in
order to produce a new document-oriented version of OSCAR that could prove more suitable to pre-train
large generative language models as well as hopefully other applications in Natural Language Processing
and Digital Humanities.",
cc-derived-dataset-about = "OSCAR",
cc-author-affiliation = "Inria, France; Sorbonne Université, France",
cc-class = "nlp/corpus-construction, nlp/web-as-corpus",
}
@Misc{cc:TongjingYinBaoMeijers:2022:intercity-relationships,
title = "Dataset of intercity relationships between 293 Chinese cities extracted and classified on the basis of
toponym co-occurrences on Common Crawl",
year = "2022",
author = "Tongjing, Wang and Yin, Zhao and Bao, Ziyu and Meijers, Evert",
URL = "https://www.researchgate.net/profile/Evert-Meijers/publication/362952059_Dataset_of_intercity_relationships_between_293_Chinese_cities_extracted_and_classified_on_the_basis_of_toponym_co-occurrences_on_Common_Crawl/links/6308bfc25eed5e4bd11f7938/Dataset-of-intercity-relationships-between-293-Chinese-cities-extracted-and-classified-on-the-basis-of-toponym-co-occurrences-on-Common-Crawl.pdf",
pdf = "https://www.researchgate.net/profile/Evert-Meijers/publication/362952059_Dataset_of_intercity_relationships_between_293_Chinese_cities_extracted_and_classified_on_the_basis_of_toponym_co-occurrences_on_Common_Crawl/links/6308bfc25eed5e4bd11f7938/Dataset-of-intercity-relationships-between-293-Chinese-cities-extracted-and-classified-on-the-basis-of-toponym-co-occurrences-on-Common-Crawl.pdf",
abstract = "Although the importance of intercity relationships is theoretically acknowledged for cities’
socioeconomic development, the availability of such relational data often limits relevant urban
studies. One of the new approaches of collecting city relational data is to extract the co-appearance
of their place names from web texts. However, dealing with a gigantic web corpus is difficult for
domain researchers given the complexities of processing terabytes of raw data. This paper develops an
efficient and easy-to-follow method to extract a dataset of intercity relationships between 293 large
Chinese cities applying the toponym co-occurrence method to a web archive. Our method successfully
filters a 6.98 TB CC data set into a 202 GB single language text corpus. A highly-scalable Hadoop-
based framework processes the full CC corpus utilizing a 1080 CPU cluster on the Amazon Elastic
Map/Reduce infrastructure. To reveal more details of the intercity relationships, the intercity
relationships are further classified into six categories: industry, information technology (IT),
finance, research, culture, and government.",
keywords = "city networks, toponym co-occurrence, city relationship, geographical information retrieval",
cc-author-affiliation = "Utrecht University, The Netherlands; Delft University of Technology, The Netherlands",
cc-class = "information retrieval, toponymy, dataset-creation",
cc-snippet = "The data was retrieved from a Common Crawl raw corpus through a series of data processing. The web
pages in this corpus that do not contain Chinese characteristics or Chinese placenames were filtered
out based on keyword selection. The filtered Chinese corpus was 202 GB and the filtered Chinese corpus
with placenames was about 139.5GB. Then we count the number of web pages where two city names
co-appear. These intercity relationships were further classified into six categories using a
lexicon-based classification method.",
cc-dataset-used = "CC-MAIN-2019-18 (WET)",
}
@InProceedings{cc:KummervoldWetjenRosa:2022:Norwegian-Colossal-Corpus,
title = "The Norwegian Colossal Corpus: {A} Text Corpus for Training Large Norwegian Language Models",
year = "2022",
author = "Kummervold, Per E and Wetjen, Freddy and de la Rosa, Javier",
booktitle = "The 13th International Conference on Language Resources and Evaluation (LREC 2022)",
pdf = "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.410.pdf",
cc-snippet = "Common Crawl (2022) is a non-profit organization that has been collecting data from the web and
providing these archives to the public since 2011. Common Crawl-based datasets are popular for training
transformer models and are the basis for the enormous 800GB Pile dataset (Gao, 2020), among others.
There are extracted Norwegian datasets that are also based on Common Crawl. The Open Super-large
Crawled Aggregated coRpus (OSCAR) (Suárez et al., 2019) contains 4.7GB (800M words) of Norwegian
Bokmål and 54MB (9M words) of Norwegian Nynorsk. Using a cleaned version of Common Crawl, Google
compiled a multilingual version of their English colossal corpus, called MC4 (2022), for training their
mT5 model (Xue et al., 2020). The Norwegian part of that dataset is roughly 94GB (14B words). Both
OSCAR and the MC4 datasets have been made available on Hugging Face (2022). Unfortunately, their
respective licenses do not allow for redistribution within the NCC. To overcome this limitation, we are
releasing scripts for the preparation, cleaning, deduplication, and formatting of these datasets, so
they can be interleaved 3855with the NCC. By combining NCC with OSCAR and MC4, it should be possible to
create a deduplicated Norwegian corpus with over 100GB of text (15B words).",
cc-derived-dataset-used = "OSCAR",
URL = "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.410.pdf",
cc-author-affiliation = "National Library of Norway (NLN), Norway",
cc-class = "nlp/corpus-construction",
}
@Article{cc:LiVincent:2022:rethinking-data-governance,
title = "Rethinking Data Governance: {A} Labor-Oriented Approach",
author = "Li, Hanlin and Vincent, Nicholas",
year = "2022",
pdf = "https://criticalautomation.org/wp-content/uploads/2022/03/li-vincent-data-governance.pdf",
abstract = "The current data governance paradigm in which technology companies solely decide how user data is
collected and used has introduced many issues to the tech sector. Prominent examples include
information asymmetry about user data’s value, monopolistic practices enabled by data’s network
effects, and power imbalance with respect to data aggregation and analysis. This work explicates how
viewing users’ data-generating activities through a labor lens can help to mitigate these issues and
provides corresponding design and research directions.",
cc-snippet = "2.1 Information asymmetry about user data's value¶ The lack of transparency about user data's value
helps make it possible for operators of for-profit computing systems to monetize user data and reap the
bulk of its financial benefits. Currently, there exists a substantial gap between what data-driven
technology companies know about user data's value and what users themselves do. For example, while
social media platforms are well aware of the amount of financial benefits of user engagement, users do
not have a window into how their collective attention and knowledge powers such businesses. This
information asymmetry is further exacerbated by the fact that the vast majority of data that users
produce during their interaction with modern technologies is rarely visible to themselves and is used
downstream without their awareness and consent. For instance, the rise of AI technologies is possible
largely due to the abundance of data unwittingly generated by the public for purposes other than
enabling AI models. Prominent examples include Flickr photos [12], Wikipedia articles [14], and the
Common Crawl dataset consisting of publicly available webpages [11]. In many of such cases, users
produce data without being aware of its value and potential, giving technology companies the
opportunity to extract an enormous amount of revenue from such data.",
URL = "https://criticalautomation.org/wp-content/uploads/2022/03/li-vincent-data-governance.pdf",
cc-author-affiliation = "Northwestern University, USA",
cc-class = "dataset-creation, data governance, user-generated content, artificial intelligence, machine learning,
cc-cited-not-used",
}
@Article{cc:PuSarwarAbdullahRehmanEtAl:2022:deepfake-text-detection,
title = "Deepfake Text Detection: Limitations and Opportunities",
year = "2022",
author = "Pu, Jiameng and Sarwar, Zain and Abdullah, Sifat Muhammad and Rehman, Abdullah and Kim, Yoonjin and
Bhattacharya, Parantapa and Javed, Mobin and Viswanath, Bimal and Tech, Virginia and Pakistan, LUMS",
URL = "https://jmpu.github.io/files/Deepfake%20Text%20Detection%20Limitations%20and%20Opportunities_CR.pdf",
cc-derived-dataset-used = "Grover-RealNews",
cc-author-affiliation = "Virginia Tech, USA; University of Chicago, USA; LUMS, Pakistan, University of Virginia, USA",
cc-class = "nlp/text-classification, deep-fake-detection, misinformation, disinformation",
}
@Article{cc:HantkeStock:2022:HTML-violations,
title = "{HTML} Violations and Where to Find Them: {A} Longitudinal Analysis of Specification Violations in
{HTML}",
author = "Hantke, Florian and Stock, Ben",
year = "2022",
URL = "https://swag.cispa.saarland/papers/hantke2022violations.pdf",
cc-snippet = "[...] we leveraged Common Crawl [22] to analyze more than 23K popular domains over the course of eight
years. [...] the crawler framework first collects meta information for each of the listed domains using
Common Crawl [22] as a basis for the following analyses (1). This Common Crawl approach makes it
possible to take a look into the past and analyze old versions of websites as well as current
snapshots. Unlike similar crawling studies before using the Internet Archive[32], with Common Crawl, we
are not limited by rate limit issues as we can request the database and S3 bucket directly. This makes
the process fast and enables to analyze nearly a thousand pages per minute from one IP address over
multiple days. The meta information that the framework collects contains details on where an HTML
document can be found in the Common Crawl’s dumps. For each domain, the framework collects meta
information from up to 100 pages and hands them to the crawler.",
cc-author-affiliation = "CISPA Helmholtz Center for Information Security, Germany",
cc-class = "web-science, internet-security",
}
@Misc{cc:MarkovZhangAgarwalEloundouEtAl:2022:undesired-content-detection,
doi = "10.48550/ARXIV.2208.03274",
URL = "https://arxiv.org/abs/2208.03274",
pdf = "https://arxiv.org/pdf/2208.03274.pdf",
author = "Markov, Todor and Zhang, Chong and Agarwal, Sandhini and Eloundou, Tyna and Lee, Teddy and Adler,
Steven and Jiang, Angela and Weng, Lilian",
title = "A Holistic Approach to Undesired Content Detection in the Real World",
publisher = "arXiv",
year = "2022",
code-repository = "https://github.com/openai/moderation-api-release",
cc-author-affiliation = "OpenAI",
cc-class = "nlp/text-classification, nlp/corpus-construction, toxic content, hate speech",
}
@InProceedings{cc:ReynoldsBatesBailey:2022:URL-parser-implementations,
title = "Equivocal {URL}s: Understanding the Fragmented Space of {URL} Parser Implementations",
author = "Reynolds, Joshua and Bates, Adam and Bailey, Michael",
booktitle = "European Symposium on Research in Computer Security",
pages = "166--185",
year = "2022",
publisher = "Springer",
cc-snippet = "We also surveyed ∼350 million URLs sampled uniformly and randomly from the approximately 3 billion
URLs in Common Crawl's January 2022 URL Index [35]. [35 Kreymer, I., Chuang, G.: Announcing the common
crawl index! (2015)]",
URL = "https://link.springer.com/chapter/10.1007/978-3-031-17143-7_9",
pdf = "https://adambates.org/documents/Reynolds_Esorics22.pdf",
cc-author-affiliation = "New Mexico State University, USA; University of Illinois at Urbana-Champaign, USA; Georgia
Institute of Technology, USA",
cc-class = "computer-security/internet-security, web-security, URL parsing",
}
@Misc{cc:KorkmazKoçyiğitŞahingözDiri:2022:hybrid-phishing-detection-system,
year = "2022",
title = "A Hybrid Phishing Detection System Using Deep Learning-based {URL} and Content Analysis",
author = "Korkmaz, Mehmet and Emre Koçyiğit and Özgür Şahingöz and Banu Diri",
journal = "Elektronika ir Elektrotechnika",
volume = "28",
number = "5",
snippet = "… With a new dataset (1 million URLs, half of which was obtained from PhishTank and the rest from
the CommonCrawl database, and the dataset contains 10,000 images) which researchers used in [21], CNN
and LSTM were tested in Intelligent …",
URL = "https://www.eejournal.ktu.lt/index.php/elt/article/download/31197/15556",
cc-author-affiliation = "Yildiz Technical University, Istanbul, Turkey; Biruni University, Istanbul, Turkey",
cc-class = "computer-security/internet-security",
}
@Misc{cc:Ab-RazakJayaErnawanFirdausEtAl:2022:ML-classifiers-phishing-detection,
year = "2022",
title = "Comparative Analysis of Machine Learning Classifiers for Phishing Detection",
author = "Ab Razak, Mohd Faizal and Jaya, Mohd Izham and Ernawan, Ferda and Firdaus, Ahmad and Nugroho, Fajar
Agung",
booktitle = "2022 6th International Conference on Informatics and Computational Sciences (ICICoS)",
pages = "84--88",
cc-snippet = "… The source for this dataset is from the University Malaysia of Sarawak, compiled from PhishTank,
OpenPhish, Alexa and Common Crawl. One method for detecting new phishing websites is to utilize
heuristics such as the URL and CSS detection …",
URL = "https://ieeexplore.ieee.org/abstract/document/9930531/",
cc-author-affiliation = "Universitas Dian Nuswantoro, Semarang, Indonesia",
cc-class = "computer-security/internet-security",
}
@Misc{cc:L-Ranaldi:2022:C-OSINT-COVID-19-Open-Source,
year = "2022",
title = "{C}-{OSINT}: {COVID}-19 Open Source artificial {INT}elligence framework",
author = "L. Ranaldi and A. Nourbakhsh and F. Fallucchid and FM. Zanzotto",
abstract = "With the emergence of COVID-19 disease worldwide, a market of the products related to this disease
formed across the Internet. By the time these goods were in short supply, many uncontrolled Dark Web
Marketplaces (DWM) were active in selling these products. At the same time, Dark Web Forums (DWF)
became proxies for spreading false ideas, fake news about COVID-19, and advertising products sold in
DWMs. This study investigates the activities entertained in the DWMs and DWFs to propose a
learning-based model to distinguish them from their related counterparts on the surface web. To this
end, we propose a COVID-19 Open Source artificial INTelligence framework (C-OSINT) to automatically
collect and classify the activities done in DWMs and DWFs. Moreover, we corporate linguistic and
stylistic solutions to leverage the classification performance between the content found in DWMs and
DWFs and two surface web sources. Our results show that using syntactic and stylistic representation
outperforms the Transformer based results over these domains.",
URL = "https://ceur-ws.org/Vol-3260/paper16.pdf",
cc-author-affiliation = "Guglielmo Marconi University, Roma, Italy; University of Rome Tor Vergata, Roma, Italy",
cc-class = "nlp/transformer-language-model, web-science/dark-web",
}
@Misc{cc:LiuRitter:2022:CoNLL-2003-named-entity-taggers-still-work,
doi = "10.48550/ARXIV.2212.09747",
URL = "https://arxiv.org/abs/2212.09747",
author = "Liu, Shuheng and Ritter, Alan",
title = "Do Co{NLL}-2003 Named Entity Taggers Still Work Well in 2023?",
year = "2022",
cc-snippet = "Our dataset follows this distribution to collect Reuters news articles published between December 5th
and 7th, 2020, collected from the Common Crawl Foundation³. [³http://commoncrawl.org/]",
pdf = "https://arxiv.org/pdf/2212.09747.pdf",
cc-author-affiliation = "Georgia Institute of Technology",
cc-class = "nlp/named-entity-recognition, dataset-creation",
}
@Misc{cc:BoháčekBravanskýTrhlíkMoravec:2022:Czech-news-article-dataset,
doi = "10.48550/ARXIV.2212.08550",
URL = "https://arxiv.org/abs/2212.08550",
author = "Boháček, Matyáš and Bravanský, Michal and Trhlík, Filip and Moravec, Václav",
title = "Fine-grained Czech News Article Dataset: An Interdisciplinary Approach to Trustworthiness Analysis",
year = "2022",
pdf = "https://arxiv.org/pdf/2212.08550.pdf",
cc-snippet = "Initially, we assembled a collection of almost 94, 000 articles by scraping URLs of 45 Czech news
sources obtained from Common Crawl² [²https://commoncrawl.org/]. These sources included mainstream
journalistic websites, tabloids, independent news outlets, and websites that are part of the
disinformation ecosystem [ 26 ], capturing the full scope of journalistic content in the Czech
Republic. [...] We applied multiple filters and balancing mechanisms to mitigate deficiencies caused by
inherent flaws in Common Crawl, which reduced the dataset’s size from 94, 000 to 10, 000 items. This
way, we also ensured that the data is as representative of the Czech news ecosystem and as diverse as
possible.",
cc-author-affiliation = "Charles University, Prague, Czech Republic; Gymnasium of Johannes Kepler, Prague, Czech
Republic; University College London, United Kingdom",
cc-class = "nlp/fake-news-detection, dataset-creation",
}
@Misc{cc:KhanHanna:2022:AI-dataset-accountability,
author = "Khan, Mehtab and Hanna, Alex",
title = "The Subjects and Stages of {AI} Dataset Development: {A} Framework for Dataset Accountability",
year = "2022",
URL = "https://ssrn.com/abstract=4217148",
doi = "http://dx.doi.org/10.2139/ssrn.4217148",
abstract = "There has been increased attention toward the datasets that are used to train and build AI
technologies from the computer science and social science research communities, but less from legal
scholarship. Both Large-Scale Language Datasets (LSLDs) and Large-Scale Computer Vision Datasets
(LSCVDs) have been at the forefront of such discussions, due to recent controversies involving the use
of facial recognition technologies, and the discussion of the use of publicly-available text for the
training of massive models which generate human-like text. Many of these datasets serve as
“benchmarks” to develop models that are used both in academic and industry research, while others
are used solely for training models. The process of developing LSLDs and LSCVDs is complex and
contextual, involving dozens of decisions about what kinds of data to collect, label, and train a model
on, as well as how to make the data available to other researchers. However, little attention has been
paid to mapping and consolidating the legal issues that arise at different stages of this process: when
the data is being collected, after the data is used to build and evaluate models and applications, and
how that data is distributed more widely. In this article, we offer four main contributions. First, we
describe what kinds of objects these datasets are, how many different kinds exist, what types of
modalities they encompass, and why they are important. Second, we provide more clarity about the stages
of dataset development – a process that has thus far been subsumed within broader discussions about
bias and discrimination – and the subjects who may be susceptible to harms at each point of
development. Third, we provide a matrix of both the stages of dataset development and the subjects of
dataset development, which traces the connections between stages and subjects. Fourth, we use this
analysis to identify some basic legal issues that arise at the various stages in order to foster a
better understanding of the dilemmas and tensions that arise at every stage. We situate our discussion
within wider discussion of current debates and proposals related to algorithmic accountability. This
paper fulfills an essential gap when it comes to comprehending the complicated landscape of legal
issues connected to datasets and the gigantic AI models trained on them.",
cc-snippet = "D. Common Crawl: Archiving the Whole Web The Common Crawl (CC) dataset is one of the most popular
datasets used in the training of what have typically been called large language models. [...]",
cc-author-affiliation = "Yale Law School, USA; Distributed AI Research Institute",
cc-class = "nlp/corpus-construction, dataset-creation, data-governance, privacy, legal/copyright",
}
@Misc{cc:SchuhmannBeaumontVencuGordonEtAl:2022:LAION-5B,
doi = "10.48550/ARXIV.2210.08402",
URL = "https://arxiv.org/abs/2210.08402",
author = "Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and
Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and
Schramowski, Patrick and Kundurthy, Srivatsa and Crowson, Katherine and Schmidt, Ludwig and
Kaczmarczyk, Robert and Jitsev, Jenia",
title = "{LAION}-5{B}: An open large-scale dataset for training next generation image-text models",
publisher = "arXiv",
year = "2022",
cc-derived-dataset-about = "LAION-5B",
cc-author-affiliation = "LAION; UC Berkeley, USA; Gentec Data; TU Darmstadt, Germany; Hessian.AI; University of
Washington, Seattle, USA; Technical University of Munich, Germany; Stability AI; EleutherAI; Juelich
Supercomputing Center (JSC), Germany; Research Center Juelich (FZJ), Germany",
cc-class = "nlp/corpus-construction, nlp/multimodal-corpora",
cc-snippet = "By starting from Common Crawl [1] and filtering this data source with an existing CLIP model, we
derive a dataset consisting of three parts: 2.32 billion English image-text examples, 2.26 billion
multilingual examples, and 1.27 billion examples that are not specific to a particular language (e.g.,
places, products, etc.). [...] To extract image-text pairs from Common Crawl, we parse the HTML IMG
(image) tags from Common Crawl’s WAT metadata files.⁴ [⁴See
https://commoncrawl.org/the-data/get-started/ for details of the metadata format.] Specifically, we
focus on images with an alt-text so we can create image-text pair.",
}
@Misc{cc:NLLB-TeamCosta-jussàCrossEtAl:2022:No-Language-Left-Behind,
doi = "10.48550/ARXIV.2207.04672",
URL = "https://arxiv.org/abs/2207.04672",
author = "{NLLB Team} and Costa-jussà, Marta R. and Cross, James and Çelebi, Onur and Elbayad, Maha and
Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and
Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi
and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett,
Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews,
Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia
and Goswami, Vedanuj and Guzmán, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers,
Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff",
title = "No Language Left Behind: Scaling Human-Centered Machine Translation",
publisher = "arXiv",
year = "2022",
abstract = "Driven by the goal of eradicating language barriers on a global scale, machine translation has
solidified itself as a key focus of artificial intelligence research today. However, such efforts have
coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource
languages. What does it take to break the 200 language barrier while ensuring safe, high quality
results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this
challenge by first contextualizing the need for low-resource language translation support through
exploratory interviews with native speakers. Then, we created datasets and models aimed at narrowing
the performance gap between low and high-resource languages. More specifically, we developed a
conditional compute model based on Sparsely Gated Mixture of Experts that is trained on data obtained
with novel and effective data mining techniques tailored for low-resource languages. We propose
multiple architectural and training improvements to counteract overfitting while training on thousands
of tasks. Critically, we evaluated the performance of over 40,000 different translation directions
using a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity
benchmark covering all languages in Flores-200 to assess translation safety. Our model achieves an
improvement of 44\% BLEU relative to the previous state-of-the-art, laying important groundwork towards
realizing a universal translation system. Finally, we open source all contributions described in this
work, accessible at https://github.com/facebookresearch/fairseq/tree/nllb.",
cc-derived-dataset-about = "NLLB",
cc-author-affiliation = "Meta AI; UC Berkeley, USA; Johns Hopkins University, USA",
cc-class = "nlp/corpus-construction, nlp/parallel-corpus, nlp/low-resource-language, nlp/language-identification",
cc-snippet = "We begin with web data as our starting point, provided by CommonCrawl (CC)18 and ParaCrawl (Bañón et
al., 2020).",
}
@Misc{cc:SmithPatwaryNorickLeGresleyEtAl:2022:Megatron-Turing-NLG-530B,
doi = "10.48550/ARXIV.2201.11990",
URL = "https://arxiv.org/abs/2201.11990",
author = "Smith, Shaden and Patwary, Mostofa and Norick, Brandon and LeGresley, Patrick and Rajbhandari, Samyam
and Casper, Jared and Liu, Zhun and Prabhumoye, Shrimai and Zerveas, George and Korthikanti, Vijay and
Zhang, Elton and Child, Rewon and Aminabadi, Reza Yazdani and Bernauer, Julie and Song, Xia and
Shoeybi, Mohammad and He, Yuxiong and Houston, Michael and Tiwary, Saurabh and Catanzaro, Bryan",
keywords = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
information sciences",
title = "Using DeepSpeed and Megatron to Train Megatron-Turing {NLG} 530{B}, {A} Large-Scale Generative
Language Model",
publisher = "arXiv",
year = "2022",
cc-author-affiliation = "Microsoft; NVIDIA",
cc-class = "nlp/language-model",
cc-snippet = "Resources such as Common Crawl (CC) provide snapshots of the web which can be utilized as a source of
language data. While these data sources contain an enormous amount of language data, they also require
carefully designed preprocessing steps in order to select data which is of reasonable quality. As prior
work has found (e.g., [9]), the quality of unfiltered Common Crawl data is lower than that of curated
datasets and steps should be taken to increase the average quality of data selected from Common Crawl
for LM pretraining. [...] Common Crawl: As mentioned previously, Common Crawl comprises an immense
amount of data. We chose to process two snapshots, 2020-50 and 2021-04, with the aim of acquiring
around 150B tokens of training data. The first step of this process is language detection [11] and text
extraction from the raw HTML included in the Common Crawl WARC files¹. Following the rationale
presented in [11], we used the pycld2² and jusText³ libraries for these tasks. [...] In addition to
Common Crawl data, we leveraged a number of other previously generated datasets. From The Pile, we
selected Books3, OpenWebText2, Stack Exchange, PubMed Abstracts, Wikipedia, Gutenberg (PG-19),
BookCorpus2, NIH ExPorter, and Pile-CC datasets. We also included the CC-Stories and RealNews datasets
used to train Megatron [63].",
}
@InProceedings{cc:AlbyJäschke:2022:top-websites,
author = "Alby, Tom and Jäschke, Robert",
editor = "Silvello, Gianmaria and Corcho, Oscar and Manghi, Paolo and Di Nunzio, Giorgio Maria and Golub,
Koraljka and Ferro, Nicola and Poggi, Antonella",
title = "Analyzing the Web: Are Top Websites Lists a Good Choice for Research?",
booktitle = "Linking Theory and Practice of Digital Libraries",
year = "2022",
publisher = "Springer International Publishing",
address = "Cham",
pages = "11--25",
abstract = "The web has been a subject of research since its beginning, but it is difficult if not impossible to
analyze the whole web, even if a database of all URLs would be freely accessible. Hundreds of studies
have used commercial top websites lists as a shortcut, in particular the Alexa One Million Top Sites
list. However, apart from the fact that Amazon decided to terminate Alexa, we question the usefulness
of such lists for research as they have several shortcomings. Our analysis shows that top sites lists
miss frequently visited websites and offer only little value for language-specific research. We present
a heuristic-driven alternative based on the Common Crawl host-level web graph while also taking
language-specific requirements into account.",
ISBN = "978-3-031-16802-4",
URL = "https://link.springer.com/chapter/10.1007/978-3-031-16802-4_2",
cc-author-affiliation = "Humboldt-Universität zu Berlin, Berlin, Germany",
cc-dataset-used = "hyperlinkgraph/cc-main-2021-feb-apr-may/hostgraph",
cc-class = "web-science, domain-ranking",
}
@Article{cc:Belz:2022:schema-org-in-e-commerce,
title = "Use of schema.org micro-markup in e-commerce projects",
year = "2022",
doi = "https://doi.org/10.30525/2661-5150/2022-4-1",
author = "Olexandra Belz",
URL = "http://baltijapublishing.lv/index.php/threeseas/article/view/1964/1973",
cc-author-affiliation = "Ivan Franko National University of Lviv, Ukraine",
abstract = "The purpose of the article is to identify the most effective schema.org micro-markup schemes used in
e-commerce projects. Methodology. The research included competitive intelligence among the leading
online platforms operating in Europe in general and in Ukraine in particular. The study involved TOP-8
e-commerce projects in Ukraine and TOP-9 global cross-border marketplaces operating in Europe. The
service validator.schema.org was chosen as the research tool. Results. The study showed that the most
popular schema.org micro-markup format is JSON-LD. In general, 82.4\% of the surveyed sites use JSON-LD
microdata format. Some sites use two microdata formats: JSON-LD and Microdata. But none of the top
online marketplaces use the RDFa micro-markup format. Popular marketplaces operating in Ukraine and
Europe often use the same types of schema.org vocabulary. However, the frequency of using micro-markup
by top marketplaces operating in Ukraine is much higher than the frequency of using micro-markup by top
marketplaces operating in Europe. In addition, Ukrainian marketplaces use a much wider list of
schema.org micro-markup properties than marketplaces operating in Europe. However, no online store has
implemented the properties of advantages and disadvantages of goods recommended by Google in the
scheme. Practical implications. The study suggests schema.org micro-markup schemes for homepage,
category page, product page, about page, payment and delivery page, warranty and returns page, contact
page and blog. The proposed templates of micro-markup schemes were validated using the
validator.schema.org service. The study recommends using the JSON-LD format for semantic markup of
website content. Value/originality. Implementation of effective semantic markup of site content will
allow search engines to more accurately identify the information presented on the site. This, in turn,
will improve the visibility of the online marketplace in the Search Engine Results Page of Google,
Bing, Yahoo! etc.",
cc-class = "e-commerce, online marketplaces, linked data, schema.org annotations, SEO",
cc-derived-dataset-used = "WebDataCommons",
cc-snippet = "Since 2008, the Common Crawl project has been crawling websites to collect web page data (extracting
metadata and web page text). At the time of writing, the latest scan took place from November 26 to
December 10, 2022. As a result of this scan, 3.35 billion web pages were processed and 420 petabytes of
content were removed (Common Crawl, 2022). Both scientists and practitioners are working with the
obtained data sets of the Common Crawl project.¶ On September 22, 2022, the Web Data Commons (WDC)
project released the Schema.org Table Annotation Benchmark (SOTAB) for public download (Web Data
Commons, 2022).",
}
@Misc{cc:Minwoo-ByeonKim:2022:COYO-700m-image-text-pair-dataset,
author = "Minwoo Byeon and Beomhee Park and Haecheon Kim and Sungjun Lee and Woonhyuk Baek and Saehoon Kim",
title = "Coyo-700m: Image-text pair dataset",
year = "2022",
cc-derived-dataset-about = "COYO-700M",
URL = "https://github.com/kakaobrain/coyo-dataset",
url-2nd = "https://kakaobrain.com/contents?contentId=7eca73e3-3089-43cb-b701-332e8a1743fd",
cc-author-affiliation = "Kakao Brain, South Korea",
cc-class = "nlp/multimodal-corpora",
cc-dataset-used = "five CommonCrawl dumps, ranging from 2017 to 2020",
abstract = "We collected about 10 billion pairs of alt-text and image source in HTML documents in Common Crawl
from Oct. 2020 to Aug. 2021. and eliminated uninformative pairs through the image and text level
filtering process with minimal cost. The following figure outlines our data collection procedure.",
}
@Misc{cc:ThoppilanFreitasHallShazeerEtAl:2022:LaMDA,
title = "La{MDA}: Language Models for Dialog Applications",
author = "Romal Thoppilan and Daniel De Freitas and Jamie Hall and Noam Shazeer and Apoorv Kulshreshtha and
Heng-Tze Cheng and Alicia Jin and Taylor Bos and Leslie Baker and Yu Du and YaGuang Li and Hongrae Lee
and Huaixiu Steven Zheng and Amin Ghafouri and Marcelo Menegali and Yanping Huang and Maxim Krikun and
Dmitry Lepikhin and James Qin and Dehao Chen and Yuanzhong Xu and Zhifeng Chen and Adam Roberts and
Maarten Bosma and Vincent Zhao and Yanqi Zhou and Chung-Ching Chang and Igor Krivokon and Will Rusch
and Marc Pickett and Pranesh Srinivasan and Laichee Man and Kathleen Meier-Hellstern and Meredith
Ringel Morris and Tulsee Doshi and Renelito Delos Santos and Toju Duke and Johnny Soraker and Ben
Zevenbergen and Vinodkumar Prabhakaran and Mark Diaz and Ben Hutchinson and Kristen Olson and Alejandra
Molina and Erin Hoffman-John and Josh Lee and Lora Aroyo and Ravi Rajakumar and Alena Butryna and
Matthew Lamm and Viktoriya Kuzmina and Joe Fenton and Aaron Cohen and Rachel Bernstein and Ray Kurzweil
and Blaise Aguera-Arcas and Claire Cui and Marian Croak and Ed Chi and Quoc Le",
year = "2022",
eprint = "2201.08239",
archiveprefix = "arXiv",
primaryclass = "cs.CL",
URL = "https://arxiv.org/abs/2201.08239",
cc-derived-dataset-used = "Tensorflow-C4",
cc-snippet = "E Pre-training data composition¶ The pre-training data, called Infiniset, is a combination of dialog
data from public dialog data and other public web documents. It consists of 2.97B documents and 1.12B
dialogs with 13.39B utterances. The composition of the data is as follows: 50\% dialogs data from
public forums; 12.5\% C4 data [11]; 12.5\% code documents from sites related to programming like Q&A
sites, tutorials, etc; 12.5\% Wikipedia (English); 6.25\% English web documents; and 6.25\% Non-English
web documents. The total number of words in the dataset is 1.56T. Note that this composition was chosen
to achieve a more robust performance on dialog tasks (Section 4) while still keeping its ability to
perform other tasks like code generation. As future work, we can study how the choice of this
composition may affect the quality of some of the other NLP tasks performed by the model.",
cc-author-affiliation = "Google",
cc-class = "nlp/language-model, nlp/transformer-language-model",
}
@Article{cc:PhillipsAlam:2022:EOT-cloud,
title = "Moving the End of Term Web Archive to the Cloud to Encourage Research Use and Reuse",
author = "Phillips, Mark Edward and Alam, Sawood",
year = "2022",
URL = "https://digital.library.unt.edu/ark:/67531/metadc1998717/m2/1/high_res_d/EOT_WADL_2022.pdf",
cc-author-affiliation = "University of North Texas, USA; Internet Archive, USA",
cc-class = "web archive",
abstract = "The End of Term Web (EOT) Archive is a collaborative project with a goal of collecting the United
States federal web, loosely defined as .gov and .mil, every four years coinciding with presidential
elections and often a transition in the Executive Branch of the government. In 2021 the End of Term
team began to process the longitudinal web archive for EOT-2008, EOT-2012, EOT-2016, and EOT-2020 to
move into the Amazon S3 storage service as part of the Amazon Open Data Program. This effort adopted
tools, structures, and documentation developed by Common Crawl in an effort to maximize potential
research access and reuse of existing tools and documentation. This paper presents the process of
organizing, staging, processing, and moving these collections into the Amazon cloud.",
}
@TechReport{cc:AddaBraffortVasilescuYvon:2022:report-French-language,
author = "Adda, Gilles and Braffort, Annelies and Vasilescu, Ioana and Yvon, François",
year = "2022",
title = "Deliverable {D1}.14 Report on the French Language. European Language Equality ({ELE}); {EU} project
no. {LC}- 01641480 – 101018166",
URL = "https://european-language-equality.eu/wp-content/uploads/2022/03/ELE___Deliverable_D1_14__Language_Report_French_.pdf",
cc-snippet = "The CommonCrawl project³⁷ [³⁷https://commoncrawl.org/] aggregates Web crawled data that is
orders or magnitude larger than these resources for many languages; furthermore this corpus is being
updated on a regular basis. By using parts of the French subset of CommonCrawl, possibly conjoined with
the more curated corpora alluded to above has enabled to train large-scale BERT-style Language Models
(LMs) – FlauBERT (Le et al., 2020) is built with a corpus containing about 12B running words,
CamemBERT (Martin et al., 2020) uses the 22B words OSCAR, and these numbers continue to grow, albeit at
a much slower pace than the corresponding English cor- pora.",
cc-author-affiliation = "Université Paris-Saclay, CNRS, LISN, Paris, France",
cc-class = "nlp/resources, French, nlp/language-models, nlp/text-corpora",
}
@Misc{cc:Nagel:2022:10-years-in-the-cloud,
type = "Presentation",
title = "{Common} {Crawl} – Experiences From 10 Years in the Cloud",
copyright = "Public",
URL = "https://digital.library.unt.edu/ark:/67531/metadc1983147/",
abstract = "Presentation for the IIPC General Assembly and Web Archiving Conference virtually held on May 23-25,
2022. This presentation gives an overview of how the Common Crawl web data is used in and outside the
cloud over the past ten years that the dataset has been hosted as part of Amazon Web Services’ Open
Data Sponsorships program.",
language = "English",
urldate = "2025-11-21",
journal = "2022 International Internet Preservation Consortium (IIPC) Web Archiving Conference, May 23-25,
2022.",
author = "Nagel, Sebastian",
month = may,
year = "2022",
cc-author-affiliation = "Common Crawl Foundation, USA",
cc-class = "web-archiving/storage, web-archiving/open-dataset",
}