cc-citations/bib/cc2022.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
@Misc{cc:SnæbjarnarsonSímonarsonRagnarssonIngólfsdóttirEtAl:2022:warm-start-and-clean-crawled-corpus,
  title        = "A Warm Start and a Clean Crawled Corpus -- {A} Recipe for Good Language Models",
  author       = "Vésteinn Snæbjarnarson and Haukur Barri Símonarson and Pétur Orri Ragnarsson and Svanhvít Lilja
                 Ingólfsdóttir and Haukur Páll Jónsson and Vilhjálmur Þorsteinsson and Hafsteinn Einarsson",
  year         = "2022",
  pdf          = "https://arxiv.org/pdf/2201.05601.pdf",
  URL          = "https://arxiv.org/abs/2201.05601",
  abstract     = "We train several language models for Icelandic, including IceBERT, that achieve state-of-the-art
                 performance in a variety of downstream tasks, including part-of-speech tagging, named entity
                 recognition, grammatical error detection and constituency parsing. To train the models we introduce a
                 new corpus of Icelandic text, the Icelandic Common Crawl Corpus (IC3), a collection of high quality
                 texts found online by targeting the Icelandic top-level-domain (TLD). Several other public data sources
                 are also collected for a total of 16GB of Icelandic text. To enhance the evaluation of model
                 performance and to raise the bar in baselines for Icelandic, we translate and adapt the WinoGrande
                 dataset for co-reference resolution. Through these efforts we demonstrate that a properly cleaned
                 crawled corpus is sufficient to achieve state-of-the-art results in NLP applications for low to medium
                 resource languages, by comparison with models trained on a curated corpus. We further show that
                 initializing models using existing multilingual models can lead to state-of-the-art results for some
                 downstream tasks.",
  cc-author-affiliation = "Miðeind ehf., Iceland; University of Iceland, Iceland",
  cc-class     = "nlp/corpus-construction, nlp/language-model",
  cc-dataset-used = "CDX, WARC, ARC 2008 – March 2020",
  cc-snippet   = "3.1. The Icelandic Common Crawl Corpus¶ The Common Crawl Foundation is a non-profit organization that
                 scrapes large semi-random subsets of the internet regularly and hosts timestamped and compressed dumps
                 of the web online¹⁰ [¹⁰https://commoncrawl.org/the-data/get-started/]. Each dump contains
                 billions of web pages occupying hundreds of terabytes. Parsing these files directly requires storage
                 and computing power not directly available to most and can come at a significant financial cost. The
                 foundation also hosts indices of URIs and their locations within the large zipped dump files. While
                 these indices are also large, their processing is feasible with a few terabytes of storage.¶ 3.1.1.
                 Extracting Icelandic Common Crawl data¶ The Common Crawl indices, which contain URI and byte offsets
                 within the compressed dumps, are used to reduce the search space when looking for Icelandic texts. The
                 Common Crawl Index Server has a public API¹¹ [¹¹https://index.commoncrawl.org/] where URIs can be
                 queried based on attributes such as date, MIME-type and substring. Using the API eliminates the need to
                 fetch the massive index files. To extract Icelandic, the .is pattern is targeted to match the Icelandic
                 top level domain (TLD), resulting in 63.5 million retrieved pages with URIs and byte locations within
                 the compressed Common Crawl dumps. The computational efficiency of our method can be attributed to
                 these steps. Given the predominant use of the .is TLD for Icelandic web content, we assume that other
                 TLDs have a much lower proportion of Icelandic content. That said, a nontrivial amount of text in
                 Icelandic is still likely to be found outside the .is domain and could be extracted by, e.g., parsing
                 the whole Common Crawl, albeit at a much higher computational cost.¶ By targeting only the
                 byte-offsets corresponding to the Icelandic TLD we extract candidate websites that have a high
                 proportion of Icelandic content. In total, the compressed content is 687GiB on disk. All dumps since
                 the start of the Common Crawl in 2008 until March 2020 were included.¶ Plain text was extracted from
                 the collected WARC (Web Archive format) files using jusText (Pomikálek, 2011)12 to remove boilerplate
                 content and HTML tags.",
}

@Misc{cc:ArtetxeAldabeAgerriPerez-de-ViñaspreEtAl:2022:corpus-quality-low-resource-languages,
  DOI          = "10.48550/ARXIV.2203.08111",
  URL          = "https://arxiv.org/abs/2203.08111",
  author       = "Artetxe, Mikel and Aldabe, Itziar and Agerri, Rodrigo and Perez-de-Viñaspre, Olatz and Soroa, Aitor",
  keywords     = "Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS:
                 Computer and information sciences, FOS: Computer and information sciences",
  title        = "Does Corpus Quality Really Matter for Low-Resource Languages?",
  publisher    = "arXiv",
  year         = "2022",
  abstract     = "The vast majority of non-English corpora are derived from automatically filtered versions of
                 CommonCrawl. While prior work has identified major issues on the quality of these datasets (Kreutzer et
                 al., 2021), it is not clear how this impacts downstream performance. Taking Basque as a case study, we
                 explore tailored crawling (manually identifying and scraping websites with high-quality content) as an
                 alternative to filtering CommonCrawl. Our new corpus, called EusCrawl, is similar in size to the Basque
                 portion of popular multilingual corpora like CC100 and mC4, yet it has a much higher quality according
                 to native annotators. For instance, 66\% of documents are rated as high-quality for EusCrawl, in
                 contrast with <33\% for both mC4 and CC100. Nevertheless, we obtain similar results on downstream tasks
                 regardless of the corpus used for pre-training. Our work suggests that NLU performance in low-resource
                 languages is primarily constrained by the quantity rather than the quality of the data, prompting for
                 methods to exploit more diverse data sources.",
  cc-snippet   = "In this paper, we explore tailored crawling (i.e., manually identifying and scraping websites with
                 high-quality content) as an alternative to filtering CommonCrawl. Taking Basque as a case study, we
                 collect 12.5M documents from 33 websites with Creative Commons content. The resulting corpus, called
                 EusCrawl, is similar in size to the Basque portion of CC100 and mC4, but it has substantially less
                 issues and a higher perceived quality according to our blind audit with native annotators. However, we
                 find that this improvement does not carry over to downstream tasks, as masked language models
                 pre-trained on either corpora obtain similar results on 5 NLU benchmarks. Our results suggests that
                 data quantity and domain play a more important role, prompting for methods to exploit more diverse
                 sources of data in low-resource languages.",
  cc-author-affiliation = "Meta AI; HiTZ Center - Ixa, University of the Basque Country UPV/EHU",
  cc-class     = "nlp/corpus-construction, nlp/corpus-representativeness, nlp/corpus-quality, nlp/language-models,
                 nlp/low-resource-languages",
}

@Misc{cc:BidermanBichenoGao:2022:datasheet-for-Pile,
  title        = "Datasheet for the Pile",
  author       = "Stella Biderman and Kieran Bicheno and Leo Gao",
  year         = "2022",
  pdf          = "https://arxiv.org/pdf/2201.07311.pdf",
  URL          = "https://arxiv.org/abs/2201.07311",
  abstract     = "This datasheet describes the Pile, a 825 GiB dataset of human-authored text compiled by EleutherAI for
                 use in large-scale language modeling. The Pile is comprised of 22 different text sources, ranging from
                 original scrapes done for this project, to text data made available by the data owners, to third-party
                 scrapes available online.",
  cc-author-affiliation = "EleutherAI",
  cc-class     = "nlp/corpus-construction, nlp/corpus-datasheet, nlp/corpus-representativeness",
  cc-derived-dataset-about = "The-Pile-English",
  cc-snippet   = "Pile-CC: The Pile-CC dataset is a sample from the Common Crawl WARCs that has been converted to text
                 using jusText [Endrédy and Novák, 2013].¶ [...] Pile-CC: The Pile-CC dataset was created to be
                 included in the Pile. The underlying data comes from the Common Crawl, which was created to give people
                 access to the wealth of information contained in the internet. Its creators were concerned that only
                 data mining companies would be able to collect this data, and has the explicit aim of democratizing
                 technology.¶ [...] Pile-CC: The data is sourced from Common Crawl, a non-profit 501(c)(3) organization
                 founded by Gil Elbaz. The data from Common Crawl was processed by EleutherAI into Pile-CC.¶ [...]
                 Pile-CC: Instances are webpages.¶ [...] Pile-CC: 54, 953, 117 documents, totaling 227.12 GiB.¶ [...]
                 Pile-CC: A tiny fraction of the entire Common Crawl was included, chosen arbitrarily and heavily
                 filtered as detailed in Gao et al. [2020].¶ [...] Pile-CC: Data in the Pile-CC dataset were scraped
                 from websites by the Common Craw and then downloaded directly from the Common Craw by EleutherAI.¶
                 [...] Pile-CC: The earliest date of contents in Pile-CC is unknown.¶",
}

@TechReport{cc:Andersson-Schwarz:2022:quantification-online-linguistic-data,
  title        = "The hitchhiker's guide Method handbook for quantification of online linguistic data in a
                 country-specific context. Official research report, Linguistic Explorations of Societies (Work Package
                 1)",
  author       = "Andersson Schwarz, Jonas",
  year         = "2022",
  pdf          = "https://gupea.ub.gu.se/bitstream/handle/2077/70890/2022_1_Andersson%20Schwarz.pdf",
  cc-snippet   = "Central actors (in no particular order)¶ CommonCrawl. California-based non-profit organization that
                 makes monthly crawls of the openly available Web and provides datasets and metadata to the public
                 freely. The CommonCrawl corpus contains petabytes of data including raw web page data, metadata data
                 and text data collected since 2011. Since 2012, CommonCrawl’s archive is hosted by Amazon Web
                 Services as part of its Public Data Sets program. Every crawl contains around 300 terabytes of data and
                 roughly 3 billion pages. In 2020, a filtered version of this CommonCrawl archive was used to train
                 OpenAI’s GPT-3 language model.¶ [...] Similarly, CommonCrawl (2021) provides an aggregate listing
                 the percentages of their database covered by each language – measured as the primary language of each
                 html document, as identified by the Compact Language Detector 2 (CLD2) algorithm. This was included as
                 a good benchmark to compare with.¶ [...] In comparison, when plotting the cur- rently stated language
                 distribution of CommonCrawl (2021) in relation to the same population numbers of L1 and L2 speakers,
                 the CommonCrawl distribution displays a similarly low kurtosis and skewness.",
  cc-author-affiliation = "Göteborgs Universitet, Sweden",
  cc-class     = "nlp/corpus-construction, nlp/corpus-representativeness",
}

@Misc{cc:MorishitaChousaSuzukiNagata:2022:JParaCrawl,
  DOI          = "10.48550/ARXIV.2202.12607",
  URL          = "https://arxiv.org/abs/2202.12607",
  pdf          = "https://arxiv.org/pdf/2202.12607.pdf",
  author       = "Morishita, Makoto and Chousa, Katsuki and Suzuki, Jun and Nagata, Masaaki",
  title        = "{JParaCrawl} v3.0: A Large-scale English-Japanese Parallel Corpus",
  year         = "2022",
  abstract     = "Most current machine translation models are mainly trained with parallel corpora, and their
                 translation accuracy largely depends on the quality and quantity of the corpora. Although there are
                 billions of parallel sentences for a few language pairs, effectively dealing with most language pairs
                 is difficult due to a lack of publicly available parallel corpora. This paper creates a large parallel
                 corpus for English-Japanese, a language pair for which only limited resources are available, compared
                 to such resource-rich languages as English-German. It introduces a new web-based English-Japanese
                 parallel corpus named JParaCrawl v3.0. Our new corpus contains more than 21 million unique parallel
                 sentence pairs, which is more than twice as many as the previous JParaCrawl v2.0 corpus. Through
                 experiments, we empirically show how our new corpus boosts the accuracy of machine translation models
                 on various domains. The JParaCrawl v3.0 corpus will eventually be publicly available online for
                 research purposes.",
  cc-snippet   = "Our method extracts parallel sentences from the web. Thus, the first step is finding a website that
                 has parallel sentences. This method is based on the hypothesis that websites containing the same
                 English and Japanese sentences might have parallel texts. To list such parallel websites, we analyzed
                 all the Common Crawl text archive data released from March 2019 to August 2021³. [³During this
                 period, the Common Crawl project released 25 archives, and their text size was about 212 TB.] We
                 identified the language in the archive by CLD2⁴ [⁴ https://github.com/CLD2Owners/cld2] and listed
                 100,000 large websites that roughly have the same size of English and Japanese texts. For this step, we
                 used extractor⁵ [⁵ 5https://github.com/paracrawl/extractor] that was provided by the ParaCrawl
                 project.",
  cc-author-affiliation = "NTT Communication Science Laboratories, NTT Corporation, Japan",
  cc-class     = "nlp/machine-translation, nlp/parallel-corpus, nlp/corpus-construction",
}

@InProceedings{cc:LAKIMAlmazroueiAlhaolDebbahEtAl:2022:carbon-footprint,
  title        = "A Holistic Assessment of the Carbon Footprint of Noor, a Very Large Arabic Language Model",
  author       = "Imad LAKIM and Ebtesam Almazrouei and Ibrahim Abu Alhaol and Merouane Debbah and Julien Launay",
  booktitle    = "Challenges {\&} Perspectives in Creating Large Language Models",
  year         = "2022",
  URL          = "https://openreview.net/forum?id=B-lS3zH8Zq",
  abstract     = "As ever larger language models grow more ubiquitous, it is crucial to consider their environmental
                 impact. Characterised by extreme size and resource use, recent generations of models have been
                 criticised for their voracious appetite for compute, and thus significant carbon footprint. Although
                 reporting of carbon impact has grown more common in machine learning papers, this reporting is usually
                 limited to compute resources used strictly for training. In this work, we propose a holistic assessment
                 of the footprint of an extreme-scale language model, Noor. Noor is an ongoing project aiming to develop
                 the largest multi-task Arabic language models--with up to 13B parameters--leveraging zero-shot
                 generalisation to enable a wide range of downstream tasks via natural language instructions. We assess
                 the total carbon bill of the entire project: starting with data collection and storage costs, including
                 research and development budgets, pretraining costs, future serving estimates, and other exogenous
                 costs necessary for this international cooperation. Notably, we find that inference costs and exogenous
                 factors can have a significant impact on total budget. Finally, we discuss pathways to reduce the
                 carbon footprint of extreme-scale models.",
  cc-author-affiliation = "TII, Abu Dhabi, Arabic Emirates; LightOn, Paris, France",
  cc-class     = "nlp/language-model, nlp/transformer-language-model, carbon-footprint",
  cc-snippet   = "We use Common Crawl (CC) for acquiring large amounts of web data. Each CC dump is on average around
                 10TB, and we discard it immediately after processing it. On average, it takes 24 hours to fully process
                 a dump: we used 21 dumps from CC, meaning we stored 210TB of data for 24hours, equivalent to 57 kWh of
                 energy consumption. After processing the dumps, we got on average 1.2TB of data per dump, thus 25TB in
                 total. Considering that this data will be stored for 6 months, we end up with 1.3 MWh of energy
                 consumption for the bulk data. Note that we keep the processed data in all languages (not just Modern
                 Standard Arabic).",
}

@Article{cc:Gutiérrez-FandiñoPérez-FernándezArmengol-EstapéGriolEtAl:2022:esCorpius,
  author       = "Gutiérrez-Fandiño, Asier and Pérez-Fernández, David and Armengol-Estapé, Jordi and Griol, David
                 and Callejas, Zoraida",
  title        = "{esCorpius: A Massive Spanish Crawling Corpus}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science - Artificial Intelligence",
  year         = "2022",
  eid          = "arXiv:2206.15147",
  URL          = "https://ui.adsabs.harvard.edu/abs/2022arXiv220615147G",
  pdf          = "https://arxiv.org/pdf/2206.15147.pdf",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  cc-snippet   = "[…] In this paper, we introduce esCorpius, a Spanish crawling corpus obtained from near 1 Pb of
                 Common Crawl data. It is the most extensive corpus in Spanish with this level of quality in the
                 extraction, purification and deduplication of web textual content […] A total of 39,502 compressed
                 WARC (Web Archive) from Common Crawl files were processed (see section 3.3 for more details). The
                 compressed information occupied about 180 TB and the size of the processed decompressed information is
                 estimated to be more than 0.8 PB. Prior to content deduplication, the downloaded corpus was composed of
                 106.768.594.753 words, 3.129.248.875 lines and 163.518.405 web pages. The deduplicated and cleaned
                 corpus size is 346.262.072.705 bytes (322.5 GB), with 104.073.706 total number of lines, 50.040.055.322
                 tokens, 1.125.798.968 paragraphs and 2.421.598.201 sentences.",
  cc-author-affiliation = "LHF Labs; Universidad Autónoma de Madrid, Spain; University of Edinburgh, United Kingdom;
                 Universidad de Granada, Spain",
  cc-class     = "nlp/corpus-construction, nlp/text-corpora",
}

@InProceedings{cc:OverwijkXiongCallan:2022:ClueWeb22,
  author       = "Overwijk, Arnold and Xiong, Chenyan and Callan, Jamie",
  title        = "{ClueWeb22}: 10 Billion Web Documents with Rich Information",
  year         = "2022",
  ISBN         = "978-1-4503-8732-3",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  URL          = "https://doi.org/10.1145/3477495.3536321",
  pdf          = "https://dl.acm.org/doi/pdf/10.1145/3477495.3536321",
  doi          = "10.1145/3477495.3536321",
  abstract     = "ClueWeb22, the newest iteration of the ClueWeb line of datasets, is the result of more than a year of
                 collaboration between industry and academia. Its design is influenced by the research needs of the
                 academic community and the real-world needs of large-scale industry systems. Compared with earlier
                 ClueWeb datasets, the ClueWeb22 corpus is larger, more varied, and has higher-quality documents. Its
                 core is raw HTML, but it includes clean text versions of documents to lower the barrier to entry.
                 Several aspects of ClueWeb22 are available to the research community for the first time at this scale,
                 for example, visual representations of rendered web pages, parsed structured information from the HTML
                 document, and the alignment of document distributions (domains, languages, and topics) to commercial
                 web search.This talk shares the design and construction of ClueWeb22, and discusses its new features.
                 We believe this newer, larger, and richer ClueWeb corpus will enable and support a broad range of
                 research in IR, NLP, and deep learning.",
  booktitle    = "Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information
                 Retrieval",
  pages        = "3360–3362",
  numpages     = "3",
  keywords     = "clueweb, web corpus, dataset",
  location     = "Madrid, Spain",
  series       = "SIGIR '22",
  cc-snippet   = "One approach is to sift CommonCrawl data, eg, the C4 dataset used to pretrain T5 [10], which provides
                 sufficient quantity, but the quality quickly becomes a concern. For example, the cleaned CommonCrawl
                 reflects a quite weird distribution of the web [5]. Language models pretrained on C4 often perform
                 worse than models pretrained on higher quality corpora at the same scale. With ClueWeb22, we aim to
                 provide the web corpus for research in the near future. The design of ClueWeb22 emphasizes on these
                 goals: 1) to reflect the distribution of the web in real scenarios; 2) to provide web pages at large
                 quantity and also with high quality; 3) to enable new research directions by including information
                 important in industry but previously not publicly available.",
  cc-author-affiliation = "Microsoft; Carnegie Mellon University",
  cc-class     = "cc-cited-not-used, nlp/corpus-construction, nlp/text-corpora, information-retrieval",
}

@Misc{cc:ZhangRollerGoyalArtetxeEtAl:2022:OPT-open-pretrained-transformer,
  doi          = "10.48550/ARXIV.2205.01068",
  URL          = "https://arxiv.org/abs/2205.01068",
  pdf          = "https://arxiv.org/pdf/2205.01068.pdf",
  snippet      = "All corpora were previously collected or filtered to contain predominantly English text, but a small
                 amount of non-English data is still present within the corpus via CommonCrawl. We removed duplicated
                 documents across all datasets by filtering …",
  cc-author-affiliation = "Meta AI",
  author       = "Zhang, Susan and Roller, Stephen and Goyal, Naman and Artetxe, Mikel and Chen, Moya and Chen, Shuohui
                 and Dewan, Christopher and Diab, Mona and Li, Xian and Lin, Xi Victoria and Mihaylov, Todor and Ott,
                 Myle and Shleifer, Sam and Shuster, Kurt and Simig, Daniel and Koura, Punit Singh and Sridhar, Anjali
                 and Wang, Tianlu and Zettlemoyer, Luke",
  title        = "{OPT}: Open Pre-trained Transformer Language Models",
  publisher    = "arXiv",
  year         = "2022",
  cc-derived-dataset-used = "CC-Stories, Pile-CC, CC-NEWS-RoBERTa-v2",
  cc-class     = "nlp/language-model, nlp/transformer-language-model, nlp/corpus-construction",
}

@InProceedings{cc:LugeonPiccardiWest:2022:Homepage2Vec,
  title        = "Homepage2Vec: Language-Agnostic Website Embedding and Classification",
  author       = "Lugeon, Sylvain and Piccardi, Tiziano and West, Robert",
  booktitle    = "Proceedings of the International AAAI Conference on Web and Social Media",
  volume       = "16",
  pages        = "1285--1291",
  year         = "2022",
  URL          = "https://ojs.aaai.org/index.php/ICWSM/article/download/19380/19152",
  abstract     = "Top-level domain. Some top-level domains (TLD) such as .edu or .biz can offer a good hint about the
                 website's content. For example, a typical use case for .edu is university websites, whereas .biz is
                 commonly associated with business activities. Following this intuition, we collected from Common
                 Crawl,5 a large-scale sample of the Web, the 19 most frequent TLDs: .com, .org, .net, .info, .xyz,
                 .club, .biz, .top, .edu, .online, .pro, .site, .vip, .icu, .buzz, .app, .asia, .gov, .space, excluding
                 the country code TLD (ccTLD) because they indicate geographic origin, not website content. We represent
                 this feature with a one-hot encoding vector of 19 dimensions.",
  cc-author-affiliation = "EPFL, Switzerland",
  cc-class     = "nlp/text-classification, web-site-classification",
}

@InProceedings{cc:ZirngiblDeuschSattlerAulbachEtAl:2022:domain-parking,
  title        = "Domain Parking: Largely Present, Rarely Considered!",
  author       = "Zirngibl, Johannes and Deusch, Steffen and Sattler, Patrick and Aulbach, Juliane and Carle, Georg and
                 Jonker, Mattijs",
  booktitle    = "Proc. Network Traffic Measurement and Analysis Conference (TMA) 2022",
  year         = "2022",
  abstract     = "Domain parking typically involves leveraging advertisements to generate revenue on otherwise inactive
                 domain names. Their content is rarely of real value to users and tends to be highly similar across
                 parked domains. They have commonalities beyond content alone: parked domains can share hosting and DNS
                 infrastructure. Parking rarely receives special treatment in existing studies (e.g., content analyses
                 or infrastructure concentration studies). While the presence and possible bias introduced by parked
                 pages is sometimes acknowledged in studies, the studies still treat parked domains as any other, either
                 because differentiation is infeasible, or because doing so is considered out-of-scope. We argue that
                 the impact of parked domains on analyses regarding the current state and future development of the
                 Internet should not be overlooked. In this paper, we motivate this argument through quantification, and
                 take steps towards helping other researchers identify parked domains. We systematically collect a list
                 of 82 parking services and develop DNS-based indicators to help identify parked domains. We next
                 quantify the presence of parked domains, using large-scale DNS data containing hundreds of millions of
                 registered domain names, representative for a significant part of the global DNS namespace. Overall, we
                 pinpoint 60 M parked domains, which is a significant percentage of all names under consideration (23\%)
                 and identify up to 4\% of domains from top lists to be parked. These findings demonstrate that the
                 effect of parked pages is potentially pronounced. We also break down into the various parking services
                 and DNS zones. This helps us demonstrate and further discuss the effect that domain parking can have on
                 research and Internet consolidation.",
  cc-snippet   = "Common Crawl While visual identification allowed us to validate the inferences to a reasonable extent,
                 we wanted to upscale validation. Therefore, we consider Common Crawl (CC) data [21] [C. Crawl. (2022)
                 The Common Crawl Corpus. [Online]. Available: https://commoncrawl.org/] and calculate the similarity of
                 pages. Common Crawl is an open repository of web crawl data, collected at monthly intervals, accounting
                 for hundreds of millions of unique domain names, and many more URLs. We consider CC data for Jan 2022
                 and the ∼60 M parked domains that we identify on Jan 28th, 2022. We extract the HTML content of
                 parked pages from CC data, only considering URLs that contain exactly the registered domain.
                 Furthermore, we require the crawl target to have been the landing page (i.e., the path of the URL is /)
                 and also to have resulted in a useful response (i.e., HTTP status code of 200). Given these filters,
                 ∼1.29 M HTML rich responses can be obtained. We extract visible text and tokenize it into words,
                 remove stop words, apply lemmatization, and create a vector for the most-frequently used words for each
                 page.",
  URL          = "https://mediatum.ub.tum.de/1661842",
  pdf          = "https://www.net.in.tum.de/fileadmin/bibtex/publications/papers/zirngibl2022prevalenceofparking.pdf",
  cc-author-affiliation = "Technical University of Munich, Germany; University of Twente, The Netherlands",
  cc-class     = "web-science, internet/DNS, internet/domain-parking",
}

@InProceedings{cc:LuccioniCorrySridharanAnannyEtAl:2022:deprecating-datasets,
  author       = "Luccioni, Alexandra Sasha and Corry, Frances and Sridharan, Hamsini and Ananny, Mike and Schultz,
                 Jason and Crawford, Kate",
  title        = "A Framework for Deprecating Datasets: Standardizing Documentation, Identification, and Communication",
  year         = "2022",
  ISBN         = "978-1-4503-9352-2",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  URL          = "https://doi.org/10.1145/3531146.3533086",
  pdf          = "https://facctconference.org/static/pdfs_2022/facct22-17.pdf",
  doi          = "10.1145/3531146.3533086",
  abstract     = "Datasets are central to training machine learning (ML) models. The ML community has recently made
                 significant improvements to data stewardship and documentation practices across the model development
                 life cycle. However, the act of deprecating, or deleting, datasets has been largely overlooked, and
                 there are currently no standardized approaches for structuring this stage of the dataset life cycle. In
                 this paper, we study the practice of dataset deprecation in ML, identify several cases of datasets that
                 continued to circulate despite having been deprecated, and describe the different technical, legal,
                 ethical, and organizational issues raised by such continuations. We then propose a Dataset Deprecation
                 Framework that includes considerations of risk, mitigation of impact, appeal mechanisms, timeline,
                 post-deprecation protocols, and publication checks that can be adapted and implemented by the ML
                 community. Finally, we propose creating a centralized, sustainable repository system for archiving
                 datasets, tracking dataset modifications or deprecations, and facilitating practices of care and
                 stewardship that can be integrated into research and publication processes.",
  booktitle    = "2022 ACM Conference on Fairness, Accountability, and Transparency",
  pages        = "199–212",
  numpages     = "14",
  keywords     = "datasets, data stewardship data management dataset deprecation",
  location     = "Seoul, Republic of Korea",
  series       = "FAccT '22",
  cc-snippet   = "When it comes to filtering large text datasets scraped from the Web, given their sheer size (C4
                 represents 2.3 TB of data, whereas the Common Crawl has 139TB), filtering them is complex and
                 time-consuming, although approaches have been proposed for reducing duplicates and train-test overlap
                 [53]. [...] In practice, documenting and deprecating these datasets is akin to a game of whack-a-mole,
                 since new versions of the Common Crawl come out every few months. Analyzing what they contain and their
                 degrees of contamination through common evaluation tasks would take significant effort.",
  cc-author-affiliation = "Hugging Face; University of Southern California, USA; New York University, USA; Microsoft
                 Research, USA",
  cc-class     = "ai/ethics-of-machine-learning, nlp/text-corpora, nlp/corpus-construction, cc-cited-not-used",
}

@Article{cc:KreutzerCaswellWangWahabEtAl:2022:audit-web-multilingual-datasets,
  author       = "Kreutzer, Julia and Caswell, Isaac and Wang, Lisa and Wahab, Ahsan and van Esch, Daan and
                 Ulzii-Orshikh, Nasanbayar and Tapo, Allahsera and Subramani, Nishant and Sokolov, Artem and Sikasote,
                 Claytone and Setyawan, Monang and Sarin, Supheakmungkol and Samb, Sokhar and Sagot, Benoît and Rivera,
                 Clara and Rios, Annette and Papadimitriou, Isabel and Osei, Salomey and Suarez, Pedro Ortiz and Orife,
                 Iroro and Ogueji, Kelechi and Rubungo, Andre Niyongabo and Nguyen, Toan Q. and Müller, Mathias and
                 Müller, André and Muhammad, Shamsuddeen Hassan and Muhammad, Nanda and Mnyakeni, Ayanda and
                 Mirzakhalov, Jamshidbek and Matangira, Tapiwanashe and Leong, Colin and Lawson, Nze and Kudugunta,
                 Sneha and Jernite, Yacine and Jenny, Mathias and Firat, Orhan and Dossou, Bonaventure F. P. and
                 Dlamini, Sakhile and de Silva, Nisansa and Çabuk Ballı, Sakine and Biderman, Stella and Battisti,
                 Alessia and Baruwa, Ahmed and Bapna, Ankur and Baljekar, Pallavi and Azime, Israel Abebe and Awokoya,
                 Ayodele and Ataman, Duygu and Ahia, Orevaoghene and Ahia, Oghenefego and Agrawal, Sweta and Adeyemi,
                 Mofetoluwa",
  title        = "Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets",
  journal      = "Transactions of the Association for Computational Linguistics",
  volume       = "10",
  pages        = "50--72",
  year         = "2022",
  month        = "01",
  abstract     = "With the success of large-scale pre-training and multilingual modeling in Natural Language Processing
                 (NLP), recent years have seen a proliferation of large, Web-mined text datasets covering hundreds of
                 languages. We manually audit the quality of 205 language-specific corpora released with five major
                 public datasets (CCAligned, ParaCrawl, WikiMatrix, OSCAR, mC4). Lower-resource corpora have systematic
                 issues: At least 15 corpora have no usable text, and a significant fraction contains less than 50\%
                 sentences of acceptable quality. In addition, many are mislabeled or use nonstandard/ambiguous language
                 codes. We demonstrate that these issues are easy to detect even for non-proficient speakers, and
                 supplement the human audit with automatic analyses. Finally, we recommend techniques to evaluate and
                 improve multilingual corpora and discuss potential risks that come with low-quality data releases.",
  ISSN         = "2307-387X",
  doi          = "10.1162/tacl_a_00447",
  URL          = "https://doi.org/10.1162/tacl\_a\_00447",
  eprint       = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00447/1986585/tacl\_a\_00447.pdf",
  cc-class     = "nlp/corpus-construction, nlp/web-as-corpus, nlp/parallel-corpus, nlp/low-resource-language",
  cc-derived-dataset-about = "CCAligned-2020, Tensorflow-C4-Multilingual, OSCAR",
  cc-snippet   = "We selected the corpora for their multilinguality and the inclusion of understudied languages in NLP.
                 With the exception of WikiMatrix and Paracrawl, all corpora are derived from CommonCrawl, and
                 distinguish themselves by the choice of filtering methods, LangID and automatic alignment technology.",
  cc-author-affiliation = "Google Research; Masakhane NLP; Turkic Interlingua; Haverford College; RobotsMali; Intel
                 Labs; University of Zambia; Google; AIMS-AMMI; Inria; University of Zurich; Stanford University; Kwame
                 Nkrumah University of Science and Technology; Sorbonne Université; Niger-Volta LTI; University of
                 Waterloo; University of Electronic Science and Technology of China; University of Notre Dame; Bayero
                 University Kano; University of South Florida; Hugging Face; Jacobs University Bremen; University of
                 Moratuwa; EleutherAI; Obafemi Awolowo University; University of Ibadan; Instadeep; University of
                 Maryland; Defence Space Administration Abuja",
}

@Misc{cc:AbadjiSuarezRomarySagot:2022:cleaner-document-oriented-multilingual-crawled-corpus,
  doi          = "10.48550/ARXIV.2201.06642",
  URL          = "https://arxiv.org/abs/2201.06642",
  pdf          = "https://arxiv.org/pdf/2201.06642.pdf",
  author       = "Abadji, Julien and Suarez, Pedro Ortiz and Romary, Laurent and Sagot, Benoît",
  title        = "Towards a Cleaner Document-Oriented Multilingual Crawled Corpus",
  publisher    = "arXiv",
  year         = "2022",
  abstract     = "The need for raw large raw corpora has dramatically increased in recent years with the introduction of
                 transfer learning and semi-supervised learning methods to Natural Language Processing. And while there
                 have been some recent attempts to manually curate the amount of data necessary to train large language
                 models, the main way to obtain this data is still through automatic web crawling. In this paper we take
                 the existing multilingual web corpus OSCAR and its pipeline Ungoliant that extracts and classifies data
                 from Common Crawl at the line level, and propose a set of improvements and automatic annotations in
                 order to produce a new document-oriented version of OSCAR that could prove more suitable to pre-train
                 large generative language models as well as hopefully other applications in Natural Language Processing
                 and Digital Humanities.",
  cc-derived-dataset-about = "OSCAR",
  cc-author-affiliation = "Inria, France; Sorbonne Université, France",
  cc-class     = "nlp/corpus-construction, nlp/web-as-corpus",
}

@Misc{cc:TongjingYinBaoMeijers:2022:intercity-relationships,
  title        = "Dataset of intercity relationships between 293 Chinese cities extracted and classified on the basis of
                 toponym co-occurrences on Common Crawl",
  year         = "2022",
  author       = "Tongjing, Wang and Yin, Zhao and Bao, Ziyu and Meijers, Evert",
  URL          = "https://www.researchgate.net/profile/Evert-Meijers/publication/362952059_Dataset_of_intercity_relationships_between_293_Chinese_cities_extracted_and_classified_on_the_basis_of_toponym_co-occurrences_on_Common_Crawl/links/6308bfc25eed5e4bd11f7938/Dataset-of-intercity-relationships-between-293-Chinese-cities-extracted-and-classified-on-the-basis-of-toponym-co-occurrences-on-Common-Crawl.pdf",
  pdf          = "https://www.researchgate.net/profile/Evert-Meijers/publication/362952059_Dataset_of_intercity_relationships_between_293_Chinese_cities_extracted_and_classified_on_the_basis_of_toponym_co-occurrences_on_Common_Crawl/links/6308bfc25eed5e4bd11f7938/Dataset-of-intercity-relationships-between-293-Chinese-cities-extracted-and-classified-on-the-basis-of-toponym-co-occurrences-on-Common-Crawl.pdf",
  abstract     = "Although the importance of intercity relationships is theoretically acknowledged for cities’
                 socioeconomic development, the availability of such relational data often limits relevant urban
                 studies. One of the new approaches of collecting city relational data is to extract the co-appearance
                 of their place names from web texts. However, dealing with a gigantic web corpus is difficult for
                 domain researchers given the complexities of processing terabytes of raw data. This paper develops an
                 efficient and easy-to-follow method to extract a dataset of intercity relationships between 293 large
                 Chinese cities applying the toponym co-occurrence method to a web archive. Our method successfully
                 filters a 6.98 TB CC data set into a 202 GB single language text corpus. A highly-scalable Hadoop-
                 based framework processes the full CC corpus utilizing a 1080 CPU cluster on the Amazon Elastic
                 Map/Reduce infrastructure. To reveal more details of the intercity relationships, the intercity
                 relationships are further classified into six categories: industry, information technology (IT),
                 finance, research, culture, and government.",
  keywords     = "city networks, toponym co-occurrence, city relationship, geographical information retrieval",
  cc-author-affiliation = "Utrecht University, The Netherlands; Delft University of Technology, The Netherlands",
  cc-class     = "information retrieval, toponymy, dataset-creation",
  cc-snippet   = "The data was retrieved from a Common Crawl raw corpus through a series of data processing. The web
                 pages in this corpus that do not contain Chinese characteristics or Chinese placenames were filtered
                 out based on keyword selection. The filtered Chinese corpus was 202 GB and the filtered Chinese corpus
                 with placenames was about 139.5GB. Then we count the number of web pages where two city names
                 co-appear. These intercity relationships were further classified into six categories using a
                 lexicon-based classification method.",
  cc-dataset-used = "CC-MAIN-2019-18 (WET)",
}

@InProceedings{cc:KummervoldWetjenRosa:2022:Norwegian-Colossal-Corpus,
  title        = "The Norwegian Colossal Corpus: {A} Text Corpus for Training Large Norwegian Language Models",
  year         = "2022",
  author       = "Kummervold, Per E and Wetjen, Freddy and de la Rosa, Javier",
  booktitle    = "The 13th International Conference on Language Resources and Evaluation (LREC 2022)",
  pdf          = "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.410.pdf",
  cc-snippet   = "Common Crawl (2022) is a non-profit organization that has been collecting data from the web and
                 providing these archives to the public since 2011. Common Crawl-based datasets are popular for training
                 transformer models and are the basis for the enormous 800GB Pile dataset (Gao, 2020), among others.
                 There are extracted Norwegian datasets that are also based on Common Crawl. The Open Super-large
                 Crawled Aggregated coRpus (OSCAR) (Suárez et al., 2019) contains 4.7GB (800M words) of Norwegian
                 Bokmål and 54MB (9M words) of Norwegian Nynorsk. Using a cleaned version of Common Crawl, Google
                 compiled a multilingual version of their English colossal corpus, called MC4 (2022), for training their
                 mT5 model (Xue et al., 2020). The Norwegian part of that dataset is roughly 94GB (14B words). Both
                 OSCAR and the MC4 datasets have been made available on Hugging Face (2022). Unfortunately, their
                 respective licenses do not allow for redistribution within the NCC. To overcome this limitation, we are
                 releasing scripts for the preparation, cleaning, deduplication, and formatting of these datasets, so
                 they can be interleaved 3855with the NCC. By combining NCC with OSCAR and MC4, it should be possible to
                 create a deduplicated Norwegian corpus with over 100GB of text (15B words).",
  cc-derived-dataset-used = "OSCAR",
  URL          = "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.410.pdf",
  cc-author-affiliation = "National Library of Norway (NLN), Norway",
  cc-class     = "nlp/corpus-construction",
}

@Article{cc:LiVincent:2022:rethinking-data-governance,
  title        = "Rethinking Data Governance: {A} Labor-Oriented Approach",
  author       = "Li, Hanlin and Vincent, Nicholas",
  year         = "2022",
  pdf          = "https://criticalautomation.org/wp-content/uploads/2022/03/li-vincent-data-governance.pdf",
  abstract     = "The current data governance paradigm in which technology companies solely decide how user data is
                 collected and used has introduced many issues to the tech sector. Prominent examples include
                 information asymmetry about user data’s value, monopolistic practices enabled by data’s network
                 effects, and power imbalance with respect to data aggregation and analysis. This work explicates how
                 viewing users’ data-generating activities through a labor lens can help to mitigate these issues and
                 provides corresponding design and research directions.",
  cc-snippet   = "2.1 Information asymmetry about user data's value¶ The lack of transparency about user data's value
                 helps make it possible for operators of for-profit computing systems to monetize user data and reap the
                 bulk of its financial benefits. Currently, there exists a substantial gap between what data-driven
                 technology companies know about user data's value and what users themselves do. For example, while
                 social media platforms are well aware of the amount of financial benefits of user engagement, users do
                 not have a window into how their collective attention and knowledge powers such businesses. This
                 information asymmetry is further exacerbated by the fact that the vast majority of data that users
                 produce during their interaction with modern technologies is rarely visible to themselves and is used
                 downstream without their awareness and consent. For instance, the rise of AI technologies is possible
                 largely due to the abundance of data unwittingly generated by the public for purposes other than
                 enabling AI models. Prominent examples include Flickr photos [12], Wikipedia articles [14], and the
                 Common Crawl dataset consisting of publicly available webpages [11]. In many of such cases, users
                 produce data without being aware of its value and potential, giving technology companies the
                 opportunity to extract an enormous amount of revenue from such data.",
  URL          = "https://criticalautomation.org/wp-content/uploads/2022/03/li-vincent-data-governance.pdf",
  cc-author-affiliation = "Northwestern University, USA",
  cc-class     = "dataset-creation, data governance, user-generated content, artificial intelligence, machine learning,
                 cc-cited-not-used",
}

@Article{cc:PuSarwarAbdullahRehmanEtAl:2022:deepfake-text-detection,
  title        = "Deepfake Text Detection: Limitations and Opportunities",
  year         = "2022",
  author       = "Pu, Jiameng and Sarwar, Zain and Abdullah, Sifat Muhammad and Rehman, Abdullah and Kim, Yoonjin and
                 Bhattacharya, Parantapa and Javed, Mobin and Viswanath, Bimal and Tech, Virginia and Pakistan, LUMS",
  URL          = "https://jmpu.github.io/files/Deepfake%20Text%20Detection%20Limitations%20and%20Opportunities_CR.pdf",
  cc-derived-dataset-used = "Grover-RealNews",
  cc-author-affiliation = "Virginia Tech, USA; University of Chicago, USA; LUMS, Pakistan, University of Virginia, USA",
  cc-class     = "nlp/text-classification, deep-fake-detection, misinformation, disinformation",
}

@Article{cc:HantkeStock:2022:HTML-violations,
  title        = "{HTML} Violations and Where to Find Them: {A} Longitudinal Analysis of Specification Violations in
                 {HTML}",
  author       = "Hantke, Florian and Stock, Ben",
  year         = "2022",
  URL          = "https://swag.cispa.saarland/papers/hantke2022violations.pdf",
  cc-snippet   = "[...] we leveraged Common Crawl [22] to analyze more than 23K popular domains over the course of eight
                 years. [...] the crawler framework first collects meta information for each of the listed domains using
                 Common Crawl [22] as a basis for the following analyses (1). This Common Crawl approach makes it
                 possible to take a look into the past and analyze old versions of websites as well as current
                 snapshots. Unlike similar crawling studies before using the Internet Archive[32], with Common Crawl, we
                 are not limited by rate limit issues as we can request the database and S3 bucket directly. This makes
                 the process fast and enables to analyze nearly a thousand pages per minute from one IP address over
                 multiple days. The meta information that the framework collects contains details on where an HTML
                 document can be found in the Common Crawl’s dumps. For each domain, the framework collects meta
                 information from up to 100 pages and hands them to the crawler.",
  cc-author-affiliation = "CISPA Helmholtz Center for Information Security, Germany",
  cc-class     = "web-science, internet-security",
}

@Misc{cc:MarkovZhangAgarwalEloundouEtAl:2022:undesired-content-detection,
  doi          = "10.48550/ARXIV.2208.03274",
  URL          = "https://arxiv.org/abs/2208.03274",
  pdf          = "https://arxiv.org/pdf/2208.03274.pdf",
  author       = "Markov, Todor and Zhang, Chong and Agarwal, Sandhini and Eloundou, Tyna and Lee, Teddy and Adler,
                 Steven and Jiang, Angela and Weng, Lilian",
  title        = "A Holistic Approach to Undesired Content Detection in the Real World",
  publisher    = "arXiv",
  year         = "2022",
  code-repository = "https://github.com/openai/moderation-api-release",
  cc-author-affiliation = "OpenAI",
  cc-class     = "nlp/text-classification, nlp/corpus-construction, toxic content, hate speech",
}

@InProceedings{cc:ReynoldsBatesBailey:2022:URL-parser-implementations,
  title        = "Equivocal {URL}s: Understanding the Fragmented Space of {URL} Parser Implementations",
  author       = "Reynolds, Joshua and Bates, Adam and Bailey, Michael",
  booktitle    = "European Symposium on Research in Computer Security",
  pages        = "166--185",
  year         = "2022",
  publisher    = "Springer",
  cc-snippet   = "We also surveyed ∼350 million URLs sampled uniformly and randomly from the approximately 3 billion
                 URLs in Common Crawl's January 2022 URL Index [35]. [35 Kreymer, I., Chuang, G.: Announcing the common
                 crawl index! (2015)]",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-031-17143-7_9",
  pdf          = "https://adambates.org/documents/Reynolds_Esorics22.pdf",
  cc-author-affiliation = "New Mexico State University, USA; University of Illinois at Urbana-Champaign, USA; Georgia
                 Institute of Technology, USA",
  cc-class     = "computer-security/internet-security, web-security, URL parsing",
}

@Misc{cc:KorkmazKoçyiğitŞahingözDiri:2022:hybrid-phishing-detection-system,
  year         = "2022",
  title        = "A Hybrid Phishing Detection System Using Deep Learning-based {URL} and Content Analysis",
  author       = "Korkmaz, Mehmet and Emre Koçyiğit and Özgür Şahingöz and Banu Diri",
  journal      = "Elektronika ir Elektrotechnika",
  volume       = "28",
  number       = "5",
  snippet      = "… With a new dataset (1 million URLs, half of which was obtained from PhishTank and the rest from
                 the CommonCrawl database, and the dataset contains 10,000 images) which researchers used in [21], CNN
                 and LSTM were tested in Intelligent …",
  URL          = "https://www.eejournal.ktu.lt/index.php/elt/article/download/31197/15556",
  cc-author-affiliation = "Yildiz Technical University, Istanbul, Turkey; Biruni University, Istanbul, Turkey",
  cc-class     = "computer-security/internet-security",
}

@Misc{cc:Ab-RazakJayaErnawanFirdausEtAl:2022:ML-classifiers-phishing-detection,
  year         = "2022",
  title        = "Comparative Analysis of Machine Learning Classifiers for Phishing Detection",
  author       = "Ab Razak, Mohd Faizal and Jaya, Mohd Izham and Ernawan, Ferda and Firdaus, Ahmad and Nugroho, Fajar
                 Agung",
  booktitle    = "2022 6th International Conference on Informatics and Computational Sciences (ICICoS)",
  pages        = "84--88",
  cc-snippet   = "… The source for this dataset is from the University Malaysia of Sarawak, compiled from PhishTank,
                 OpenPhish, Alexa and Common Crawl. One method for detecting new phishing websites is to utilize
                 heuristics such as the URL and CSS detection …",
  URL          = "https://ieeexplore.ieee.org/abstract/document/9930531/",
  cc-author-affiliation = "Universitas Dian Nuswantoro, Semarang, Indonesia",
  cc-class     = "computer-security/internet-security",
}

@Misc{cc:L-Ranaldi:2022:C-OSINT-COVID-19-Open-Source,
  year         = "2022",
  title        = "{C}-{OSINT}: {COVID}-19 Open Source artificial {INT}elligence framework",
  author       = "L. Ranaldi and A. Nourbakhsh and F. Fallucchid and FM. Zanzotto",
  abstract     = "With the emergence of COVID-19 disease worldwide, a market of the products related to this disease
                 formed across the Internet. By the time these goods were in short supply, many uncontrolled Dark Web
                 Marketplaces (DWM) were active in selling these products. At the same time, Dark Web Forums (DWF)
                 became proxies for spreading false ideas, fake news about COVID-19, and advertising products sold in
                 DWMs. This study investigates the activities entertained in the DWMs and DWFs to propose a
                 learning-based model to distinguish them from their related counterparts on the surface web. To this
                 end, we propose a COVID-19 Open Source artificial INTelligence framework (C-OSINT) to automatically
                 collect and classify the activities done in DWMs and DWFs. Moreover, we corporate linguistic and
                 stylistic solutions to leverage the classification performance between the content found in DWMs and
                 DWFs and two surface web sources. Our results show that using syntactic and stylistic representation
                 outperforms the Transformer based results over these domains.",
  URL          = "https://ceur-ws.org/Vol-3260/paper16.pdf",
  cc-author-affiliation = "Guglielmo Marconi University, Roma, Italy; University of Rome Tor Vergata, Roma, Italy",
  cc-class     = "nlp/transformer-language-model, web-science/dark-web",
}

@Misc{cc:LiuRitter:2022:CoNLL-2003-named-entity-taggers-still-work,
  doi          = "10.48550/ARXIV.2212.09747",
  URL          = "https://arxiv.org/abs/2212.09747",
  author       = "Liu, Shuheng and Ritter, Alan",
  title        = "Do Co{NLL}-2003 Named Entity Taggers Still Work Well in 2023?",
  year         = "2022",
  cc-snippet   = "Our dataset follows this distribution to collect Reuters news articles published between December 5th
                 and 7th, 2020, collected from the Common Crawl Foundation³. [³http://commoncrawl.org/]",
  pdf          = "https://arxiv.org/pdf/2212.09747.pdf",
  cc-author-affiliation = "Georgia Institute of Technology",
  cc-class     = "nlp/named-entity-recognition, dataset-creation",
}

@Misc{cc:BoháčekBravanskýTrhlíkMoravec:2022:Czech-news-article-dataset,
  doi          = "10.48550/ARXIV.2212.08550",
  URL          = "https://arxiv.org/abs/2212.08550",
  author       = "Boháček, Matyáš and Bravanský, Michal and Trhlík, Filip and Moravec, Václav",
  title        = "Fine-grained Czech News Article Dataset: An Interdisciplinary Approach to Trustworthiness Analysis",
  year         = "2022",
  pdf          = "https://arxiv.org/pdf/2212.08550.pdf",
  cc-snippet   = "Initially, we assembled a collection of almost 94, 000 articles by scraping URLs of 45 Czech news
                 sources obtained from Common Crawl² [²https://commoncrawl.org/]. These sources included mainstream
                 journalistic websites, tabloids, independent news outlets, and websites that are part of the
                 disinformation ecosystem [ 26 ], capturing the full scope of journalistic content in the Czech
                 Republic. [...] We applied multiple filters and balancing mechanisms to mitigate deficiencies caused by
                 inherent flaws in Common Crawl, which reduced the dataset’s size from 94, 000 to 10, 000 items. This
                 way, we also ensured that the data is as representative of the Czech news ecosystem and as diverse as
                 possible.",
  cc-author-affiliation = "Charles University, Prague, Czech Republic; Gymnasium of Johannes Kepler, Prague, Czech
                 Republic; University College London, United Kingdom",
  cc-class     = "nlp/fake-news-detection, dataset-creation",
}

@Misc{cc:KhanHanna:2022:AI-dataset-accountability,
  author       = "Khan, Mehtab and Hanna, Alex",
  title        = "The Subjects and Stages of {AI} Dataset Development: {A} Framework for Dataset Accountability",
  year         = "2022",
  URL          = "https://ssrn.com/abstract=4217148",
  doi          = "http://dx.doi.org/10.2139/ssrn.4217148",
  abstract     = "There has been increased attention toward the datasets that are used to train and build AI
                 technologies from the computer science and social science research communities, but less from legal
                 scholarship. Both Large-Scale Language Datasets (LSLDs) and Large-Scale Computer Vision Datasets
                 (LSCVDs) have been at the forefront of such discussions, due to recent controversies involving the use
                 of facial recognition technologies, and the discussion of the use of publicly-available text for the
                 training of massive models which generate human-like text. Many of these datasets serve as
                 “benchmarks” to develop models that are used both in academic and industry research, while others
                 are used solely for training models. The process of developing LSLDs and LSCVDs is complex and
                 contextual, involving dozens of decisions about what kinds of data to collect, label, and train a model
                 on, as well as how to make the data available to other researchers. However, little attention has been
                 paid to mapping and consolidating the legal issues that arise at different stages of this process: when
                 the data is being collected, after the data is used to build and evaluate models and applications, and
                 how that data is distributed more widely. In this article, we offer four main contributions. First, we
                 describe what kinds of objects these datasets are, how many different kinds exist, what types of
                 modalities they encompass, and why they are important. Second, we provide more clarity about the stages
                 of dataset development – a process that has thus far been subsumed within broader discussions about
                 bias and discrimination – and the subjects who may be susceptible to harms at each point of
                 development. Third, we provide a matrix of both the stages of dataset development and the subjects of
                 dataset development, which traces the connections between stages and subjects. Fourth, we use this
                 analysis to identify some basic legal issues that arise at the various stages in order to foster a
                 better understanding of the dilemmas and tensions that arise at every stage. We situate our discussion
                 within wider discussion of current debates and proposals related to algorithmic accountability. This
                 paper fulfills an essential gap when it comes to comprehending the complicated landscape of legal
                 issues connected to datasets and the gigantic AI models trained on them.",
  cc-snippet   = "D. Common Crawl: Archiving the Whole Web The Common Crawl (CC) dataset is one of the most popular
                 datasets used in the training of what have typically been called large language models. [...]",
  cc-author-affiliation = "Yale Law School, USA; Distributed AI Research Institute",
  cc-class     = "nlp/corpus-construction, dataset-creation, data-governance, privacy, legal/copyright",
}

@Misc{cc:SchuhmannBeaumontVencuGordonEtAl:2022:LAION-5B,
  doi          = "10.48550/ARXIV.2210.08402",
  URL          = "https://arxiv.org/abs/2210.08402",
  author       = "Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and
                 Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and
                 Schramowski, Patrick and Kundurthy, Srivatsa and Crowson, Katherine and Schmidt, Ludwig and
                 Kaczmarczyk, Robert and Jitsev, Jenia",
  title        = "{LAION}-5{B}: An open large-scale dataset for training next generation image-text models",
  publisher    = "arXiv",
  year         = "2022",
  cc-derived-dataset-about = "LAION-5B",
  cc-author-affiliation = "LAION; UC Berkeley, USA; Gentec Data; TU Darmstadt, Germany; Hessian.AI; University of
                 Washington, Seattle, USA; Technical University of Munich, Germany; Stability AI; EleutherAI; Juelich
                 Supercomputing Center (JSC), Germany; Research Center Juelich (FZJ), Germany",
  cc-class     = "nlp/corpus-construction, nlp/multimodal-corpora",
  cc-snippet   = "By starting from Common Crawl [1] and filtering this data source with an existing CLIP model, we
                 derive a dataset consisting of three parts: 2.32 billion English image-text examples, 2.26 billion
                 multilingual examples, and 1.27 billion examples that are not specific to a particular language (e.g.,
                 places, products, etc.). [...] To extract image-text pairs from Common Crawl, we parse the HTML IMG
                 (image) tags from Common Crawl’s WAT metadata files.⁴ [⁴See
                 https://commoncrawl.org/the-data/get-started/ for details of the metadata format.] Specifically, we
                 focus on images with an alt-text so we can create image-text pair.",
}

@Misc{cc:NLLB-TeamCosta-jussàCrossEtAl:2022:No-Language-Left-Behind,
  doi          = "10.48550/ARXIV.2207.04672",
  URL          = "https://arxiv.org/abs/2207.04672",
  author       = "{NLLB Team} and Costa-jussà, Marta R. and Cross, James and Çelebi, Onur and Elbayad, Maha and
                 Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and
                 Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi
                 and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett,
                 Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews,
                 Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia
                 and Goswami, Vedanuj and Guzmán, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers,
                 Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff",
  title        = "No Language Left Behind: Scaling Human-Centered Machine Translation",
  publisher    = "arXiv",
  year         = "2022",
  abstract     = "Driven by the goal of eradicating language barriers on a global scale, machine translation has
                 solidified itself as a key focus of artificial intelligence research today. However, such efforts have
                 coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource
                 languages. What does it take to break the 200 language barrier while ensuring safe, high quality
                 results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this
                 challenge by first contextualizing the need for low-resource language translation support through
                 exploratory interviews with native speakers. Then, we created datasets and models aimed at narrowing
                 the performance gap between low and high-resource languages. More specifically, we developed a
                 conditional compute model based on Sparsely Gated Mixture of Experts that is trained on data obtained
                 with novel and effective data mining techniques tailored for low-resource languages. We propose
                 multiple architectural and training improvements to counteract overfitting while training on thousands
                 of tasks. Critically, we evaluated the performance of over 40,000 different translation directions
                 using a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity
                 benchmark covering all languages in Flores-200 to assess translation safety. Our model achieves an
                 improvement of 44\% BLEU relative to the previous state-of-the-art, laying important groundwork towards
                 realizing a universal translation system. Finally, we open source all contributions described in this
                 work, accessible at https://github.com/facebookresearch/fairseq/tree/nllb.",
  cc-derived-dataset-about = "NLLB",
  cc-author-affiliation = "Meta AI; UC Berkeley, USA; Johns Hopkins University, USA",
  cc-class     = "nlp/corpus-construction, nlp/parallel-corpus, nlp/low-resource-language, nlp/language-identification",
  cc-snippet   = "We begin with web data as our starting point, provided by CommonCrawl (CC)18 and ParaCrawl (Bañón et
                 al., 2020).",
}

@Misc{cc:SmithPatwaryNorickLeGresleyEtAl:2022:Megatron-Turing-NLG-530B,
  doi          = "10.48550/ARXIV.2201.11990",
  URL          = "https://arxiv.org/abs/2201.11990",
  author       = "Smith, Shaden and Patwary, Mostofa and Norick, Brandon and LeGresley, Patrick and Rajbhandari, Samyam
                 and Casper, Jared and Liu, Zhun and Prabhumoye, Shrimai and Zerveas, George and Korthikanti, Vijay and
                 Zhang, Elton and Child, Rewon and Aminabadi, Reza Yazdani and Bernauer, Julie and Song, Xia and
                 Shoeybi, Mohammad and He, Yuxiong and Houston, Michael and Tiwary, Saurabh and Catanzaro, Bryan",
  keywords     = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
                 information sciences",
  title        = "Using DeepSpeed and Megatron to Train Megatron-Turing {NLG} 530{B}, {A} Large-Scale Generative
                 Language Model",
  publisher    = "arXiv",
  year         = "2022",
  cc-author-affiliation = "Microsoft; NVIDIA",
  cc-class     = "nlp/language-model",
  cc-snippet   = "Resources such as Common Crawl (CC) provide snapshots of the web which can be utilized as a source of
                 language data. While these data sources contain an enormous amount of language data, they also require
                 carefully designed preprocessing steps in order to select data which is of reasonable quality. As prior
                 work has found (e.g., [9]), the quality of unfiltered Common Crawl data is lower than that of curated
                 datasets and steps should be taken to increase the average quality of data selected from Common Crawl
                 for LM pretraining. [...] Common Crawl: As mentioned previously, Common Crawl comprises an immense
                 amount of data. We chose to process two snapshots, 2020-50 and 2021-04, with the aim of acquiring
                 around 150B tokens of training data. The first step of this process is language detection [11] and text
                 extraction from the raw HTML included in the Common Crawl WARC files¹. Following the rationale
                 presented in [11], we used the pycld2² and jusText³ libraries for these tasks. [...] In addition to
                 Common Crawl data, we leveraged a number of other previously generated datasets. From The Pile, we
                 selected Books3, OpenWebText2, Stack Exchange, PubMed Abstracts, Wikipedia, Gutenberg (PG-19),
                 BookCorpus2, NIH ExPorter, and Pile-CC datasets. We also included the CC-Stories and RealNews datasets
                 used to train Megatron [63].",
}

@InProceedings{cc:AlbyJäschke:2022:top-websites,
  author       = "Alby, Tom and Jäschke, Robert",
  editor       = "Silvello, Gianmaria and Corcho, Oscar and Manghi, Paolo and Di Nunzio, Giorgio Maria and Golub,
                 Koraljka and Ferro, Nicola and Poggi, Antonella",
  title        = "Analyzing the Web: Are Top Websites Lists a Good Choice for Research?",
  booktitle    = "Linking Theory and Practice of Digital Libraries",
  year         = "2022",
  publisher    = "Springer International Publishing",
  address      = "Cham",
  pages        = "11--25",
  abstract     = "The web has been a subject of research since its beginning, but it is difficult if not impossible to
                 analyze the whole web, even if a database of all URLs would be freely accessible. Hundreds of studies
                 have used commercial top websites lists as a shortcut, in particular the Alexa One Million Top Sites
                 list. However, apart from the fact that Amazon decided to terminate Alexa, we question the usefulness
                 of such lists for research as they have several shortcomings. Our analysis shows that top sites lists
                 miss frequently visited websites and offer only little value for language-specific research. We present
                 a heuristic-driven alternative based on the Common Crawl host-level web graph while also taking
                 language-specific requirements into account.",
  ISBN         = "978-3-031-16802-4",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-031-16802-4_2",
  cc-author-affiliation = "Humboldt-Universität zu Berlin, Berlin, Germany",
  cc-dataset-used = "hyperlinkgraph/cc-main-2021-feb-apr-may/hostgraph",
  cc-class     = "web-science, domain-ranking",
}

@Article{cc:Belz:2022:schema-org-in-e-commerce,
  title        = "Use of schema.org micro-markup in e-commerce projects",
  year         = "2022",
  doi          = "https://doi.org/10.30525/2661-5150/2022-4-1",
  author       = "Olexandra Belz",
  URL          = "http://baltijapublishing.lv/index.php/threeseas/article/view/1964/1973",
  cc-author-affiliation = "Ivan Franko National University of Lviv, Ukraine",
  abstract     = "The purpose of the article is to identify the most effective schema.org micro-markup schemes used in
                 e-commerce projects. Methodology. The research included competitive intelligence among the leading
                 online platforms operating in Europe in general and in Ukraine in particular. The study involved TOP-8
                 e-commerce projects in Ukraine and TOP-9 global cross-border marketplaces operating in Europe. The
                 service validator.schema.org was chosen as the research tool. Results. The study showed that the most
                 popular schema.org micro-markup format is JSON-LD. In general, 82.4\% of the surveyed sites use JSON-LD
                 microdata format. Some sites use two microdata formats: JSON-LD and Microdata. But none of the top
                 online marketplaces use the RDFa micro-markup format. Popular marketplaces operating in Ukraine and
                 Europe often use the same types of schema.org vocabulary. However, the frequency of using micro-markup
                 by top marketplaces operating in Ukraine is much higher than the frequency of using micro-markup by top
                 marketplaces operating in Europe. In addition, Ukrainian marketplaces use a much wider list of
                 schema.org micro-markup properties than marketplaces operating in Europe. However, no online store has
                 implemented the properties of advantages and disadvantages of goods recommended by Google in the
                 scheme. Practical implications. The study suggests schema.org micro-markup schemes for homepage,
                 category page, product page, about page, payment and delivery page, warranty and returns page, contact
                 page and blog. The proposed templates of micro-markup schemes were validated using the
                 validator.schema.org service. The study recommends using the JSON-LD format for semantic markup of
                 website content. Value/originality. Implementation of effective semantic markup of site content will
                 allow search engines to more accurately identify the information presented on the site. This, in turn,
                 will improve the visibility of the online marketplace in the Search Engine Results Page of Google,
                 Bing, Yahoo! etc.",
  cc-class     = "e-commerce, online marketplaces, linked data, schema.org annotations, SEO",
  cc-derived-dataset-used = "WebDataCommons",
  cc-snippet   = "Since 2008, the Common Crawl project has been crawling websites to collect web page data (extracting
                 metadata and web page text). At the time of writing, the latest scan took place from November 26 to
                 December 10, 2022. As a result of this scan, 3.35 billion web pages were processed and 420 petabytes of
                 content were removed (Common Crawl, 2022). Both scientists and practitioners are working with the
                 obtained data sets of the Common Crawl project.¶ On September 22, 2022, the Web Data Commons (WDC)
                 project released the Schema.org Table Annotation Benchmark (SOTAB) for public download (Web Data
                 Commons, 2022).",
}

@Misc{cc:Minwoo-ByeonKim:2022:COYO-700m-image-text-pair-dataset,
  author       = "Minwoo Byeon and Beomhee Park and Haecheon Kim and Sungjun Lee and Woonhyuk Baek and Saehoon Kim",
  title        = "Coyo-700m: Image-text pair dataset",
  year         = "2022",
  cc-derived-dataset-about = "COYO-700M",
  URL          = "https://github.com/kakaobrain/coyo-dataset",
  url-2nd      = "https://kakaobrain.com/contents?contentId=7eca73e3-3089-43cb-b701-332e8a1743fd",
  cc-author-affiliation = "Kakao Brain, South Korea",
  cc-class     = "nlp/multimodal-corpora",
  cc-dataset-used = "five CommonCrawl dumps, ranging from 2017 to 2020",
  abstract     = "We collected about 10 billion pairs of alt-text and image source in HTML documents in Common Crawl
                 from Oct. 2020 to Aug. 2021. and eliminated uninformative pairs through the image and text level
                 filtering process with minimal cost. The following figure outlines our data collection procedure.",
}

@Misc{cc:ThoppilanFreitasHallShazeerEtAl:2022:LaMDA,
  title        = "La{MDA}: Language Models for Dialog Applications",
  author       = "Romal Thoppilan and Daniel De Freitas and Jamie Hall and Noam Shazeer and Apoorv Kulshreshtha and
                 Heng-Tze Cheng and Alicia Jin and Taylor Bos and Leslie Baker and Yu Du and YaGuang Li and Hongrae Lee
                 and Huaixiu Steven Zheng and Amin Ghafouri and Marcelo Menegali and Yanping Huang and Maxim Krikun and
                 Dmitry Lepikhin and James Qin and Dehao Chen and Yuanzhong Xu and Zhifeng Chen and Adam Roberts and
                 Maarten Bosma and Vincent Zhao and Yanqi Zhou and Chung-Ching Chang and Igor Krivokon and Will Rusch
                 and Marc Pickett and Pranesh Srinivasan and Laichee Man and Kathleen Meier-Hellstern and Meredith
                 Ringel Morris and Tulsee Doshi and Renelito Delos Santos and Toju Duke and Johnny Soraker and Ben
                 Zevenbergen and Vinodkumar Prabhakaran and Mark Diaz and Ben Hutchinson and Kristen Olson and Alejandra
                 Molina and Erin Hoffman-John and Josh Lee and Lora Aroyo and Ravi Rajakumar and Alena Butryna and
                 Matthew Lamm and Viktoriya Kuzmina and Joe Fenton and Aaron Cohen and Rachel Bernstein and Ray Kurzweil
                 and Blaise Aguera-Arcas and Claire Cui and Marian Croak and Ed Chi and Quoc Le",
  year         = "2022",
  eprint       = "2201.08239",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2201.08239",
  cc-derived-dataset-used = "Tensorflow-C4",
  cc-snippet   = "E Pre-training data composition¶ The pre-training data, called Infiniset, is a combination of dialog
                 data from public dialog data and other public web documents. It consists of 2.97B documents and 1.12B
                 dialogs with 13.39B utterances. The composition of the data is as follows: 50\% dialogs data from
                 public forums; 12.5\% C4 data [11]; 12.5\% code documents from sites related to programming like Q&A
                 sites, tutorials, etc; 12.5\% Wikipedia (English); 6.25\% English web documents; and 6.25\% Non-English
                 web documents. The total number of words in the dataset is 1.56T. Note that this composition was chosen
                 to achieve a more robust performance on dialog tasks (Section 4) while still keeping its ability to
                 perform other tasks like code generation. As future work, we can study how the choice of this
                 composition may affect the quality of some of the other NLP tasks performed by the model.",
  cc-author-affiliation = "Google",
  cc-class     = "nlp/language-model, nlp/transformer-language-model",
}

@Article{cc:PhillipsAlam:2022:EOT-cloud,
  title        = "Moving the End of Term Web Archive to the Cloud to Encourage Research Use and Reuse",
  author       = "Phillips, Mark Edward and Alam, Sawood",
  year         = "2022",
  URL          = "https://digital.library.unt.edu/ark:/67531/metadc1998717/m2/1/high_res_d/EOT_WADL_2022.pdf",
  cc-author-affiliation = "University of North Texas, USA; Internet Archive, USA",
  cc-class     = "web archive",
  abstract     = "The End of Term Web (EOT) Archive is a collaborative project with a goal of collecting the United
                 States federal web, loosely defined as .gov and .mil, every four years coinciding with presidential
                 elections and often a transition in the Executive Branch of the government. In 2021 the End of Term
                 team began to process the longitudinal web archive for EOT-2008, EOT-2012, EOT-2016, and EOT-2020 to
                 move into the Amazon S3 storage service as part of the Amazon Open Data Program. This effort adopted
                 tools, structures, and documentation developed by Common Crawl in an effort to maximize potential
                 research access and reuse of existing tools and documentation. This paper presents the process of
                 organizing, staging, processing, and moving these collections into the Amazon cloud.",
}

@TechReport{cc:AddaBraffortVasilescuYvon:2022:report-French-language,
  author       = "Adda, Gilles and Braffort, Annelies and Vasilescu, Ioana and Yvon, François",
  year         = "2022",
  title        = "Deliverable {D1}.14 Report on the French Language. European Language Equality ({ELE}); {EU} project
                 no. {LC}- 01641480 – 101018166",
  URL          = "https://european-language-equality.eu/wp-content/uploads/2022/03/ELE___Deliverable_D1_14__Language_Report_French_.pdf",
  cc-snippet   = "The CommonCrawl project³⁷ [³⁷https://commoncrawl.org/] aggregates Web crawled data that is
                 orders or magnitude larger than these resources for many languages; furthermore this corpus is being
                 updated on a regular basis. By using parts of the French subset of CommonCrawl, possibly conjoined with
                 the more curated corpora alluded to above has enabled to train large-scale BERT-style Language Models
                 (LMs) – FlauBERT (Le et al., 2020) is built with a corpus containing about 12B running words,
                 CamemBERT (Martin et al., 2020) uses the 22B words OSCAR, and these numbers continue to grow, albeit at
                 a much slower pace than the corresponding English cor- pora.",
  cc-author-affiliation = "Université Paris-Saclay, CNRS, LISN, Paris, France",
  cc-class     = "nlp/resources, French, nlp/language-models, nlp/text-corpora",
}

@Misc{cc:Nagel:2022:10-years-in-the-cloud,
  type         = "Presentation",
  title        = "{Common} {Crawl} – Experiences From 10 Years in the Cloud",
  copyright    = "Public",
  URL          = "https://digital.library.unt.edu/ark:/67531/metadc1983147/",
  abstract     = "Presentation for the IIPC General Assembly and Web Archiving Conference virtually held on May 23-25,
                 2022. This presentation gives an overview of how the Common Crawl web data is used in and outside the
                 cloud over the past ten years that the dataset has been hosted as part of Amazon Web Services’ Open
                 Data Sponsorships program.",
  language     = "English",
  urldate      = "2025-11-21",
  journal      = "2022 International Internet Preservation Consortium (IIPC) Web Archiving Conference, May 23-25,
                 2022.",
  author       = "Nagel, Sebastian",
  month        = may,
  year         = "2022",
  cc-author-affiliation = "Common Crawl Foundation, USA",
  cc-class     = "web-archiving/storage, web-archiving/open-dataset",
}