cc-citations/bib/cc2023.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

@Article{cc:SafiSingh:2023:phishing-website-detection,
  title        = "A Systematic Literature Review on Phishing Website Detection Techniques",
  journal      = "Journal of King Saud University - Computer and Information Sciences",
  year         = "2023",
  ISSN         = "1319-1578",
  DOI          = "https://doi.org/10.1016/j.jksuci.2023.01.004",
  URL          = "https://www.sciencedirect.com/science/article/pii/S1319157823000034",
  author       = "Asadullah Safi and Satwinder Singh",
  keywords     = "Phishing, Phishing Detection, Deep Learning, Cyber Security, Machine Learning",
  abstract     = "Phishing is a fraud attempt in which an attacker acts as a trusted person or entity to obtain
                 sensitive information from an internet user. In this Systematic Literature Survey (SLR), different
                 phishing detection approaches, namely Lists Based, Visual Similarity, Heuristic, Machine Learning, and
                 Deep Learning based techniques, are studied and compared. For this purpose, several algorithms, data
                 sets, and techniques for phishing website detection are revealed with the proposed research questions.
                 A systematic Literature survey was conducted on 80 scientific papers published in the last five years
                 in research journals, conferences, leading workshops, the thesis of researchers, book chapters, and
                 from high-rank websites. The work carried out in this study is an update in the previous systematic
                 literature surveys with more focus on the latest trends in phishing detection techniques. This study
                 enhances readers' understanding of different types of phishing website detection techniques, the data
                 sets used, and the comparative performance of algorithms used. Machine Learning techniques have been
                 applied the most, i.e., 57 as per studies, according to the SLR. In addition, the survey revealed that
                 while gathering the data sets, researchers primarily accessed two sources: 53 studies accessed the
                 PhishTank website (53 for the phishing data set) and 29 studies used Alexa's website for downloading
                 legitimate data sets. Also, as per the literature survey, most studies used Machine Learning
                 techniques; 31 used Random Forest Classifier. Finally, as per different studies, Convolution Neural
                 Network (CNN) achieved the highest Accuracy, 99.98\%, for detecting phishing websites.",
  cc-snippet   = "[phishing website detection research relying] Common Crawl (Rao et al., 2019); (Rashid et al., 2020) ;
                 (Geyik et al., 2021) ; (Korkmaz and Sahingoz, 2020) ; (Chiew et al., 2019) ; (Feng and Yue, 2020) ;
                 (Wei et al., 2020)",
  cc-author-affiliation = "Nangarhar University, Afghanistan; Central University of Punjab, Bathinda, Punjab, India",
  cc-class     = "computer-security/internet-security, web-security",
}

@Misc{cc:GoldsteinSastryMusserDiRestaEtAl:2023:generative-language-models-threads-and-mitigations,
  DOI          = "10.48550/ARXIV.2301.04246",
  URL          = "https://arxiv.org/abs/2301.04246",
  author       = "Goldstein, Josh A. and Sastry, Girish and Musser, Micah and DiResta, Renee and Gentzel, Matthew and
                 Sedova, Katerina",
  keywords     = "Computers and Society (cs.CY), FOS: Computer and information sciences, FOS: Computer and information
                 sciences",
  title        = "Generative Language Models and Automated Influence Operations: Emerging Threats and Potential
                 Mitigations",
  publisher    = "arXiv",
  year         = "2023",
  cc-author-affiliation = "Georgetown University’s Center for Security and Emerging Technology, USA; OpenAI; Stanford
                 Internet Observatory, USA",
  cc-class     = "nlp/generative-language-models, ai/ethics-of-machine-learning, cc-cited-not-used",
  cc-snippet   = "While some of this data is typically taken from relatively structured sources such as Wikipedia, a
                 large majority of data usually comes from tools like Common Crawl that scrape the web for publicly
                 available text.¹⁴⁷ [147. CommonCrawl freely publishes its archives of web data. See “So you’re
                 ready to get started.,” Common Crawl, accessed June 27, 2022,
                 https://commoncrawl.org/the-data/get-started/. But anyone can build their own software for web scraping
                 or use other tools to extract data from websites.]",
}

@PhdThesis{cc:Wang:2023:large-web-archive-collection,
  title        = "Large Web Archive Collection Infrastructure and Services",
  author       = "Wang, Xinyue",
  year         = "2023",
  school       = "Virginia Tech",
  pdf          = "https://vtechworks.lib.vt.edu/bitstream/handle/10919/113345/Wang_X_D_2023.pdf",
  URL          = "http://hdl.handle.net/10919/113345",
  cc-author-affiliation = "Virginia Tech, USA",
  cc-class     = "web-archiving, data formats, big data, data processing, WARC, Parquet, CDX",
  abstract     = "The web has evolved to be the primary carrier of human knowledge during the information age. The
                 ephemeral nature of much web content makes web knowledge preservation vital in preserving human
                 knowledge and memories. Web archives are created to preserve the current web and make it available for
                 future reuse. In addition to its preservation purpose, web archive data is also used as a source for
                 research and for lost information discovery. However, the reuse of web archive data is inherently
                 challenging because of the scale of data size and requirements of big data tools to serve and analyze
                 web archive data efficiently. In this research, we propose to build a web archive big data processing
                 infrastructure that can support efficient and scalable web archive reuse like quantitative data
                 analysis and browsing services. We adopt industry frameworks and tools to establish a platform that can
                 provide high-performance computation for web archive initiatives and users. We propose to convert the
                 standard web archive data file format to a columnar data format for efficient future reuse. Our
                 experiments show that our proposed design can significantly improve quantitative data analysis tasks
                 for common web archive data usage. Our design can also serve an efficient web browsing service without
                 adopting a sophisticated web hosting architecture. In addition to the standard web archive data, we
                 also integrate Twitter data into our design as a unique web archive resource. Twitter is a prominent
                 source of data for researchers in a variety of fields and an integral element of the web's history. We
                 aggregate the Twitter data from different sources and integrate it into the suggested design for reuse.
                 We are able to greatly increase the processing performance of workloads around social media data by
                 overcoming the data loading bottleneck with a web-archive-like Parquet data format.",
  cc-snippet   = "We use Common Crawl’s web archiving data crawled from May 20 to 23, 2018. The data set consists of
                 1219 Gzip compressed WARC files totaling 0.98 TB, and contains 53,324,440 records. The WARC files are
                 organized by crawling time, each containing records crawled from a mutually exclusive time span. We
                 then reformat the WARC files to yield the following five datasets for comparison: 1) the original WARC
                 files; 2) case 1 plus CDX index files built against all the original WARC files; 3) Parquet files
                 containing the same information as case 1, with most columns in String type; 4) the same as case 3 but
                 the Timestamp column in INT64 Timestamp type; 5) Avro, [...]",
}

@Article{cc:Terzis:2023:programmable-commons,
  title        = "Building Programmable Commons",
  author       = "Terzis, Petros",
  year         = "2023",
  publisher    = "SocArXiv",
  URL          = "https://osf.io/preprints/socarxiv/yuef5/",
  DOI          = "10.31235/osf.io/yuef5",
  cc-author-affiliation = "University College London, United Kingdom",
  cc-class     = "digital-commons, public-commons, cc-cited-not-used",
  cc-snippet   = "Programmable commons and the public value of programmability are thus introduced as parts of a broader
                 political project that aspires to democratise access to, and management of these resources. By drawing
                 on the history of a family of commons -namely intellectual commons, infrastructure commons, and global
                 commons-, this paper explores the material form and impact of infocomputational technologies and
                 presents a blend of bottom-up and top-down initiatives for their commons-based organisation and
                 governance.",
}

@Misc{cc:HanleyKumarDurumeric:2023:conspiracy-theories,
  DOI          = "10.48550/ARXIV.2301.10880",
  URL          = "https://arxiv.org/abs/2301.10880",
  author       = "Hanley, Hans W. A. and Kumar, Deepak and Durumeric, Zakir",
  title        = "A Golden Age: Conspiracy Theories' Relationship with Misinformation Outlets, News Media, and the Wider
                 Internet",
  publisher    = "arXiv",
  year         = "2023",
  abstract     = "Do we live in a {"}Golden Age of Conspiracy Theories?{"} In the last few decades, conspiracy theories
                 have proliferated on the Internet with some having dangerous real-world consequences. A large
                 contingent of those who participated in the January 6th attack on the US Capitol believed fervently in
                 the QAnon conspiracy theory. In this work, we study the relationships amongst five prominent conspiracy
                 theories (QAnon, COVID, UFO/Aliens, 9-11, and Flat-Earth) and each of their respective relationships to
                 the news media, both mainstream and fringe. Identifying and publishing a set of 755 different
                 conspiracy theory websites dedicated to our five conspiracy theories, we find that each set often
                 hyperlinks to the same external domains, with COVID and QAnon conspiracy theory websites largest amount
                 of shared connections. Examining the role of news media, we further find that not only do outlets known
                 for spreading misinformation hyperlink to our set of conspiracy theory websites more often than
                 mainstream websites but this hyperlinking has increased dramatically between 2018 and 2021, with the
                 advent of QAnon and the start of COVID-19 pandemic. Using partial Granger-causality, we uncover several
                 positive correlative relationships between the hyperlinks from misinformation websites and the
                 popularity of conspiracy theory websites, suggesting the prominent role that misinformation news
                 outlets play in popularizing many conspiracy theories.",
  cc-snippet   = "Using our own web scrapes and pages historically scraped by Common Crawl,¹
                 [¹https://commoncrawl.org/] we then document the state and the changing behaviors of the conspiracy
                 theory ecosystem and their relationship to a separate set of 530 known misinformation outlets, 565
                 authentic news websites, and 528 non-news websites. [...] Utilizing the Common Crawl harmonic and
                 PageRank centrality measures that measure a website’s centrality across all of the crawled Internet,
                 we then find many of the websites in our dataset have relatively high network centrality, suggesting
                 that many of them are not peripheral on the Internet but actually near the Internet’s core/are
                 mainstream. Indeed examining, the hyperlink connections between news media and these conspiracy
                 theories, we find that many of them rely heavily on mainstream as well as misinformation outlets
                 (compared to non-news websites) for their information, with many popular misinformation outlets also
                 hyperlinking back to many of these conspiracy theory websites. [...] 4.1 Common Crawl Page Retrieval
                 and Website Crawling To gather the set of hyperlinks between our websites, we utilize Common Crawl data
                 [92]—widely considered the most complete publicly available source of web crawl data—and our own
                 website crawls. For each website in our dataset, we collect all the domain’s HTML pages that were
                 indexed by Common Crawl before August 2021. In addition to Common Crawl data, we further utilize our
                 own website scrapes. We utilize our own crawls, in addition to Common Crawl, due to noisiness, missing
                 pages, and missing domains within the Common Crawl dataset [85]. For example, 309 particularly small
                 conspiracy theory domains were not contained within the Common Crawl dataset (i.e. these websites often
                 only contained a few dozen pages). Thus for each website in our dataset, we further gather all the HTML
                 pages 10 hops from each website’s homepage (i.e., we collect all URLs linked from the homepage (1st
                 hop), then all URLs linked from the pages that were linked by the homepage (2nd hop), and so forth).
                 For each HTML page from our scrapes and Common Crawl, we parse the HTML, detect the date that page was
                 published, and collect hyperlinks to other pages (i.e., HTML <a> tags). Altogether we gather the
                 available Common Crawl pages and scrape the HTML for our 755 conspiracy theory, 530 misinformation, 565
                 authentic news, and 528 non-news websites. [...] Utilizing Common Crawl network data [ 61] over the
                 indexed Internet (87.7 million websites), we thus determine the network centrality of our set of
                 conspiracy-focused websites to understand if each conspiracy theory website category is “core”
                 (regularly utilized on the Internet) or “peripheral”. We utilize centralities across Common
                 Crawl’s dataset rather than our partial one in order to get a sense of each conspiracy theory’s
                 centrality on the entire Internet. While only 446 of our conspiracy theory websites are within the
                 Common Crawl dataset, this analysis allows us to fully understand the relative roles that each
                 conspiracy theory website group in our dataset plays on the wider Internet.",
  cc-author-affiliation = "Stanford University, USA",
  cc-class     = "nlp/fake-news-detection, misinformation, disinformation, conspiracy theories,
                 web-science/hyperlinkgraph",
}

@Misc{cc:PeetersDerBizer:2023:WDC-products,
  DOI          = "10.48550/ARXIV.2301.09521",
  URL          = "https://arxiv.org/abs/2301.09521",
  author       = "Peeters, Ralph and Der, Reng Chiz and Bizer, Christian",
  title        = "{WDC} Products: {A} Multi-Dimensional Entity Matching Benchmark",
  publisher    = "arXiv",
  year         = "2023",
  cc-snippet   = "The first step of the pipeline is the extraction of large amounts of product offers from the Common
                 Crawl⁴ [⁴https://commoncrawl.org/] using schema.org annotations. Some product offers contain
                 product identifiers like MPNs and GTINs which allow us to group offers into [...] The Web Data Commons6
                 project regularly extracts schema.org annotations from the Common Crawl, the largest web corpus
                 available to the public, in order to monitor the adoption of semantic annotations on the Web and to
                 provide the extracted data for public download. The WDC Products benchmark uses product offers from the
                 WDC Product Data Corpus V2020 (PDC2020)7. The corpus was created by extracting schema.org product data
                 from the September 2020 version of the Common Crawl. The extracted data goes through a pipeline of
                 cleansing steps such as removing offers from listing pages as well as advertisements that are contained
                 in a page in addition to the main offer [31]. The resulting PDC2020 corpus consists of ∼98 million
                 product offers originating from 603,000 websites.",
  cc-dataset-used = "CC-MAIN-2020-40",
  cc-author-affiliation = "University of Mannheim, Germany",
  cc-class     = "semantic-web, semantic-web/microformats, e-commerce, linked data, schema.org annotations",
}

@Misc{cc:Amatriain:2023:transformer-models-catalog,
  DOI          = "10.48550/ARXIV.2302.07730",
  URL          = "https://arxiv.org/abs/2302.07730",
  pdf          = "https://arxiv.org/pdf/2302.07730.pdf",
  author       = "Xavier Amatriain",
  title        = "Transformer models: an introduction and catalog",
  publisher    = "arXiv",
  year         = "2023",
  cc-author-affiliation = "amatriain.net",
  cc-class     = "nlp/language-model, nlp/transformer-language-model, nlp/multi-modal-language-model",
}

@Misc{cc:CarliniJagielskiChoquette-ChooPalekaEtAl:2023:poisoning-web-scale-training-datasets,
  DOI          = "10.48550/ARXIV.2302.10149",
  URL          = "https://arxiv.org/abs/2302.10149",
  pdf          = "https://arxiv.org/pdf/2302.10149.pdf",
  author       = "Carlini, Nicholas and Jagielski, Matthew and Choquette-Choo, Christopher A. and Paleka, Daniel and
                 Pearce, Will and Anderson, Hyrum and Terzis, Andreas and Thomas, Kurt and Tramèr, Florian",
  keywords     = "Cryptography and Security (cs.CR), Machine Learning (cs.LG), FOS: Computer and information sciences,
                 FOS: Computer and information sciences",
  title        = "Poisoning Web-Scale Training Datasets is Practical",
  publisher    = "arXiv",
  year         = "2023",
  cc-author-affiliation = "Google; ETH Zurich, Switzerland; NVIDIA; Robust Intelligence",
  cc-class     = "nlp/corpus-construction, computer-security, nlp/language-model, nlp/transformer-language-model,
                 nlp/multi-modal-language-model",
  abstract     = "Deep learning models are often trained on distributed, webscale datasets crawled from the internet. In
                 this paper, we introduce two new dataset poisoning attacks that intentionally introduce malicious
                 examples to a model's performance. Our attacks are immediately practical and could, today, poison 10
                 popular datasets. Our first attack, split-view poisoning, exploits the mutable nature of internet
                 content to ensure a dataset annotator's initial view of the dataset differs from the view downloaded by
                 subsequent clients. By exploiting specific invalid trust assumptions, we show how we could have
                 poisoned 0.01\% of the LAION-400M or COYO-700M datasets for just $60 USD. Our second attack,
                 frontrunning poisoning, targets web-scale datasets that periodically snapshot crowd-sourced content --
                 such as Wikipedia -- where an attacker only needs a time-limited window to inject malicious examples.
                 In light of both attacks, we notify the maintainers of each affected dataset and recommended several
                 low-overhead defenses.",
  cc-snippet   = "B.3 Common Crawl Common Crawl is a petabyte-scale corpus of web crawl data that is repeatedly captured
                 on a roughly monthly basis. Each archive is a complete re-crawl of the internet that records the full
                 activity, including all requests of the crawler and the host responses—with both HTTP headers and
                 content. As such, each archive contains a static snapshot of all crawled pages at the time of visit.
                 This may include new page content not seen during a previous crawl, and may exclude content that has
                 become stale since the previous crawl. For example, data crawled during September 24 through October 8,
                 2022 contains 3.15 billion web pages with 380 TiB of uncompressed content from 34 million registered
                 domains—1.3 billion URLs were not visited in any of the prior crawls.¹⁴ The Common Crawl dataset
                 is vulnerable to an attack which is similar to both our frontrunning and split-view poisoning attacks.
                 The adversary can purchase an expired domain which was previously contained in the Common Crawl, and it
                 will be re-crawled with the adversary’s choice of content, which will then appear in subsequent
                 Common Crawl snap- shots. Notice that, differently from the snapshot-poisoning attack on Wikipedia,
                 there is no content moderation here and so the adversary simply needs to continue to control the domain
                 to poison all future Common Crawl snapshots. Buying recently-expired domains that existed in previous
                 Common Crawl snapshots allows a stronger form of attack where the attack can inject entirely new links
                 into the crawl. This can be accomplished by adding links or subdomains to poisoned domains, and
                 allowing the crawler to discover the new poisoned domains. Thus, an adversary may inject arbitrarily
                 many pages into the Common Crawl dataset, not only from the originally expired subset. We do not
                 implement this attack following our ethics statements outlined earlier. Since Common Crawl WARC files
                 have been hosted by Amazon on a AWS Athena (serverless service)¹⁵, domain reconnaissance work to
                 analyze URLs is inexpensive. Scanning through 10 years of Common Crawl data to analyze domains from
                 popular TLDs and high number of Common Crawl entries cost us USD$ 0.84. While additional analysis might
                 somewhat increase this cost, it remains an inexpensive way to search for vulnerable domains. Buying
                 recently expired domains, or domains that have a dangling DNS record with an active IP address is
                 preferred, as domains that failed to return a 200-OK status in consecutive crawls seem to be moved to a
                 lower priority. For example, among expired domains we purchased, just one domain accounts for more than
                 90\% of all status codes among the purchased domains, while other domains we purchased as early as
                 12/20/2020 have seen relatively less scraping traffic across a 3 year period.16 Because Common Crawl is
                 enormous and uncurated (to accurately reflect the content of the internet) poisoning all of Common
                 Crawl is impractical due to size. Additionally, it is not always apparent how consumers of this data
                 are process- ing it for downstream machine learning tasks. However, there exist many derivative
                 datasets which are constructed by curating a relevant subset of the Common Crawl. This includes the
                 LAION-5B image dataset [57], the text dataset known as the Pile [23], the multilingual text dataset
                 CC-100 [78], and the CCMatrix dataset [61], a translation dataset of pairs of translated sentences.
                 Such curation actually amplifies the power of an attack: an attack which adds 1MB of text to the Common
                 Crawl would be poisoning a 2.5 · 10−9 fraction of the Common Crawl, but if this text bypasses the
                 curation done for the CC-100 dataset, it could instead poison a 1.2 · 10−5 fraction of the English
                 corpus, or even a full 9.1\% of the Oromo corpus.",
}

@Misc{cc:HuangDongWangHaoEtAl:2023:language-is-not-all-you-need,
  DOI          = "10.48550/ARXIV.2302.14045",
  URL          = "https://arxiv.org/abs/2302.14045",
  author       = "Huang, Shaohan and Dong, Li and Wang, Wenhui and Hao, Yaru and Singhal, Saksham and Ma, Shuming and
                 Lv, Tengchao and Cui, Lei and Mohammed, Owais Khan and Liu, Qiang and Aggarwal, Kriti and Chi, Zewen
                 and Bjorck, Johan and Chaudhary, Vishrav and Som, Subhojit and Song, Xia and Wei, Furu",
  title        = "Language Is Not All You Need: Aligning Perception with Language Models",
  publisher    = "arXiv",
  year         = "2023",
  cc-author-affiliation = "Microsoft",
  cc-class     = "nlp/language-model, nlp/transformer-language-model, nlp/multi-modal-language-model",
  cc-dataset-used = "CC-MAIN-2020-50, CC-MAIN-2021-04",
  cc-derived-dataset-used = "The-Pile-English, CC-Stories, RealNews, LAION-400M, LAION-2B, COYO-700M",
  cc-snippet   = "Text Corpora We train our model with The Pile [GBB+20] and Common Crawl (CC). The Pile is a massive
                 English text dataset built for training large-scale language models, which is produced from a variety
                 of data sources. We exclude data splits from GitHub, arXiv, Stack Exchange, and PubMed Central. We also
                 include the Common Crawl snapshots (2020-50 and 2021-04) datasets, CC-Stories, and RealNews datasets
                 [SPP+19 , SPN+22]. The entire datasets have been purged of duplicate and near-duplicate documents, as
                 well as filtered to exclude downstream task data. Refer to Appendix B.1.1 for detailed descriptions of
                 training text corpora.¶ Image-Caption Pairs The image-caption pairs are constructed from several
                 datasets, including English LAION-2B [ SBV+22 ], LAION-400M [ SVB+21], COYO-700M [BPK+22 ], and
                 Conceptual Captions [ SDGS18, CSDS21]. English LAION-2B, LAION-400M, and COYO-700M are collected from
                 web pages of the Common Crawl web data by extracting image sources and the corresponding alt-text.
                 Conceptual Captions are also from internet web pages. More details can be found in Appendix B.1.2. ¶
                 Interleaved Image-Text Data We collect interleaved multimodal data from the Common Crawl snapshot,
                 which is a publicly available archive of web pages. We use a filtering process to select about 71M web
                 pages from the original 2B web pages in the snapshot. We then extract the text and images from the HTML
                 of each selected web page. For each document, we limit the number of images to five to reduce noise and
                 redundancy. We also randomly discard half of the documents that only have one image to increase the
                 diversity. We provide more details about the data collection process in Appendix B.1.3. By using this
                 corpus, we enable KOSMOS-1 to handle interleaved text and image and improve its few-shot ability.",
}

@Misc{cc:TouvronLavrilIzacardMartinetEtAl:2023:LLaMA,
  DOI          = "10.48550/ARXIV.2302.13971",
  URL          = "https://arxiv.org/abs/2302.13971",
  pdf          = "https://arxiv.org/pdf/2302.13971.pdf",
  author       = "Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne
                 and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and
                 Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume",
  keywords     = "Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and
                 information sciences",
  title        = "{LL}a{MA}: Open and Efficient Foundation Language Models",
  publisher    = "arXiv",
  year         = "2023",
  abstract     = "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We
                 train our models on trillions of tokens, and show that it is possible to train state-of-the-art models
                 using publicly available datasets exclusively, without resorting to proprietary and inaccessible
                 datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is
                 competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the
                 research community.",
  cc-author-affiliation = "Meta AI",
  cc-class     = "nlp/language-model, nlp/transformer-language-model, nlp/multi-modal-language-model",
  cc-dataset-used = "five CommonCrawl dumps, ranging from 2017 to 2020",
  cc-derived-dataset-about = "Tensorflow-C4",
  cc-snippet   = "English CommonCrawl [67\%]. We preprocess five CommonCrawl dumps, ranging from 2017 to 2020, with the
                 CCNet pipeline (Wenzek et al., 2020). This process deduplicates the data at the line level, performs
                 language identification with a fastText linear classifier to remove non-English pages and filters low
                 quality content with an n-gram language model. In addition, we trained a linear model to classify pages
                 used as references in Wikipedia v.s. randomly sampled pages, and discarded pages not classified as
                 references.",
}

@Misc{cc:Ammar:2023:dynamic-graph-processing,
  year         = "2023",
  title        = "Systems and Algorithms for Dynamic Graph Processing",
  author       = "Khaled Ammar",
  URL          = "https://uwspace.uwaterloo.ca/bitstream/handle/10012/19195/Ammar_Khaled.pdf",
  cc-derived-dataset-used = "WDC-hyperlinkgraph, WDC-hyperlinkgraph (2014)",
  cc-snippet   = "Common Crawl experiments. Sixteen machines load 64 billion edges, index them, and track motifs in 20
                 batches of 10K random edge changes.",
  cc-author-affiliation = "University of Waterloo, Ontario, Canada",
  cc-class     = "graph-processing, web-science/hyperlinkgraph",
}

@Misc{cc:HuangSiddarth:2023:generative-AI-and-digital-commons,
  title        = "Generative {AI} and the Digital Commons",
  author       = "Saffron Huang and Divya Siddarth",
  year         = "2023",
  eprint       = "2303.11074",
  archiveprefix = "arXiv",
  primaryclass = "cs.CY",
  pdf          = "https://arxiv.org/pdf/2303.11074.pdf",
  cc-author-affiliation = "Collective Intelligence Project (cip.org)",
  cc-class     = "digital-commons, public-commons, nlp/corpus-construction, nlp/language-models,
                 nlp/generative-language-models, cc-cited-not-used",
  cc-snippet   = "GFMs are trained on the digital commons. Generative foundation models leverage large databases of
                 scraped information (text, code, images) from the internet to train highly capable models. This depends
                 on the availability of public, scrapable data and leverages the “collective intelligence” of
                 humanity, including the painstakingly edited Wikipedia, millennia’s worth of books, billions of
                 Reddit comments, hundreds of terabytes’ worth of images, and more³ [³LAION-5B, which Stable
                 Diffusion is trained on, has 5 billion text-image pairs (Schuhmann et al., 2022).The Pile has 100+GB of
                 books (Gao et al., 2020)]. They also rely on non- profits like Common Crawl (which build and maintain
                 open repositories of web crawl data), Creative Commons (for open licenses for the data used), open
                 source libraries, and other digital infrastructure. They also take advantage of aggregated user
                 preferences; e.g. the WebText dataset underlying the GPT family of models uses Reddit “karma
                 scores” to select content for inclusion. All of this is common digital information and infrastructure
                 that many people contribute to.",
}

@Misc{cc:ChanBradleyRajkumar:2023:reclaiming-digital-commons,
  title        = "Reclaiming the Digital Commons: {A} Public Data Trust for Training Data",
  author       = "Alan Chan and Herbie Bradley and Nitarshan Rajkumar",
  year         = "2023",
  eprint       = "2303.09001",
  archiveprefix = "arXiv",
  primaryclass = "cs.CY",
  pdf          = "https://arxiv.org/pdf/2303.09001.pdf",
  cc-snippet   = "The data trust could also start from existing efforts, such as the Common Crawl.",
  abstract     = "Democratization of AI means not only that people can freely use AI, but also that people can
                 collectively decide how AI is to be used. In particular, collective decision-making power is required
                 to redress the negative externalities from the development of increasingly advanced AI systems,
                 including degradation of the digital commons and unemployment from automation. The rapid pace of AI
                 development and deployment currently leaves little room for this power. Monopolized in the hands of
                 private corporations, the development of the most capable foundation models has proceeded largely
                 without public input. There is currently no implemented mechanism for ensuring that the economic value
                 generated by such models is redistributed to account for their negative externalities. The citizens
                 that have generated the data necessary to train models do not have input on how their data are to be
                 used. In this work, we propose that a public data trust assert control over training data for
                 foundation models. In particular, this trust should scrape the internet as a digital commons, to
                 license to commercial model developers for a percentage cut of revenues from deployment. First, we
                 argue in detail for the existence of such a trust. We also discuss feasibility and potential risks.
                 Second, we detail a number of ways for a data trust to incentivize model developers to use training
                 data only from the trust. We propose a mix of verification mechanisms, potential regulatory action, and
                 positive incentives. We conclude by highlighting other potential benefits of our proposed data trust
                 and connecting our work to ongoing efforts in data and compute governance.",
  URL          = "https://arxiv.org/abs/2303.09001",
  cc-author-affiliation = "University of Cambridge, United Kingdom; Mila, Université de Montréal, Canada; EleutherAI",
  cc-class     = "digital-commons, public-commons, nlp/corpus-construction, nlp/language-models,
                 nlp/generative-language-models, cc-cited-not-used",
}

@Misc{cc:TurskiStanisławekKaczmarekDydaEtAl:2023:CCpdf,
  title        = "{CC}pdf: Building a High Quality Corpus for Visually Rich Documents from Web Crawl Data",
  author       = "Michał Turski and Tomasz Stanisławek and Karol Kaczmarek and Paweł Dyda and Filip Graliński",
  year         = "2023",
  eprint       = "2304.14953",
  URL          = "https://arxiv.org/pdf/2304.14953.pdf",
  cc-author-affiliation = "Snowflake; Adam Mickiewicz University, Poznań, Poland",
  cc-class     = "nlp/language-models, nlp/corpus-construction, document understanding, PDF",
  abstract     = "In recent years, the field of document understanding has progressed a lot. A significant part of this
                 progress has been possible thanks to the use of language models pretrained on large amounts of
                 documents. However, pretraining corpora used in the domain of document understanding are single domain,
                 monolingual, or nonpublic. Our goal in this paper is to propose an efficient pipeline for creating a
                 big-scale, diverse, multilingual corpus of PDF files from all over the Internet using Common Crawl, as
                 PDF files are the most canonical types of documents as considered in document understanding. We
                 analysed extensively all of the steps of the pipeline and proposed a solution which is a trade-off
                 between data quality and processing time. We also share a CCpdf corpus in a form or an index of PDF
                 files along with a script for downloading them, which produces a collection useful for language model
                 pretraining. The dataset and tools published with this paper offer researchers the opportunity to
                 develop even better multilingual language models.",
  cc-snippet   = "As our input we used web indexes created by Common Crawl. [...] They crawl webpages and save them into
                 crawls dumps. A crawl dump contains billions of webpages (hundreds of terabytes of uncompressed data)
                 and a new dump has been published nearly every month since March 2014. Some earlier, more irregular
                 dumps starting from 2008 are also available.¹¹ Each dump also contains an index of the crawled pages.
                 We decided to simply use the latest (and the largest) dump available at the time of writing this paper
                 — the May 2022 dump.¹² [¹²https://commoncrawl.org/2022/06/may-2022-crawl-archive-now-available/]
                 It contains 3.45 billion web pages, which amounts to 462 TB of uncompressed content. It would obviously
                 be possible to apply the extraction procedure described in this paper to all crawls to obtain an even
                 larger collection of PDFs, which would also allow for a diachronic analysis, but we wanted to focus on
                 the most recent documents. Note that dumps contain only files considered as text files by the Common
                 Crawl web robot. Mostly these are web pages in the HTML format, but, fortunately, PDFs are also treated
                 as text files, being derivative of the PostScript page description language. This is not the case with,
                 for instance, images, Excel files, DOCX files. Consequently, such files cannot be amassed using the
                 methods described in the aforementioned papers.¶ 3.2 PDF links extraction¶ We experimented with two
                 methods for extracting links to PDF files (step 1 in Figure 1):¶ 1. using CDX files, i.e., index
                 server files provided by Common Crawl;¶ 2. looking for links to PDF files in WARC, i.e., raw crawl
                 data files.¶ The first method is simpler, as CDX files are easy to download and take up only 225 GB in
                 total. The second method might yield more links to PDF files, but:¶ – it is impossible for us to
                 download all WARCs. Only a limited number of them can be processed, though still a significant number
                 of PDF links can be added even if a small percentage of all WARC files are processed,¶ – there is
                 lower probability that the file linked is available at all, be it in the crawl dump or simply at the
                 original address.¶ In CDX files, the MIME type of a captured file is specified, and we limited
                 ourselves to the application/pdf type.¶ Hence, in this paper, we focus on the first method, which
                 allows to speed up the whole processing pipeline.",
  cc-dataset-used = "CC-MAIN-2022-21 (CDX)",
}

@InProceedings{cc:NourinTranJiangBockEtAl:2023:Measuring-turkmenistans-internet-censorship,
  author       = "Nourin, Sadia and Tran, Van and Jiang, Xi and Bock, Kevin and Feamster, Nick and Hoang, Nguyen Phong
                 and Levin, Dave",
  title        = "Measuring and Evading Turkmenistan’s Internet Censorship: {A} Case Study in Large-Scale Measurements
                 of a Low-Penetration Country",
  year         = "2023",
  ISBN         = "97-81450-394-1-6-1",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  URL          = "https://doi.org/10.1145/3543507.3583189",
  pdf          = "https://dl.acm.org/doi/pdf/10.1145/3543507.3583189",
  DOI          = "10.1145/3543507.3583189",
  abstract     = "Since 2006, Turkmenistan has been listed as one of the few Internet enemies by Reporters without
                 Borders due to its extensively censored Internet and strictly regulated information control policies.
                 Existing reports of filtering in Turkmenistan rely on a handful of vantage points or test a small
                 number of websites. Yet, the country’s poor Internet adoption rates and small population can make
                 more comprehensive measurement challenging. With a population of only six million people and an
                 Internet penetration rate of only 38\%, it is challenging to either recruit in-country volunteers or
                 obtain vantage points to conduct remote network measurements at scale. We present the largest
                 measurement study to date of Turkmenistan’s Web censorship. To do so, we developed TMC, which tests
                 the blocking status of millions of domains across the three foundational protocols of the Web (DNS,
                 HTTP, and HTTPS). Importantly, TMC does not require access to vantage points in the country. We apply
                 TMC to 15.5M domains, our results reveal that Turkmenistan censors more than 122K domains, using
                 different blocklists for each protocol. We also reverse-engineer these censored domains, identifying 6K
                 over-blocking rules causing incidental filtering of more than 5.4M domains. Finally, we use , an
                 open-source censorship evasion tool, to discover five new censorship evasion strategies that can defeat
                 Turkmenistan’s censorship at both transport and application layers. We will publicly release both the
                 data collected by TMC and the code for censorship evasion.",
  booktitle    = "Proceedings of the {ACM} Web Conference 2023",
  pages        = "1969--1979",
  numpages     = "11",
  keywords     = "Censorship Measurement, Web Filtering, Turkmenistan",
  location     = "Austin, TX, USA",
  series       = "WWW '23",
  cc-snippet   = "[...] the payload of our probes contains domains curated from the Citizen Lab lists [5], the full
                 Tranco list [42], and Common Crawl Project [8]. Due to limited resources of our VPS, we opt to probe
                 the frst 10M FQDNs ranked by the Common Crawl Project instead of the full list of almost 400M FQDNs.
                 [...] We scanned all regular expressions that TMC discovered against all FQDNs that we could obtain
                 from DNS zone fles provided via ICANN’s Centralized Zone Data Service [ 6] and the full host list
                 from the Common Crawl Project [8], totaling 718M FQDNs.",
  cc-author-affiliation = "University of Maryland, USA; University of Chicago, USA",
  cc-dataset-used = "hyperlinkgraph",
  cc-class     = "web-filtering, internet-censorship",
}

@Misc{cc:ZhuHesselAwadallaGadreEtAl:2023:Multimodal-C4,
  title        = "Multimodal {C4}: An Open, Billion-scale Corpus of Images Interleaved With Text",
  author       = "Wanrong Zhu and Jack Hessel and Anas Awadalla and Samir Yitzhak Gadre and Jesse Dodge and Alex Fang
                 and Youngjae Yu and Ludwig Schmidt and William Yang Wang and Yejin Choi",
  year         = "2023",
  eprint       = "2304.06939",
  URL          = "https://arxiv.org/abs/2304.06939",
  cc-author-affiliation = "Allen Institute for Artificial Intelligence, USA; University of California, Santa Barbara,
                 USA; Paul G. Allen School of Computer Science, University of Washington, USA; Columbia University, USA;
                 Yonsei University, South Korea; LAION",
  cc-dataset-used = "CC-MAIN-2019-18 (WET)",
  cc-derived-dataset-about = "Allenai-multimodal-c4 (mmc4)",
  cc-class     = "nlp/corpus-construction, nlp/multimodal-corpora, ai/image-text-alignment",
  cc-snippet   = "Multimodal C4 is an expansion of the text-only c4 dataset [21], which was created by taking the April
                 2019 snapshot from Common Crawl4 and applying several filters with the intention of retaining
                 high-quality, natural English text. Each document in c4 consists of the text scraped from one URL.
                 [...] e built the mmc4 dataset on top of c4 because: 1) c4 is a web-scale dataset widely adopted as a
                 pre-training corpus [21 , 25, 9 , 29, 27 ]; 2) c4 is constructed from web pages, which frequently
                 contain multimedia content like images: a multimodal sequence version is a natural extension; and 3)
                 c4-en,5 the specific underlying subset from which we construct mmc4 has already been processed with
                 several data-cleaning steps (including English- language identification by langdetect6 with at least
                 0.99 confidence; text deduplication removing duplicate three-sentence spans + placeholder text like
                 “lorem ipsum{"}; and removal of any document containing any word on the “List of Dirty, Naughty,
                 Obscene or Otherwise Bad Words”).7 See [ 21] for more information about the text-only c4.
                 Importantly, by building on the popular text-only c4, prior text-only documentation efforts [ 11] can
                 provide insight about potential biases and risks that could arise when training on our multimodal
                 extension. We use the NLTK [4] sentence tokenizer to chunk each c4 document into a list of sentences.¶
                 Gathering images. We first retrieve the original webpages for each document in the c4-en dataset from
                 the Common Crawl version 2019-18, which is the default version for c4. Next, we extract the URLs for
                 downloadable images from the raw WAT files. We restrict the image extension to either png/jpeg/jpg, and
                 exclude image URLs that contain the following tokens: tlogo, button, icon, plugin, widgetu. We attempt
                 to download from these URLs, and resize images to a maximum dimension of 800px. We eliminate any c4
                 documents that do not contain valid, downloadable images at the time of collection (mid-to-late 2022).
                 The starting point after this step is 115M documents and 1.37B images.",
}

@MastersThesis{cc:Jørgensen:2023:BacklinkDB,
  title        = "Backlink{DB}: {A} Purpose-Built Backlink Database Management System",
  author       = "Jørgensen, Marius Løvold",
  year         = "2023",
  school       = "UiT Norges arktiske universitet",
  URL          = "https://munin.uit.no/handle/10037/28861",
  pdf          = "https://munin.uit.no/bitstream/handle/10037/28861/thesis.pdf",
  abstract     = "In order to compile a list of all the backlinks for a given webpage, we need knowledge about all the
                 outgoing links on the web. Traversing the web and storing all the backlink data in a database allows us
                 to efficiently retrieve the list of backlinks for a web page on demand. However, the web consists of
                 billions of backlinks which translates to terabytes of data. As the web is continuously evolving, the
                 database needs to be rebuilt periodically in order for it to closely resemble the current state of the
                 web. This thesis presents BacklinkDB, a purpose-built database management system designed for managing
                 a backlink database. Using a series of in-memory hash indices allows for high insert throughput when
                 building the database. The backlink data for a given domain is stored together in sections throughout
                 the database file. This allows for the requested backlink data to be easily located. With a simple
                 sql-inspired query language, the users can both insert and retrieve backlink data. The evaluation shows
                 that building a purpose-built database management sys- tem allows us to make the trade-offs between
                 which performance metrics that is important. In this thesis, we will focus on creating a scalable
                 backlink database management system with high insert performance",
  cc-snippet   = "5.1.3 Data¶ The link data used in the experiments is downloaded from the Common Crawls website1.
                 Common Crawl is a non-profit organization that periodically crawls the web and publicizes data. For the
                 experiments described in this chapter, data from the August 2022 crawl²
                 [²https://commoncrawl.org/2022/08/august-2022-crawl-archive-now-available/] is used.¶ Data
                 prepossessing¶ Common Crawl provides data on all the indexable webpages. This data is provided in a
                 series of warc files found in their public repository. Common Crawl also provide WAT files which are
                 produced by processing the warc files and extracting the metadata for each webpage. The WAT files
                 contain a list of all the outgoing links for each of the webpages.¶ All external links from the WAT
                 file are extracted to their own link file so that they can be directly inserted into a database. Each
                 link is stored on a separate line in the file using spaces to separate the source domain, source path,
                 destination domain, and destination path. All the backlinks containing urls longer than 2048 characters
                 are discarded. A link file is created for each of the WAT files. These link files contain all the
                 information needed to build a backlink database.",
  cc-dataset-used = "CC-MAIN-2022-33 (WAT)",
  cc-author-affiliation = "UiT, The Arctic University of Norway, Norway",
  cc-class     = "web-science/hyperlinkgraph, ir/backlinkdb",
}

@Article{cc:CalzavaraHantkeWilhelmRabittiEtAl:2023:web-archives-for-web-security-measurements,
  title        = "You Call This Archaeology? Evaluating Web Archives for Reproducible Web Security Measurements",
  author       = "Calzavara, Stefano and Hantke, Florian and Wilhelm, Moritz and Rabitti, Alvise and Stock, Ben",
  year         = "2023",
  URL          = "https://swag.cispa.saarland/papers/calzavara2023archaeology.pdf",
  abstract     = "Given the dynamic nature of the Web, security measurements on it suffer from reproducibility issues.
                 In this paper we take a systematic look into the potential of using web archives for web security
                 measurements. We first evaluate an extensive set of web archives as potential sources of archival data,
                 showing the superiority of the Internet Archive with respect to its competitors. We then assess the
                 appropriateness of the Internet Archive for historical web security measurements, detecting subtleties
                 and possible pitfalls in its adoption. Finally, we investigate the feasibility of using the Internet
                 Archive to simulate live security measurements, using recent archival data in place of live data. Our
                 analysis shows that archive-based security measurements are a promising alternative to traditional live
                 security measurements, which is reproducible by design; nevertheless, it also shows potential pitfalls
                 and shortcomings of archive-based measurements. As an important contribution, we use the collected
                 knowledge to identify insights and best practices for future archive-based security measurements.",
  cc-snippet   = "Besides Memento-based archives, we also consider Common Crawl as a possible alternative source of
                 archival data. Common Crawl archives parts of the Web once a month and stores the content as one
                 snapshot. The reason why we use Common Crawl is that it contains a massive amount of data: its October
                 2022 snapshot includes more than 2.55 billion pages, with its index alone being larger than 2TB;
                 moreover, Common Crawl was already used in a previous web security measurement [ 15, 36]. The content
                 archived on Common Crawl is stored in form of large compressed files consisting of lists of WARC files.
                 These WARC files hold meta information such as the requested datetime, content type, or content size,
                 followed by the archived content.",
  cc-author-affiliation = "CISPA Helmholtz Center for Information Security, Germany; Università Ca’ Foscari, Venezia,
                 Italy",
  cc-class     = "computer-security/internet-security, web-science",
}

@Misc{cc:HendersonLiJurafskyHashimotoEtAl:2023:Foundation-models-and-fair-use,
  title        = "Foundation Models and Fair Use",
  author       = "Peter Henderson and Xuechen Li and Dan Jurafsky and Tatsunori Hashimoto and Mark A. Lemley and Percy
                 Liang",
  year         = "2023",
  eprint       = "2303.15715",
  archiveprefix = "arXiv",
  primaryclass = "cs.CY",
  URL          = "https://arxiv.org/abs/2303.15715",
  abstract     = "Existing foundation models are trained on copyrighted material. Deploying these models can pose both
                 legal and ethical risks when data creators fail to receive appropriate attribution or compensation. In
                 the United States and several other countries, copyrighted content may be used to build foundation
                 models without incurring liability due to the fair use doctrine. However, there is a caveat: If the
                 model produces output that is similar to copyrighted data, particularly in scenarios that affect the
                 market of that data, fair use may no longer apply to the output of the model. In this work, we
                 emphasize that fair use is not guaranteed, and additional work may be necessary to keep model
                 development and deployment squarely in the realm of fair use. First, we survey the potential risks of
                 developing and deploying foundation models based on copyrighted content. We review relevant U.S. case
                 law, drawing parallels to existing and potential applications for generating text, source code, and
                 visual art. Experiments confirm that popular foundation models can generate content considerably
                 similar to copyrighted material. Second, we discuss technical mitigations that can help foundation
                 models stay in line with fair use. We argue that more research is needed to align mitigation strategies
                 with the current state of the law. Lastly, we suggest that the law and technical mitigations should
                 co-evolve. For example, coupled with other policy mechanisms, the law could more explicitly consider
                 safe harbors when strong technical tools are used to mitigate infringement harms. This co-evolution may
                 help strike a balance between intellectual property and innovation, which speaks to the original goal
                 of fair use. But we emphasize that the strategies we describe here are not a panacea and more work is
                 needed to develop policies that address the potential harms of foundation models.",
  cc-snippet   = "Implied Licenses and Common Crawl. On the other hand, many creators voluntarily post their works on
                 the internet with permissions for web crawling. It is well-established that merely posting something on
                 the internet does not waive the intellectual property interest in the work, but many data creators use
                 an industry-standard “robots.txt” file to affirmatively to include their website and data in caches
                 and search indexes. In Field v. Google, Inc. (D. Nev. 2006) a district court held that Google could
                 cache web content that did not disallow scraping via robots.txt, suggesting that there was an implied
                 license and thus the use was not infringement. This license only extended to caching in that case,
                 which does not necessarily reflect the uses of foundation models we discuss throughout this work, so it
                 is unlikely to cover all the use cases we describe here. And the bounds of the uses covered by the
                 robots.txt file are untested in court.21 While the issue of whether the implied license extends to
                 foundation model training has not been resolved in litigation, it is possible that an outcome like
                 Field v. Google, Inc. (D. Nev. 2006) would extend to some foundation model uses—in particular, for
                 building a cached dataset and training a model.¶ It is worth noting that the use of a robots.txt
                 header or other opt-out mechanism has implications for fair use also. Datasets and models like C4
                 (Raffel et al., 2019) and LAION-400M (Schuhmann, 2021), rely on CommonCrawl data which is crawled only
                 if users explicitly allow it through their robots.txt file. CommonCrawl is able to host a snapshot of
                 the internet largely because of fair use arguments. As the organization’s director argues, there is a
                 transformation into a different—not easily human-readable—format, the organization does not take a
                 snapshot of entire webpages, and the use itself is transformative (from actively presenting content to
                 caching content) and for the public benefit (Leetaru, 2017). In Field v. Google, Inc. (D. Nev. 2006),
                 respect for the robots.txt file also was considered in the fair use assessment with the court noting
                 that Google in good faith followed industry standards that would prevent caching (respecting
                 disallowing crawling via a robots.txt). It is possible, then, that providing an opt-out mechanism for
                 data creators and respecting the robots.txt opt-out mechanism will be taken into account in assessing a
                 fair use argument, as it was in Field v. Google, Inc. (D. Nev. 2006).²²¶ [...] Furthermore, if
                 web-crawled data is used, restricting it to data that respects robots.txt opt-outs can make a fair use
                 argument more tractable, though not guaranteed. As we noted before, in Field v. Google, Inc. (D. Nev.
                 2006), respect for the robots.txt file was considered in the fair use assessment with the court because
                 it gave the plaintiff opportunity to opt out. This is likely why many webcrawl-based models rely on the
                 CommonCrawl dataset as a source. Its webcrawl automatically respects robots.txt opt-outs and does not
                 crawl every webpage in full. It is possible then that future fair use assessments could consider
                 respecting the robots.txt opt-out—or implementing other opt-out mechanisms—favorably, as was the
                 case in Field v. Google, Inc. (D. Nev. 2006). Conversely, ignoring a robots.txt opt-out could
                 negatively impact a fair use assessment. However, Kapoor & Narayanan (2023) have argued that there are
                 structural critiques of opt-out mechanisms beyond the current state of the law.¶",
  cc-author-affiliation = "Stanford University, USA",
  cc-class     = "legal/copyright, legal/fair-use, nlp/language-model, ai/foundation-model, web-crawling, robots.txt",
}

@Misc{cc:ZhaoZhouLiTangEtAl:2023:survey-of-LLMs,
  title        = "A Survey of Large Language Models",
  author       = "Wayne Xin Zhao and Kun Zhou and Junyi Li and Tianyi Tang and Xiaolei Wang and Yupeng Hou and Yingqian
                 Min and Beichen Zhang and Junjie Zhang and Zican Dong and Yifan Du and Chen Yang and Yushuo Chen and
                 Zhipeng Chen and Jinhao Jiang and Ruiyang Ren and Yifan Li and Xinyu Tang and Zikang Liu and Peiyu Liu
                 and Jian-Yun Nie and Ji-Rong Wen",
  year         = "2023",
  eprint       = "2303.18223",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2303.18223",
  abstract     = "Language is essentially a complex, intricate system of human expressions governed by grammatical
                 rules. It poses a significant challenge to develop capable AI algorithms for comprehending and grasping
                 a language. As a major approach, language modeling has been widely studied for language understanding
                 and generation in the past two decades, evolving from statistical language models to neural language
                 models. Recently, pre-trained language models (PLMs) have been proposed by pre-training Transformer
                 models over large-scale corpora, showing strong capabilities in solving various NLP tasks. Since
                 researchers have found that model scaling can lead to performance improvement, they further study the
                 scaling effect by increasing the model size to an even larger size. Interestingly, when the parameter
                 scale exceeds a certain level, these enlarged language models not only achieve a significant
                 performance improvement but also show some special abilities that are not present in small-scale
                 language models. To discriminate the difference in parameter scale, the research community has coined
                 the term large language models (LLM) for the PLMs of significant size. Recently, the research on LLMs
                 has been largely advanced by both academia and industry, and a remarkable progress is the launch of
                 ChatGPT, which has attracted widespread attention from society. The technical evolution of LLMs has
                 been making an important impact on the entire AI community, which would revolutionize the way how we
                 develop and use AI algorithms. In this survey, we review the recent advances of LLMs by introducing the
                 background, key findings, and mainstream techniques. In particular, we focus on four major aspects of
                 LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Besides, we also
                 summarize the available resources for developing LLMs and discuss the remaining issues for future
                 directions.",
  cc-snippet   = "CommonCrawl. CommonCrawl [132] is one of the largest open-source web crawling databases, containing a
                 petabyte-scale data volume, which has been widely used as training data for existing LLMs. As the whole
                 dataset is very large, existing studies mainly extract subsets of web pages from it within a specific
                 period. However, due to the widespread existence of noisy and low-quality information in web data, it
                 is necessary to perform data preprocessing before usage. Based on CommonCrawl, there are four filtered
                 datasets that are commonly used in existing work: C4 [73], CC-Stories [124], CC-News [27], and RealNews
                 [125]. The Colossal Clean Crawled Corpus (C4) includes five variants¹⁸,
                 [¹⁸https://www.tensorflow.org/datasets/catalog/c4] namely en (806G), en.noclean (6T), realnewslike
                 (36G), webtextlike (17G), and multilingual (38T). The en version has been utilized for pre-training T5
                 [73], LaMDA [63], Gopher [59], and UL2 [80]. The multilingual C4, also called mC4, has been used in mT5
                 [74]. CC-Stories (31G) is composed of a subset of CommonCrawl data, in which the contents are made in a
                 story-like way. While, the original source of CC-Stories is not available now, so a reproduction
                 version, CC-Stories-R [133], has been included in Table 2. Moreover, two news corpora extracted from
                 CommonCrawl, i.e., REALNEWS (120G) and CC-News (76G), are also commonly used as the pre-training
                 data.",
  cc-author-affiliation = "Gaoling School of Artificial Intelligence and School of Information, Renmin University of
                 China, Beijing, China; DIRO, Université de Montr ́eal, Canada",
  cc-class     = "nlp/language-models, nlp/large-language-models, nlp/text-corpora",
}

@Misc{cc:WuIrsoyLuDabravolskiEtAl:2023:BloombergGPT,
  title        = "Bloomberg{GPT}: {A} Large Language Model for Finance",
  author       = "Shijie Wu and Ozan Irsoy and Steven Lu and Vadim Dabravolski and Mark Dredze and Sebastian Gehrmann
                 and Prabhanjan Kambadur and David Rosenberg and Gideon Mann",
  year         = "2023",
  eprint       = "2303.17564",
  archiveprefix = "arXiv",
  primaryclass = "cs.LG",
  URL          = "https://arxiv.org/abs/2303.17564",
  abstract     = "The use of NLP in the realm of financial technology is broad and complex, with applications ranging
                 from sentiment analysis and named entity recognition to question answering. Large Language Models
                 (LLMs) have been shown to be effective on a variety of tasks; however, no LLM specialized for the
                 financial domain has been reported in literature. In this work, we present BloombergGPT, a 50 billion
                 parameter language model that is trained on a wide range of financial data. We construct a 363 billion
                 token dataset based on Bloomberg's extensive data sources, perhaps the largest domain-specific dataset
                 yet, augmented with 345 billion tokens from general purpose datasets. We validate BloombergGPT on
                 standard LLM benchmarks, open financial benchmarks, and a suite of internal benchmarks that most
                 accurately reflect our intended usage. Our mixed dataset training leads to a model that outperforms
                 existing models on financial tasks by significant margins without sacrificing performance on general
                 LLM benchmarks. Additionally, we explain our modeling choices, training process, and evaluation
                 methodology. We release Training Chronicles (Appendix C) detailing our experience in training
                 BloombergGPT.",
  cc-class     = "nlp/language-models, nlp/large-language-models, nlp/dataset-creation, financial markets,
                 cc-cited-not-used",
  cc-author-affiliation = "Bloomberg, New York, NY, USA; Bloomberg, Toronto, ON, Canada; Computer Science, Johns Hopkins
                 University, Baltimore, MD, USA",
}

@Misc{cc:ÖhmanVerlindenEkgrenGyllenstenEtAl:2023:Nordic-Pile,
  title        = "The Nordic Pile: {A} 1.2{TB} Nordic Dataset for Language Modeling",
  author       = "Joey Öhman and Severine Verlinden and Ariel Ekgren and Amaru Cuba Gyllensten and Tim Isbister and
                 Evangelia Gogoulou and Fredrik Carlsson and Magnus Sahlgren",
  year         = "2023",
  eprint       = "2303.17183",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2303.17183",
  abstract     = "Pre-training Large Language Models (LLMs) require massive amounts of text data, and the performance of
                 the LLMs typically correlates with the scale and quality of the datasets. This means that it may be
                 challenging to build LLMs for smaller languages such as Nordic ones, where the availability of text
                 corpora is limited. In order to facilitate the development of the LLMS in the Nordic languages, we
                 curate a high-quality dataset consisting of 1.2TB of text, in all of the major North Germanic languages
                 (Danish, Icelandic, Norwegian, and Swedish), as well as some high-quality English data. This paper
                 details our considerations and processes for collecting, cleaning, and filtering the dataset.",
  cc-snippet   = "Therefore, The Nordic Pile is composed mostly of existing sources, with a large por- tion of these
                 originating from derivatives of Common Crawl data, such as OSCAR (Suárez et al., 2019; Ortiz Suárez
                 et al., 2020) and Multilingual C4 (mC4) (Xue et al., 2021), which is a language- filtered version of C4
                 (Raffel et al., 2020).¶ [...] Web CC: Web data derived from Common Crawl¶ Similarly, Web CC is the
                 most prominent of our categories.",
  cc-author-affiliation = "AI Sweden, Sweden; RISE, Sweden",
  cc-class     = "nlp/corpus-construction, nlp/text-corpora, nlp/language-model",
}

@Misc{cc:Zhang:2023:ChatGPT-and-Bard-share-revenue,
  title        = "Should Chat{GPT} and Bard Share Revenue with Their Data Providers? {A} New Business Model for the {AI}
                 Era",
  author       = "Dong Zhang",
  year         = "2023",
  eprint       = "2305.02555",
  archiveprefix = "arXiv",
  primaryclass = "cs.LG",
  URL          = "https://arxiv.org/abs/2305.02555",
  abstract     = "With various AI tools such as ChatGPT becoming increasingly popular, we are entering a true AI era. We
                 can foresee that exceptional AI tools will soon reap considerable profits. A crucial question arise:
                 should AI tools share revenue with their training data providers in additional to traditional
                 stakeholders and shareholders? The answer is Yes. Large AI tools, such as large language models, always
                 require more and better quality data to continuously improve, but current copyright laws limit their
                 access to various types of data. Sharing revenue between AI tools and their data providers could
                 transform the current hostile zero-sum game relationship between AI tools and a majority of copyrighted
                 data owners into a collaborative and mutually beneficial one, which is necessary to facilitate the
                 development of a virtuous cycle among AI tools, their users and data providers that drives forward AI
                 technology and builds a healthy AI ecosystem. However, current revenue-sharing business models do not
                 work for AI tools in the forthcoming AI era, since the most widely used metrics for website-based
                 traffic and action, such as clicks, will be replaced by new metrics such as prompts and cost per prompt
                 for generative AI tools. A completely new revenue-sharing business model, which must be almost
                 independent of AI tools and be easily explained to data providers, needs to establish a prompt-based
                 scoring system to measure data engagement of each data provider. This paper systematically discusses
                 how to build such a scoring system for all data providers for AI tools based on classification and
                 content similarity models, and outlines the requirements for AI tools or third parties to build it.
                 Sharing revenue with data providers using such a scoring system would encourage more data owners to
                 participate in the revenue-sharing program. This will be a utilitarian AI era where all parties
                 benefit.",
  cc-author-affiliation = "",
  cc-class     = "legal/copyright, legal/fair-use, nlp/language-model, ai/foundation-model, economic aspects of large
                 language models, monetization of training data",
}

@Misc{cc:HuangGuptaZhongLiEtAl:2023:privacy-implications-of-retrieval-based-language-models,
  title        = "Privacy Implications of Retrieval-Based Language Models",
  author       = "Yangsibo Huang and Samyak Gupta and Zexuan Zhong and Kai Li and Danqi Chen",
  year         = "2023",
  eprint       = "2305.14888",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  abstract     = "Retrieval-based language models (LMs) have demonstrated improved interpretability, factuality, and
                 adaptability compared to their parametric counterparts, by incorporating retrieved text from external
                 datastores. While it is well known that parametric models are prone to leaking private data, it remains
                 unclear how the addition of a retrieval datastore impacts model privacy. In this work, we present the
                 first study of privacy risks in retrieval-based LMs, particularly kNN-LMs. Our goal is to explore the
                 optimal design and training procedure in domains where privacy is of concern, aiming to strike a
                 balance between utility and privacy. Crucially, we find that kNN-LMs are more susceptible to leaking
                 private information from their private datastore than parametric models. We further explore mitigations
                 of privacy risks. When privacy information is targeted and readily detected in the text, we find that a
                 simple sanitization step would completely eliminate the risks, while decoupling query and key encoders
                 achieves an even better utility-privacy trade-off. Otherwise, we consider strategies of mixing public
                 and private data in both datastore and encoder training. While these methods offer modest improvements,
                 they leave considerable room for future work. Together, our findings provide insights for practitioners
                 to better understand and mitigate privacy risks in retrieval-based LMs. Our code is available at:
                 [https://github.com/Princeton-SysML/kNNLM_privacy].",
  URL          = "https://arxiv.org/abs/2305.14888",
  cc-author-affiliation = "",
  cc-class     = "",
}

@Misc{cc:LongpreYauneyReifLeeEtAl:2023:pretrainers-guide-to-training-data,
  title        = "A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, &
                 Toxicity",
  author       = "Shayne Longpre and Gregory Yauney and Emily Reif and Katherine Lee and Adam Roberts and Barret Zoph
                 and Denny Zhou and Jason Wei and Kevin Robinson and David Mimno and Daphne Ippolito",
  year         = "2023",
  eprint       = "2305.13169",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2305.13169",
  abstract     = "Pretraining is the preliminary and fundamental step in developing capable language models (LM).
                 Despite this, pretraining data design is critically under-documented and often guided by empirically
                 unsupported intuitions. To address this, we pretrain 28 1.5B parameter decoder-only models, training on
                 data curated (1) at different times, (2) with varying toxicity and quality filters, and (3) with
                 different domain compositions. First, we quantify the effect of pretraining data age. A temporal shift
                 between evaluation data and pretraining data leads to performance degradation, which is not overcome by
                 finetuning. Second, we explore the effect of quality and toxicity filters, showing a trade-off between
                 performance on standard benchmarks and risk of toxic generations. Our findings indicate there does not
                 exist a one-size-fits-all solution to filtering training data. We also find that the effects of
                 different types of filtering are not predictable from text domain characteristics. Lastly, we
                 empirically validate that the inclusion of heterogeneous data sources, like books and web, is broadly
                 beneficial and warrants greater prioritization. These findings constitute the largest set of
                 experiments to validate, quantify, and expose many undocumented intuitions about text pretraining,
                 which we hope will help support more informed data-centric decisions in LM development.",
  cc-author-affiliation = "",
  cc-class     = "",
}

@Misc{cc:PratapTjandraShiTomaselloEtAl:2023:scaling-speech-technology-to-1000-languages,
  title        = "Scaling Speech Technology to 1,000+ Languages",
  author       = "Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and
                 Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi
                 and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli",
  year         = "2023",
  eprint       = "2305.13516",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2305.13516",
  abstract     = "Expanding the language coverage of speech technology has the potential to improve access to
                 information for many more people. However, current speech technology is restricted to about one hundred
                 languages which is a small fraction of the over 7,000 languages spoken around the world. The Massively
                 Multilingual Speech (MMS) project increases the number of supported languages by 10-40x, depending on
                 the task. The main ingredients are a new dataset based on readings of publicly available religious
                 texts and effectively leveraging self-supervised learning. We built pre-trained wav2vec 2.0 models
                 covering 1,406 languages, a single multilingual automatic speech recognition model for 1,107 languages,
                 speech synthesis models for the same number of languages, as well as a language identification model
                 for 4,017 languages. Experiments show that our multilingual speech recognition model more than halves
                 the word error rate of Whisper on 54 languages of the FLEURS benchmark while being trained on a small
                 fraction of the labeled data.",
  cc-author-affiliation = "Meta AI; Hebrew University of Jerusalem, Israel",
  cc-class     = "nlp/speech-recognition, nlp/language-model",
  cc-snippet   = "We evaluate this single model on FLEURS, CommonVoice, VoxPopuli and MLS. [...] During inference, we
                 use n-gram models trained on CommonCrawl data. [...] ¶ Identifying Biased Words. We were not able to
                 find speakers for most of the considered languages of this study and therefore use the following
                 automatic procedure to determine religious words: for each word that occurs in the training data of
                 MMS-lab, we compare the relative token frequency, that is, the rate at which the word type occurs in
                 the MMS-lab data, to the relative token frequency in a general domain corpus; we use Common Crawl
                 [Conneau et al., 2020b] as a general domain corpus. If the relative word frequency is at least twice as
                 high in MMS-lab compared to Common Crawl, then we add it to the subset of words we include in our
                 study. This enables us to evaluate on 51 languages of the FLEURS corpus since not all languages are
                 covered by MMS-lab and we also need to find data in Common Crawl for each language. The automatic
                 procedure has the added benefit of avoiding any potential biases introduced by human annotators.
                 [...]¶ B n-gram Language Models¶ We train 5-gram language models on Common Crawl data using KenLM
                 [Heafield, 2011] for each language in FLEURS. For languages that do not use spaces to separate words,
                 we train 20-gram character-level language models. These languages are Mandarin Chinese (cmn), Cantonese
                 Chinese (yue), Japanese (jpn), Thai (tha), Lao (lao), Burmese (mya) and Khmer (khm). The text is pre-
                 processed following § 3.1.2 and we also remove emojis.³³",
}

@Article{cc:SakaiTaoChenLiEtAl:2023:On-Ordering-of-Pooled,
  author       = "Sakai, Tetsuya and Tao, Sijie and Chen, Nuo and Li, Yujing and Maistro, Maria and Chu, Zhumin and
                 Ferro, Nicola",
  title        = "On the Ordering of Pooled Web Pages, Gold Assessments, and Bronze Assessments",
  year         = "2023",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  ISSN         = "1046-8188",
  URL          = "https://doi.org/10.1145/3600227",
  DOI          = "10.1145/3600227",
  abstract     = "The present study leverages a recent opportunity we had to create a new English web search test
                 collection for the NTCIR-16 We Want Web (WWW-4) task, which concluded in June 2022. More specifically,
                 through the test collection construction effort, we examined two factors that may affect the relevance
                 assessments of depth-k pools, which in turn may affect the relative evaluation of different IR systems.
                 The first factor is the document ordering strategy for the assessors, namely, prioritisation (PRI) and
                 randomisation (RND). PRI is a method that has been used in NTCIR tasks for over a decade; it ranks the
                 pooled documents by a kind of pseudorelevance for the assessors. The second factor is assessor type,
                 i.e., Gold or Bronze. Gold assessors are the topic creators and therefore they “know” which
                 documents are (highly) relevant and which are not; Bronze assessors are not the topic creators and may
                 lack sufficient knowledge about the topics. We believe that our study is unique in that the authors of
                 this paper served as the Gold assessors when creating the WWW-4 test collection, which enabled us to
                 closely examine why Bronze assessments differ from the Gold ones. Our research questions examine
                 assessor efficiency (RQ1), inter-assessor agreement (RQ2), system ranking similarity with different
                 qrels files (RQ3), system ranking robustness to the choice of test topics (RQ4), and the reasons why
                 Bronze assessors tend to be more liberal than Gold assessors (RQ5). The most remarkable of our results
                 are as follows. Firstly, in the comparisons for RQ1 through RQ4, it turned out that what may matter
                 more than the document ordering strategy (PRI vs. RND) and the assessor type (Gold vs. Bronze) is how
                 well-motivated and/or well-trained the Bronze assessors are. Secondly, regarding RQ5, of the documents
                 originally judged nonrelevant by the Gold assessors contrary to the Bronze assessors in our
                 experiments, almost one half were truly relevant according to the Gold assessors’ own
                 reconsiderations. This result suggests that even Gold assessors are far from perfect; budget
                 permitting, it may be beneficial to hire highly-motivated Bronze assessors in addition to Gold
                 assessors so that they can complement each other.",
  note         = "Just Accepted",
  journal      = "ACM Trans. Inf. Syst.",
  month        = may,
  keywords     = "web search, relevance assessments, pooling, test collections, information retrieval",
  cc-snippet   = "The WWW-4 task introduced a new English web corpus called Chuweb21, which was constructed based on the
                 April 2021 block of Common Crawl dataset.⁹ [⁹
                 https://commoncrawl.org/2021/04/april-2021-crawl-archive-now-available/] Details of the corpus
                 construction process can be found in the WWW-4 overview paper [38]. Chuweb21 contains 82,451,337 HTMLs
                 or 1.69 TiB of compressed content; it is publicly available.¹⁰",
  cc-derived-dataset-about = "Chuweb21",
  cc-author-affiliation = "Waseda University, Japan; University of Copenhagen, Denmark; Tsinghua University, P. R. C.;
                 University of Padua, Italy",
  cc-class     = "ir/test-collection, ir/web-search, ir/search-engine-evaluation, nlp/corpus-construction",
}

@Article{cc:LiVincentChancellorHecht:2023:dimensions-of-data-labor,
  title        = "The Dimensions of Data Labor: {A} Road Map for Researchers, Activists, and Policymakers to Empower
                 Data Producers",
  author       = "Li, Hanlin and Vincent, Nicholas and Chancellor, Stevie and Hecht, Brent",
  journal      = "arXiv preprint arXiv:2305.13238",
  year         = "2023",
  URL          = "https://arxiv.org/pdf/2305.13238.pdf",
  abstract     = "Many recent technological advances (e.g. ChatGPT and search engines) are possible only because of
                 massive amounts of user-generated data produced through user interactions with computing systems or
                 scraped from the web (e.g. behavior logs, user-generated content, and artwork). However, data producers
                 have little say in what data is captured, how it is used, or who it benefits. Organizations with the
                 ability to access and process this data, e.g. OpenAI and Google, possess immense power in shaping the
                 technology landscape. By synthesizing related literature that reconceptualizes the production of data
                 for computing as ``data labor'', we outline opportunities for researchers, policymakers, and activists
                 to empower data producers in their relationship with tech companies, e.g advocating for transparency
                 about data reuse, creating feedback channels between data producers and companies, and potentially
                 developing mechanisms to share data's revenue more broadly. In doing so, we characterize data labor
                 with six important dimensions - legibility, end-use awareness, collaboration requirement, openness,
                 replaceability, and livelihood overlap - based on the parallels between data labor and various other
                 types of labor in the computing literature.",
  cc-snippet   = "For example, publicly available texts and artwork enabled the creation of generative AI models like
                 ChatGPT and Dall- E because model developers were able to scrape and process data from billions of web
                 pages¹. [¹https://commoncrawl.org/2022/10/sep-oct-2022-crawl-archive-now-available/]",
  cc-author-affiliation = "University of California, Berkeley, USA; University of California, Davis, USA; University of
                 Minnesota, Minneapolis, USA; Northwestern University, Evanston, USA",
  cc-class     = "legal/copyright, cc-citet-not-used, user-generated data, empowerment, data leverage",
}

@Article{cc:KanfoudBouramoul:2023:Tackling-multilingual-and-heterogeneous,
  author       = "Mohamed Raouf Kanfoud and Abdelkrim Bouramoul",
  title        = "Tackling the multilingual and heterogeneous documents with the pre-trained language identifiers",
  journal      = "International Journal of Computers and Applications",
  volume       = "0",
  number       = "0",
  pages        = "1--12",
  year         = "2023",
  publisher    = "Taylor & Francis",
  DOI          = "10.1080/1206212X.2023.2218236",
  URL          = "https://doi.org/10.1080/1206212X.2023.2218236",
  abstract     = "The Web has become one of the most important data sources, and the content shared is most often
                 multilingual, as users belong to different cultures and speak different languages. Multilingual content
                 (document) is not suitable for many people who only need content in one language. Furthermore, dividing
                 a multilingual document into monolingual documents helps researchers extract only the text of the
                 desired language to use in different tasks such as training or model testing. Therefore, it is
                 challenging to clean and divide the raw content manually. This paper presents an automatic approach to
                 dividing a multilingual document and reassembling it into monolingual documents by examining three
                 existing state-of-the-art tools for Language Identification (LI). We prepared different corpora with
                 different heterogeneity characteristics for the evaluation and evaluated their code-switching pattern
                 using three different code-switching metrics. The proposed approach reached 99\% as the best accuracy
                 result for the long segment (long text) and 90\% for the mixed segment. In addition, a good correlation
                 was found between the I-Index and accuracy with Pearson’s r = −0.998.",
  cc-snippet   = "The authors collected data from a non-profit foundation, Common Crawl, which explores the Web and
                 provides data freely to the public. The collected datasets are heterogeneous and multilingual.",
  cc-author-affiliation = "University of Constantine 2 – Abdelhamid Mehri, El Khroub, Algeria",
  cc-class     = "nlp/language-identification, nlp/corpus-construction, multi-lingual documents",