cc-citations/bib/cc2026.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
@Article{cc:PeiskerHoffmannMuttarak:2026:Climate-news,
  title        = "Climate news mediates extreme weather effects on climate change concern",
  journal      = "Climate Risk Management",
  volume       = "52",
  pages        = "100806",
  year         = "2026",
  ISSN         = "2212-0963",
  DOI          = "https://doi.org/10.1016/j.crm.2026.100806",
  URL          = "https://www.sciencedirect.com/science/article/pii/S2212096326000197",
  author       = "Jonas Peisker and Roman Hoffmann and Raya Muttarak",
  keywords     = "Climate change concern, Extreme weather, News media, Issue attention, Mediation",
  abstract     = "As the severe impacts of climate change become increasingly apparent, concerns about climate-related
                 issues have grown in recent years. The news media plays an important role in disseminating information
                 about climate change and its consequences to the wider public and thus can influence public climate
                 concern. Here, we investigate how extreme weather affects issue attention to climate change in the
                 European online news media and how extreme weather and news coverage jointly shape changes in climate
                 change concern. For the analysis, we combine 12 harmonized Eurobarometer survey waves, measuring public
                 concerns about climate issues, with meteorological data and indices of environmental news coverage
                 based on publications from 2481 media outlets in 200 regions of 22 European countries. Using fixed
                 effects panel models, we estimate effects of temperature anomalies on climate news and climate concern
                 and explore the role of the news media in explaining changes in concerns in response to temperature
                 anomalies. The results indicate that unusually high temperatures exhibit a robust positive effect on
                 media attention, especially when they overlap with other events that draw attention to the climate
                 topic, such as major climate change conferences. We furthermore find evidence that the climate news in
                 national outlets increases public concern about climate change and show that reporting by such outlets
                 is likely to partly explain the effects of temperature anomalies on concerns. We do not find any
                 significant effects of climate reporting in regional news outlets on climate concern. Our results
                 suggest that the national news media partly mediates the effects of extreme weather on public climate
                 change concern. The findings also highlight that focusing events strongly influence issue attention of
                 the media, providing windows of opportunity to raise awareness about climate issues, while pointing to
                 challenges in sustaining attention to related topics beyond short-lived news cycles.",
  cc-author-affiliation = "International Institute for Applied Systems Analysis, Population and Just Societies,
                 Laxenburg, Austria; University of Bologna, Italy",
  cc-class     = "climate change, climate risk management, news, websiteranking, domain-ranking, hyperlinkgraph",
  cc-derived-dataset-used = "OpenPageRank",
  cc-snippet   = "The weights are provided by the Open Page Rank that is based on Common Crawl project, an open source
                 database of web crawl data (DomCop, 2022). The Open Page Rank is scaled to a range from 0 to 10. Fig. 1
                 shows the distribution of page ranks over the included articles.",
}

@Article{cc:JumeletWeissweilerNivreBisazza:2026:MultiBLiMP-1.0,
  author       = "Jumelet, Jaap and Weissweiler, Leonie and Nivre, Joakim and Bisazza, Arianna",
  title        = "{MultiBLiMP 1.0}: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
  journal      = "Transactions of the Association for Computational Linguistics",
  volume       = "14",
  pages        = "193--216",
  year         = "2026",
  month        = "01",
  abstract     = "We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering
                 101 languages and 2 types of subject-verb agreement, containing more than 128,000 minimal pairs. Our
                 minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic
                 resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an
                 unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in
                 modelling low-resource languages.1",
  ISSN         = "2307-387X",
  DOI          = "10.1162/TACL.a.600",
  URL          = "https://doi.org/10.1162/TACL.a.600",
  eprint       = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.600/2577913/tacl.a.600.pdf",
  cc-author-affiliation = "University of Groningen, The Netherlands; Uppsala University, Sweden",
  cc-class     = "linguistic minimal pairs, language frequency, nlp/corpus-construction, nlp/multi-lingual-corpus",
  cc-derived-dataset-used = "GlotCC",
  cc-snippet   = "Since the training corpora of most LLMs are not publicly available, we estimate this distribution
                 based on the language frequencies of Kargaran et al. (2024), which were computed on a 3.9T token split
                 of the Common Crawl corpus. Common Crawl provides a good reflection of the language distribution of the
                 web-scraped data that is at the core of many LLM training corpora. [...] We also report results for
                 language subgroups split based on the Common Crawl language frequencies: [...]",
}

@Article{cc:HanleyLuPan:2025:Across-the-firewall,
  author       = "Hans W. A. Hanley and Yingdan Lu and Jennifer Pan",
  title        = "Across the firewall: Foreign media’s role in shaping Chinese social media narratives on the
                 Russo-Ukrainian War",
  journal      = "Proceedings of the National Academy of Sciences",
  volume       = "122",
  number       = "1",
  pages        = "e2420607122",
  year         = "2025",
  DOI          = "10.1073/pnas.2420607122",
  URL          = "https://www.pnas.org/doi/abs/10.1073/pnas.2420607122",
  eprint       = "https://www.pnas.org/doi/pdf/10.1073/pnas.2420607122",
  abstract     = "There is a widespread perception that China’s digital censorship distances its people from the
                 global internet, and the Chinese Communist Party, through state-controlled media, is the main
                 gatekeeper of information about foreign affairs. Our analysis of narratives about the Russo-Ukrainian
                 War circulating on the Chinese social media platform Weibo challenges this view. Comparing narratives
                 on Weibo with 8.26 million unique news articles from 2,500 of some of the most trafficked websites in
                 China, Russia, Ukraine, and the United States (totaling 10,000 sites), we find that Russian news
                 websites published more articles matching narratives found on Weibo than news websites from China,
                 Ukraine, or the United States. Similarly, a plurality of Weibo narratives were most associated with
                 narratives found on Russian news websites while less than ten percent were most associated with
                 narratives from Chinese news sites. Narratives later appearing on Weibo were more likely to first
                 appear on Russian rather than Chinese, Ukrainian, or US news websites, and Russian websites were highly
                 influential for narratives appearing on Weibo. Altogether, these results show that Chinese state media
                 was not the main gatekeeper of information about Russia’s invasion of Ukraine for Weibo users.",
  cc-author-affiliation = "Stanford University, Stanford, CA, USA; Northwestern University, Evanston, IL, USA",
  cc-class     = "political science, news, news narratives, hyperlinkgraph, domain-ranks,",
  cc-dataset-used = "hyperlinkgraph, CC-MAIN-2023-06, CC-MAIN-2022-49, CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27,
                 CC-MAIN-2022-21, CC-MAIN-2022-05",
  cc-snippet   = "News websites were identified using Amazon Alexa, Common Crawl, and Cloudflare data (SI Appendix,
                 section S2). [...] Namely, we collected the set of most popular websites ranked in Amazon Alexa’s top
                 one million websites and Common Crawl’s Domain Rank datasets from April 2022, which utilize the
                 top-level domain of each country we were interested in (i.e., .cn, .ua, and .ru).ˢ² [ˢ² We utilized
                 both Common Crawl and Amazon Alexa due to the paucity of Chinese, Ukrainian, and Russian domains in the
                 US-dominated Amazon Alexa list.] [...] From each news website in our dataset, we collected news
                 articles published between January 1, 2022, and June 1, 2022. To gather this data, we took two main
                 approaches: (1) gathering available web crawls from Common Crawl [22], and (2) extensively crawling
                 each website retrospectively between November 2022 and March 2023. Common Crawl is widely considered
                 the most complete public source of web crawl data. For each website, we downloaded Common Crawl indexed
                 pagesˢ³ from between January 1, 2022, and January 1, 2023 (CC-MAIN-2023-06, CC-MAIN-2022-49,
                 CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, CC-MAIN-2022-21, CC-MAIN-2022-05), identified the
                 publication date using the Python htmldate library, and included HTML pages published on their websites
                 between January 1, 2022, and June 1, 2022. To further expand the Common Crawl dataset, we performed a
                 breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages that
                 are missing from Common Crawl.",
}

@TechReport{cc:Di-PaoloLiberatiRubeo:2026:GreenWashing-climate-information-and-banking-policies,
  year         = "2026",
  title        = "{(Green)Washing} the Trust: Climate Information and Banking Policies",
  author       = "Di Paolo, Simone and Liberati, Danilo and Rubeo, Lorenzo",
  URL          = "https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf",
  journal      = "Temi di discussione (Working Papers)",
  number       = "1514",
  abstract     = "Greenwashing, that is, the deceptive self-portrayal of companies as sustainable and environmentally
                 friendly, is an increasingly relevant issue in finance. Identifying greenwashers is not a trivial task,
                 given the difficulty of assessing firms’ true environmental profiles, especially when relying on
                 traditional data sources that generally overlook communication strategies and mass perceptions. Using
                 granular credit data from the euro area banking system, we show that during the period 2019-2023,
                 greenwashers, initially identified by combining information on firms’ carbon emissions with an
                 assessment of the reliability of their reporting, were able to borrow at lower interest rates than
                 other companies. We then assess companies’ environmental profiles by extracting textual information
                 from newspapers and the internet. We find that sentiment scores based on firms’ own websites are
                 generally higher than those derived from newspapers, suggesting that companies use their communication
                 channels to place greater emphasis on their sustainable image than is reflected in external sources. By
                 integrating this textual metric with our initial proxy, we construct an alternative definition of
                 greenwashing. Based on a sample of Italian firms, results obtained from this combined proxy are
                 consistent with those derived from structured data alone. Finally, by introducing an unexpected
                 contractionary monetary policy shock into our framework, we confirm the operation of the credit risk
                 channel of monetary policy and find evidence of a reduction in the pricing benefits previously enjoyed
                 by greenwashers.",
  cc-author-affiliation = "Banca d'Italia, Italy",
  cc-class     = "climate change, company websites, banking, policies",
  cc-dataset-used = "CC-MAIN-2024-10",
  cc-snippet   = "We then developed a second Python script using the BeautifulSoup library to crawl these websites and
                 download their complete HTML content, following internal links up to three levels deep within the same
                 domain. When this automated approach failed (mostly for technical reasons, e.g. in the case of
                 single-page applications), we turned to Common Crawl, an open-access repository that regularly archives
                 vast portions of the internet. Common Crawl stores petabytes of raw web data, including HTML pages,
                 metadata, and text extracts, collected through periodic crawls of publicly accessible websites. Its
                 datasets are freely available and widely used in research for tasks such as text mining, search engine
                 development, and large-scale content analysis. However, Common Crawl is less suited for projects
                 focused on a restricted number of websites, since accessing a few domains of interest requires
                 downloading and processing very large amounts of data, often including irrelevant content. Moreover,
                 the temporal granularity of the snapshots and the potential incompleteness of some archived websites
                 may limit its reliability for capturing the most up-to-date corporate information. In our case, we
                 therefore relied on Common Crawl only as a complementary source, to obtain the HTML content for those
                 sites that our script was unable to fetch.¹⁸ [¹⁸We used the snapshot CC-MAIN-2024-10, the 10th
                 main crawl of 2024.]",
}