Skip to content

Commit 51b1323

Browse files
Merge pull request #11 from commoncrawl/update-citations-2025-2026-dev-jan-feb
Update citations 2025/2026 December - March
2 parents d430412 + f8ba214 commit 51b1323

6 files changed

Lines changed: 965 additions & 30 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ clean:
6262

6363
# Google Scholar Alerts
6464
gscholar_alerts/extracted_citations.jsonl: gscholar_alerts/eml/
65-
python3 gscholar_alerts/parse_scholar_alert_eml.py $< | LC_ALL=C sort >$@
65+
python3 gscholar_alerts/parse_scholar_alert_eml.py $< gscholar_alerts/citations.jsonl | LC_ALL=C sort >$@
6666

6767
gscholar_alerts/citations.jsonl: gscholar_alerts/extracted_citations.jsonl
6868
jq -c 'select(.title != null and .authors != null) | del(.idx, .date, .data, .ref, .link)' $< >$@

bib/cc2017.bib

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
@Article{Schaefer:2017:boilerplate-detection,
22
author = "Schäfer, Roland",
33
title = "Accurate and Efficient General-purpose Boilerplate Detection for Crawled Web Corpora",
4-
journal = "Lang. Resour. Eval.",
4+
journal = "Language Resources and Evaluation",
55
issue_date = "September 2017",
66
volume = "51",
77
number = "3",

bib/cc2025.bib

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,3 +1047,32 @@ @InProceedings{cc:Ligeti-NagyHéjaBánfiFöldesiEtAl:2025:Expanding-Hungarian-Gi
10471047
cc-author-affiliation = "ELTE Research Centre for Linguistics, Budapest, Hungary",
10481048
cc-class = "nlp/corpus-construction, nlp/text-corpora, language-specific corpus",
10491049
}
1050+
1051+
@Misc{cc:Mahara:2025:Cybersecurity-data-extraction,
1052+
title = "Cybersecurity Data Extraction from Common Crawl",
1053+
author = "Ashim Mahara",
1054+
year = "2025",
1055+
eprint = "2602.22218",
1056+
archiveprefix = "arXiv",
1057+
primaryclass = "cs.CR",
1058+
URL = "https://arxiv.org/abs/2602.22218",
1059+
abstract = "Alpha-Root is a cybersecurity-focused dataset collected in a single shot from the Common Crawl web
1060+
graph using community detection. Unlike iterative content-scoring approaches like DeepSeekMath, we mine
1061+
quality domains directly from the web graph, starting from just 20 trusted seed domains.",
1062+
cc-author-affiliation = "Rochester Institute of Technology, USA",
1063+
cc-class = "computer-security/internet-security, web-science/hyperlinkgraph",
1064+
cc-dataset-used = "hyperlinkgraph",
1065+
cc-snippet = "Thus, we introduce Alpha-Root as a new large text corpus from Common Crawl⁵ [⁵commoncrawl.org] —
1066+
intended to be used as a pre-training dataset — by extracting cybersecurity-focused domains from the
1067+
webgraph provided by Common Crawl. We utilize the Leiden [42] algorithm for community detection to mine
1068+
the web graph for domains that represent the cybersecurity community in the webgraph. We then extract
1069+
texts from webpages of the members of the community that are also present in the FineWeb-Edu [29]
1070+
dataset. [...] CommonCrawl release web graphs for each of their crawls. The cc-graph is a graph of
1071+
nodes (domains) and edges between the nodes when the nodes are hyperlinked through a webpage.
1072+
Essentially, there exists an edge between two domains if there is at-least one hyperlink between
1073+
webpages of the domains. The cc-graph includes more than 100 million nodes and more than 1.8 billion
1074+
edges. The processing of such a large graph presents several challenges on its own since commoncrawl
1075+
only provides the data in an edge-list format while a lot of existing software libraries require an
1076+
adjacency list to work on the graph.",
1077+
}
1078+

bib/cc2026.bib

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
@Article{cc:PeiskerHoffmannMuttarak:2026:Climate-news,
2+
title = "Climate news mediates extreme weather effects on climate change concern",
3+
journal = "Climate Risk Management",
4+
volume = "52",
5+
pages = "100806",
6+
year = "2026",
7+
ISSN = "2212-0963",
8+
DOI = "https://doi.org/10.1016/j.crm.2026.100806",
9+
URL = "https://www.sciencedirect.com/science/article/pii/S2212096326000197",
10+
author = "Jonas Peisker and Roman Hoffmann and Raya Muttarak",
11+
keywords = "Climate change concern, Extreme weather, News media, Issue attention, Mediation",
12+
abstract = "As the severe impacts of climate change become increasingly apparent, concerns about climate-related
13+
issues have grown in recent years. The news media plays an important role in disseminating information
14+
about climate change and its consequences to the wider public and thus can influence public climate
15+
concern. Here, we investigate how extreme weather affects issue attention to climate change in the
16+
European online news media and how extreme weather and news coverage jointly shape changes in climate
17+
change concern. For the analysis, we combine 12 harmonized Eurobarometer survey waves, measuring public
18+
concerns about climate issues, with meteorological data and indices of environmental news coverage
19+
based on publications from 2481 media outlets in 200 regions of 22 European countries. Using fixed
20+
effects panel models, we estimate effects of temperature anomalies on climate news and climate concern
21+
and explore the role of the news media in explaining changes in concerns in response to temperature
22+
anomalies. The results indicate that unusually high temperatures exhibit a robust positive effect on
23+
media attention, especially when they overlap with other events that draw attention to the climate
24+
topic, such as major climate change conferences. We furthermore find evidence that the climate news in
25+
national outlets increases public concern about climate change and show that reporting by such outlets
26+
is likely to partly explain the effects of temperature anomalies on concerns. We do not find any
27+
significant effects of climate reporting in regional news outlets on climate concern. Our results
28+
suggest that the national news media partly mediates the effects of extreme weather on public climate
29+
change concern. The findings also highlight that focusing events strongly influence issue attention of
30+
the media, providing windows of opportunity to raise awareness about climate issues, while pointing to
31+
challenges in sustaining attention to related topics beyond short-lived news cycles.",
32+
cc-author-affiliation = "International Institute for Applied Systems Analysis, Population and Just Societies,
33+
Laxenburg, Austria; University of Bologna, Italy",
34+
cc-class = "climate change, climate risk management, news, websiteranking, domain-ranking, hyperlinkgraph",
35+
cc-derived-dataset-used = "OpenPageRank",
36+
cc-snippet = "The weights are provided by the Open Page Rank that is based on Common Crawl project, an open source
37+
database of web crawl data (DomCop, 2022). The Open Page Rank is scaled to a range from 0 to 10. Fig. 1
38+
shows the distribution of page ranks over the included articles.",
39+
}
40+
41+
@Article{cc:JumeletWeissweilerNivreBisazza:2026:MultiBLiMP-1.0,
42+
author = "Jumelet, Jaap and Weissweiler, Leonie and Nivre, Joakim and Bisazza, Arianna",
43+
title = "{MultiBLiMP 1.0}: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
44+
journal = "Transactions of the Association for Computational Linguistics",
45+
volume = "14",
46+
pages = "193--216",
47+
year = "2026",
48+
month = "01",
49+
abstract = "We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering
50+
101 languages and 2 types of subject-verb agreement, containing more than 128,000 minimal pairs. Our
51+
minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic
52+
resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an
53+
unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in
54+
modelling low-resource languages.1",
55+
ISSN = "2307-387X",
56+
DOI = "10.1162/TACL.a.600",
57+
URL = "https://doi.org/10.1162/TACL.a.600",
58+
eprint = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.600/2577913/tacl.a.600.pdf",
59+
cc-author-affiliation = "University of Groningen, The Netherlands; Uppsala University, Sweden",
60+
cc-class = "linguistic minimal pairs, language frequency, nlp/corpus-construction, nlp/multi-lingual-corpus",
61+
cc-derived-dataset-used = "GlotCC",
62+
cc-snippet = "Since the training corpora of most LLMs are not publicly available, we estimate this distribution
63+
based on the language frequencies of Kargaran et al. (2024), which were computed on a 3.9T token split
64+
of the Common Crawl corpus. Common Crawl provides a good reflection of the language distribution of the
65+
web-scraped data that is at the core of many LLM training corpora. [...] We also report results for
66+
language subgroups split based on the Common Crawl language frequencies: [...]",
67+
}
68+
69+
@Article{cc:HanleyLuPan:2025:Across-the-firewall,
70+
author = "Hans W. A. Hanley and Yingdan Lu and Jennifer Pan",
71+
title = "Across the firewall: Foreign media’s role in shaping Chinese social media narratives on the
72+
Russo-Ukrainian War",
73+
journal = "Proceedings of the National Academy of Sciences",
74+
volume = "122",
75+
number = "1",
76+
pages = "e2420607122",
77+
year = "2025",
78+
DOI = "10.1073/pnas.2420607122",
79+
URL = "https://www.pnas.org/doi/abs/10.1073/pnas.2420607122",
80+
eprint = "https://www.pnas.org/doi/pdf/10.1073/pnas.2420607122",
81+
abstract = "There is a widespread perception that China’s digital censorship distances its people from the
82+
global internet, and the Chinese Communist Party, through state-controlled media, is the main
83+
gatekeeper of information about foreign affairs. Our analysis of narratives about the Russo-Ukrainian
84+
War circulating on the Chinese social media platform Weibo challenges this view. Comparing narratives
85+
on Weibo with 8.26 million unique news articles from 2,500 of some of the most trafficked websites in
86+
China, Russia, Ukraine, and the United States (totaling 10,000 sites), we find that Russian news
87+
websites published more articles matching narratives found on Weibo than news websites from China,
88+
Ukraine, or the United States. Similarly, a plurality of Weibo narratives were most associated with
89+
narratives found on Russian news websites while less than ten percent were most associated with
90+
narratives from Chinese news sites. Narratives later appearing on Weibo were more likely to first
91+
appear on Russian rather than Chinese, Ukrainian, or US news websites, and Russian websites were highly
92+
influential for narratives appearing on Weibo. Altogether, these results show that Chinese state media
93+
was not the main gatekeeper of information about Russia’s invasion of Ukraine for Weibo users.",
94+
cc-author-affiliation = "Stanford University, Stanford, CA, USA; Northwestern University, Evanston, IL, USA",
95+
cc-class = "political science, news, news narratives, hyperlinkgraph, domain-ranks,",
96+
cc-dataset-used = "hyperlinkgraph, CC-MAIN-2023-06, CC-MAIN-2022-49, CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27,
97+
CC-MAIN-2022-21, CC-MAIN-2022-05",
98+
cc-snippet = "News websites were identified using Amazon Alexa, Common Crawl, and Cloudflare data (SI Appendix,
99+
section S2). [...] Namely, we collected the set of most popular websites ranked in Amazon Alexa’s top
100+
one million websites and Common Crawl’s Domain Rank datasets from April 2022, which utilize the
101+
top-level domain of each country we were interested in (i.e., .cn, .ua, and .ru).ˢ² [ˢ² We utilized
102+
both Common Crawl and Amazon Alexa due to the paucity of Chinese, Ukrainian, and Russian domains in the
103+
US-dominated Amazon Alexa list.] [...] From each news website in our dataset, we collected news
104+
articles published between January 1, 2022, and June 1, 2022. To gather this data, we took two main
105+
approaches: (1) gathering available web crawls from Common Crawl [22], and (2) extensively crawling
106+
each website retrospectively between November 2022 and March 2023. Common Crawl is widely considered
107+
the most complete public source of web crawl data. For each website, we downloaded Common Crawl indexed
108+
pagesˢ³ from between January 1, 2022, and January 1, 2023 (CC-MAIN-2023-06, CC-MAIN-2022-49,
109+
CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, CC-MAIN-2022-21, CC-MAIN-2022-05), identified the
110+
publication date using the Python htmldate library, and included HTML pages published on their websites
111+
between January 1, 2022, and June 1, 2022. To further expand the Common Crawl dataset, we performed a
112+
breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages that
113+
are missing from Common Crawl.",
114+
}
115+
116+
@TechReport{cc:Di-PaoloLiberatiRubeo:2026:GreenWashing-climate-information-and-banking-policies,
117+
year = "2026",
118+
title = "{(Green)Washing} the Trust: Climate Information and Banking Policies",
119+
author = "Di Paolo, Simone and Liberati, Danilo and Rubeo, Lorenzo",
120+
URL = "https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf",
121+
journal = "Temi di discussione (Working Papers)",
122+
number = "1514",
123+
abstract = "Greenwashing, that is, the deceptive self-portrayal of companies as sustainable and environmentally
124+
friendly, is an increasingly relevant issue in finance. Identifying greenwashers is not a trivial task,
125+
given the difficulty of assessing firms’ true environmental profiles, especially when relying on
126+
traditional data sources that generally overlook communication strategies and mass perceptions. Using
127+
granular credit data from the euro area banking system, we show that during the period 2019-2023,
128+
greenwashers, initially identified by combining information on firms’ carbon emissions with an
129+
assessment of the reliability of their reporting, were able to borrow at lower interest rates than
130+
other companies. We then assess companies’ environmental profiles by extracting textual information
131+
from newspapers and the internet. We find that sentiment scores based on firms’ own websites are
132+
generally higher than those derived from newspapers, suggesting that companies use their communication
133+
channels to place greater emphasis on their sustainable image than is reflected in external sources. By
134+
integrating this textual metric with our initial proxy, we construct an alternative definition of
135+
greenwashing. Based on a sample of Italian firms, results obtained from this combined proxy are
136+
consistent with those derived from structured data alone. Finally, by introducing an unexpected
137+
contractionary monetary policy shock into our framework, we confirm the operation of the credit risk
138+
channel of monetary policy and find evidence of a reduction in the pricing benefits previously enjoyed
139+
by greenwashers.",
140+
cc-author-affiliation = "Banca d'Italia, Italy",
141+
cc-class = "climate change, company websites, banking, policies",
142+
cc-dataset-used = "CC-MAIN-2024-10",
143+
cc-snippet = "We then developed a second Python script using the BeautifulSoup library to crawl these websites and
144+
download their complete HTML content, following internal links up to three levels deep within the same
145+
domain. When this automated approach failed (mostly for technical reasons, e.g. in the case of
146+
single-page applications), we turned to Common Crawl, an open-access repository that regularly archives
147+
vast portions of the internet. Common Crawl stores petabytes of raw web data, including HTML pages,
148+
metadata, and text extracts, collected through periodic crawls of publicly accessible websites. Its
149+
datasets are freely available and widely used in research for tasks such as text mining, search engine
150+
development, and large-scale content analysis. However, Common Crawl is less suited for projects
151+
focused on a restricted number of websites, since accessing a few domains of interest requires
152+
downloading and processing very large amounts of data, often including irrelevant content. Moreover,
153+
the temporal granularity of the snapshots and the potential incompleteness of some archived websites
154+
may limit its reliability for capturing the most up-to-date corporate information. In our case, we
155+
therefore relied on Common Crawl only as a complementary source, to obtain the HTML content for those
156+
sites that our script was unable to fetch.¹⁸ [¹⁸We used the snapshot CC-MAIN-2024-10, the 10th
157+
main crawl of 2024.]",
158+
}
159+

0 commit comments

Comments
 (0)