Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ clean:

# Google Scholar Alerts
gscholar_alerts/extracted_citations.jsonl: gscholar_alerts/eml/
python3 gscholar_alerts/parse_scholar_alert_eml.py $< | LC_ALL=C sort >$@
python3 gscholar_alerts/parse_scholar_alert_eml.py $< gscholar_alerts/citations.jsonl | LC_ALL=C sort >$@

gscholar_alerts/citations.jsonl: gscholar_alerts/extracted_citations.jsonl
jq -c 'select(.title != null and .authors != null) | del(.idx, .date, .data, .ref, .link)' $< >$@
Expand Down
2 changes: 1 addition & 1 deletion bib/cc2017.bib
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
@Article{Schaefer:2017:boilerplate-detection,
author = "Schäfer, Roland",
title = "Accurate and Efficient General-purpose Boilerplate Detection for Crawled Web Corpora",
journal = "Lang. Resour. Eval.",
journal = "Language Resources and Evaluation",
issue_date = "September 2017",
volume = "51",
number = "3",
Expand Down
29 changes: 29 additions & 0 deletions bib/cc2025.bib
Original file line number Diff line number Diff line change
Expand Up @@ -1047,3 +1047,32 @@ @InProceedings{cc:Ligeti-NagyHéjaBánfiFöldesiEtAl:2025:Expanding-Hungarian-Gi
cc-author-affiliation = "ELTE Research Centre for Linguistics, Budapest, Hungary",
cc-class = "nlp/corpus-construction, nlp/text-corpora, language-specific corpus",
}

@Misc{cc:Mahara:2025:Cybersecurity-data-extraction,
title = "Cybersecurity Data Extraction from Common Crawl",
author = "Ashim Mahara",
year = "2025",
eprint = "2602.22218",
archiveprefix = "arXiv",
primaryclass = "cs.CR",
URL = "https://arxiv.org/abs/2602.22218",
abstract = "Alpha-Root is a cybersecurity-focused dataset collected in a single shot from the Common Crawl web
graph using community detection. Unlike iterative content-scoring approaches like DeepSeekMath, we mine
quality domains directly from the web graph, starting from just 20 trusted seed domains.",
cc-author-affiliation = "Rochester Institute of Technology, USA",
cc-class = "computer-security/internet-security, web-science/hyperlinkgraph",
cc-dataset-used = "hyperlinkgraph",
cc-snippet = "Thus, we introduce Alpha-Root as a new large text corpus from Common Crawl⁵ [⁵commoncrawl.org] —
intended to be used as a pre-training dataset — by extracting cybersecurity-focused domains from the
webgraph provided by Common Crawl. We utilize the Leiden [42] algorithm for community detection to mine
the web graph for domains that represent the cybersecurity community in the webgraph. We then extract
texts from webpages of the members of the community that are also present in the FineWeb-Edu [29]
dataset. [...] CommonCrawl release web graphs for each of their crawls. The cc-graph is a graph of
nodes (domains) and edges between the nodes when the nodes are hyperlinked through a webpage.
Essentially, there exists an edge between two domains if there is at-least one hyperlink between
webpages of the domains. The cc-graph includes more than 100 million nodes and more than 1.8 billion
edges. The processing of such a large graph presents several challenges on its own since commoncrawl
only provides the data in an edge-list format while a lot of existing software libraries require an
adjacency list to work on the graph.",
}

159 changes: 159 additions & 0 deletions bib/cc2026.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
@Article{cc:PeiskerHoffmannMuttarak:2026:Climate-news,
title = "Climate news mediates extreme weather effects on climate change concern",
journal = "Climate Risk Management",
volume = "52",
pages = "100806",
year = "2026",
ISSN = "2212-0963",
DOI = "https://doi.org/10.1016/j.crm.2026.100806",
URL = "https://www.sciencedirect.com/science/article/pii/S2212096326000197",
author = "Jonas Peisker and Roman Hoffmann and Raya Muttarak",
keywords = "Climate change concern, Extreme weather, News media, Issue attention, Mediation",
abstract = "As the severe impacts of climate change become increasingly apparent, concerns about climate-related
issues have grown in recent years. The news media plays an important role in disseminating information
about climate change and its consequences to the wider public and thus can influence public climate
concern. Here, we investigate how extreme weather affects issue attention to climate change in the
European online news media and how extreme weather and news coverage jointly shape changes in climate
change concern. For the analysis, we combine 12 harmonized Eurobarometer survey waves, measuring public
concerns about climate issues, with meteorological data and indices of environmental news coverage
based on publications from 2481 media outlets in 200 regions of 22 European countries. Using fixed
effects panel models, we estimate effects of temperature anomalies on climate news and climate concern
and explore the role of the news media in explaining changes in concerns in response to temperature
anomalies. The results indicate that unusually high temperatures exhibit a robust positive effect on
media attention, especially when they overlap with other events that draw attention to the climate
topic, such as major climate change conferences. We furthermore find evidence that the climate news in
national outlets increases public concern about climate change and show that reporting by such outlets
is likely to partly explain the effects of temperature anomalies on concerns. We do not find any
significant effects of climate reporting in regional news outlets on climate concern. Our results
suggest that the national news media partly mediates the effects of extreme weather on public climate
change concern. The findings also highlight that focusing events strongly influence issue attention of
the media, providing windows of opportunity to raise awareness about climate issues, while pointing to
challenges in sustaining attention to related topics beyond short-lived news cycles.",
cc-author-affiliation = "International Institute for Applied Systems Analysis, Population and Just Societies,
Laxenburg, Austria; University of Bologna, Italy",
cc-class = "climate change, climate risk management, news, websiteranking, domain-ranking, hyperlinkgraph",
cc-derived-dataset-used = "OpenPageRank",
cc-snippet = "The weights are provided by the Open Page Rank that is based on Common Crawl project, an open source
database of web crawl data (DomCop, 2022). The Open Page Rank is scaled to a range from 0 to 10. Fig. 1
shows the distribution of page ranks over the included articles.",
}

@Article{cc:JumeletWeissweilerNivreBisazza:2026:MultiBLiMP-1.0,
author = "Jumelet, Jaap and Weissweiler, Leonie and Nivre, Joakim and Bisazza, Arianna",
title = "{MultiBLiMP 1.0}: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
journal = "Transactions of the Association for Computational Linguistics",
volume = "14",
pages = "193--216",
year = "2026",
month = "01",
abstract = "We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering
101 languages and 2 types of subject-verb agreement, containing more than 128,000 minimal pairs. Our
minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic
resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an
unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in
modelling low-resource languages.1",
ISSN = "2307-387X",
DOI = "10.1162/TACL.a.600",
URL = "https://doi.org/10.1162/TACL.a.600",
eprint = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.600/2577913/tacl.a.600.pdf",
cc-author-affiliation = "University of Groningen, The Netherlands; Uppsala University, Sweden",
cc-class = "linguistic minimal pairs, language frequency, nlp/corpus-construction, nlp/multi-lingual-corpus",
cc-derived-dataset-used = "GlotCC",
cc-snippet = "Since the training corpora of most LLMs are not publicly available, we estimate this distribution
based on the language frequencies of Kargaran et al. (2024), which were computed on a 3.9T token split
of the Common Crawl corpus. Common Crawl provides a good reflection of the language distribution of the
web-scraped data that is at the core of many LLM training corpora. [...] We also report results for
language subgroups split based on the Common Crawl language frequencies: [...]",
}

@Article{cc:HanleyLuPan:2025:Across-the-firewall,
author = "Hans W. A. Hanley and Yingdan Lu and Jennifer Pan",
title = "Across the firewall: Foreign media’s role in shaping Chinese social media narratives on the
Russo-Ukrainian War",
journal = "Proceedings of the National Academy of Sciences",
volume = "122",
number = "1",
pages = "e2420607122",
year = "2025",
DOI = "10.1073/pnas.2420607122",
URL = "https://www.pnas.org/doi/abs/10.1073/pnas.2420607122",
eprint = "https://www.pnas.org/doi/pdf/10.1073/pnas.2420607122",
abstract = "There is a widespread perception that China’s digital censorship distances its people from the
global internet, and the Chinese Communist Party, through state-controlled media, is the main
gatekeeper of information about foreign affairs. Our analysis of narratives about the Russo-Ukrainian
War circulating on the Chinese social media platform Weibo challenges this view. Comparing narratives
on Weibo with 8.26 million unique news articles from 2,500 of some of the most trafficked websites in
China, Russia, Ukraine, and the United States (totaling 10,000 sites), we find that Russian news
websites published more articles matching narratives found on Weibo than news websites from China,
Ukraine, or the United States. Similarly, a plurality of Weibo narratives were most associated with
narratives found on Russian news websites while less than ten percent were most associated with
narratives from Chinese news sites. Narratives later appearing on Weibo were more likely to first
appear on Russian rather than Chinese, Ukrainian, or US news websites, and Russian websites were highly
influential for narratives appearing on Weibo. Altogether, these results show that Chinese state media
was not the main gatekeeper of information about Russia’s invasion of Ukraine for Weibo users.",
cc-author-affiliation = "Stanford University, Stanford, CA, USA; Northwestern University, Evanston, IL, USA",
cc-class = "political science, news, news narratives, hyperlinkgraph, domain-ranks,",
cc-dataset-used = "hyperlinkgraph, CC-MAIN-2023-06, CC-MAIN-2022-49, CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27,
CC-MAIN-2022-21, CC-MAIN-2022-05",
cc-snippet = "News websites were identified using Amazon Alexa, Common Crawl, and Cloudflare data (SI Appendix,
section S2). [...] Namely, we collected the set of most popular websites ranked in Amazon Alexa’s top
one million websites and Common Crawl’s Domain Rank datasets from April 2022, which utilize the
top-level domain of each country we were interested in (i.e., .cn, .ua, and .ru).ˢ² [ˢ² We utilized
both Common Crawl and Amazon Alexa due to the paucity of Chinese, Ukrainian, and Russian domains in the
US-dominated Amazon Alexa list.] [...] From each news website in our dataset, we collected news
articles published between January 1, 2022, and June 1, 2022. To gather this data, we took two main
approaches: (1) gathering available web crawls from Common Crawl [22], and (2) extensively crawling
each website retrospectively between November 2022 and March 2023. Common Crawl is widely considered
the most complete public source of web crawl data. For each website, we downloaded Common Crawl indexed
pagesˢ³ from between January 1, 2022, and January 1, 2023 (CC-MAIN-2023-06, CC-MAIN-2022-49,
CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, CC-MAIN-2022-21, CC-MAIN-2022-05), identified the
publication date using the Python htmldate library, and included HTML pages published on their websites
between January 1, 2022, and June 1, 2022. To further expand the Common Crawl dataset, we performed a
breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages that
are missing from Common Crawl.",
}

@TechReport{cc:Di-PaoloLiberatiRubeo:2026:GreenWashing-climate-information-and-banking-policies,
year = "2026",
title = "{(Green)Washing} the Trust: Climate Information and Banking Policies",
author = "Di Paolo, Simone and Liberati, Danilo and Rubeo, Lorenzo",
URL = "https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf",
journal = "Temi di discussione (Working Papers)",
number = "1514",
abstract = "Greenwashing, that is, the deceptive self-portrayal of companies as sustainable and environmentally
friendly, is an increasingly relevant issue in finance. Identifying greenwashers is not a trivial task,
given the difficulty of assessing firms’ true environmental profiles, especially when relying on
traditional data sources that generally overlook communication strategies and mass perceptions. Using
granular credit data from the euro area banking system, we show that during the period 2019-2023,
greenwashers, initially identified by combining information on firms’ carbon emissions with an
assessment of the reliability of their reporting, were able to borrow at lower interest rates than
other companies. We then assess companies’ environmental profiles by extracting textual information
from newspapers and the internet. We find that sentiment scores based on firms’ own websites are
generally higher than those derived from newspapers, suggesting that companies use their communication
channels to place greater emphasis on their sustainable image than is reflected in external sources. By
integrating this textual metric with our initial proxy, we construct an alternative definition of
greenwashing. Based on a sample of Italian firms, results obtained from this combined proxy are
consistent with those derived from structured data alone. Finally, by introducing an unexpected
contractionary monetary policy shock into our framework, we confirm the operation of the credit risk
channel of monetary policy and find evidence of a reduction in the pricing benefits previously enjoyed
by greenwashers.",
cc-author-affiliation = "Banca d'Italia, Italy",
cc-class = "climate change, company websites, banking, policies",
cc-dataset-used = "CC-MAIN-2024-10",
cc-snippet = "We then developed a second Python script using the BeautifulSoup library to crawl these websites and
download their complete HTML content, following internal links up to three levels deep within the same
domain. When this automated approach failed (mostly for technical reasons, e.g. in the case of
single-page applications), we turned to Common Crawl, an open-access repository that regularly archives
vast portions of the internet. Common Crawl stores petabytes of raw web data, including HTML pages,
metadata, and text extracts, collected through periodic crawls of publicly accessible websites. Its
datasets are freely available and widely used in research for tasks such as text mining, search engine
development, and large-scale content analysis. However, Common Crawl is less suited for projects
focused on a restricted number of websites, since accessing a few domains of interest requires
downloading and processing very large amounts of data, often including irrelevant content. Moreover,
the temporal granularity of the snapshots and the potential incompleteness of some archived websites
may limit its reliability for capturing the most up-to-date corporate information. In our case, we
therefore relied on Common Crawl only as a complementary source, to obtain the HTML content for those
sites that our script was unable to fetch.¹⁸ [¹⁸We used the snapshot CC-MAIN-2024-10, the 10th
main crawl of 2024.]",
}

Loading