Merge pull request #11 from commoncrawl/update-citations-2025-2026-dev-jan-feb

sebastian-nagel · web-flow · commit 51b1323129dd · 2026-03-19T08:28:34.000+01:00
Update citations 2025/2026 December - March
diff --git a/Makefile b/Makefile
@@ -62,7 +62,7 @@ clean:
 
 # Google Scholar Alerts
 gscholar_alerts/extracted_citations.jsonl: gscholar_alerts/eml/
-	python3 gscholar_alerts/parse_scholar_alert_eml.py $< | LC_ALL=C sort >$@
+	python3 gscholar_alerts/parse_scholar_alert_eml.py $< gscholar_alerts/citations.jsonl | LC_ALL=C sort >$@
 
 gscholar_alerts/citations.jsonl: gscholar_alerts/extracted_citations.jsonl
 	jq -c 'select(.title != null and .authors != null) | del(.idx, .date, .data, .ref, .link)' $< >$@
diff --git a/bib/cc2017.bib b/bib/cc2017.bib
@@ -1,7 +1,7 @@
 @Article{Schaefer:2017:boilerplate-detection,
   author       = "Schäfer, Roland",
   title        = "Accurate and Efficient General-purpose Boilerplate Detection for Crawled Web Corpora",
-  journal      = "Lang. Resour. Eval.",
+  journal      = "Language Resources and Evaluation",
   issue_date   = "September 2017",
   volume       = "51",
   number       = "3",
diff --git a/bib/cc2025.bib b/bib/cc2025.bib
@@ -1047,3 +1047,32 @@ @InProceedings{cc:Ligeti-NagyHéjaBánfiFöldesiEtAl:2025:Expanding-Hungarian-Gi
   cc-author-affiliation = "ELTE Research Centre for Linguistics, Budapest, Hungary",
   cc-class     = "nlp/corpus-construction, nlp/text-corpora, language-specific corpus",
 }
+
+@Misc{cc:Mahara:2025:Cybersecurity-data-extraction,
+  title        = "Cybersecurity Data Extraction from Common Crawl",
+  author       = "Ashim Mahara",
+  year         = "2025",
+  eprint       = "2602.22218",
+  archiveprefix = "arXiv",
+  primaryclass = "cs.CR",
+  URL          = "https://arxiv.org/abs/2602.22218",
+  abstract     = "Alpha-Root is a cybersecurity-focused dataset collected in a single shot from the Common Crawl web
+                 graph using community detection. Unlike iterative content-scoring approaches like DeepSeekMath, we mine
+                 quality domains directly from the web graph, starting from just 20 trusted seed domains.",
+  cc-author-affiliation = "Rochester Institute of Technology, USA",
+  cc-class     = "computer-security/internet-security, web-science/hyperlinkgraph",
+  cc-dataset-used = "hyperlinkgraph",
+  cc-snippet   = "Thus, we introduce Alpha-Root as a new large text corpus from Common Crawl⁵ [⁵commoncrawl.org] —
+                 intended to be used as a pre-training dataset — by extracting cybersecurity-focused domains from the
+                 webgraph provided by Common Crawl. We utilize the Leiden [42] algorithm for community detection to mine
+                 the web graph for domains that represent the cybersecurity community in the webgraph. We then extract
+                 texts from webpages of the members of the community that are also present in the FineWeb-Edu [29]
+                 dataset. [...] CommonCrawl release web graphs for each of their crawls. The cc-graph is a graph of
+                 nodes (domains) and edges between the nodes when the nodes are hyperlinked through a webpage.
+                 Essentially, there exists an edge between two domains if there is at-least one hyperlink between
+                 webpages of the domains. The cc-graph includes more than 100 million nodes and more than 1.8 billion
+                 edges. The processing of such a large graph presents several challenges on its own since commoncrawl
+                 only provides the data in an edge-list format while a lot of existing software libraries require an
+                 adjacency list to work on the graph.",
+}
+
diff --git a/bib/cc2026.bib b/bib/cc2026.bib
@@ -0,0 +1,159 @@
+@Article{cc:PeiskerHoffmannMuttarak:2026:Climate-news,
+  title        = "Climate news mediates extreme weather effects on climate change concern",
+  journal      = "Climate Risk Management",
+  volume       = "52",
+  pages        = "100806",
+  year         = "2026",
+  ISSN         = "2212-0963",
+  DOI          = "https://doi.org/10.1016/j.crm.2026.100806",
+  URL          = "https://www.sciencedirect.com/science/article/pii/S2212096326000197",
+  author       = "Jonas Peisker and Roman Hoffmann and Raya Muttarak",
+  keywords     = "Climate change concern, Extreme weather, News media, Issue attention, Mediation",
+  abstract     = "As the severe impacts of climate change become increasingly apparent, concerns about climate-related
+                 issues have grown in recent years. The news media plays an important role in disseminating information
+                 about climate change and its consequences to the wider public and thus can influence public climate
+                 concern. Here, we investigate how extreme weather affects issue attention to climate change in the
+                 European online news media and how extreme weather and news coverage jointly shape changes in climate
+                 change concern. For the analysis, we combine 12 harmonized Eurobarometer survey waves, measuring public
+                 concerns about climate issues, with meteorological data and indices of environmental news coverage
+                 based on publications from 2481 media outlets in 200 regions of 22 European countries. Using fixed
+                 effects panel models, we estimate effects of temperature anomalies on climate news and climate concern
+                 and explore the role of the news media in explaining changes in concerns in response to temperature
+                 anomalies. The results indicate that unusually high temperatures exhibit a robust positive effect on
+                 media attention, especially when they overlap with other events that draw attention to the climate
+                 topic, such as major climate change conferences. We furthermore find evidence that the climate news in
+                 national outlets increases public concern about climate change and show that reporting by such outlets
+                 is likely to partly explain the effects of temperature anomalies on concerns. We do not find any
+                 significant effects of climate reporting in regional news outlets on climate concern. Our results
+                 suggest that the national news media partly mediates the effects of extreme weather on public climate
+                 change concern. The findings also highlight that focusing events strongly influence issue attention of
+                 the media, providing windows of opportunity to raise awareness about climate issues, while pointing to
+                 challenges in sustaining attention to related topics beyond short-lived news cycles.",
+  cc-author-affiliation = "International Institute for Applied Systems Analysis, Population and Just Societies,
+                 Laxenburg, Austria; University of Bologna, Italy",
+  cc-class     = "climate change, climate risk management, news, websiteranking, domain-ranking, hyperlinkgraph",
+  cc-derived-dataset-used = "OpenPageRank",
+  cc-snippet   = "The weights are provided by the Open Page Rank that is based on Common Crawl project, an open source
+                 database of web crawl data (DomCop, 2022). The Open Page Rank is scaled to a range from 0 to 10. Fig. 1
+                 shows the distribution of page ranks over the included articles.",
+}
+
+@Article{cc:JumeletWeissweilerNivreBisazza:2026:MultiBLiMP-1.0,
+  author       = "Jumelet, Jaap and Weissweiler, Leonie and Nivre, Joakim and Bisazza, Arianna",
+  title        = "{MultiBLiMP 1.0}: A Massively Multilingual Benchmark of Linguistic Minimal Pairs",
+  journal      = "Transactions of the Association for Computational Linguistics",
+  volume       = "14",
+  pages        = "193--216",
+  year         = "2026",
+  month        = "01",
+  abstract     = "We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering
+                 101 languages and 2 types of subject-verb agreement, containing more than 128,000 minimal pairs. Our
+                 minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic
+                 resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an
+                 unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in
+                 modelling low-resource languages.1",
+  ISSN         = "2307-387X",
+  DOI          = "10.1162/TACL.a.600",
+  URL          = "https://doi.org/10.1162/TACL.a.600",
+  eprint       = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.600/2577913/tacl.a.600.pdf",
+  cc-author-affiliation = "University of Groningen, The Netherlands; Uppsala University, Sweden",
+  cc-class     = "linguistic minimal pairs, language frequency, nlp/corpus-construction, nlp/multi-lingual-corpus",
+  cc-derived-dataset-used = "GlotCC",
+  cc-snippet   = "Since the training corpora of most LLMs are not publicly available, we estimate this distribution
+                 based on the language frequencies of Kargaran et al. (2024), which were computed on a 3.9T token split
+                 of the Common Crawl corpus. Common Crawl provides a good reflection of the language distribution of the
+                 web-scraped data that is at the core of many LLM training corpora. [...] We also report results for
+                 language subgroups split based on the Common Crawl language frequencies: [...]",
+}
+
+@Article{cc:HanleyLuPan:2025:Across-the-firewall,
+  author       = "Hans W. A. Hanley and Yingdan Lu and Jennifer Pan",
+  title        = "Across the firewall: Foreign media’s role in shaping Chinese social media narratives on the
+                 Russo-Ukrainian War",
+  journal      = "Proceedings of the National Academy of Sciences",
+  volume       = "122",
+  number       = "1",
+  pages        = "e2420607122",
+  year         = "2025",
+  DOI          = "10.1073/pnas.2420607122",
+  URL          = "https://www.pnas.org/doi/abs/10.1073/pnas.2420607122",
+  eprint       = "https://www.pnas.org/doi/pdf/10.1073/pnas.2420607122",
+  abstract     = "There is a widespread perception that China’s digital censorship distances its people from the
+                 global internet, and the Chinese Communist Party, through state-controlled media, is the main
+                 gatekeeper of information about foreign affairs. Our analysis of narratives about the Russo-Ukrainian
+                 War circulating on the Chinese social media platform Weibo challenges this view. Comparing narratives
+                 on Weibo with 8.26 million unique news articles from 2,500 of some of the most trafficked websites in
+                 China, Russia, Ukraine, and the United States (totaling 10,000 sites), we find that Russian news
+                 websites published more articles matching narratives found on Weibo than news websites from China,
+                 Ukraine, or the United States. Similarly, a plurality of Weibo narratives were most associated with
+                 narratives found on Russian news websites while less than ten percent were most associated with
+                 narratives from Chinese news sites. Narratives later appearing on Weibo were more likely to first
+                 appear on Russian rather than Chinese, Ukrainian, or US news websites, and Russian websites were highly
+                 influential for narratives appearing on Weibo. Altogether, these results show that Chinese state media
+                 was not the main gatekeeper of information about Russia’s invasion of Ukraine for Weibo users.",
+  cc-author-affiliation = "Stanford University, Stanford, CA, USA; Northwestern University, Evanston, IL, USA",
+  cc-class     = "political science, news, news narratives, hyperlinkgraph, domain-ranks,",
+  cc-dataset-used = "hyperlinkgraph, CC-MAIN-2023-06, CC-MAIN-2022-49, CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27,
+                 CC-MAIN-2022-21, CC-MAIN-2022-05",
+  cc-snippet   = "News websites were identified using Amazon Alexa, Common Crawl, and Cloudflare data (SI Appendix,
+                 section S2). [...] Namely, we collected the set of most popular websites ranked in Amazon Alexa’s top
+                 one million websites and Common Crawl’s Domain Rank datasets from April 2022, which utilize the
+                 top-level domain of each country we were interested in (i.e., .cn, .ua, and .ru).ˢ² [ˢ² We utilized
+                 both Common Crawl and Amazon Alexa due to the paucity of Chinese, Ukrainian, and Russian domains in the
+                 US-dominated Amazon Alexa list.] [...] From each news website in our dataset, we collected news
+                 articles published between January 1, 2022, and June 1, 2022. To gather this data, we took two main
+                 approaches: (1) gathering available web crawls from Common Crawl [22], and (2) extensively crawling
+                 each website retrospectively between November 2022 and March 2023. Common Crawl is widely considered
+                 the most complete public source of web crawl data. For each website, we downloaded Common Crawl indexed
+                 pagesˢ³ from between January 1, 2022, and January 1, 2023 (CC-MAIN-2023-06, CC-MAIN-2022-49,
+                 CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, CC-MAIN-2022-21, CC-MAIN-2022-05), identified the
+                 publication date using the Python htmldate library, and included HTML pages published on their websites
+                 between January 1, 2022, and June 1, 2022. To further expand the Common Crawl dataset, we performed a
+                 breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages that
+                 are missing from Common Crawl.",
+}
+
+@TechReport{cc:Di-PaoloLiberatiRubeo:2026:GreenWashing-climate-information-and-banking-policies,
+  year         = "2026",
+  title        = "{(Green)Washing} the Trust: Climate Information and Banking Policies",
+  author       = "Di Paolo, Simone and Liberati, Danilo and Rubeo, Lorenzo",
+  URL          = "https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf",
+  journal      = "Temi di discussione (Working Papers)",
+  number       = "1514",
+  abstract     = "Greenwashing, that is, the deceptive self-portrayal of companies as sustainable and environmentally
+                 friendly, is an increasingly relevant issue in finance. Identifying greenwashers is not a trivial task,
+                 given the difficulty of assessing firms’ true environmental profiles, especially when relying on
+                 traditional data sources that generally overlook communication strategies and mass perceptions. Using
+                 granular credit data from the euro area banking system, we show that during the period 2019-2023,
+                 greenwashers, initially identified by combining information on firms’ carbon emissions with an
+                 assessment of the reliability of their reporting, were able to borrow at lower interest rates than
+                 other companies. We then assess companies’ environmental profiles by extracting textual information
+                 from newspapers and the internet. We find that sentiment scores based on firms’ own websites are
+                 generally higher than those derived from newspapers, suggesting that companies use their communication
+                 channels to place greater emphasis on their sustainable image than is reflected in external sources. By
+                 integrating this textual metric with our initial proxy, we construct an alternative definition of
+                 greenwashing. Based on a sample of Italian firms, results obtained from this combined proxy are
+                 consistent with those derived from structured data alone. Finally, by introducing an unexpected
+                 contractionary monetary policy shock into our framework, we confirm the operation of the credit risk
+                 channel of monetary policy and find evidence of a reduction in the pricing benefits previously enjoyed
+                 by greenwashers.",
+  cc-author-affiliation = "Banca d'Italia, Italy",
+  cc-class     = "climate change, company websites, banking, policies",
+  cc-dataset-used = "CC-MAIN-2024-10",
+  cc-snippet   = "We then developed a second Python script using the BeautifulSoup library to crawl these websites and
+                 download their complete HTML content, following internal links up to three levels deep within the same
+                 domain. When this automated approach failed (mostly for technical reasons, e.g. in the case of
+                 single-page applications), we turned to Common Crawl, an open-access repository that regularly archives
+                 vast portions of the internet. Common Crawl stores petabytes of raw web data, including HTML pages,
+                 metadata, and text extracts, collected through periodic crawls of publicly accessible websites. Its
+                 datasets are freely available and widely used in research for tasks such as text mining, search engine
+                 development, and large-scale content analysis. However, Common Crawl is less suited for projects
+                 focused on a restricted number of websites, since accessing a few domains of interest requires
+                 downloading and processing very large amounts of data, often including irrelevant content. Moreover,
+                 the temporal granularity of the snapshots and the potential incompleteness of some archived websites
+                 may limit its reliability for capturing the most up-to-date corporate information. In our case, we
+                 therefore relied on Common Crawl only as a complementary source, to obtain the HTML content for those
+                 sites that our script was unable to fetch.¹⁸ [¹⁸We used the snapshot CC-MAIN-2024-10, the 10th
+                 main crawl of 2024.]",
+}
+
diff --git a/gscholar_alerts/citations.jsonl b/gscholar_alerts/citations.jsonl
diff --git a/gscholar_alerts/parse_scholar_alert_eml.py b/gscholar_alerts/parse_scholar_alert_eml.py