fix: format errors in BibTeX citations

sebastian-nagel · sebastian-nagel · commit 219f352aa40b · 2025-07-08T18:59:39.000+02:00
diff --git a/bib/cc2023.bib b/bib/cc2023.bib
@@ -1326,8 +1326,8 @@ @Misc{cc:Tenis:2023:Efficient-URL-phishing-detection
   volume       = "47",
   number       = "2",
   year         = "2023",
-  title        = "Modelling an Efficient {URL} Phishing Detection Approach Based on a Dense Network Model. Computer
-                 Systems Science & Engineering . 2023, Vol. 47 Issue 2, p2625-2641. 17p.",
+  title        = "Modelling an Efficient {URL} Phishing Detection Approach Based on a Dense Network Model",
+  pages        = "2625--2641",
   author       = "Tenis, A. Aldo; Santhosh, R.",
   URL          = "https://web.s.ebscohost.com/abstract?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=02676192&AN=169779920&h=WGjAKpK7ACB1ZcUfp8Ikhm9IcDPjsbjptgyhA5ityW47Z2oYK4JmZTEMhj6t1UhLOFgbraBWyMgS1NID6mz%2bcA%3d%3d&crl=c&resultNs=AdminWebAuth&resultLocal=ErrCrlNotAuth&crlhashurl=login.aspx%3fdirect%3dtrue%26profile%3dehost%26scope%3dsite%26authtype%3dcrawler%26jrnl%3d02676192%26AN%3d169779920",
   abstract     = "The social engineering cyber-attack is where culprits mislead the users by getting the login details
@@ -1386,7 +1386,7 @@ @InProceedings{cc:TirumalaSimigAghajanyanMorcos:2023:Improving-LLM-Pretraining
 }
 
 @Article{cc:WangKimMittalRexford:2023:RAVEN-Stateless-Rapid-IP-Address-Variation,
-  title        = "{RAVEN}: Stateless Rapid {IP} Address Variation for Enterprise Networks.",
+  title        = "{RAVEN}: Stateless Rapid {IP} Address Variation for Enterprise Networks",
   author       = "Wang, Liang and Kim, Hyojoon and Mittal, Prateek and Rexford, Jennifer",
   journal      = "Proc. Priv. Enhancing Technol.",
   volume       = "2023",
diff --git a/bib/cc2024.bib b/bib/cc2024.bib
@@ -142,7 +142,16 @@ @InProceedings{cc:Thompson:2024:Longitudinal-web-analytics
 }
 
 @InProceedings{cc:ElOuadi:2024:Comparison-CC-News-GDELT,
+  author       = "El Ouadi, Ameir and Beskow, David",
   title        = "Comparison of Common Crawl News \& {GDELT}",
+  year         = "2024",
+  month        = apr,
+  language     = "English",
+  booktitle    = "2024 {IEEE} International Systems Conference {(SysCon)}",
+  publisher    = "IEEE",
+  pages        = "1--3",
+  URL          = "https://ieeexplore.ieee.org/document/10553540",
+  DOI          = "10.1109/SysCon61195.2024.10553540",
   abstract     = "The corpus of worldwide news is important for natural language processing, knowledge graphs, large
                  language models, and other technical efforts. Additionally, this corpus is important for understanding
                  the people, places, organizations, and events that interact in real-time every day. This paper compares
@@ -152,14 +161,10 @@ @InProceedings{cc:ElOuadi:2024:Comparison-CC-News-GDELT
                  from across the globe, Common Crawl focuses on news sites from around the world gathered through web
                  crawling. Our analysis revealed considerable differences in where the two datasets gather their news
                  sources.",
-  author       = "El Ouadi, Ameir and Beskow, David",
-  year         = "2024",
-  month        = apr,
-  language     = "English",
-  booktitle    = "2024 {IEEE} International Systems Conference {(SysCon)}",
-  publisher    = "IEEE",
-  pages        = "1--3",
-  URL          = "https://doi.org/10.1109/SysCon61195.2024.10553540",
+  keywords     = "Databases, Organizations, Knowledge graphs, Real-time systems, Natural language processing, Task
+                 analysis, Open Source Data, News Data, NLP, LLMs",
+  cc-author-affiliation = "United States Military Academy, West Point, NY, USA",
+  cc-class     = "news-corpus, knowledge-graph",
 }
 
 @Misc{cc:LiuLuoShanVoelkerEtAl:2024:Somesite-I-used-to-crawl,
@@ -311,7 +316,7 @@ @Misc{cc:RiveroGonzalo:2024:When-online-content-disappears
   language     = "en-US",
   urldate      = "2025-01-30",
   journal      = "Pew Research Center",
-  author       = "Rivero, Samuel Bestvater, Emma Remy {and} Gonzalo, Athena Chapekis",
+  author       = "Athena Chapekis and Samuel Bestvater and Emma Remy and Gonzalo Rivero",
   month        = may,
   year         = "2024",
   cc-snippet   = "To conduct this part of our analysis, we collected a random sample of just under 1 million webpages
@@ -368,6 +373,7 @@ @Misc{cc:Amarikwa:2024:Internet-Openness-at-Risk
 @Misc{Blagojevic:2024:CC-news-dataset,
   author       = "Vladimir Blagojevic",
   year         = "2024",
+  title        = "vblagoje/cc_news",
   URL          = "https://huggingface.co/datasets/vblagoje/cc_news",
   cc-derived-dataset-about = "vblagoje-cc-news",
   cc-author-affiliation = "deepset AI, Germany",
@@ -409,7 +415,7 @@ @Article{cc:KrieschLosacker:2024:Bioeconomy-firms
   author       = "Kriesch, Lukas and Losacker, Sebastian",
   year         = "2024",
   month        = apr,
-  pages        = "55–78",
+  pages        = "55--78",
   cc-author-affiliation = "Justus-Liebig-University Giessen, Germany",
   cc-class     = "economic geography, web-mining, regional science, regional economics",
   cc-snippet   = "The dataset is based on a novel web-mining approach developed by Kriesch (2023). This dataset uses the
diff --git a/bib/cc2025.bib b/bib/cc2025.bib
@@ -586,6 +586,23 @@ @InProceedings{cc:AnsarSperottoHolz:2025:Web-crawl-refusals
   URL          = "https://link.springer.com/chapter/10.1007/978-3-031-85960-1_9",
   cc-author-affiliation = "University of Twente, Enschede, The Netherlands; University of Münster, Germany",
   cc-class     = "web-crawling; web-crawling/server-side-blocking",
+  cc-snippet   = "CC is a non-proﬁt initiative. The crawler is designed [18] to be “polite” and comply with the
+                 Robots Exclusion Protocol, using e.g. ﬁve seconds wait between requests to the same host and
+                 employing an exponential back-oﬀ strategy on error. It identiﬁes itself with a User-Agent header
+                 that is speciﬁc to CC. It does not process content that is dynamically generated on client-side, e.g.
+                 using JavaScript. According to CC’s documentation, the fetching process happens over 14 days and is
+                 handled by 20 AWS EC2 spot nodes in N. Virginia. The documentation does not provide the public IP
+                 addresses. [...] Dataset. We use the CC-MAIN-2023-50 snapshot, which was compiled between 2023-11-28
+                 and 2023-12-12 [7]. We use two parts of this snapshot: the general index ﬁle Columnar URL index and
+                 the archive non-200 responses. The non-200 responses archive contains all fetching attempts that did
+                 not receive a status code 200 (“200 OK”) as a reply (see the end of this section for limitations of
+                 this approach). It consists of 3.43 TB of compressed data. We extract timestamps, server IPs, URIs,
+                 Fully Qualiﬁed Domain Name (FQDNs), registered domains, HTTP status codes and headers, and page
+                 textual contents. We use pyasn [5] to map IPs to AS numbers. We use the index to obtain records with
+                 status code 200 for FQDNs of interest.¶ Data Pruning. We obtain 561 × 106 records from the non-200
+                 responses, summarized in Table 1. We prune all responses with status codes for redirections (3xx ;
+                 these are rescheduled by CC for later crawls) as well as those where the content could not be found
+                 (404 ). This leaves us with 21.7 × 106 records.",
 }
 
 @Article{cc:MousaHassanRashidAl-Saady:2025:Safeguarding-patient-Data,
@@ -613,7 +630,7 @@ @Article{cc:MousaHassanRashidAl-Saady:2025:Safeguarding-patient-Data
                  Moumal",
   year         = "2025",
   month        = may,
-  pages        = "47–60",
+  pages        = "47--60",
   keywords     = "Phishing URL detection; machine learning; healthcare cybersecurity; multilayer perceptron; decision
                  tree; Naive Bayes; patient data security",
   cc-snippet   = "For validation, we constructed a more extensive and realistic dataset by combining the
@@ -646,8 +663,8 @@ @Misc{cc:EhrmanntrautWunderlePfisterJannidisEtAl:2025:ModernGBERT
   cc-author-affiliation = "Julius-Maximilians-Universität Würzburg, Germany",
   cc-class     = "nlp/language-model, German",
   cc-snippet   = "We pre-trained ModernGBERT on the same data as LLäMmlein decoder models (Pfister et al., 2024), using
-                 the open-source RedPajamaV2 dataset (Weber et al., 2024).⁶ This dataset comprises German
-                 CommonCrawl snapshots from 2014–2023.",
+                 the open-source RedPajamaV2 dataset (Weber et al., 2024).⁶ This dataset comprises German CommonCrawl
+                 snapshots from 2014–2023.",
 }
 
 @Misc{cc:VeselovskyPiccardiAndersonWestEtAl:2025:Web2Wiki,