Skip to content

Commit 219f352

Browse files
fix: format errors in BibTeX citations
1 parent b90b953 commit 219f352

3 files changed

Lines changed: 39 additions & 16 deletions

File tree

bib/cc2023.bib

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,8 +1326,8 @@ @Misc{cc:Tenis:2023:Efficient-URL-phishing-detection
13261326
volume = "47",
13271327
number = "2",
13281328
year = "2023",
1329-
title = "Modelling an Efficient {URL} Phishing Detection Approach Based on a Dense Network Model. Computer
1330-
Systems Science & Engineering . 2023, Vol. 47 Issue 2, p2625-2641. 17p.",
1329+
title = "Modelling an Efficient {URL} Phishing Detection Approach Based on a Dense Network Model",
1330+
pages = "2625--2641",
13311331
author = "Tenis, A. Aldo; Santhosh, R.",
13321332
URL = "https://web.s.ebscohost.com/abstract?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=02676192&AN=169779920&h=WGjAKpK7ACB1ZcUfp8Ikhm9IcDPjsbjptgyhA5ityW47Z2oYK4JmZTEMhj6t1UhLOFgbraBWyMgS1NID6mz%2bcA%3d%3d&crl=c&resultNs=AdminWebAuth&resultLocal=ErrCrlNotAuth&crlhashurl=login.aspx%3fdirect%3dtrue%26profile%3dehost%26scope%3dsite%26authtype%3dcrawler%26jrnl%3d02676192%26AN%3d169779920",
13331333
abstract = "The social engineering cyber-attack is where culprits mislead the users by getting the login details
@@ -1386,7 +1386,7 @@ @InProceedings{cc:TirumalaSimigAghajanyanMorcos:2023:Improving-LLM-Pretraining
13861386
}
13871387

13881388
@Article{cc:WangKimMittalRexford:2023:RAVEN-Stateless-Rapid-IP-Address-Variation,
1389-
title = "{RAVEN}: Stateless Rapid {IP} Address Variation for Enterprise Networks.",
1389+
title = "{RAVEN}: Stateless Rapid {IP} Address Variation for Enterprise Networks",
13901390
author = "Wang, Liang and Kim, Hyojoon and Mittal, Prateek and Rexford, Jennifer",
13911391
journal = "Proc. Priv. Enhancing Technol.",
13921392
volume = "2023",

bib/cc2024.bib

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,16 @@ @InProceedings{cc:Thompson:2024:Longitudinal-web-analytics
142142
}
143143

144144
@InProceedings{cc:ElOuadi:2024:Comparison-CC-News-GDELT,
145+
author = "El Ouadi, Ameir and Beskow, David",
145146
title = "Comparison of Common Crawl News \& {GDELT}",
147+
year = "2024",
148+
month = apr,
149+
language = "English",
150+
booktitle = "2024 {IEEE} International Systems Conference {(SysCon)}",
151+
publisher = "IEEE",
152+
pages = "1--3",
153+
URL = "https://ieeexplore.ieee.org/document/10553540",
154+
DOI = "10.1109/SysCon61195.2024.10553540",
146155
abstract = "The corpus of worldwide news is important for natural language processing, knowledge graphs, large
147156
language models, and other technical efforts. Additionally, this corpus is important for understanding
148157
the people, places, organizations, and events that interact in real-time every day. This paper compares
@@ -152,14 +161,10 @@ @InProceedings{cc:ElOuadi:2024:Comparison-CC-News-GDELT
152161
from across the globe, Common Crawl focuses on news sites from around the world gathered through web
153162
crawling. Our analysis revealed considerable differences in where the two datasets gather their news
154163
sources.",
155-
author = "El Ouadi, Ameir and Beskow, David",
156-
year = "2024",
157-
month = apr,
158-
language = "English",
159-
booktitle = "2024 {IEEE} International Systems Conference {(SysCon)}",
160-
publisher = "IEEE",
161-
pages = "1--3",
162-
URL = "https://doi.org/10.1109/SysCon61195.2024.10553540",
164+
keywords = "Databases, Organizations, Knowledge graphs, Real-time systems, Natural language processing, Task
165+
analysis, Open Source Data, News Data, NLP, LLMs",
166+
cc-author-affiliation = "United States Military Academy, West Point, NY, USA",
167+
cc-class = "news-corpus, knowledge-graph",
163168
}
164169

165170
@Misc{cc:LiuLuoShanVoelkerEtAl:2024:Somesite-I-used-to-crawl,
@@ -311,7 +316,7 @@ @Misc{cc:RiveroGonzalo:2024:When-online-content-disappears
311316
language = "en-US",
312317
urldate = "2025-01-30",
313318
journal = "Pew Research Center",
314-
author = "Rivero, Samuel Bestvater, Emma Remy {and} Gonzalo, Athena Chapekis",
319+
author = "Athena Chapekis and Samuel Bestvater and Emma Remy and Gonzalo Rivero",
315320
month = may,
316321
year = "2024",
317322
cc-snippet = "To conduct this part of our analysis, we collected a random sample of just under 1 million webpages
@@ -368,6 +373,7 @@ @Misc{cc:Amarikwa:2024:Internet-Openness-at-Risk
368373
@Misc{Blagojevic:2024:CC-news-dataset,
369374
author = "Vladimir Blagojevic",
370375
year = "2024",
376+
title = "vblagoje/cc_news",
371377
URL = "https://huggingface.co/datasets/vblagoje/cc_news",
372378
cc-derived-dataset-about = "vblagoje-cc-news",
373379
cc-author-affiliation = "deepset AI, Germany",
@@ -409,7 +415,7 @@ @Article{cc:KrieschLosacker:2024:Bioeconomy-firms
409415
author = "Kriesch, Lukas and Losacker, Sebastian",
410416
year = "2024",
411417
month = apr,
412-
pages = "5578",
418+
pages = "55--78",
413419
cc-author-affiliation = "Justus-Liebig-University Giessen, Germany",
414420
cc-class = "economic geography, web-mining, regional science, regional economics",
415421
cc-snippet = "The dataset is based on a novel web-mining approach developed by Kriesch (2023). This dataset uses the

bib/cc2025.bib

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,23 @@ @InProceedings{cc:AnsarSperottoHolz:2025:Web-crawl-refusals
586586
URL = "https://link.springer.com/chapter/10.1007/978-3-031-85960-1_9",
587587
cc-author-affiliation = "University of Twente, Enschede, The Netherlands; University of Münster, Germany",
588588
cc-class = "web-crawling; web-crawling/server-side-blocking",
589+
cc-snippet = "CC is a non-profit initiative. The crawler is designed [18] to be “polite” and comply with the
590+
Robots Exclusion Protocol, using e.g. five seconds wait between requests to the same host and
591+
employing an exponential back-off strategy on error. It identifies itself with a User-Agent header
592+
that is specific to CC. It does not process content that is dynamically generated on client-side, e.g.
593+
using JavaScript. According to CC’s documentation, the fetching process happens over 14 days and is
594+
handled by 20 AWS EC2 spot nodes in N. Virginia. The documentation does not provide the public IP
595+
addresses. [...] Dataset. We use the CC-MAIN-2023-50 snapshot, which was compiled between 2023-11-28
596+
and 2023-12-12 [7]. We use two parts of this snapshot: the general index file Columnar URL index and
597+
the archive non-200 responses. The non-200 responses archive contains all fetching attempts that did
598+
not receive a status code 200 (“200 OK”) as a reply (see the end of this section for limitations of
599+
this approach). It consists of 3.43 TB of compressed data. We extract timestamps, server IPs, URIs,
600+
Fully Qualified Domain Name (FQDNs), registered domains, HTTP status codes and headers, and page
601+
textual contents. We use pyasn [5] to map IPs to AS numbers. We use the index to obtain records with
602+
status code 200 for FQDNs of interest.¶ Data Pruning. We obtain 561 × 106 records from the non-200
603+
responses, summarized in Table 1. We prune all responses with status codes for redirections (3xx ;
604+
these are rescheduled by CC for later crawls) as well as those where the content could not be found
605+
(404 ). This leaves us with 21.7 × 106 records.",
589606
}
590607

591608
@Article{cc:MousaHassanRashidAl-Saady:2025:Safeguarding-patient-Data,
@@ -613,7 +630,7 @@ @Article{cc:MousaHassanRashidAl-Saady:2025:Safeguarding-patient-Data
613630
Moumal",
614631
year = "2025",
615632
month = may,
616-
pages = "4760",
633+
pages = "47--60",
617634
keywords = "Phishing URL detection; machine learning; healthcare cybersecurity; multilayer perceptron; decision
618635
tree; Naive Bayes; patient data security",
619636
cc-snippet = "For validation, we constructed a more extensive and realistic dataset by combining the
@@ -646,8 +663,8 @@ @Misc{cc:EhrmanntrautWunderlePfisterJannidisEtAl:2025:ModernGBERT
646663
cc-author-affiliation = "Julius-Maximilians-Universität Würzburg, Germany",
647664
cc-class = "nlp/language-model, German",
648665
cc-snippet = "We pre-trained ModernGBERT on the same data as LLäMmlein decoder models (Pfister et al., 2024), using
649-
the open-source RedPajamaV2 dataset (Weber et al., 2024).⁶ This dataset comprises German
650-
CommonCrawl snapshots from 2014–2023.",
666+
the open-source RedPajamaV2 dataset (Weber et al., 2024).⁶ This dataset comprises German CommonCrawl
667+
snapshots from 2014–2023.",
651668
}
652669

653670
@Misc{cc:VeselovskyPiccardiAndersonWestEtAl:2025:Web2Wiki,

0 commit comments

Comments
 (0)