@@ -142,7 +142,16 @@ @InProceedings{cc:Thompson:2024:Longitudinal-web-analytics
142142}
143143
144144@InProceedings {cc:ElOuadi:2024:Comparison-CC-News-GDELT ,
145+ author = " El Ouadi, Ameir and Beskow, David" ,
145146 title = " Comparison of Common Crawl News \& {GDELT}" ,
147+ year = " 2024" ,
148+ month = apr,
149+ language = " English" ,
150+ booktitle = " 2024 {IEEE} International Systems Conference {(SysCon)}" ,
151+ publisher = " IEEE" ,
152+ pages = " 1--3" ,
153+ URL = " https://ieeexplore.ieee.org/document/10553540" ,
154+ DOI = " 10.1109/SysCon61195.2024.10553540" ,
146155 abstract = " The corpus of worldwide news is important for natural language processing, knowledge graphs, large
147156 language models, and other technical efforts. Additionally, this corpus is important for understanding
148157 the people, places, organizations, and events that interact in real-time every day. This paper compares
@@ -152,14 +161,10 @@ @InProceedings{cc:ElOuadi:2024:Comparison-CC-News-GDELT
152161 from across the globe, Common Crawl focuses on news sites from around the world gathered through web
153162 crawling. Our analysis revealed considerable differences in where the two datasets gather their news
154163 sources." ,
155- author = " El Ouadi, Ameir and Beskow, David" ,
156- year = " 2024" ,
157- month = apr,
158- language = " English" ,
159- booktitle = " 2024 {IEEE} International Systems Conference {(SysCon)}" ,
160- publisher = " IEEE" ,
161- pages = " 1--3" ,
162- URL = " https://doi.org/10.1109/SysCon61195.2024.10553540" ,
164+ keywords = " Databases, Organizations, Knowledge graphs, Real-time systems, Natural language processing, Task
165+ analysis, Open Source Data, News Data, NLP, LLMs" ,
166+ cc-author-affiliation = " United States Military Academy, West Point, NY, USA" ,
167+ cc-class = " news-corpus, knowledge-graph" ,
163168}
164169
165170@Misc {cc:LiuLuoShanVoelkerEtAl:2024:Somesite-I-used-to-crawl ,
@@ -311,7 +316,7 @@ @Misc{cc:RiveroGonzalo:2024:When-online-content-disappears
311316 language = " en-US" ,
312317 urldate = " 2025-01-30" ,
313318 journal = " Pew Research Center" ,
314- author = " Rivero, Samuel Bestvater, Emma Remy { and} Gonzalo, Athena Chapekis " ,
319+ author = " Athena Chapekis and Samuel Bestvater and Emma Remy and Gonzalo Rivero " ,
315320 month = may,
316321 year = " 2024" ,
317322 cc-snippet = " To conduct this part of our analysis, we collected a random sample of just under 1 million webpages
@@ -368,6 +373,7 @@ @Misc{cc:Amarikwa:2024:Internet-Openness-at-Risk
368373@Misc {Blagojevic:2024:CC-news-dataset ,
369374 author = " Vladimir Blagojevic" ,
370375 year = " 2024" ,
376+ title = " vblagoje/cc_news" ,
371377 URL = " https://huggingface.co/datasets/vblagoje/cc_news" ,
372378 cc-derived-dataset-about = " vblagoje-cc-news" ,
373379 cc-author-affiliation = " deepset AI, Germany" ,
@@ -409,7 +415,7 @@ @Article{cc:KrieschLosacker:2024:Bioeconomy-firms
409415 author = " Kriesch, Lukas and Losacker, Sebastian" ,
410416 year = " 2024" ,
411417 month = apr,
412- pages = " 55– 78" ,
418+ pages = " 55-- 78" ,
413419 cc-author-affiliation = " Justus-Liebig-University Giessen, Germany" ,
414420 cc-class = " economic geography, web-mining, regional science, regional economics" ,
415421 cc-snippet = " The dataset is based on a novel web-mining approach developed by Kriesch (2023). This dataset uses the
0 commit comments