Skip to content

Commit e5323c0

Browse files
- add citations 2023
1 parent d26b17e commit e5323c0

2 files changed

Lines changed: 106 additions & 3 deletions

File tree

bib/cc2023.bib

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ @Misc{cc:GoldsteinSastryMusserDiRestaEtAl:2023:generative-language-models-thread
5454
or use other tools to extract data from websites.]",
5555
}
5656

57-
@PhdThesis{cc:Wang:2023:Large-Web-Archive-Collection,
57+
@PhdThesis{cc:Wang:2023:large-web-archive-collection,
5858
title = "Large Web Archive Collection Infrastructure and Services",
5959
author = "Wang, Xinyue",
6060
year = "2023",
@@ -91,7 +91,7 @@ @PhdThesis{cc:Wang:2023:Large-Web-Archive-Collection
9191
the Timestamp column in INT64 Timestamp type; 5) Avro, [...]",
9292
}
9393

94-
@Article{cc:Terzis:2023:Building-Programmable-Commons,
94+
@Article{cc:Terzis:2023:programmable-commons,
9595
title = "Building Programmable Commons",
9696
author = "Terzis, Petros",
9797
year = "2023",
@@ -107,3 +107,85 @@ @Article{cc:Terzis:2023:Building-Programmable-Commons
107107
presents a blend of bottom-up and top-down initiatives for their commons-based organisation and
108108
governance.",
109109
}
110+
111+
@Misc{cc:HanleyKumarDurumeric:2023:conspiracy-theories,
112+
doi = "10.48550/ARXIV.2301.10880",
113+
URL = "https://arxiv.org/abs/2301.10880",
114+
author = "Hanley, Hans W. A. and Kumar, Deepak and Durumeric, Zakir",
115+
title = "A Golden Age: Conspiracy Theories' Relationship with Misinformation Outlets, News Media, and the Wider
116+
Internet",
117+
publisher = "arXiv",
118+
year = "2023",
119+
abstract = "Do we live in a {"}Golden Age of Conspiracy Theories?{"} In the last few decades, conspiracy theories
120+
have proliferated on the Internet with some having dangerous real-world consequences. A large
121+
contingent of those who participated in the January 6th attack on the US Capitol believed fervently in
122+
the QAnon conspiracy theory. In this work, we study the relationships amongst five prominent conspiracy
123+
theories (QAnon, COVID, UFO/Aliens, 9-11, and Flat-Earth) and each of their respective relationships to
124+
the news media, both mainstream and fringe. Identifying and publishing a set of 755 different
125+
conspiracy theory websites dedicated to our five conspiracy theories, we find that each set often
126+
hyperlinks to the same external domains, with COVID and QAnon conspiracy theory websites largest amount
127+
of shared connections. Examining the role of news media, we further find that not only do outlets known
128+
for spreading misinformation hyperlink to our set of conspiracy theory websites more often than
129+
mainstream websites but this hyperlinking has increased dramatically between 2018 and 2021, with the
130+
advent of QAnon and the start of COVID-19 pandemic. Using partial Granger-causality, we uncover several
131+
positive correlative relationships between the hyperlinks from misinformation websites and the
132+
popularity of conspiracy theory websites, suggesting the prominent role that misinformation news
133+
outlets play in popularizing many conspiracy theories.",
134+
cc-snippet = "Using our own web scrapes and pages historically scraped by Common Crawl,¹
135+
[¹https://commoncrawl.org/] we then document the state and the changing behaviors of the conspiracy
136+
theory ecosystem and their relationship to a separate set of 530 known misinformation outlets, 565
137+
authentic news websites, and 528 non-news websites. [...] Utilizing the Common Crawl harmonic and
138+
PageRank centrality measures that measure a website’s centrality across all of the crawled Internet,
139+
we then find many of the websites in our dataset have relatively high network centrality, suggesting
140+
that many of them are not peripheral on the Internet but actually near the Internet’s core/are
141+
mainstream. Indeed examining, the hyperlink connections between news media and these conspiracy
142+
theories, we find that many of them rely heavily on mainstream as well as misinformation outlets
143+
(compared to non-news websites) for their information, with many popular misinformation outlets also
144+
hyperlinking back to many of these conspiracy theory websites. [...] 4.1 Common Crawl Page Retrieval
145+
and Website Crawling To gather the set of hyperlinks between our websites, we utilize Common Crawl data
146+
[92]—widely considered the most complete publicly available source of web crawl data—and our own
147+
website crawls. For each website in our dataset, we collect all the domain’s HTML pages that were
148+
indexed by Common Crawl before August 2021. In addition to Common Crawl data, we further utilize our
149+
own website scrapes. We utilize our own crawls, in addition to Common Crawl, due to noisiness, missing
150+
pages, and missing domains within the Common Crawl dataset [85]. For example, 309 particularly small
151+
conspiracy theory domains were not contained within the Common Crawl dataset (i.e. these websites often
152+
only contained a few dozen pages). Thus for each website in our dataset, we further gather all the HTML
153+
pages 10 hops from each website’s homepage (i.e., we collect all URLs linked from the homepage (1st
154+
hop), then all URLs linked from the pages that were linked by the homepage (2nd hop), and so forth).
155+
For each HTML page from our scrapes and Common Crawl, we parse the HTML, detect the date that page was
156+
published, and collect hyperlinks to other pages (i.e., HTML <a> tags). Altogether we gather the
157+
available Common Crawl pages and scrape the HTML for our 755 conspiracy theory, 530 misinformation, 565
158+
authentic news, and 528 non-news websites. [...] Utilizing Common Crawl network data [ 61] over the
159+
indexed Internet (87.7 million websites), we thus determine the network centrality of our set of
160+
conspiracy-focused websites to understand if each conspiracy theory website category is “core”
161+
(regularly utilized on the Internet) or “peripheral”. We utilize centralities across Common
162+
Crawl’s dataset rather than our partial one in order to get a sense of each conspiracy theory’s
163+
centrality on the entire Internet. While only 446 of our conspiracy theory websites are within the
164+
Common Crawl dataset, this analysis allows us to fully understand the relative roles that each
165+
conspiracy theory website group in our dataset plays on the wider Internet.",
166+
cc-author-affiliation = "Stanford University, USA",
167+
cc-class = "nlp/fake-news-detection, misinformation, disinformation, conspiracy theories, hyperlink-graph",
168+
}
169+
170+
@Misc{cc:PeetersDerBizer:2023:WDC-products,
171+
doi = "10.48550/ARXIV.2301.09521",
172+
URL = "https://arxiv.org/abs/2301.09521",
173+
author = "Peeters, Ralph and Der, Reng Chiz and Bizer, Christian",
174+
title = "{WDC} Products: {A} Multi-Dimensional Entity Matching Benchmark",
175+
publisher = "arXiv",
176+
year = "2023",
177+
cc-snippet = "The first step of the pipeline is the extraction of large amounts of product offers from the Common
178+
Crawl⁴ [⁴https://commoncrawl.org/] using schema.org annotations. Some product offers contain
179+
product identifiers like MPNs and GTINs which allow us to group offers into [...] The Web Data Commons6
180+
project regularly extracts schema.org annotations from the Common Crawl, the largest web corpus
181+
available to the public, in order to monitor the adoption of semantic annotations on the Web and to
182+
provide the extracted data for public download. The WDC Products benchmark uses product offers from the
183+
WDC Product Data Corpus V2020 (PDC2020)7. The corpus was created by extracting schema.org product data
184+
from the September 2020 version of the Common Crawl. The extracted data goes through a pipeline of
185+
cleansing steps such as removing offers from listing pages as well as advertisements that are contained
186+
in a page in addition to the main offer [31]. The resulting PDC2020 corpus consists of ∼98 million
187+
product offers originating from 603,000 websites.",
188+
cc-dataset-used = "CC-MAIN-2020-40",
189+
cc-author-affiliation = "University of Mannheim, Germany",
190+
cc-class = "semantic-web, semantic-web/microformats, e-commerce, linked data, schema.org annotations",
191+
}

0 commit comments

Comments
 (0)