@@ -54,7 +54,7 @@ @Misc{cc:GoldsteinSastryMusserDiRestaEtAl:2023:generative-language-models-thread
5454 or use other tools to extract data from websites.]" ,
5555}
5656
57- @PhdThesis {cc:Wang:2023:Large-Web-Archive-Collection ,
57+ @PhdThesis {cc:Wang:2023:large-web-archive-collection ,
5858 title = " Large Web Archive Collection Infrastructure and Services" ,
5959 author = " Wang, Xinyue" ,
6060 year = " 2023" ,
@@ -91,7 +91,7 @@ @PhdThesis{cc:Wang:2023:Large-Web-Archive-Collection
9191 the Timestamp column in INT64 Timestamp type; 5) Avro, [...]" ,
9292}
9393
94- @Article {cc:Terzis:2023:Building-Programmable-Commons ,
94+ @Article {cc:Terzis:2023:programmable-commons ,
9595 title = " Building Programmable Commons" ,
9696 author = " Terzis, Petros" ,
9797 year = " 2023" ,
@@ -107,3 +107,85 @@ @Article{cc:Terzis:2023:Building-Programmable-Commons
107107 presents a blend of bottom-up and top-down initiatives for their commons-based organisation and
108108 governance." ,
109109}
110+
111+ @Misc {cc:HanleyKumarDurumeric:2023:conspiracy-theories ,
112+ doi = " 10.48550/ARXIV.2301.10880" ,
113+ URL = " https://arxiv.org/abs/2301.10880" ,
114+ author = " Hanley, Hans W. A. and Kumar, Deepak and Durumeric, Zakir" ,
115+ title = " A Golden Age: Conspiracy Theories' Relationship with Misinformation Outlets, News Media, and the Wider
116+ Internet" ,
117+ publisher = " arXiv" ,
118+ year = " 2023" ,
119+ abstract = " Do we live in a {"}Golden Age of Conspiracy Theories?{"} In the last few decades, conspiracy theories
120+ have proliferated on the Internet with some having dangerous real-world consequences. A large
121+ contingent of those who participated in the January 6th attack on the US Capitol believed fervently in
122+ the QAnon conspiracy theory. In this work, we study the relationships amongst five prominent conspiracy
123+ theories (QAnon, COVID, UFO/Aliens, 9-11, and Flat-Earth) and each of their respective relationships to
124+ the news media, both mainstream and fringe. Identifying and publishing a set of 755 different
125+ conspiracy theory websites dedicated to our five conspiracy theories, we find that each set often
126+ hyperlinks to the same external domains, with COVID and QAnon conspiracy theory websites largest amount
127+ of shared connections. Examining the role of news media, we further find that not only do outlets known
128+ for spreading misinformation hyperlink to our set of conspiracy theory websites more often than
129+ mainstream websites but this hyperlinking has increased dramatically between 2018 and 2021, with the
130+ advent of QAnon and the start of COVID-19 pandemic. Using partial Granger-causality, we uncover several
131+ positive correlative relationships between the hyperlinks from misinformation websites and the
132+ popularity of conspiracy theory websites, suggesting the prominent role that misinformation news
133+ outlets play in popularizing many conspiracy theories." ,
134+ cc-snippet = " Using our own web scrapes and pages historically scraped by Common Crawl,¹
135+ [¹https://commoncrawl.org/] we then document the state and the changing behaviors of the conspiracy
136+ theory ecosystem and their relationship to a separate set of 530 known misinformation outlets, 565
137+ authentic news websites, and 528 non-news websites. [...] Utilizing the Common Crawl harmonic and
138+ PageRank centrality measures that measure a website’s centrality across all of the crawled Internet,
139+ we then find many of the websites in our dataset have relatively high network centrality, suggesting
140+ that many of them are not peripheral on the Internet but actually near the Internet’s core/are
141+ mainstream. Indeed examining, the hyperlink connections between news media and these conspiracy
142+ theories, we find that many of them rely heavily on mainstream as well as misinformation outlets
143+ (compared to non-news websites) for their information, with many popular misinformation outlets also
144+ hyperlinking back to many of these conspiracy theory websites. [...] 4.1 Common Crawl Page Retrieval
145+ and Website Crawling To gather the set of hyperlinks between our websites, we utilize Common Crawl data
146+ [92]—widely considered the most complete publicly available source of web crawl data—and our own
147+ website crawls. For each website in our dataset, we collect all the domain’s HTML pages that were
148+ indexed by Common Crawl before August 2021. In addition to Common Crawl data, we further utilize our
149+ own website scrapes. We utilize our own crawls, in addition to Common Crawl, due to noisiness, missing
150+ pages, and missing domains within the Common Crawl dataset [85]. For example, 309 particularly small
151+ conspiracy theory domains were not contained within the Common Crawl dataset (i.e. these websites often
152+ only contained a few dozen pages). Thus for each website in our dataset, we further gather all the HTML
153+ pages 10 hops from each website’s homepage (i.e., we collect all URLs linked from the homepage (1st
154+ hop), then all URLs linked from the pages that were linked by the homepage (2nd hop), and so forth).
155+ For each HTML page from our scrapes and Common Crawl, we parse the HTML, detect the date that page was
156+ published, and collect hyperlinks to other pages (i.e., HTML <a> tags). Altogether we gather the
157+ available Common Crawl pages and scrape the HTML for our 755 conspiracy theory, 530 misinformation, 565
158+ authentic news, and 528 non-news websites. [...] Utilizing Common Crawl network data [ 61] over the
159+ indexed Internet (87.7 million websites), we thus determine the network centrality of our set of
160+ conspiracy-focused websites to understand if each conspiracy theory website category is “core”
161+ (regularly utilized on the Internet) or “peripheral”. We utilize centralities across Common
162+ Crawl’s dataset rather than our partial one in order to get a sense of each conspiracy theory’s
163+ centrality on the entire Internet. While only 446 of our conspiracy theory websites are within the
164+ Common Crawl dataset, this analysis allows us to fully understand the relative roles that each
165+ conspiracy theory website group in our dataset plays on the wider Internet." ,
166+ cc-author-affiliation = " Stanford University, USA" ,
167+ cc-class = " nlp/fake-news-detection, misinformation, disinformation, conspiracy theories, hyperlink-graph" ,
168+ }
169+
170+ @Misc {cc:PeetersDerBizer:2023:WDC-products ,
171+ doi = " 10.48550/ARXIV.2301.09521" ,
172+ URL = " https://arxiv.org/abs/2301.09521" ,
173+ author = " Peeters, Ralph and Der, Reng Chiz and Bizer, Christian" ,
174+ title = " {WDC} Products: {A} Multi-Dimensional Entity Matching Benchmark" ,
175+ publisher = " arXiv" ,
176+ year = " 2023" ,
177+ cc-snippet = " The first step of the pipeline is the extraction of large amounts of product offers from the Common
178+ Crawl⁴ [⁴https://commoncrawl.org/] using schema.org annotations. Some product offers contain
179+ product identifiers like MPNs and GTINs which allow us to group offers into [...] The Web Data Commons6
180+ project regularly extracts schema.org annotations from the Common Crawl, the largest web corpus
181+ available to the public, in order to monitor the adoption of semantic annotations on the Web and to
182+ provide the extracted data for public download. The WDC Products benchmark uses product offers from the
183+ WDC Product Data Corpus V2020 (PDC2020)7. The corpus was created by extracting schema.org product data
184+ from the September 2020 version of the Common Crawl. The extracted data goes through a pipeline of
185+ cleansing steps such as removing offers from listing pages as well as advertisements that are contained
186+ in a page in addition to the main offer [31]. The resulting PDC2020 corpus consists of ∼98 million
187+ product offers originating from 603,000 websites." ,
188+ cc-dataset-used = " CC-MAIN-2020-40" ,
189+ cc-author-affiliation = " University of Mannheim, Germany" ,
190+ cc-class = " semantic-web, semantic-web/microformats, e-commerce, linked data, schema.org annotations" ,
191+ }
0 commit comments