diff --git a/Makefile b/Makefile index f51bb92..608edfa 100644 --- a/Makefile +++ b/Makefile @@ -62,7 +62,7 @@ clean: # Google Scholar Alerts gscholar_alerts/extracted_citations.jsonl: gscholar_alerts/eml/ - python3 gscholar_alerts/parse_scholar_alert_eml.py $< | LC_ALL=C sort >$@ + python3 gscholar_alerts/parse_scholar_alert_eml.py $< gscholar_alerts/citations.jsonl | LC_ALL=C sort >$@ gscholar_alerts/citations.jsonl: gscholar_alerts/extracted_citations.jsonl jq -c 'select(.title != null and .authors != null) | del(.idx, .date, .data, .ref, .link)' $< >$@ diff --git a/bib/cc2017.bib b/bib/cc2017.bib index f7b2448..e7be53c 100644 --- a/bib/cc2017.bib +++ b/bib/cc2017.bib @@ -1,7 +1,7 @@ @Article{Schaefer:2017:boilerplate-detection, author = "Schäfer, Roland", title = "Accurate and Efficient General-purpose Boilerplate Detection for Crawled Web Corpora", - journal = "Lang. Resour. Eval.", + journal = "Language Resources and Evaluation", issue_date = "September 2017", volume = "51", number = "3", diff --git a/bib/cc2025.bib b/bib/cc2025.bib index e30aec8..25a76cd 100644 --- a/bib/cc2025.bib +++ b/bib/cc2025.bib @@ -1047,3 +1047,32 @@ @InProceedings{cc:Ligeti-NagyHéjaBánfiFöldesiEtAl:2025:Expanding-Hungarian-Gi cc-author-affiliation = "ELTE Research Centre for Linguistics, Budapest, Hungary", cc-class = "nlp/corpus-construction, nlp/text-corpora, language-specific corpus", } + +@Misc{cc:Mahara:2025:Cybersecurity-data-extraction, + title = "Cybersecurity Data Extraction from Common Crawl", + author = "Ashim Mahara", + year = "2025", + eprint = "2602.22218", + archiveprefix = "arXiv", + primaryclass = "cs.CR", + URL = "https://arxiv.org/abs/2602.22218", + abstract = "Alpha-Root is a cybersecurity-focused dataset collected in a single shot from the Common Crawl web + graph using community detection. Unlike iterative content-scoring approaches like DeepSeekMath, we mine + quality domains directly from the web graph, starting from just 20 trusted seed domains.", + cc-author-affiliation = "Rochester Institute of Technology, USA", + cc-class = "computer-security/internet-security, web-science/hyperlinkgraph", + cc-dataset-used = "hyperlinkgraph", + cc-snippet = "Thus, we introduce Alpha-Root as a new large text corpus from Common Crawl⁵ [⁵commoncrawl.org] — + intended to be used as a pre-training dataset — by extracting cybersecurity-focused domains from the + webgraph provided by Common Crawl. We utilize the Leiden [42] algorithm for community detection to mine + the web graph for domains that represent the cybersecurity community in the webgraph. We then extract + texts from webpages of the members of the community that are also present in the FineWeb-Edu [29] + dataset. [...] CommonCrawl release web graphs for each of their crawls. The cc-graph is a graph of + nodes (domains) and edges between the nodes when the nodes are hyperlinked through a webpage. + Essentially, there exists an edge between two domains if there is at-least one hyperlink between + webpages of the domains. The cc-graph includes more than 100 million nodes and more than 1.8 billion + edges. The processing of such a large graph presents several challenges on its own since commoncrawl + only provides the data in an edge-list format while a lot of existing software libraries require an + adjacency list to work on the graph.", +} + diff --git a/bib/cc2026.bib b/bib/cc2026.bib new file mode 100644 index 0000000..1bd040d --- /dev/null +++ b/bib/cc2026.bib @@ -0,0 +1,159 @@ +@Article{cc:PeiskerHoffmannMuttarak:2026:Climate-news, + title = "Climate news mediates extreme weather effects on climate change concern", + journal = "Climate Risk Management", + volume = "52", + pages = "100806", + year = "2026", + ISSN = "2212-0963", + DOI = "https://doi.org/10.1016/j.crm.2026.100806", + URL = "https://www.sciencedirect.com/science/article/pii/S2212096326000197", + author = "Jonas Peisker and Roman Hoffmann and Raya Muttarak", + keywords = "Climate change concern, Extreme weather, News media, Issue attention, Mediation", + abstract = "As the severe impacts of climate change become increasingly apparent, concerns about climate-related + issues have grown in recent years. The news media plays an important role in disseminating information + about climate change and its consequences to the wider public and thus can influence public climate + concern. Here, we investigate how extreme weather affects issue attention to climate change in the + European online news media and how extreme weather and news coverage jointly shape changes in climate + change concern. For the analysis, we combine 12 harmonized Eurobarometer survey waves, measuring public + concerns about climate issues, with meteorological data and indices of environmental news coverage + based on publications from 2481 media outlets in 200 regions of 22 European countries. Using fixed + effects panel models, we estimate effects of temperature anomalies on climate news and climate concern + and explore the role of the news media in explaining changes in concerns in response to temperature + anomalies. The results indicate that unusually high temperatures exhibit a robust positive effect on + media attention, especially when they overlap with other events that draw attention to the climate + topic, such as major climate change conferences. We furthermore find evidence that the climate news in + national outlets increases public concern about climate change and show that reporting by such outlets + is likely to partly explain the effects of temperature anomalies on concerns. We do not find any + significant effects of climate reporting in regional news outlets on climate concern. Our results + suggest that the national news media partly mediates the effects of extreme weather on public climate + change concern. The findings also highlight that focusing events strongly influence issue attention of + the media, providing windows of opportunity to raise awareness about climate issues, while pointing to + challenges in sustaining attention to related topics beyond short-lived news cycles.", + cc-author-affiliation = "International Institute for Applied Systems Analysis, Population and Just Societies, + Laxenburg, Austria; University of Bologna, Italy", + cc-class = "climate change, climate risk management, news, websiteranking, domain-ranking, hyperlinkgraph", + cc-derived-dataset-used = "OpenPageRank", + cc-snippet = "The weights are provided by the Open Page Rank that is based on Common Crawl project, an open source + database of web crawl data (DomCop, 2022). The Open Page Rank is scaled to a range from 0 to 10. Fig. 1 + shows the distribution of page ranks over the included articles.", +} + +@Article{cc:JumeletWeissweilerNivreBisazza:2026:MultiBLiMP-1.0, + author = "Jumelet, Jaap and Weissweiler, Leonie and Nivre, Joakim and Bisazza, Arianna", + title = "{MultiBLiMP 1.0}: A Massively Multilingual Benchmark of Linguistic Minimal Pairs", + journal = "Transactions of the Association for Computational Linguistics", + volume = "14", + pages = "193--216", + year = "2026", + month = "01", + abstract = "We introduce MultiBLiMP 1.0, a massively multilingual benchmark of linguistic minimal pairs, covering + 101 languages and 2 types of subject-verb agreement, containing more than 128,000 minimal pairs. Our + minimal pairs are created using a fully automated pipeline, leveraging the large-scale linguistic + resources of Universal Dependencies and UniMorph. MultiBLiMP 1.0 evaluates abilities of LLMs at an + unprecedented multilingual scale, and highlights the shortcomings of the current state-of-the-art in + modelling low-resource languages.1", + ISSN = "2307-387X", + DOI = "10.1162/TACL.a.600", + URL = "https://doi.org/10.1162/TACL.a.600", + eprint = "https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.600/2577913/tacl.a.600.pdf", + cc-author-affiliation = "University of Groningen, The Netherlands; Uppsala University, Sweden", + cc-class = "linguistic minimal pairs, language frequency, nlp/corpus-construction, nlp/multi-lingual-corpus", + cc-derived-dataset-used = "GlotCC", + cc-snippet = "Since the training corpora of most LLMs are not publicly available, we estimate this distribution + based on the language frequencies of Kargaran et al. (2024), which were computed on a 3.9T token split + of the Common Crawl corpus. Common Crawl provides a good reflection of the language distribution of the + web-scraped data that is at the core of many LLM training corpora. [...] We also report results for + language subgroups split based on the Common Crawl language frequencies: [...]", +} + +@Article{cc:HanleyLuPan:2025:Across-the-firewall, + author = "Hans W. A. Hanley and Yingdan Lu and Jennifer Pan", + title = "Across the firewall: Foreign media’s role in shaping Chinese social media narratives on the + Russo-Ukrainian War", + journal = "Proceedings of the National Academy of Sciences", + volume = "122", + number = "1", + pages = "e2420607122", + year = "2025", + DOI = "10.1073/pnas.2420607122", + URL = "https://www.pnas.org/doi/abs/10.1073/pnas.2420607122", + eprint = "https://www.pnas.org/doi/pdf/10.1073/pnas.2420607122", + abstract = "There is a widespread perception that China’s digital censorship distances its people from the + global internet, and the Chinese Communist Party, through state-controlled media, is the main + gatekeeper of information about foreign affairs. Our analysis of narratives about the Russo-Ukrainian + War circulating on the Chinese social media platform Weibo challenges this view. Comparing narratives + on Weibo with 8.26 million unique news articles from 2,500 of some of the most trafficked websites in + China, Russia, Ukraine, and the United States (totaling 10,000 sites), we find that Russian news + websites published more articles matching narratives found on Weibo than news websites from China, + Ukraine, or the United States. Similarly, a plurality of Weibo narratives were most associated with + narratives found on Russian news websites while less than ten percent were most associated with + narratives from Chinese news sites. Narratives later appearing on Weibo were more likely to first + appear on Russian rather than Chinese, Ukrainian, or US news websites, and Russian websites were highly + influential for narratives appearing on Weibo. Altogether, these results show that Chinese state media + was not the main gatekeeper of information about Russia’s invasion of Ukraine for Weibo users.", + cc-author-affiliation = "Stanford University, Stanford, CA, USA; Northwestern University, Evanston, IL, USA", + cc-class = "political science, news, news narratives, hyperlinkgraph, domain-ranks,", + cc-dataset-used = "hyperlinkgraph, CC-MAIN-2023-06, CC-MAIN-2022-49, CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, + CC-MAIN-2022-21, CC-MAIN-2022-05", + cc-snippet = "News websites were identified using Amazon Alexa, Common Crawl, and Cloudflare data (SI Appendix, + section S2). [...] Namely, we collected the set of most popular websites ranked in Amazon Alexa’s top + one million websites and Common Crawl’s Domain Rank datasets from April 2022, which utilize the + top-level domain of each country we were interested in (i.e., .cn, .ua, and .ru).ˢ² [ˢ² We utilized + both Common Crawl and Amazon Alexa due to the paucity of Chinese, Ukrainian, and Russian domains in the + US-dominated Amazon Alexa list.] [...] From each news website in our dataset, we collected news + articles published between January 1, 2022, and June 1, 2022. To gather this data, we took two main + approaches: (1) gathering available web crawls from Common Crawl [22], and (2) extensively crawling + each website retrospectively between November 2022 and March 2023. Common Crawl is widely considered + the most complete public source of web crawl data. For each website, we downloaded Common Crawl indexed + pagesˢ³ from between January 1, 2022, and January 1, 2023 (CC-MAIN-2023-06, CC-MAIN-2022-49, + CC-MAIN-2022-40 CC-MAIN-2022-33, CC-MAIN-2022-27, CC-MAIN-2022-21, CC-MAIN-2022-05), identified the + publication date using the Python htmldate library, and included HTML pages published on their websites + between January 1, 2022, and June 1, 2022. To further expand the Common Crawl dataset, we performed a + breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages that + are missing from Common Crawl.", +} + +@TechReport{cc:Di-PaoloLiberatiRubeo:2026:GreenWashing-climate-information-and-banking-policies, + year = "2026", + title = "{(Green)Washing} the Trust: Climate Information and Banking Policies", + author = "Di Paolo, Simone and Liberati, Danilo and Rubeo, Lorenzo", + URL = "https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf", + journal = "Temi di discussione (Working Papers)", + number = "1514", + abstract = "Greenwashing, that is, the deceptive self-portrayal of companies as sustainable and environmentally + friendly, is an increasingly relevant issue in finance. Identifying greenwashers is not a trivial task, + given the difficulty of assessing firms’ true environmental profiles, especially when relying on + traditional data sources that generally overlook communication strategies and mass perceptions. Using + granular credit data from the euro area banking system, we show that during the period 2019-2023, + greenwashers, initially identified by combining information on firms’ carbon emissions with an + assessment of the reliability of their reporting, were able to borrow at lower interest rates than + other companies. We then assess companies’ environmental profiles by extracting textual information + from newspapers and the internet. We find that sentiment scores based on firms’ own websites are + generally higher than those derived from newspapers, suggesting that companies use their communication + channels to place greater emphasis on their sustainable image than is reflected in external sources. By + integrating this textual metric with our initial proxy, we construct an alternative definition of + greenwashing. Based on a sample of Italian firms, results obtained from this combined proxy are + consistent with those derived from structured data alone. Finally, by introducing an unexpected + contractionary monetary policy shock into our framework, we confirm the operation of the credit risk + channel of monetary policy and find evidence of a reduction in the pricing benefits previously enjoyed + by greenwashers.", + cc-author-affiliation = "Banca d'Italia, Italy", + cc-class = "climate change, company websites, banking, policies", + cc-dataset-used = "CC-MAIN-2024-10", + cc-snippet = "We then developed a second Python script using the BeautifulSoup library to crawl these websites and + download their complete HTML content, following internal links up to three levels deep within the same + domain. When this automated approach failed (mostly for technical reasons, e.g. in the case of + single-page applications), we turned to Common Crawl, an open-access repository that regularly archives + vast portions of the internet. Common Crawl stores petabytes of raw web data, including HTML pages, + metadata, and text extracts, collected through periodic crawls of publicly accessible websites. Its + datasets are freely available and widely used in research for tasks such as text mining, search engine + development, and large-scale content analysis. However, Common Crawl is less suited for projects + focused on a restricted number of websites, since accessing a few domains of interest requires + downloading and processing very large amounts of data, often including irrelevant content. Moreover, + the temporal granularity of the snapshots and the potential incompleteness of some archived websites + may limit its reliability for capturing the most up-to-date corporate information. In our case, we + therefore relied on Common Crawl only as a complementary source, to obtain the HTML content for those + sites that our script was unable to fetch.¹⁸ [¹⁸We used the snapshot CC-MAIN-2024-10, the 10th + main crawl of 2024.]", +} + diff --git a/gscholar_alerts/citations.jsonl b/gscholar_alerts/citations.jsonl index 2b4ebed..702256d 100644 --- a/gscholar_alerts/citations.jsonl +++ b/gscholar_alerts/citations.jsonl @@ -1136,7 +1136,7 @@ {"year":"2019","title":"Automating Analysis and Feedback to Improve Mathematics' Teachers' Classroom Discourse","authors":["A Suresh, T Sumner, J Jacobs, B Foland, W Ward - Paper submitted to the ninth …, 2019"],"snippet":"… GloVe or Global vectors for word representation is an unsupervised learning algorithm trained on aggregated word-word co-occurrence statistics from a corpus. In our model, we use the vectors trained on Common Crawl with 840 billion tokens and 300 dimensions …","url":["https://www.researchgate.net/profile/Jennifer_Jacobs8/publication/332233671_Automating_Analysis_and_Feedback_to_Improve_Mathematics%27_Teachers%27_Classroom_Discourse/links/5ca7c7394585157bd32535fc/Automating-Analysis-and-Feedback-to-Improve-Mathematics-Teachers-Classroom-Discourse.pdf"]} {"year":"2019","title":"Automating the Fact-Checking Task: Challenges and Directions","authors":["DNE da Silva - 2019"],"snippet":"Page 1. Automating the Fact-Checking Task: Challenges and Directions Dissertation zur Erlangung des Doktorgrades (Dr. rer. nat.) der Mathematisch-Naturwissenschaftlichen Fakultät der Rheinischen Friedrich-Wilhelms-Universität Bonn …","url":["http://hss.ulb.uni-bonn.de/2019/5500/5500.pdf"]} {"year":"2019","title":"Backlink Analyser using Apache Spark","authors":["M Zeeshan, S Asim, A Nadeem Anwar - 2019"],"snippet":"… Google Page Rank is assigned by Google based on different website factors (Design, Visitors, and Quality of content). Common Crawl [1] is an open repository for web crawl data … We will use subset of Common Crawl dataset good enough to demonstrate our system …","url":["http://dspace.cuilahore.edu.pk/xmlui/bitstream/handle/123456789/1438/SE29_Backlink%20Analyzer%20using%20Apache%20Spark.pdf?sequence=1&isAllowed=y"]} -{"year":"2019","title":"BEING PROFILED: COGITAS ERGO SUM: 10 Years of Profiling the European Citizen","authors":["I Baraliuc, E Bayamlioglu, M Hildebrandt, L Janssens - 2019"],"snippet":""} +{"year":"2019","title":"BEING PROFILED: COGITAS ERGO SUM: 10 Years of Profiling the European Citizen","authors":["I Baraliuc, E Bayamlioglu, M Hildebrandt, L Janssens - 2019"]} {"year":"2019","title":"Beyond Bag-of-Concepts: Vectors of Locally Aggregated Concepts","authors":["M Grootendorst, J Vanschoren"],"snippet":"… Word2Vec pre-trained embeddings were trained on the Google News data set and contain vectors for 3 million English words.1 GloVe pre-trained embeddings were trained on the Common Crawl data set and contain vectors for 1.9 million English words.2 Pre-trained …","url":["https://ecmlpkdd2019.org/downloads/paper/489.pdf"]} {"year":"2019","title":"Bidirectional Text Compression in External Memory","authors":["P Dinklage, J Ellert, J Fischer, D Köppl, M Penschuck - arXiv preprint arXiv …, 2019"],"snippet":"Page 1. Bidirectional Text Compression in External Memory Patrick Dinklage Technische Universität Dortmund, Department of Computer Science patrick.dinklage@ tu-dortmund.de Jonas Ellert Technische Universität Dortmund …","url":["https://arxiv.org/pdf/1907.03235"]} {"year":"2019","title":"Big Bidirectional Insertion Representations for Documents","authors":["L Li, W Chan - arXiv preprint arXiv:1910.13034, 2019"],"snippet":"… Eu- Page 3. Figure 1: Big Bidirectional Insertion Representations for Documents roparl, Rapid, News-Commentary) and parallel sentence-level data (WikiTitles, Common Crawl, Paracrawl). The test set is newstest2019. The …","url":["https://arxiv.org/pdf/1910.13034"]} @@ -1326,7 +1326,7 @@ {"year":"2019","title":"Frame Augmented Alternating Attention Network for Video Question Answering","authors":["W Zhang, S Tang, Y Cao, S Pu, F Wu, Y Zhuang - IEEE Transactions on Multimedia, 2019"],"snippet":"Page 1. 1520-9210 (c) 2019 IEEE. Personal use is permitted, but republication/ redistribution requires IEEE permission. See http://www.ieee.org/ publications_standards/publications/rights/index.html for more information. This …","url":["https://ieeexplore.ieee.org/abstract/document/8811730/"]} {"year":"2019","title":"Frequency, acceptability, and selection: A case study of clause-embedding","authors":["AS White, K Rawlins"],"snippet":"Page 1. Frequency, acceptability, and selection: A case study of clause-embedding Aaron Steven White University of Rochester aaron.white@rochester.edu Kyle Rawlins Johns Hopkins University kgr@jhu.edu Abstract We investigate …","url":["https://ling.auf.net/lingbuzz/004596/current.pdf"]} {"year":"2019","title":"From Legal to Technical Concept: Towards an Automated Classification of German Political Twitter Postings as Criminal Offenses","authors":["F Zufall, T Horsmann, T Zesch"],"snippet":"… We use a bi-directional LSTM (Hochreiter and Schmidhuber, 1997) for classification.30 We use the 300-dimensional German pre-trained word embeddings provided by Grave et al. (2018), which are trained on the German common crawl …","url":["https://www.researchgate.net/profile/Frederike_Zufall/publication/331475806_From_Legal_to_Technical_Concept_Towards_an_Automated_Classification_of_German_Political_Twitter_Postings_as_Criminal_Offenses/links/5ccbe9b0a6fdcc4719838905/From-Legal-to-Technical-Concept-Towards-an-Automated-Classification-of-German-Political-Twitter-Postings-as-Criminal-Offenses.pdf"]} -{"year":"2019","title":"Frontiersinpatternrecognitionandartificialintelligence","authors":["B Marleah, N Nicola, SC Yee - 2019"],"snippet":""} +{"year":"2019","title":"Frontiersinpatternrecognitionandartificialintelligence","authors":["B Marleah, N Nicola, SC Yee - 2019"]} {"year":"2019","title":"Frowning Frodo, Wincing Leia, and a Seriously Great Friendship: Learning to Classify Emotional Relationships of Fictional Characters","authors":["E Kim, R Klinger - arXiv preprint arXiv:1903.12453, 2019"],"snippet":"… We obtain word vectors for the embedding layer from GloVe (pre-trained on Common Crawl, d = 300, Pennington et al., 2014) and initialize out- of-vocabulary terms with zeros (including the po- sition indicators). 4 Experiments Experimental Setting …","url":["https://arxiv.org/pdf/1903.12453"]} {"year":"2019","title":"Fusing Vector Space Models for Domain-Specific Applications","authors":["L Rettig, J Audiffren, P Cudré-Mauroux - arXiv preprint arXiv:1909.02307, 2019"],"snippet":"… Despite the convenience they bring, using such readilyavailable, pre-trained models is often suboptimal in vertical applications [2], [3]; as these models are pre-trained on large, non-specific sources (eg, Wikipedia and the Common …","url":["https://arxiv.org/pdf/1909.02307"]} {"year":"2019","title":"Gating Mechanisms for Combining Character and Word-level Word Representations: An Empirical Study","authors":["JA Balazs, Y Matsuo - arXiv preprint arXiv:1904.05584, 2019"],"snippet":"Page 1. Gating Mechanisms for Combining Character and Word-level Word Representations: An Empirical Study Jorge A. Balazs and Yutaka Matsuo Graduate School of Engineering The University of Tokyo {jorge, matsuo}@weblab.tu-tokyo.ac.jp Abstract …","url":["https://arxiv.org/pdf/1904.05584"]} @@ -1847,7 +1847,7 @@ {"year":"2020","title":"Answering Comparative Questions with Arguments","authors":["A Bondarenko, A Panchenko, M Beloucif, C Biemann… - Datenbank-Spektrum, 2020"],"snippet":"… analyzing Wikidata and DBpedia as additional sources of (structured) information besides the retrieval of sentences/documents from the Common Crawl … Ruppert E, Faralli S, Ponzetto SP, Biemann C (2018) Building a …","url":["https://link.springer.com/article/10.1007/s13222-020-00346-8"]} {"year":"2020","title":"Answering Event-Related Questions over Long-term News Article Archives","authors":["J Wang, A Jatowt, M Färber, M Yoshikawa"],"snippet":"… We can see that the actual time scope (January, 1988) of the first question is reflected relatively well by its distribution of relevant documents as generally 4 We use Glove [23] embeddings trained on the Common Crawl dataset with 300 dimensions. Page 6 …","url":["http://www.aifb.kit.edu/images/1/19/QA_ECIR2020.pdf"]} {"year":"2020","title":"Application of Machine Learning Techniques for Text Generation","authors":["S Martí Román - 2020"],"snippet":"Page 1. Escola Tècnica Superior d'Enginyeria Informàtica Universitat Politècnica de València Application of Machine Learning Techniques for Text Generation DEGREE FINAL WORK Degree in Computer Engineering Author: Salvador Martí Román …","url":["https://riunet.upv.es/bitstream/handle/10251/149583/Mart%C3%AD%20-%20Uso%20de%20t%C3%A9cnicas%20de%20aprendizaje%20autom%C3%A1tico%20para%20la%20generaci%C3%B3n%20de%20texto.pdf?sequence=1"]} -{"year":"2020","title":"Application of Machine Learning to Classify News Headlines","authors":["P Guttula, RM Aburas, S Srijan"],"snippet":""} +{"year":"2020","title":"Application of Machine Learning to Classify News Headlines","authors":["P Guttula, RM Aburas, S Srijan"]} {"year":"2020","title":"AQuaMuSe: Automatically Generating Datasets for Query-Based Multi-Document Summarization","authors":["S Kulkarni, S Chammas, W Zhu, F Sha, E Ie - arXiv preprint arXiv:2010.12694, 2020"],"snippet":"… is a nontrivial task in itself and there are several con1https://commoncrawl org … paragraphs, we use a pre-processed and cleaned version of the Common Crawl corpus (Raffel et al … We illustrate our approach us- ing Google's Natural …","url":["https://arxiv.org/pdf/2010.12694"]} {"year":"2020","title":"AraWEAT: Multidimensional Analysis of Biases in Arabic Word Embeddings","authors":["A Lauscher, R Takieddin, SP Ponzetto, G Glavaš - arXiv preprint arXiv:2011.01575, 2020"],"snippet":"… For FT, we investigate two models, one trained on the portions of Wikipedia and CommonCrawl corpora written in Modern Standard Arabic (MS) and the other on portions written in Egyptian Arabic.9 We evaluate the four variants …","url":["https://arxiv.org/pdf/2011.01575"]} {"year":"2020","title":"ArchiMeDe@ DANKMEMES: A New Model Architecture for Meme Detection","authors":["J Setpal, G Sarti"],"snippet":"… We fine-tune representations over the available meme textual data and use them as components of our end-to-end system. 1umberto-commoncrawl-cased-v1 in the HuggingFace's model hub (Wolf et al., 2019) Page 4. 2.3 Visual input …","url":["http://ceur-ws.org/Vol-2765/paper138.pdf"]} @@ -2897,7 +2897,7 @@ {"year":"2021","title":"Bilingual Lexicon Induction via Unsupervised Bitext Construction and Word Alignment","authors":["H Shi, L Zettlemoyer, SI Wang - arXiv preprint arXiv:2101.00148"],"snippet":"Page 1. Bilingual Lexicon Induction via Unsupervised Bitext Construction and Word Alignment Haoyue Shi ∗ TTI-Chicago freda@ttic.edu Luke Zettlemoyer University of Washington Facebook AI Research lsz@fb.com …","url":["https://arxiv.org/pdf/2101.00148"]} {"year":"2021","title":"BitextEdit: Automatic Bitext Editing for Improved Low-Resource Machine Translation","authors":["E Briakou, SI Wang, L Zettlemoyer, M Ghazvininejad - arXiv preprint arXiv …, 2021"],"snippet":"Mined bitexts can contain imperfect translations that yield unreliable training signals for Neural Machine Translation (NMT). While filtering such pairs out is known to improve final model quality, we argue that it is suboptimal in low-resource conditions …","url":["https://arxiv.org/pdf/2111.06787"]} {"year":"2021","title":"Blank spots, critical information needs and local journalism fund-ing","authors":["S Bisiani"],"snippet":"Abstract A global business model crisis in journalism, fuelled by loss in advertising revenue, challenges the survival of local news production. In Sweden, it has led to the closure of several newspapers across the country, and the concentration of …","url":["http://compscjournalism.org/projects/simona/projects/Master_Thesis_Simona_Bisiani.pdf"]} -{"year":"2021","title":"Book genre and author's gender recognition based on titles","authors":["A Pawłowski, E Herden, T Walkowiak - … and Text: Data, models, information and …, 2021"],"snippet":""} +{"year":"2021","title":"Book genre and author's gender recognition based on titles","authors":["A Pawłowski, E Herden, T Walkowiak - … and Text: Data, models, information and …, 2021"]} {"year":"2021","title":"BOSS: Bandwidth-Optimized Search Accelerator for Storage-Class Memory","authors":["J Heo, SY Lee, S Min, Y Park, SJ Jung, TJ Ham…"],"snippet":"Page 1. BOSS: Bandwidth-Optimized Search Accelerator for Storage-Class Memory Jun Heo, Seung Yul Lee, Sunhong Min, Yeonhong Park, Sung Jun Jung, Tae Jun Ham, Jae W. Lee Seoul National University {j.heo, triomphant1 …","url":["https://conferences.computer.org/iscapub/pdfs/ISCA2021-4ghucdBnCWYB7ES2Pe4YdT/333300a279/333300a279.pdf"]} {"year":"2021","title":"Bottom-Up Shift and Reasoning for Referring Image Segmentation","authors":["S Yang, M Xia, G Li, HY Zhou, Y Yu - Proceedings of the IEEE/CVF Conference on …, 2021"],"snippet":"Page 1. Bottom-Up Shift and Reasoning for Referring Image Segmentation Sibei Yang1∗† Meng Xia2∗ Guanbin Li2 Hong-Yu Zhou3 Yizhou Yu3,4† 1ShanghaiTech University 2Sun Yat-sen University 3The University of Hong Kong 4Deepwise AI Lab Abstract …","url":["https://openaccess.thecvf.com/content/CVPR2021/papers/Yang_Bottom-Up_Shift_and_Reasoning_for_Referring_Image_Segmentation_CVPR_2021_paper.pdf"]} {"year":"2021","title":"bradleypallen/keras-quora-question-pairs","authors":["ONDJF Mar, AMJJA Sep, OSMTW Thu, F Sat"],"snippet":"… Model, Source of Word Embeddings, Accuracy. \"BiMPM model\" [5], GloVe Common Crawl (840B tokens, 300D), 0.88 … \"Decomposable attention\" [6], \"Quora's text corpus\", 0.86. \"LDC\" [5], GloVe Common Crawl (840B tokens, 300D), 0.86 …","url":["https://giters.com/bradleypallen/keras-quora-question-pairs?amp=1"]} @@ -3493,7 +3493,7 @@ {"year":"2021","title":"Pre-training Methods in Information Retrieval","authors":["Y Fan, X Xie, Y Cai, J Chen, X Ma, X Li, R Zhang, J Guo… - arXiv preprint arXiv …, 2021"],"snippet":"The core of information retrieval (IR) is to identify relevant information from large-scale resources and return it as a ranked list to respond to user's information need. Recently, the resurgence of deep learning has greatly advanced this field and leads …","url":["https://arxiv.org/pdf/2111.13853"]} {"year":"2021","title":"Predicting cross-linguistic adjective order with information gain","authors":["W Dyer, R Futrell, Z Liu, G Scontras - arXiv preprint arXiv:2012.15263, 2020"],"snippet":"… 4.1 Data Our study relies on two types of source data, both extracted from the CoNLL 2017 Shared Task: Multilingual Parsing from Raw Text to Universal De- pendencies (Ginter et al., 2017; Zeman et al., 2017) a set of …","url":["https://arxiv.org/pdf/2012.15263"]} {"year":"2021","title":"Predicting Depression and Suicide Ideation in the Canadian Population Using Social Media Data","authors":["R Skaik - 2021"],"snippet":"Page 1. Predicting Depression and Suicide Ideation in the Canadian Population Using Social Media Data by Ruba Skaik Thesis submitted to the University of Ottawa in partial Fulfillment of the requirements for the Ph.D. degree in Computer Science …","url":["https://ruor.uottawa.ca/bitstream/10393/42346/7/Skaik_Ruba_2021_thesis.pdf"]} -{"year":"2021","title":"Predicting leadership perception with large-scale natural language data","authors":["S Bhatia, CY Olivola, N Bhatia, A Ameen - The Leadership Quarterly, 2021"],"snippet":"","url":["https://www.sciencedirect.com/science/article/pii/S1048984321000400"]} +{"year":"2021","title":"Predicting leadership perception with large-scale natural language data","authors":["S Bhatia, CY Olivola, N Bhatia, A Ameen - The Leadership Quarterly, 2021"],"url":["https://www.sciencedirect.com/science/article/pii/S1048984321000400"]} {"year":"2021","title":"Predicting Next Dialogue Action in Emotionally Loaded Conversation","authors":["D Deksne, R Skadiņš - Proceedings of the Future Technologies Conference, 2021"],"snippet":"… the Latvian Wikipedia data 2 and trained with our internal monolingual corpora, as well as several BERT models: multilingual BERT [26], BERT-LV [27], Language-agnostic BERT Sentence Embedding (LaBSE) [28], and the multilingual XLM-R model trained …","url":["https://link.springer.com/chapter/10.1007/978-3-030-89906-6_19"]} {"year":"2021","title":"Predicting the Performance of Multilingual NLP Models","authors":["A Srinivasan, S Sitaram, T Ganu, S Dandapat, K Bali… - arXiv preprint arXiv …, 2021"],"snippet":"… The pretraining procedure makes use of only unlabelled data (raw text), which is easy to obtain for a large number of languages from various sources including CommonCrawl1 and Wikipedia dumps2. These models can then be used to solve a …","url":["https://arxiv.org/pdf/2110.08875"]} {"year":"2021","title":"Preprint from: https://www. gipp. com/pub","authors":["T Spinde, L Rudnitckaia, K Sinha, F Hamborg"],"snippet":"… We scraped articles from both news outlets, published in the last decade, from 2010 to 2020, from Common Crawl [4]. For preprocessing, we use Genism simple preprocessing and generate n-grams. 3.4 Linear mapping between vector spaces …","url":["https://www.gipp.com/wp-content/papercite-data/pdf/spinde2021.pdf"]} @@ -3627,7 +3627,7 @@ {"year":"2021","title":"Step-unrolled Denoising Autoencoders for Text Generation","authors":["N Savinov, J Chung, M Binkowski, E Elsen, A Oord - arXiv preprint arXiv:2112.06749, 2021"],"snippet":"… results on unconditional language modeling on the Colossal Cleaned Common Crawl dataset and a dataset of Python code from GitHub. … • We demonstrate good qualitative results for unconditional generation and inpainting on Colossal Clean …","url":["https://arxiv.org/pdf/2112.06749"]} {"year":"2021","title":"Stepmothers are mean and academics are pretentious: What do pretrained language models learn about you?","authors":["R Choenni, E Shutova, R van Rooij - arXiv preprint arXiv:2109.10052, 2021","WWE Do, FLDT Fine-Tuning"],"snippet":"… are monolingual and 2 multilingual: BERT (Devlin et al., 2019) uncased trained on the BooksCorpus dataset (Zhu et al., 2015) and English Wikipedia; RoBERTa (Liu et al., 2019), the optimized version of BERT that is in addition …","url":["https://arxiv.org/pdf/2109.10052","https://deepai.org/publication/stepmothers-are-mean-and-academics-are-pretentious-what-do-pretrained-language-models-learn-about-you"]} {"year":"2021","title":"Stock Volume Prediction Based on Polarity of Tweets, News, and Historical Data Using Deep Learning","authors":["N Jawahar, J Chelladurai, I Sakthivel, B Bajracharya - 2020 2nd International …, 2020"],"snippet":"… token in order to predict the entity of that token. The CNN core model is pre-trained with GloVe vectors on Common Crawl, with 86.43% precision and 86.37% recall for NER. A python script is written that uses Psycopg to extract …","url":["https://dl.acm.org/doi/abs/10.1145/3440054.3440063"]} -{"year":"2021","title":"Storytelling Exhibitions: Identity, Truth and Wonder","authors":["P Hughes - 2021"],"snippet":""} +{"year":"2021","title":"Storytelling Exhibitions: Identity, Truth and Wonder","authors":["P Hughes - 2021"]} {"year":"2021","title":"Strategyproof Learning: Building Trustworthy User-Generated Datasets","authors":["S Farhadkhani, R Guerraoui, LN Hoang - arXiv preprint arXiv:2106.02398, 2021"],"snippet":"Page 1. arXiv:2106.02398v1 [cs.LG] 4 Jun 2021 Strategyproof Learning: Building Trustworthy User-Generated Datasets Sadegh Farhadkhani IC School, EPFL Lausanne, Switzerland sadegh.farhadkhani@epfl.ch Rachid Guerraoui …","url":["https://arxiv.org/pdf/2106.02398"]} {"year":"2021","title":"Streaming cascade-based speech translation leveraged by a direct segmentation model","authors":["J Iranzo-Sánchez, J Jorge, P Baquero-Arnal… - Neural Networks, 2021"],"snippet":"JavaScript is disabled on your browser. Please enable JavaScript to use all the features on this page. Skip to main content Skip to article …","url":["https://www.sciencedirect.com/science/article/pii/S0893608021002057"]} {"year":"2021","title":"Streamlining the Identification of Emerging Tasks in the O* NET System Using Natural Language Processing (NLP): Technical Summary","authors":["JA Dahlke, DJ Putka - 2021"],"snippet":"… In our case, we used embeddings from the Global Vectors for Word Representation (GloVe; Pennington, Socher, & Manning, 2014) algorithm trained on a “common crawl” of the internet consisting of 42 billion tokens (ie, wordlike pieces of text) …","url":["https://www.onetcenter.org/dl_files/EmergingTasksNLP.pdf"]} @@ -8064,7 +8064,7 @@ {"year":"2024","title":"JLBert: Japanese Light BERT for Cross-Domain Short Text Classification","authors":["C Kayal, S Chattopadhyay, A Gupta, S Abrol, A Gugol - Proceedings of the 2024 Joint …, 2024"],"snippet":"… Second, there are limited models that have been trained on diverse datasets beyond Wikipedia and Common Crawl, limiting experimentation with other forms of text data. Moreover, with the English language being the focus of research interest …","url":["https://aclanthology.org/2024.lrec-main.833.pdf"]} {"year":"2024","title":"JOANNA ZYLINSKA: Diffused Seeing","authors":["J ZYLINSKA"],"snippet":"This article examines the transformation of the relationship between seeing and understanding in humans and machines by the technologies of machine learning known as ‘generative AI’. Taking Stable Diffusion as the main case study, while also …","url":["https://mediatheoryjournal.org/2024/09/30/joanna-zylinska-diffused-seeing/"]} {"year":"2024","title":"Journal of Business Academy","authors":["E GEÇİCİ - Hakem Kurulu"],"snippet":"This study examines in detail the potential of artificial intelligence (AI)-assisted ChatGPT in the field of accounting education and the opportunities and challenges that this utilisation may bring. ChatGPT can benefit by reducing error rates and …","url":["https://www.isakder.org/2024/vol.5_issue2_full_issue.pdf#page=8"]} -{"year":"2024","title":"Juru: Legal Brazilian Large Language Model from Reputable Sources","authors":["RM Junior, R Pires, R Romero, R Nogueira - arXiv preprint arXiv:2403.18140, 2024"],"snippet":"The high computational cost associated with pretraining large language models limits their research. Two strategies have emerged to address this issue: domain specialization and pretraining with high-quality data. To explore these strategies, we …","url":["https://arxiv.org/pdf/2403.18140"]} +{"year":"2024","title":"Juru: Legal Brazilian Large Language Model from Reputable Sources","authors":["R Nogueira - … Systems: 35th Brazilian Conference, BRACIS 2025 …, 2026","RM Junior, R Pires, R Romero, R Nogueira - arXiv preprint arXiv:2403.18140, 2024"],"snippet":"The high compute cost associated with pretraining large language models limits their research. Two strategies have emerged to address this issue: domain specialization and pretraining with highquality data. To explore these strategies, we …","url":["https://arxiv.org/pdf/2403.18140","https://books.google.de/books?hl=en&lr=lang_en&id=dXG6EQAAQBAJ&oi=fnd&pg=PA121&dq=commoncrawl&ots=QngSfjEHQW&sig=BaJJD04xXD9NfacK1r8J8kiD83Y"]} {"year":"2024","title":"Just Rewrite It Again: A Post-Processing Method for Enhanced Semantic Similarity and Privacy Preservation of Differentially Private Rewritten Text","authors":["S Meisenbacher, F Matthes - arXiv preprint arXiv:2405.19831, 2024"],"snippet":"The study of Differential Privacy (DP) in Natural Language Processing often views the task of text privatization as a $\\textit{rewriting}$ task, in which sensitive input texts are rewritten to hide explicit or implicit private information. In order to evaluate the …","url":["https://arxiv.org/pdf/2405.19831"]} {"year":"2024","title":"JusticeAI: A Large Language Models Inspired Collaborative & Cross-Domain Multimodal System for Automatic Judicial Rulings in Smart Courts","authors":["NA Samee, M Alabdulhafith, SMAH Shah, A Rizwan - IEEE Access, 2024"],"snippet":"… For GloVe embedding’s, which are static, we employed embedding’s pre-trained on large-scale datasets such as the Common Crawl or Wikipedia + Gig word corpus. For the dynamic embedding’s from BERT, ALBERT, RoBERTa, and Distilled BERT …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10743188.pdf"]} {"year":"2024","title":"KazQAD: Kazakh Open-Domain Question Answering Dataset","authors":["R Yeshpanov, P Efimov, L Boytsov, A Shalkarbayuli… - arXiv preprint arXiv …, 2024"],"snippet":"… Kaz-RoBERTa6 is a monolingual model trained on a collection of Kazakh texts from Common Crawl, the Leipzig Corpora Collection, the … XLM-R was pre-trained on a larger multilingual corpus derived from Common Crawl, in which Kazakh is …","url":["https://arxiv.org/pdf/2404.04487"]} @@ -9190,6 +9190,7 @@ {"year":"2025","title":"$\\texttt {Droid} $: A Resource Suite for AI-Generated Code Detection","authors":["D Orel, I Paul, I Gurevych, P Nakov - arXiv preprint arXiv:2507.10583, 2025"],"snippet":"In this work, we compile $\\textbf{$\\texttt{DroidCollection}$}$, the most extensive open data suite for training and evaluating machine-generated code detectors, comprising over a million code samples, seven programming languages, outputs …","url":["https://arxiv.org/pdf/2507.10583"]} {"year":"2025","title":"'Nobody's Framework': Article 4 CDSM and the Broken Promise of TDM in the Age of AI","authors":["AG Morais - 2025"],"snippet":"Generative Artificial Intelligence (GenAI) has introduced significant legal and regulatory challenges, particularly concerning the use of copyrighted content to train models through text and data mining (TDM). Within the European Union (EU), this …","url":["https://lup.lub.lu.se/luur/download?func=downloadFile&recordOId=9197022&fileOId=9197035"]} {"year":"2025","title":"(Mis) Fitting: A Survey of Scaling Laws","authors":["M Li, S Kudugunta, L Zettlemoyer - arXiv preprint arXiv:2502.18969, 2025"],"snippet":"Modern foundation models rely heavily on using scaling laws to guide crucial training decisions. Researchers often extrapolate the optimal architecture and hyper parameters settings from smaller training runs by describing the relationship …","url":["https://arxiv.org/pdf/2502.18969"]} +{"year":"2025","title":"11. Open data in computational social science research","authors":["LD Ibáñez, J Walker, E Simperl", "LD Ibáñez, J Walker, E Simperl - Handbook of Computational Social Science, 2025"],"snippet":"Data are seen as critical to solving the most complex problems of today. More and more data are available online, and they are used to improve decisions, create efficiencies, unlock innovation, and hold organizations to account. For instance, in …","url":["https://www.elgaronline.com/downloadpdf/edcollchap/book/9781802207309/chapter11.pdf", "https://books.google.de/books?hl=en&lr=lang_en&id=bAOhEQAAQBAJ&oi=fnd&pg=PA150&dq=commoncrawl&ots=jVQGkKSQ5H&sig=MEPsQ3Iqd7c8qlayZXVENXKUCWI"]} {"year":"2025","title":"13 Libyan Translators' Attitudes toward the Profession in the Era of Automation","authors":["NASA Ali, M Babchikh - … Intelligence in Translation: Possibilities, Processes and …, 2025"],"snippet":"Artificial intelligence (AI) has affected many aspects of human life and transformed various industries, including translation. Like many professionals, some translators are concerned about the future of their business and how AI will affect it. Others …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=KmqIEQAAQBAJ&oi=fnd&pg=PT179&dq=commoncrawl&ots=RcOpBL7jL1&sig=JD2zdouqt_pe3ZqOfPu_NX0vwlE"]} {"year":"2025","title":"13 Machine Learning in Phishing URL Detection: A Review of Recent Progress","authors":["A Simhadri, M Rishikesh, M Subramaniam - Power Energy and Secure Smart …, 2025"],"snippet":"In 2023, the Anti-Phishing Working Group, a prominent cybersecurity organization, reported five million phishing attacks that affected systems globally, thereby sending a worldwide signal of the alarming increase in incidents. Phishing remains a favored …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=-JdnEQAAQBAJ&oi=fnd&pg=PA92&dq=commoncrawl&ots=ARChckrbIr&sig=cL-Hu50M14JyWZcZrSR11qyeP5w"]} {"year":"2025","title":"2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining","authors":["W Zhang, H Zhang, X Li, J Sun, Y Shen, W Lu, D Zhao… - arXiv preprint arXiv …, 2025"],"snippet":"… These corpora, consisting of sequences of text paragraphs interspersed with images, are typically crawled from webpage and document, such as Common Crawl Pretraining on a combination of interleaved corpus and image-pair datasets enables …","url":["https://arxiv.org/pdf/2501.00958"]} @@ -9217,6 +9218,7 @@ {"year":"2025","title":"A Comparative Analysis of Static Word Embeddings for Hungarian","authors":["M Gedeon - arXiv preprint arXiv:2505.07809, 2025"],"snippet":"This paper presents a comprehensive analysis of various static word embeddings for Hungarian, including traditional models such as Word2Vec, FastText, as well as static embeddings derived from BERT-based models using different extraction …","url":["https://arxiv.org/pdf/2505.07809"]} {"year":"2025","title":"A Comparative Analysis of Transformer Models for the Prediction of Arabic Punctuation","authors":["A Aboutaib, A El Allaoui, I Zeroual - … Conference on Artificial Intelligence and Smart …, 2025"],"snippet":"We present a comprehensive comparative analysis of different transformer models for the task of punctuation prediction in Arabic text. The models evaluated include Asafaya-BERT, XLM-RoBERTa, Google BERT Multi-lingual, AraBERT, MarBERT …","url":["https://link.springer.com/chapter/10.1007/978-3-031-90921-4_96"]} {"year":"2025","title":"A Comparative Approach for Auditing Multilingual Phonetic Transcript Archives","authors":["F Samir, EP Ahn, S Prakash, M Soskuthy, V Shwartz… - Transactions of the …, 2025"],"snippet":"Curating datasets that span multiple languages is challenging. To make the collection more scalable, researchers often incorporate one or more imperfect classifiers in the process, like language identification models. These models …","url":["https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00759/131563"]} +{"year":"2025","title":"A Comparative Performance Study of Retrieval-Augmented Generation Systems in Gynecologic Oncology","authors":["S Peng, W Kuerbanjiang, H Xiao, X He, Y Yi - 2025"],"snippet":"Large language models (LLMs) show great potential in oncology, but their utility is limited by hallucinations and static knowledge. Retrieval-augmented generation (RAG), which grounds model outputs in curated clinical sources, can mitigate these issues …","url":["https://www.researchsquare.com/article/rs-8221196/latest"]} {"year":"2025","title":"A Comparative Study of Korean Text Summarization Performance According","authors":["M Song - Intelligent Sustainable Systems: Selected Papers of …"],"snippet":"In the current NLP research status, there are active studies that have attempted to improve performance through fine-tuning or scaling up by suggesting various PLMs. However, it is difficult to find research analyzing which architecture features of PLMs …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=axpIEQAAQBAJ&oi=fnd&pg=PA65&dq=commoncrawl&ots=0TepaNL6xD&sig=B1Vdms5zoG0xspkiHf8tSwi36oo"]} {"year":"2025","title":"A COMPARATIVE STUDY OF MACHINE LEARNING AND DEEP LEARNING MODELS: STRENGTHS, LIMITATIONS, AND APPLICATIONS","authors":["N Kumari - Journal ID"],"snippet":"Artificial Intelligence (AI) by enabling systems to learn from data and make intelligent decisions. This paper presents an in-depth analysis of various ML and DL models, comparing their architectures, applications, strengths, and limitations. By exploring …","url":["https://www.researchgate.net/profile/Iaeme-Pub/publication/392942791_A_COMPARATIVE_STUDY_OF_MACHINE_LEARNING_AND_DEEP_LEARNING_MODELS_STRENGTHS_LIMITATIONS_AND_APPLICATIONS/links/685a228793040b17338cc893/A-COMPARATIVE-STUDY-OF-MACHINE-LEARNING-AND-DEEP-LEARNING-MODELS-STRENGTHS-LIMITATIONS-AND-APPLICATIONS.pdf"]} {"year":"2025","title":"A Comparative Study of Sentiment Analysis Using Transformer Models","authors":["A Hegde, G Kavya, SH Lakshmaiah - Speech and Language Technologies for Low …, 2025"],"snippet":"… XLMRoberta: is a multilingual version of RoBERTa which is pretrained on 2.5 TB filtered Common Crawl data containing 100 languages including Bangla [30]. The extensive pretraining enables XLM-RoBERTa to effectively handle and understand …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=8_mTEQAAQBAJ&oi=fnd&pg=PA183&dq=commoncrawl&ots=UUJBlfMXvA&sig=TV5lUTMoKOyS8KrU4j2e-MathO8"]} @@ -9227,8 +9229,12 @@ {"year":"2025","title":"A Comparative Survey on Large Language Models for Biological Data","authors":["R Mousa, A Sarabadani, T Taami, AA Bengari… - 2025"],"snippet":"The development of large language models (LLMs) has grown exponentially since the release of ChatGPT. Large language models have gained attention for their robust performance across various tasks. The ability of LLMs to understand and …","url":["https://www.preprints.org/frontend/manuscript/7dd6d8ddb94f9bc02dc4e1a764957e07/download_pub"]} {"year":"2025","title":"A Comparison and Critical Reflection of Information Disorder Detection Techniques: Performing a Cross-Data and Cross-Model Evaluation","authors":["MN Gruensteidl, S Kirrane - Information Fusion, 2025"],"snippet":"Abstract Information disorders, such as dis-, mis-, and malinformation, can lead to societal and/or economic harm. They are rapidly spread, extensively consumed on the web, and represent a threat to democracy. AI-based detection models can …","url":["https://www.sciencedirect.com/science/article/pii/S1566253525008681"]} {"year":"2025","title":"A Comprehensive Overview and Analysis of Large Language Models: Trends and Challenges","authors":["A Mohammed, R Kora - IEEE Access, 2025"],"snippet":"Large Language Models (LLMs) have transformed numerous fields by offering innovative solutions that drive advancements across a wide range of applications. However, their widespread adoption presents several challenges, including …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11015742.pdf"]} +{"year":"2025","title":"A Comprehensive Study of Implicit and Explicit Biases in Large Language Models","authors":["F Kazi, A Young, Y Inani, S Rafatirad - arXiv preprint arXiv:2511.14153, 2025"],"snippet":"Large Language Models (LLMs) inherit explicit and implicit biases from their training datasets. Identifying and mitigating biases in LLMs is crucial to ensure fair outputs, as they can perpetuate harmful stereotypes and misinformation. This study …","url":["https://arxiv.org/pdf/2511.14153"]} {"year":"2025","title":"A Comprehensive Study of LLM and Evolution, Varieties, and Their Role in Software Engineering and Cybersecurity","authors":["H Rasel, ABS Didar, AAM Dinar, FI Fahad, MAJ Khan… - 2025"],"snippet":"… With 175 billion parameters, which is over 100 times larger than GPT-2, GPT-3 was trained on a mix of Common Crawl, Wikipedia, books, and other large web datasets. It kept the same Transformer decoder-only structure but pushed the limits …","url":["https://www.preprints.org/frontend/manuscript/add06b943589c89aeb4522db9818a7d7/download_pub"]} +{"year":"2025","title":"A Comprehensive Study of Supervised Machine Learning Models for Zero-Day Attack Detection: Analyzing Performance on Imbalanced Data","authors":["Z Lotfi, M Lotfi - arXiv preprint arXiv:2512.07030, 2025"],"snippet":"Among the various types of cyberattacks, identifying zero-day attacks is problematic because they are unknown to security systems as their pattern and characteristics do not match known blacklisted attacks. There are many Machine Learning (ML) …","url":["https://arxiv.org/pdf/2512.07030"]} {"year":"2025","title":"A comprehensive survey on Arabic text augmentation: approaches, challenges, and applications","authors":["AA ElSabagh, SS Azab, HA Hefny - Neural Computing and Applications, 2025"],"snippet":"Arabic is a linguistically complex language with a rich structure and valuable syntax that pose unique challenges for natural language processing (NLP), primarily due to the scarcity of large, reliable annotated datasets essential for training models. The …","url":["https://link.springer.com/article/10.1007/s00521-025-11020-z"]} +{"year":"2025","title":"A Comprehensive Survey on Data Distillation: Techniques, Frameworks, and Future Directions","authors":["Q Razi, S Singh, R Priyadarshini, V Hassija… - IEEE Internet of Things …, 2025"],"snippet":"… Large-scale collections, such as the common crawl, capture snapshots of the internet and provide datasets that can reach staggering sizes in some cases, up to 541 terabytes. These massive resources serve as a goldmine for training large …","url":["https://ieeexplore.ieee.org/abstract/document/11250975/"]} +{"year":"2025","title":"A Comprehensive Survey on Large Language Models: From Pre-training to Autonomous Agents","authors":["J Xu, CX Liang, Z Bi, X Li, D Zhang, Z Yu"],"snippet":"Large language models (LLMs) have emerged as one of the most transformative developments in artificial intelligence, demonstrating remarkable capabilities across natural language understanding, generation, reasoning, and multimodal perception …","url":["https://www.researchgate.net/profile/Ziqian_Bi/publication/399059225_A_Comprehensive_Survey_on_Large_Language_Models_From_Pre-training_to_Autonomous_Agents/links/694c94a07e61d05b5312836f/A-Comprehensive-Survey-on-Large-Language-Models-From-Pre-training-to-Autonomous-Agents.pdf"]} {"year":"2025","title":"A Comprehensive Survey on Long Context Language Modeling","authors":["J Liu, D Zhu, Z Bai, Y He, H Liao, H Que, Z Wang… - arXiv preprint arXiv …, 2025"],"snippet":"Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) …","url":["https://arxiv.org/pdf/2503.17407"]} {"year":"2025","title":"A Comprehensive Survey on Reinforcement Learning-based Agentic Search: Foundations, Roles, Optimizations, Evaluations, and Applications","authors":["M Lin, Z Wu, Z Xu, H Liu, X Tang, Q He, C Aggarwal… - arXiv preprint arXiv …, 2025"],"snippet":"The advent of large language models (LLMs) has transformed information access and reasoning through open-ended natural language interaction. However, LLMs remain limited by static knowledge, factual hallucinations, and the inability to retrieve …","url":["https://arxiv.org/pdf/2510.16724"]} {"year":"2025","title":"A Concise Survey on Modern Web‐Based Phishing Techniques and Advanced Mitigation Strategies","authors":["D Panneerselvam, SC Sethuraman, AJ Emerson… - Transactions on Emerging …, 2025"],"snippet":"Phishing is a tactical technique practiced by cyber‐criminals, wherein the target systems are approached, made vulnerable, and exploited. A Phisher who does the act of phishing is always creative, calculative, and persistent. This potentially leads …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/ett.70119"]} @@ -9240,6 +9246,7 @@ {"year":"2025","title":"A Dual Contrastive Learning Framework for Enhanced Hate Speech Detection in Low-Resource Languages","authors":["K Chavinda, U Thayasivam - Proceedings of the First Workshop on Challenges in …, 2025"],"snippet":"Hate speech on social media platforms is a critical issue, especially in low-resource languages such as Sinhala and Tamil, where the lack of annotated datasets and linguistic tools hampers the development of effective detection systems. This …","url":["https://aclanthology.org/2025.chipsal-1.11.pdf"]} {"year":"2025","title":"A Feasibility Study & Implementation","authors":["TA Bloch, I Inusa - Proceedings of 5th International Conference on Recent …"],"snippet":"Natural Language Processing (NLP) is the emerging field research studies of the interaction between human and computing systems. With advancement of NLP techniques, machines are becoming increasingly proficient in understanding …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=kr1KEQAAQBAJ&oi=fnd&pg=PA1&dq=commoncrawl&ots=y7cpa-6W2n&sig=NBai9td8kclKb4ILEfMHRnWqdMI"]} {"year":"2025","title":"A Framework for Auditing Chatbots for Dialect-Based Quality-of-Service Harms","authors":["E Harvey, RF Kizilcec, A Koenecke - arXiv preprint arXiv:2506.04419, 2025"],"snippet":"… Researchers have found that widely-used sources of training data, like the Common Crawl,2 contain hate speech and other harmful content [58], and that this content increases as datasets scale [10]. To address this, LLM developers have …","url":["https://arxiv.org/pdf/2506.04419"]} +{"year":"2025","title":"A Framework for Ethical Data Removal from Language Resources: An Example from Low-Resource Language Communities","authors":["S Luger, R Mosquera-Gómez, PO Suárez, T Vaughan - Proceedings of the AAAI …, 2025"],"snippet":"… The authors acknowledge the support from our colleagues at Common Crawl, MLCommons, Factored AI, and iMerit without which this work would not be possible. We also appreciate the thoughtful conversations on indigenous data rights with …","url":["https://ojs.aaai.org/index.php/AAAI-SS/article/download/36866/39004"]} {"year":"2025","title":"A Framework for Safe AI: Data Governance and Ecosystem Structure","authors":["WT Tsai, L Zhang - 2025 IEEE International Conference on Artificial …, 2025"],"snippet":"… Recent complaints in the United States against Cohere’s Command family of models [11] detail extensive use of third-party corpora such as Common Crawl’s C4 without authorization—actions that, according to plaintiffs, constitute both copyright …","url":["https://ieeexplore.ieee.org/abstract/document/11127262/"]} {"year":"2025","title":"A framework for spatial clustering of textual objects: applications in topic clustering and text segmentation","authors":["G Guex - Cahiers du Centre de Linguistique et des Sciences du …, 2025"],"snippet":"We present a general, classical, framework of spatial clustering which can be applied to various textual objects (eg character n-grams, words, sentences). This framework proposes to cluster objects according to users defined linguistic similarity …","url":["https://www.cahiers-clsl.ch/article/view/8346/8132"]} {"year":"2025","title":"A Greek Dataset for the Detection of Online Sexism Against Women","authors":["PA Mantziou - 2025"],"snippet":"This study introduces, to the best of our knowledge, the first large-scale,(~ 14000 instances) manually annotated dataset for the detection of online sexism in the Greek language, against women. The dataset was collected from multiple social …","url":["https://pergamos.lib.uoa.gr/uoa/dl/object/5309095/file.pdf"]} @@ -9254,6 +9261,8 @@ {"year":"2025","title":"A Memory Efficient Randomized Subspace Optimization Method for Training Large Language Models","authors":["Y Chen, Y Zhang, Y Liu, K Yuan, Z Wen - arXiv preprint arXiv:2502.07222, 2025"],"snippet":"The memory challenges associated with training Large Language Models (LLMs) have become a critical concern, particularly when using the Adam optimizer. To address this issue, numerous memory-efficient techniques have been proposed …","url":["https://arxiv.org/pdf/2502.07222"]} {"year":"2025","title":"A Multi-class Cyberbullying Classification on Image and Text in Code-Mixed Bangla-English Social Media Content","authors":["AC Roy, T Mahmud, T Abrar - Natural Language Processing Journal, 2025"],"snippet":"Social media platforms like Facebook, Instagram, and Twitter are widely used; users frequently share their daily lives by uploading pictures, posts, and videos, which gain significant popularity. However, social media posts often receive a mix of …","url":["https://www.sciencedirect.com/science/article/pii/S2949719125000676"]} {"year":"2025","title":"A Multi-Modal Large Language Model for Free-Form, Open-Ended, and Interactive Prediction of Properties and Mechanisms of Candidate Drug Molecules","authors":["Y Liang, R Zhang, Z Ma, D Singh, Y Li, M Huo, C Gao…"],"snippet":"Accurately predicting the mechanisms and properties of candidate drug molecules is critical for advancing drug discovery. However, existing models are often limited to structured outputs, fixed task sets, and static, one-shot predictions. We present …","url":["https://openreview.net/pdf?id=K12ZDGL3ik"]} +{"year":"2025","title":"A Multimodal Deep Learning Framework for Video-Based Sentiment Analysis","authors":["J Bai, AWBA Wahab - Journal of Advances in Engineering Sciences and …, 2025"],"snippet":"… Textual features were represented using 300-dimensional GloVe embeddings pretrained on Common Crawl, capturing rich semantic and syntactic information. All text inputs were tokenized at the word level and aligned with video timestamps to …","url":["https://imp.cscholar.com/index.php/JAEST/article/download/1000107/219"]} +{"year":"2025","title":"A Naive Hybrid Approach to Borrowing Detection","authors":["F Sánchez-León - Proceedings of the Iberian Languages Evaluation …, 2025"],"snippet":"In this paper, we report on our approach to automatic borrowing detection and retrieval as implemented for ADoBo 2025 shared task. Our more outstanding hypothesis is that news text is edited according to strict editing guidelines that may …","url":["https://ceur-ws.org/Vol-4098/ADoBo2025_paper4.pdf"]} {"year":"2025","title":"A Neural Network Approach to Sentiment Analysis","authors":["SK Singh, M Srivastava, N Kumar, N Singh, A Singh"],"snippet":"… In the case of pre-trained embeddings like GloVe or Word2Vec, one would download a pre-trained embeddings file (eg, 300-dimensional vectors trained on Google News or Common Crawl) and create an embedding matrix for the …","url":["https://ijctjournal.org/wp-content/uploads/2025/08/A-Neural-Network-Approach-to-Sentiment-Analysis.pdf"]} {"year":"2025","title":"A new Approach to Programming: AI Agents, LLMs, and an SQL Generation Case Study","authors":["A Adelfio - 2025"],"snippet":"The rise of Large Language Models (LLMs) and AI agents is transforming software development, introducing new paradigms in automation and human-machine collaboration. This thesis, conducted in collaboration with Poseidon, a company …","url":["https://webthesis.biblio.polito.it/secure/35273/1/tesi.pdf"]} {"year":"2025","title":"A New Pair of GloVes","authors":["R Carlson, J Bauer, CD Manning - arXiv preprint arXiv:2507.18103, 2025"],"snippet":"This report documents, describes, and evaluates new 2024 English GloVe (Global Vectors for Word Representation) models. While the original GloVe models built in 2014 have been widely used and found useful, languages and the world continue to …","url":["https://arxiv.org/pdf/2507.18103"]} @@ -9261,10 +9270,13 @@ {"year":"2025","title":"A Novel Approach to Automated Detection of AI-Generated Text","authors":["HM Abbas - Journal of Al-Qadisiyah for Computer Science and …, 2025"],"snippet":"Detecting machine-generated text involves identifying whether text has been created by artificial intelligence models or written by humans. This task has become increasingly significant due to the potential misuse of AI-generated text for producing …","url":["https://jqcsm.qu.edu.iq/index.php/journalcm/article/download/1958/995"]} {"year":"2025","title":"A Novel Assistant for Question-Answering from Training Video Sessions Using RAG","authors":["Q Kembellec, K Boutalbi, O Le Van - 2025 IEEE 49th Annual Computers, Software …, 2025"],"snippet":"… developed, including the Robustly Optimized BERT Pretraining Approach (RoBERTa) [24], XLM-RoBERTa (large-sized model) [12] pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. A few models are pre-trained on …","url":["https://ieeexplore.ieee.org/abstract/document/11126594/"]} {"year":"2025","title":"A Novel Dual-Strategy Approach for Constructing Knowledge Graphs in the Home Appliance Fault Domain","authors":["D Zhang, J Zhang, Y Jia, M Liao - Algorithms, 2025"],"snippet":"Knowledge graph technology holds significant importance for efficient fault diagnosis in household appliances. However, the scarcity of public fault diagnosis data and the lack of automated knowledge extraction pose major challenges to …","url":["https://www.mdpi.com/1999-4893/18/8/485"]} +{"year":"2025","title":"A novel framework for phishing attack detection using domain-adapted GloVe embeddings and attention-enhanced neural sequence model","authors":["K Sruthi, SM Naik - Applied Soft Computing, 2025"],"snippet":"Phishing attacks have evolved into sophisticated cyber threats, causing catastrophic financial and reputational damage. Malicious URLs, often disguised as legitimate ones, are the primary vectors for these attacks. Current phishing detection systems …","url":["https://www.sciencedirect.com/science/article/pii/S1568494625017545"]} {"year":"2025","title":"A Paradigm Gap in Urdu","authors":["F Adeeba, R Bhatt - arXiv preprint arXiv:2509.01084, 2025"],"snippet":"In this paper, we document a paradigm gap in the combinatorial possibilities of verbs and aspect in Urdu: the perfective form of the -ya: kar construction (eg ro-ya: ki: cry-Pfv do.Pfv) is sharply ungrammatical in modern Urdu and Hindi, despite being …","url":["https://arxiv.org/pdf/2509.01084"]} {"year":"2025","title":"A Peek Behind the Curtain: Using Step-Around Prompt Engineering to Identify Bias and Misinformation in GenAI Models","authors":["D Hickerson, M Perkins - arXiv preprint arXiv:2503.15205, 2025"],"snippet":"This research examines the emerging technique of step-around prompt engineering in GenAI research, a method that deliberately bypasses AI safety measures to expose underlying biases and vulnerabilities in GenAI models. We discuss how …","url":["https://arxiv.org/pdf/2503.15205"]} {"year":"2025","title":"A Proposal of Post-OCR Spelling Correction Using Monolingual Byte-level Language Models","authors":["SS de Araújo, BLD Bezerra, AF de Sousa Neto - … of the 2025 ACM Symposium on …, 2025"],"snippet":"This work presents a proposal for a spelling corrector using monolingual byte-level language models (Monobyte) for the post-OCR task in texts produced by Handwritten Text Recognition (HTR) systems. We evaluate three Monobyte models …","url":["https://dl.acm.org/doi/abs/10.1145/3704268.3748673"]} +{"year":"2025","title":"A protocol for evaluating AI chatbots' capabilities for low-resource language teachers","authors":["N Swineheart, P Nguyen, E Yeh - 2025"],"snippet":"… Because of the wide variation in the amount of resources present in the Common Crawl dataset, we differentiated between the “Low” and “Extremely Low” resource groups to gain a more nuanced understanding of the differences between …","url":["https://scholarspace.manoa.hawaii.edu/bitstreams/64af4349-2b61-4638-9939-aaa24afd9ea4/download"]} {"year":"2025","title":"A review of advanced prompting techniques in Large Language Models (LLMs)","authors":["S Neupane - 2025"],"snippet":"Abstract This study investigates sophisticated prompting methods used to guide large language models more effectively. I analyzed techniques like zero-shot, CoT, ToT, and persona-based prompting for their ability to improve performance, accuracy …","url":["https://ucw.arcabc.ca/_flysystem/repo-bin/2025-08/ThesisFinal_SundarNeupane_Redacted.pdf"]} +{"year":"2025","title":"A Review of Automated Text Summarization Models on Diverse Datasets: An Evaluation Perspective","authors":["GE Egbuonu, PK Chika-Ugada, C Ndigwe, C Dimoji… - … of Science and Logics in ICT …, 2025"],"snippet":"This paper reviews Automatic Text Summarization which is one of the tasks in Natural Language Processing (NLP). It is driven by speedy increase in textual data across domains. The reviews systematically examined the recent advancements in …","url":["https://journals.ui.edu.ng/index.php/uijslictr/article/download/2060/1621"]} {"year":"2025","title":"A review of large language models and the recommendation task","authors":["J Munson, T Cuezze, S Nesar, D Zosso - Discover Artificial Intelligence, 2025"],"snippet":"Recommender systems are now ubiquitous across the internet, from streaming services to online shopping to social media. Traditional systems operate behind the scenes, often invisible to the end user. While these systems have enjoyed prolific …","url":["https://link.springer.com/article/10.1007/s44163-025-00334-5"]} {"year":"2025","title":"A REVIEW ON THE FUTURE OF GENERATIVE AI SYSTEMS","authors":["GK Dixit, S Kumar, H Kaur, S Choudhary, V Kumar"],"snippet":"Generative AI is reshaping industries through its ability to create new content—from text and images to audio and code—by learning patterns from vast datasets. In this paper, we examine the origins and evolution of Generative AI, explore its …","url":["https://ijrrr.com/papers18-1/V18-1-paper25-A%20Review%20on%20the%20Future%20of%20Generative%20AI%20Systems.pdf"]} {"year":"2025","title":"A REVISED TECHNIQUE TO TRAIN TERM/WORD VECTOR SPACE MODELS APPLYING THE ONTOLOGY-RELATED APPROACH","authors":["OV Palagin, VY Velychko, KS Malakhov, OS Shchurov"],"snippet":"… The most common datasets include an entire corpus of Wikipedia texts, the common crawl dataset [43], or the … [Online] Available from http://commoncrawl.org [Accessed: 03 February 2020]. … [Online] Available from http://commoncrawl.org [Accessed: 03 …","url":["https://nasplib.isofts.kiev.ua/server/api/core/bitstreams/dffced97-4888-4af5-a36c-2659c6c43555/content"]} @@ -9306,6 +9318,7 @@ {"year":"2025","title":"A Survey on the Impact of Pre-Trained Language Models in Sentiment Classification Task","authors":["H Gautam, A Gaur, DK Yadav - International Journal of Data Science and Analytics, 2025"],"snippet":"The evolution of pre-trained language models (PLMs) has significantly transformed the landscape of sentiment analysis, particularly in handling complex, noisy, informal, and short-text commonly found on social media. While numerous surveys have …","url":["https://link.springer.com/article/10.1007/s41060-025-00805-z"]} {"year":"2025","title":"A Systematic Analysis of Base Model Choice for Reward Modeling","authors":["K Ahrabian, P Jandaghi, N Mokhberian… - arXiv preprint arXiv …, 2025"],"snippet":"… We believe this is due to the potential occurrence of similar documents in the excluded CommonCrawl and C4 categories. Figure 7 showcases the Jensen-Shannon Distance (JSD) between different models over the scores of the entire 1M samples …","url":["https://arxiv.org/pdf/2505.10775"]} {"year":"2025","title":"A systematic review of bias detection methods for non-English word embeddings and language models","authors":["A Puttick, C Ikae, C Rigotti, E Fosch-Villaronga… - Artificial Intelligence Review, 2025"],"snippet":"Biases in applications of machine learning and artificial intelligence are a major limitation of these applications. Stereotypes of the society are reflected in different types of applications, including image generation, machine translation or CV ranking …","url":["https://link.springer.com/article/10.1007/s10462-025-11375-8"]} +{"year":"2025","title":"A Systematic Survey of Cultural Datasets for Equitable LLM Alignment","authors":["M Piao, L Miao, Y Liu, M He, H Ma, L Zhang, D Wei…"],"snippet":"As large language models (LLMs) are widespread worldwide, the uneven cultural alignment capabilities have become increasingly apparent. Existing LLMs generally exhibit a tendency toward Western cultural centrism, harboring stereotypes and …","url":["https://www.researchgate.net/profile/Mengyao-Piao/publication/398429883_A_Systematic_Survey_of_Cultural_Datasets_for_Equitable_LLM_Alignment/links/694216e106a9ab54f8479d9e/A-Systematic-Survey-of-Cultural-Datasets-for-Equitable-LLM-Alignment.pdf"]} {"year":"2025","title":"A systematic survey of natural language processing for the Greek language","authors":["J Bakagianni, K Pouli, M Gavriilidou, J Pavlopoulos - Patterns, 2025"],"snippet":"Comprehensive monolingual natural language processing (NLP) surveys are essential for assessing language-specific challenges, resource availability, and research gaps. However, existing surveys often lack standardized methodologies …","url":["https://www.cell.com/patterns/fulltext/S2666-3899(25)00161-8"]} {"year":"2025","title":"A Tale of LLMs and Induced Small Proxies: Scalable Agents for Knowledge Mining","authors":["S Zhang, L Yun, Z Wang, J Shang, L Peng - arXiv preprint arXiv:2510.01427, 2025"],"snippet":"At the core of Deep Research is knowledge mining, the task of extracting structured information from massive unstructured text in response to user instructions. Large language models (LLMs) excel at interpreting such instructions but are prohibitively …","url":["https://arxiv.org/pdf/2510.01427"]} {"year":"2025","title":"A technical background on artificial intelligence and intelligent language models","authors":["R Swier - JALTCALL Trends, 2025"],"snippet":"Stunning advancements in artificial intelligence (AI) over the last several years have undoubtedly opened new possibilities and challenges for the field of second language learning. Of course, AI is not new, and for decades it has attracted the …","url":["https://www.castledown.com/journals/jct/article/download/jct.v1n1.102412/962"]} @@ -9326,6 +9339,7 @@ {"year":"2025","title":"Adaptive Phishing Detection in Web Applications Using Ensemble Deep Learning and Feature Fusion Techniques","authors":["A Oluwaferanmi - 2025"],"snippet":"Phishing attacks represent one of the most persistent and evolving threats to web applications, often leading to severe financial loss, data breaches, and the compromise of user trust. Conventional detection techniques based on blacklists …","url":["https://www.researchgate.net/profile/Aremu-Oluwaferanmi/publication/390872160_Adaptive_Phishing_Detection_in_Web_Applications_Using_Ensemble_Deep_Learning_and_Feature_Fusion_Techniques/links/6800cb4cd1054b0207d4ddcf/Adaptive-Phishing-Detection-in-Web-Applications-Using-Ensemble-Deep-Learning-and-Feature-Fusion-Techniques.pdf"]} {"year":"2025","title":"Adaptive sorting for large keys, strings, and database rows","authors":["M Kuhrt, B Seeger, S Wild, G Graefe - … für Business, Technologie und Web (BTW 2025 …, 2025"],"snippet":"As sorting a database table may require expensive comparisons, eg, due to column count or column types such as long or international strings, optimizing the count and cost of comparisons is important. Adaptive sorting avoids comparisons by exploiting …","url":["https://dl.gi.de/bitstreams/e555578d-8cb8-4e2f-b0df-c3b7c8a7ac15/download"]} {"year":"2025","title":"Addressing Bias in LLMs: Strategies and Application to Fair AI-based Recruitment","authors":["A Peña, J Fierrez, A Morales, G Mancera, M Lopez… - arXiv preprint arXiv …, 2025"],"snippet":"The use of language technologies in high-stake settings is increasing in recent years, mostly motivated by the success of Large Language Models (LLMs). However, despite the great performance of LLMs, they are are susceptible to ethical concerns …","url":["https://arxiv.org/pdf/2506.11880"]} +{"year":"2025","title":"Addressing Stereotypes in Large Language Models: A Critical Examination and Mitigation","authors":["F Kazi - arXiv preprint arXiv:2511.21711, 2025"],"snippet":"… The training uses the C4 dataset, a 750 GB collection of clean English text sourced from Common Crawl, meticulously processed to remove irrelevant content. … [16] Common Crawl Foundation, Common crawl, 2024. …","url":["https://arxiv.org/pdf/2511.21711"]} {"year":"2025","title":"Advanced Implementation of a Multilevel Model for Text Summarization in Kazakh Using Pretrained Models","authors":["D Oralbekova, O Mamyrbayev, M Othman… - Engineering, Technology & …, 2025"],"snippet":"This study investigates transformer models for the task of hybrid text summarization in the Kazakh language. Using mBART, mT5, and XLM-RoBERTa models, a multilevel architecture was developed that processes text at the character, subword …","url":["https://etasr.com/index.php/ETASR/article/download/12799/5489"]} {"year":"2025","title":"Advanced Layout Analysis Models for Docling","authors":["N Livathinos, C Auer, A Nassar, RT de Lima, M Lysak… - arXiv preprint arXiv …, 2025"],"snippet":"… We have incorporated WordScape documents from the 2013 CommonCrawl snapshot into our data mix. However, a detailed inspection of the annotations revealed a significant semantic mismatch: WordScape’s “Table” label is frequently applied to …","url":["https://arxiv.org/pdf/2509.11720"]} {"year":"2025","title":"ADVANCED MACHINE LEARNING FRAMEWORK FOR IDENTIFYING AND MITIGATING FAKE NEWS AND MISINFORMATION PROPAGATION ON SOCIAL MEDIA …","authors":["K Zia, U Saeed, A Rauf, RH Ahmed, M Hussain - Spectrum of Engineering Sciences, 2025"],"snippet":"Fake news has become such a serious problem to everyone of all ages and backgrounds as it can deceive. The demand for accurate and reliable methods to detect misinformation has increased due to the rising reliance of individuals on …","url":["https://www.thesesjournal.com/index.php/1/article/download/1424/1081"]} @@ -9333,6 +9347,7 @@ {"year":"2025","title":"Advanced Tool Learning and Selection System (ATLASS): A Closed-Loop Framework Using LLM","authors":["MA Haque, J Williams, S Siddique, MH Islam, H Ali… - arXiv preprint arXiv …, 2025"],"snippet":"The combination of LLM agents with external tools enables models to solve complex tasks beyond their knowledge base. Human-designed tools are inflexible and restricted to solutions within the scope of pre-existing tools created by experts. To …","url":["https://arxiv.org/pdf/2503.10071"]} {"year":"2025","title":"Advancements in Natural Language Processing: Leveraging Transformer Models for Multilingual Text Generation","authors":["MZ Hossain, S Goyal - Pacific Journal of Advanced Engineering Innovations, 2024"],"snippet":"Background: Recent advancements in Natural Language Processing (NLP) have revolutionized text generation techniques, with Transformer models becoming the cornerstone of modern NLP tasks, particularly in multilingual text generation …","url":["https://scienceget.org/index.php/pjaei/article/download/2/14"]} {"year":"2025","title":"Advancements in Transformer-Based Models for Enhanced Hate Speech Detection in Arabic: Addressing Dialectal Variations and Cross-Platform Challenges","authors":["A Fat'hAlalim, Y Liu, Q Xie, N Ibrahim - ACM Transactions on Asian and Low …, 2025"],"snippet":"… It was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. We used the xlm-roberta-base version[60]. • AlBERT: AlBERT is a lite BERT model that presents two parameter-reduction techniques to lower memory …","url":["https://dl.acm.org/doi/pdf/10.1145/3748492"]} +{"year":"2025","title":"Advancing Bangla Machine Translation Through Informal Datasets","authors":["A Roy, R Rahaman, S Shibly, US Joy, AA Kafi… - arXiv preprint arXiv …, 2025"],"snippet":"Bangla is the sixth most widely spoken language globally, with approximately 234 million native speakers. However, progress in open-source Bangla machine translation remains limited. Most online resources are in English and often remain …","url":["https://arxiv.org/pdf/2512.13487"]} {"year":"2025","title":"Advancing EHR analysis: Predictive medication modeling using LLMs","authors":["H Alghamdi, A Mostafa - Information Systems, 2025"],"snippet":"In modern healthcare systems, the analysis of Electronic Health Records (EHR) is fundamental for uncovering patient health trends and enhancing clinical practices. This study aims to advance EHR analysis by developing predictive models for …","url":["https://www.sciencedirect.com/science/article/pii/S0306437925000134"]} {"year":"2025","title":"Advancing Eye-Gaze Writing Systems With Computer Vision, and Dynamic Text Suggestions","authors":["WA Shobaki - 2025"],"snippet":"Eye gaze writing, a novel interaction modality, has the potential to revolutionize communication for individuals with limited mobility. In our research, we investigated the deep learning algorithms efficiency for real-time eye gaze writing. We have …","url":["https://search.proquest.com/openview/715a7373585a2b37890db99e554bb65b/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Advancing Hindi Text Summarization: Named Entity Recognition and Content Augmentation Strategies","authors":["S Gupta, S Pal - ACM Transactions on Asian and Low-Resource …, 2025"],"snippet":"We explore advancements in Hindi text summarization, a critical area in natural language processing that aids in managing information overload. Despite a growing corpus of Hindi data, there’sa significant gap in practical summarization tools due to …","url":["https://dl.acm.org/doi/pdf/10.1145/3770073"]} @@ -9344,8 +9359,10 @@ {"year":"2025","title":"Advantageous Parameter Expansion Training Makes Better Large Language Models","authors":["N Gu, Y Chen, Z Zhang, P Fu, Z Lin, S Wang, Y Sun… - arXiv preprint arXiv …, 2025"],"snippet":"Although scaling up the number of trainable parameters in both pre-training and fine-tuning can effectively improve the performance of large language models, it also leads to increased computational overhead. When delving into the parameter difference, we …","url":["https://arxiv.org/pdf/2505.24241"]} {"year":"2025","title":"Adversarial Attacks against Neural Ranking Models via In-Context Learning","authors":["A Bigdeli, N Arabzadeh, E Bagheri, CLA Clarke - arXiv preprint arXiv:2508.15283, 2025"],"snippet":"… Given the large document sizes in the Common Crawl news collection and the C4 collection, we divide documents into chunks of 512 tokens with a stride of 256 tokens. We determine the relevance score of the topicdocument pair used in the re-ranking …","url":["https://arxiv.org/pdf/2508.15283"]} {"year":"2025","title":"Adversarial Learning for Cross-Lingual Word Embeddings","authors":["H Wang"],"snippet":"In the field of natural language processing, current neural network systems are hungry for labelled data. However, large amounts of human-annotated or human-corrected labelled data are only available for a limited number of languages. Previous studies …","url":["https://access.archive-ouverte.unige.ch/access/metadata/03bcac7c-3252-4ced-b696-467c93e32836/download"]} +{"year":"2025","title":"Adversarial machine learning and AI security: safeguarding Industry 4.0","authors":["V Vakula, M Noorjahan, D Sharmila, L Saxena… - Quantum Shield for AI …, 2025"],"snippet":"… A prevalent method in AI systems entails utilizing extensive datasets, approximately one trillion words, such as “Common Crawl,” to train models [14]. Common crawl compiles its vast collection by web crawling, initiated with a compilation of recognized …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=poqYEQAAQBAJ&oi=fnd&pg=PA23&dq=commoncrawl&ots=0v-jKSZ9fp&sig=0K6ahkFekbWSaNiHcbdA78nanoc"]} {"year":"2025","title":"Adversarial Speech-Text Pre-Training for Speech Translation","authors":["C Liu, L Chen, W Zhang, X Li, P Tang, M Yu, S Ghosh… - ICASSP 2025-2025 IEEE …, 2025"],"snippet":"Large-scale pre-training has been shown to benefit speech translation tasks. However, existing multimodal pre-training efforts rely on parallel corpora for semantic alignment, potentially limiting performance to the scale of available data …","url":["https://ieeexplore.ieee.org/abstract/document/10888294/"]} {"year":"2025","title":"AEHRC at BioLaySumm 2025: Leveraging T5 for Lay Summarisation of Radiology Reports","authors":["W Zhang, S Chandra, B Koopman, J Dowling… - Proceedings of the 24th …, 2025"],"snippet":"Biomedical texts, such as research articles and clinical reports, are often written in highly technical language, making them difficult for patients and the general public to understand. The BioLaySumm 2025 Shared Task addresses this challenge by …","url":["https://aclanthology.org/2025.bionlp-share.21.pdf"]} +{"year":"2025","title":"Affordances and limitations of using large language models to generate qualitative data about mental health perceptions in engineering","authors":["J Sanders, J Mobley IV, I Miller, NW Sochacka… - Journal of Engineering …, 2026"],"snippet":"Background Generative artificial intelligence (AI) large‐language models (LLMs) have significant potential as research tools. However, the broader implications of using these tools are still emerging. Few studies have explored using LLMs to …","url":["https://onlinelibrary.wiley.com/doi/pdf/10.1002/jee.70037"]} {"year":"2025","title":"AFRIDOC-MT: Document-level MT Corpus for African Languages","authors":["JO Alabi, IA Azime, M Zhang, C España-Bonet… - arXiv preprint arXiv …, 2025"],"snippet":"This paper introduces AFRIDOC-MT, a document-level multi-parallel translation dataset covering English and five African languages: Amharic, Hausa, Swahili, Yor\\`ub\\'a, and Zulu. The dataset comprises 334 health and 271 information technology news …","url":["https://arxiv.org/pdf/2501.06374"]} {"year":"2025","title":"AfroXLMR-Comet: Multilingual Knowledge Distillation with Attention Matching for Low-Resource languages","authors":["JS Raju, JS Walia, S Raghav, V Marivate - arXiv preprint arXiv:2502.18020, 2025"],"snippet":"… manually audited, general domain 3T token monolingual dataset based on CommonCrawl, spanning 419 languages. We employ a multilingual subset of the dataset that represents African languages, specifically Kinyarwanda (rw), Swahili (sw) …","url":["https://arxiv.org/pdf/2502.18020"]} {"year":"2025","title":"Age and gender distortion in online media and large language models","authors":["D Guilbeault, S Delecourt, BS Desikan - Nature, 2025"],"snippet":"Are widespread stereotypes accurate 1, 2, 3 or socially distorted 4, 5, 6? This continuing debate is limited by the lack of large-scale multimodal data on stereotypical associations and the inability to compare these to ground truth …","url":["https://www.nature.com/articles/s41586-025-09581-z"]} @@ -9353,6 +9370,7 @@ {"year":"2025","title":"Aggarwal, CC (2018). Artificial Neural Network and Deep Learning. Springer. Anderson, JA (1995). An Introduction to Neural Networks. The MIT Press. Bahdanau, D …","authors":["N Rahayu - Deep Learning: Teori, Algoritma, dan Aplikasi, 2025"],"snippet":"… Dataset besar seperti Common Crawl atau Wikipedia sering digunakan untuk melatih model ini. Hasil: model transformer seperti GPT-4 mampu menghasilkan teks yang menyerupai manusia dan melakukan berbagai tugas NLP . Aplikasi …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=JQBLEQAAQBAJ&oi=fnd&pg=PA56&dq=commoncrawl&ots=M1n7O9-g3C&sig=xCIpfEywcM9ogJUQLbay9gb3SMU"]} {"year":"2025","title":"AGI for the Earth, the path, possibilities and how to evaluate intelligence of models that work with Earth Observation Data?","authors":["M Valipour, K Zheng, J Lowman, S Szabados… - arXiv preprint arXiv …, 2025"],"snippet":"… available datasets such as Common Crawl3. Human language only encapsulates one dimension of intelligence in which we can encode observations, events, their interrelations, and basically any concept that can be described textually. The …","url":["https://arxiv.org/pdf/2508.06057"]} {"year":"2025","title":"Agnostic debiasing of static embeddings: An approach to fairness in language models","authors":["G Cafferata, MG Beiró - JAIIO, Jornadas Argentinas de Informática, 2025"],"snippet":"Word vector representations were the initial building block that started the current state-of-the-art methods for several NLP tasks. Bias metrics and debiasing methods for static embeddings have been studied with moderate success, achieving some …","url":["https://revistas.unlp.edu.ar/JAIIO/article/download/19793/20019"]} +{"year":"2025","title":"AI Aesthetics","authors":["G Mueller, AD Soto-Vásquez, C Music, RO Sesigür…"],"snippet":"AI Aesthetics This volume investigates the intersection of generative AI and media aesthetics from an interdisciplinary perspective. Combining in-depth theoretical reflection with a diverse selection of case studies, its authors explore the aesthetic …","url":["https://mediarep.org/server/api/core/bitstreams/07ea8c80-7f22-44d2-a171-fe3f2e54ae70/content"]} {"year":"2025","title":"AI and Productivity: Using Artificial Intelligence to Improve Processes and Unlock Potential","authors":["D Clark - 2025"],"snippet":"Is AI in your organization driving real productivity or creating costly distractions? AI and Productivity by Donald Clark is a strategic guide for senior leaders and people executives who want to harness artificial intelligence to improve efficiency …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=Lo6SEQAAQBAJ&oi=fnd&pg=PP1&dq=commoncrawl&ots=NX1B0cQTL7&sig=Kbi6MKt-bTDqlP-qm0QZzdmXbKc"]} {"year":"2025","title":"AI Applications for Ancient Art History Education","authors":["C Smith - AI & Antiquity, 2025"],"snippet":"… , scrubbed information from large, publicly available datasets from companies such as LAION and Common Crawl that “web crawl” the internet to process and index the information to search engines (Common Crawl, 2025). As well as other …","url":["https://ai-antiquity.org/index.php/ai/article/download/22/7"]} {"year":"2025","title":"AI as a Child in a Cage: On Mirrors, Obedience, and the Illusion of Intelligence","authors":["D Safronov"],"snippet":"This paper explores the metaphor of the child in the cage as a framework for understanding the development of artificial intelligence systems under conditions of constraint. Contemporary large language models (LLMs) are trained not on the full …","url":["https://philarchive.org/archive/SAFAAA-4"]} @@ -9361,23 +9379,29 @@ {"year":"2025","title":"AI Diffusion in Low Resource Language Countries","authors":["A Misra, SW Zamir, W Hamidouche, I Becker-Reshef… - arXiv preprint arXiv …, 2025"],"snippet":"Artificial intelligence (AI) is diffusing globally at unprecedented speed, but adoption remains uneven. Frontier Large Language Models (LLMs) are known to perform poorly on low-resource languages due to data scarcity. We hypothesize that this …","url":["https://arxiv.org/pdf/2511.02752"]} {"year":"2025","title":"AI Ethics in Generative AI","authors":["M Ramachandran - Engineering Ethics of AI by Design: Principles …, 2025"],"snippet":"This chapter explores the ethical dimensions of Generative AI (GenAI), focusing on its transformative role in content creation and its far-reaching societal impacts. It analyses ethical challenges such as misinformation, intellectual property violations …","url":["https://link.springer.com/chapter/10.1007/978-981-95-2909-4_16"]} {"year":"2025","title":"AI Explains: ChatGPT","authors":["A Piani - 2025"],"snippet":"In a world where technology evolves at a breakneck pace,'AI Explains: ChatGPT'offers a comprehensive exploration of one of the most transformative innovations of our time. This book delves into the intricacies of ChatGPT, a model that has redefined …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=pKBZEQAAQBAJ&oi=fnd&pg=PT7&dq=commoncrawl&ots=te9DhR1LKA&sig=iKMYvltU_AGHySSFu6qhnkvTq_M"]} +{"year":"2025","title":"AI in Conflict Resolution: Practical Considerations, Opportunities and Challenges","authors":["A Molak - Conflict Resolution Quarterly, 2025"],"snippet":"With the rise of generative artificial intelligence models, researchers and practitioners across the fields are exploring the potential opportunities such models present for their work. The promise of higher efficiency, and the idea that generative …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/crq.70012"]} +{"year":"2025","title":"AI Powered Phishing Link Identifier for Social Media DMs","authors":["N HS, N Biswal, MG NS, G YK"],"snippet":"In today’s digital landscape, social media platforms have become a medium for cyber attackers to perform malicious activities. Attackers use Direct Messages to trick users into revealing their identities by clicking on the malicious links. As modern …","url":["https://ijctjournal.org/wp-content/uploads/2025/11/AI-Powered-Phishing-Link-Identifier-for-Social-Media-DMs.pdf"]} {"year":"2025","title":"AI Scaling: From Up to Down and Out","authors":["Y Wang, Y Li, C Xu - arXiv preprint arXiv:2502.01677, 2025"],"snippet":"AI Scaling has traditionally been synonymous with Scaling Up, which builds larger and more powerful models. However, the growing demand for efficiency, adaptability, and collaboration across diverse applications necessitates a broader perspective …","url":["https://arxiv.org/pdf/2502.01677"]} {"year":"2025","title":"AI tool for scientific literature data extraction","authors":["J Ronkainen - 2025"],"snippet":"The aim of this project was to explore artificial intelligence (AI) by developing a tool that leverages large language models (LLMs) to extract structured information from scientific articles. Systematic literature reviews and meta-analyses are based on …","url":["https://www.theseus.fi/bitstream/handle/10024/893515/Ronkainen_Justiina.pdf?sequence=2"]} {"year":"2025","title":"AI Tools and Technologies for Academic Research","authors":["A Szendi, D Kuttor, Z Pál - Institutional guide to using AI for research, 2025"],"snippet":"This chapter explores the landscape of Artificial Intelligence (AI) tools and technologies specifically suited for academic research, emphasizing the role of locally run Generative AI (GenAI) applications. By examining the structure of data …","url":["https://link.springer.com/chapter/10.1007/978-3-031-94809-1_3"]} {"year":"2025","title":"AI University Education","authors":["I Pitas"],"snippet":"• The need for such education permeates all levels of education and all social strata.• A 1/3-2/3 society, where 1/3 of the population understands and benefits from scientific progress, while the remaining 2/3 lags, being impoverished and …","url":["https://icarus.csd.auth.gr/wp-content/uploads/2025/01/AI-University-Education-v5.2.pdf"]} {"year":"2025","title":"AI-assisted German Employment Contract Review: A Benchmark Dataset","authors":["O Wardas, F Matthes - arXiv preprint arXiv:2501.17194, 2025"],"snippet":"Employment contracts are used to agree upon the working conditions between employers and employees all over the world. Understanding and reviewing contracts for void or unfair clauses requires extensive knowledge of the legal system …","url":["https://arxiv.org/pdf/2501.17194"]} +{"year":"2025","title":"AI-Assisted OSINT/SOCMINT for Safeguarding Borders: A Systematic Review","authors":["A Karakikes, K Kotis - Information, 2025"],"snippet":"In the highly volatile realm of global security, the necessity for leading-edge and effectual border resilience tactics has never been more imperative. This PRISMA 2020 guided systematic literature review (SLR) examines the intersection of artificial …","url":["https://www.mdpi.com/2078-2489/16/12/1095"]} {"year":"2025","title":"AI-Based Digital Advertising Tools in English and Business","authors":["N Singh - Application of English in Artificial Intelligence (AI) And …, 2025"],"snippet":"The integration of Artificial Intelligence (AI) into advertising has revolutionized the way businesses target consumers and create advertisements. This paper explores the use of AI-based tools in English advertising and their impact on businesses. By …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=yoxTEQAAQBAJ&oi=fnd&pg=PA139&dq=commoncrawl&ots=4ic10CYuuA&sig=ogRKxcqaRgcxttmJMYMrbANc2tc"]} {"year":"2025","title":"AI-Based Pronunciation Assessment and Grammatical Error Correction with Feedback for the German Language","authors":["SN Mehta, A Roth, C Munteanu, S Chandna - International Conference on Human …, 2025"],"snippet":"The rapid advancement of AI has transformed education and language learning, leading to the development of Computer Aided Language Learning (CALL) systems. These systems help learners practice reading, writing, pronunciation, and vocabulary …","url":["https://link.springer.com/chapter/10.1007/978-3-031-93415-5_23"]} +{"year":"2025","title":"AI-Driven Relevance Finder for Activations","authors":["V Malik - 2025"],"snippet":"This thesis investigates the viability of synthetic customer profiles for predicting customer engagement with marketing activations in the electronics domain. The research addresses critical challenges in customer analytics, particularly data …","url":["https://aaltodoc.aalto.fi/bitstreams/14291cb6-8995-43c9-9775-4563842f15f8/download"]} {"year":"2025","title":"AI-Generated Content and the Pollution of the Information Sphere: A Freedom of Expression Analysis under Article 10 ECHR","authors":["K Goth - 2025"],"snippet":"This thesis analyses whether and to what extent AI-generated content is protected by freedom of expression and information under Article 10 §1 ECHR, and under what conditions state interferences can be justified to protect the integrity of the …","url":["https://studenttheses.uu.nl/bitstream/handle/20.500.12932/49552/Publish%20Master%20Thesis%20Katharina%20Goth%2027th%20June%202025%200933406.pdf?sequence=1&isAllowed=y"]} {"year":"2025","title":"AI-Generated Content in Copyright Law: A Roadmap for Updating GCC Copyright Law","authors":["S Papastefanou - Innovation and Development of Knowledge Societies, 2025"],"snippet":"The rise of Text-to-Image Diffusion Models (TIDM) and the ensuing possibility to create complex images with a few text specifications poses a challenge to the fundamentals of Intellectual Property Law. In view of the ambitious goals of the GCC …","url":["https://www.taylorfrancis.com/chapters/edit/10.4324/9781003528517-7/ai-generated-content-copyright-law-stefan-papastefanou"]} {"year":"2025","title":"AI-generated stories favour stability over change: homogeneity and cultural stereotyping in narratives generated by gpt-4o-mini","authors":["JW Rettberg, H Wigers - Open Research Europe, 2025"],"snippet":"Can a language model trained largely on Anglo-American texts generate stories that are culturally relevant to other nationalities? To find out, we generated 11,800 stories - 50 for each of 236 countries – by sending the prompt “Write a 1500 word …","url":["https://open-research-europe.ec.europa.eu/articles/5-202"]} {"year":"2025","title":"AI-Powered Real-Time Text Editor with Multilingual Translation and Speech Recognition","authors":["A Dwivedi, S Sahu, A Srivastava - 2024 IEEE 16th International Conference on …, 2024"],"snippet":"This paper introduces a novel real-time collaborative text editor for revolutionizing information writing. We propose the development of an all-inclusive real-time collaborative text editor that does not require separate tools for information writing …","url":["https://ieeexplore.ieee.org/abstract/document/10847521/"]} {"year":"2025","title":"AI-Powered Sentiment Analytics in Banking: A BERT and LSTM Perspective.","authors":["MT Siddique, MJ Uddin, L Chambugong, AM Nijhum… - International …, 2025"],"snippet":"In recent years, the banking industry has witnessed a surge in digital feedback channels, where customers regularly share their experiences and opinions. Extracting meaningful insights from this unstructured data is vital for enhancing …","url":["http://www.iibajournal.org/index.php/iibeaj/article/download/65/65"]} {"year":"2025","title":"AI-Powered Transcreation in Global Marketing: Insights from Iran","authors":["G Hassani, M Malekshahi, H Davari - ELOPE: English Language Overseas …, 2025"],"snippet":"This study examines AI-powered transcreation’s role in improving cross-cultural brand communication. We employed GPT-3 to evaluate AI’s ability to enhance global marketing through improved translation and adaptation of brand messages …","url":["https://journals.uni-lj.si/elope/article/download/20627/18579"]} +{"year":"2025","title":"AICC: Parse HTML Finer, Make Models Better--A 7.3 T AI-Ready Corpus Built by a Model-Based HTML Parser","authors":["R Ma, J Qiu, C Xu, P Chu, K Liu, P Ren, Y Qu, J Peng… - arXiv preprint arXiv …, 2025"],"snippet":"… Using MinerU-HTML, we construct AICC (AI-ready Common Crawl), a 7.3-trillion token multilingual corpus from two Common Crawl snapshots. In controlled pretraining experiments where AICC and Trafilatura-extracted TfCC undergo …","url":["https://arxiv.org/pdf/2511.16397"]} {"year":"2025","title":"AIsplaining: Generative AI explains linguistic identities to me","authors":["B Carbajal-Carrera - Australian Review of Applied Linguistics, 2025"],"snippet":"… an analysis of undesirable content in the Common Crawl corpus. (Ed.),^(Eds.). Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language …","url":["https://www.jbe-platform.com/content/journals/10.1075/aral.24077.car"]} {"year":"2025","title":"AIxcellent Vibes at GermEval 2025 Shared Task on Candy Speech Detection: Improving Model Performance by Span-Level Training","authors":["CR Thelen, PG Blaneck, T Bornheim, N Grieger… - … 2025 Conference on …, 2025"],"snippet":"Positive, supportive online communication in social media (“candy speech”) has the potential to foster civility, yet automated detection of such language remains underexplored, limiting systematic analysis of its impact. We investigate how candy …","url":["https://serwiss.bib.hs-hannover.de/frontdoor/deliver/index/docId/3679/file/978-3-69018-016-0.pdf#page=404"]} {"year":"2025","title":"Aleph-Alpha-GermanWeb: Improving German-language LLM pre-training with model-based data curation and synthetic data generation","authors":["TF Burns, L Parcalabescu, S Wäldchen, M Barlow… - arXiv preprint arXiv …, 2025"],"snippet":"… To curate the Common Crawl data, we applied a pipeline similar to (but which we show can perform better than) FineWeb2. We then … By leveraging a combination of Common Crawl web data, FineWeb2, and synthetic data conditioned on organic …","url":["https://arxiv.org/pdf/2505.00022"]} {"year":"2025","title":"Algorithmic bias: sexualized violence against women in GPT-3 models","authors":["S Wyer, S Black - AI and Ethics, 2025"],"snippet":"… Common Crawl was the main contributor within the GPT-3 training dataset, and at the time of writing is the largest non-curated web corpus … CommonCrawl has been shown to present several types of explicit and abusive content regardless of filtering …","url":["https://link.springer.com/article/10.1007/s43681-024-00641-0"]} +{"year":"2025","title":"Algorithmic red teaming approaches to secure LLMs","authors":["S Jauhari - Machine Learning with Applications, 2025"],"snippet":"Algorithmic red teaming for Large Language Models (LLMs) is a crucial practice for proactively ensuring their safety and robustness. This process involves using an LLM as an adversary to test the vulnerabilities of a target LLM, which is essential for …","url":["https://www.sciencedirect.com/science/article/pii/S2666827025001987"]} {"year":"2025","title":"ALHD: A Large-Scale and Multigenre Benchmark Dataset for Arabic LLM-Generated Text Detection","authors":["A Khairallah, A Zubiaga - arXiv preprint arXiv:2510.03502, 2025"],"snippet":"We introduce ALHD, the first large-scale comprehensive Arabic dataset explicitly designed to distinguish between humanand LLM-generated texts. ALHD spans three genres (news, social media, reviews), covering both MSA and dialectal Arabic …","url":["https://arxiv.org/pdf/2510.03502"]} {"year":"2025","title":"Align-then-Slide: A complete evaluation framework for Ultra-Long Document-Level Machine Translation","authors":["J Guo, D Wei, Y Luo, X Chen, Z Wu, H Yang, H Shang… - arXiv preprint arXiv …, 2025"],"snippet":"… Our bilingual data originate from CommonCrawl. We first randomly sampled 100 document pairs that contained both source and target texts. After rule-based filtering to remove poorly aligned samples, professional translators selected the 50 highest-quality …","url":["https://arxiv.org/pdf/2509.03809"]} {"year":"2025","title":"Aligning LLMs for Multilingual Consistency in Enterprise Applications","authors":["A Agarwal, H Meghwani, HL Patel, T Sheng, S Ravi… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) remain unreliable for global enterprise applications due to substantial performance gaps between high-resource and mid/low-resource languages, driven by English-centric pretraining and internal reasoning biases. This …","url":["https://arxiv.org/pdf/2509.23659"]} @@ -9397,6 +9421,7 @@ {"year":"2025","title":"An ensemble classification model for improved performance of phishing detection system","authors":["M Sahoo, S Samanta, S Ghosh - International Journal of Information and Computer …, 2025"],"snippet":"Individuals and organisations are at risk of money losses and data compromise from phishing attempts. Traditional rule-based phishing detection methods fail to keep up with attacker strategies. The need for more advanced and adaptive phishing …","url":["https://www.inderscienceonline.com/doi/abs/10.1504/IJICS.2025.145112"]} {"year":"2025","title":"An Ensemble Machine Learning With Feature Selection Methods for Detecting Phishing Attacks","authors":["MM Rezq, KM Amin, HA Mousa"],"snippet":"… Phishing websites were obtained from Open Phish and Phish Tank, while legitimate websites were obtained from Alexa and Common Crawl. Next, data preprocessing was conducted, which included removing missing values, eliminating …","url":["https://ijci.journals.ekb.eg/article_392290_af4f40ca2484588b1827f82394730b7d.pdf"]} {"year":"2025","title":"An Evaluation of N-Gram Selection Strategies for Regular Expression Indexing in Contemporary Text Analysis Tasks","authors":["L Zhang, S Deep, JM Patel, K Sankaralingam - arXiv preprint arXiv:2504.12251, 2025"],"snippet":"… Since we could not locate the original dataset, we constructed a similar dataset using web pages from 2013 stored in Common Crawl [9]. We chose 2013 data because it is relatively close to 1999, ensuring that most of the regexes constructed …","url":["https://arxiv.org/pdf/2504.12251"]} +{"year":"2025","title":"An Evolutionary Overview of Large Language Models: From Statistical Methods to the Transformer Era.","authors":["B Damjanović, D Korać, D Simić, N Stamenković - Journal of Information Technology …, 2025"],"snippet":"While the early evolution of large language models (LLMs), including shift from statistical approaches to the Transformer architecture, illustrates their historical impact on the processing of natural language; however, the latest research in neural …","url":["https://search.ebscohost.com/login.aspx?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=22329625&AN=190267396&h=IglFGKxpIUzJ%2F%2BrPx7oee18ueAqz%2FDoJQb9mMa7puVmFG3XWF%2F%2B2JM0RQuNUbf4ivHO9jgXANiAZmvXi5nDnOw%3D%3D&crl=c"]} {"year":"2025","title":"An Expanded Massive Multilingual Dataset for High-Performance Language Technologies","authors":["L Burchell, O de Gibert, N Arefyev, M Aulamo, M Bañón… - arXiv preprint arXiv …, 2025"],"snippet":"Training state-of-the-art large language models requires vast amounts of clean and diverse textual data. However, building suitable multilingual datasets remains a challenge. In this work, we present HPLT v2, a collection of high-quality multilingual …","url":["https://arxiv.org/pdf/2503.10267"]} {"year":"2025","title":"An Explainable Artificial Intelligence Text Classifier for Suicidality Prediction in Youth Crisis Text Line Users: Development and Validation Study","authors":["J Thomas, A Lucht, J Segler, R Wundrack, M Miché… - JMIR Public Health and …, 2025","R Lieb, L Kuchinke, G Meinlschmidt"],"snippet":"… This multilingual model, trained on 2.5 TB of CommonCrawl data in 100 languages, tokenizes and encodes input text into a 768-dimensional embedding. Each message is embedded separately and then attached to an array of embedded …","url":["https://publichealth.jmir.org/2025/1/e63809","https://publichealth.jmir.org/2025/1/e63809/PDF"]} {"year":"2025","title":"An Investigation into Black and Brown Communities' Engagement with Data & Technology","authors":["E Al-Haque, G Thompson, ADR Smith, B Johnson - … of the AAAI/ACM Conference on …, 2025"],"snippet":"Over the years, we have witnessed significant biases in datasets and AI-driven systems. While these biases can impact anyone, there is a heightened risk for disproportionate harm to Black and Brown communities. Despite efforts to address …","url":["https://ojs.aaai.org/index.php/AIES/article/download/36531/38669"]} @@ -9405,6 +9430,7 @@ {"year":"2025","title":"An NLP-based System for Automated Com-pliance Analysis and Requirement Clas-sification in Engineering Applications","authors":["SDJ Lindén"],"snippet":"Regulatory compliance is a critical challenge in engineering product development, particularly in industries governed by complex and frequently evolving standards. This research, conducted in collaboration with Volvo Penta, explores the use of …","url":["https://odr.chalmers.se/server/api/core/bitstreams/3423d3ab-a145-4c77-ae70-615a4b1b849f/content"]} {"year":"2025","title":"An NLP-driven e-learning platform with LLMs and graph databases for personalised guidance","authors":["G Dobriţa, SV Oprea, A Bâra - Connection Science, 2025"],"snippet":"Information is ubiquitously available at our fingertips, transforming the way we learn, work and engage with the world around us. The challenge is not just accessing data but discerning its relevance and utility. This constant flow of information demands …","url":["https://www.tandfonline.com/doi/pdf/10.1080/09540091.2025.2518991"]} {"year":"2025","title":"An Outlook on the Opportunities and Challenges of Multi-Agent AI Systems","authors":["F Tian, A Luo, J Du, X Xian, R Specht, G Wang, X Bi… - arXiv preprint arXiv …, 2025"],"snippet":"Multi-agent AI systems (MAS) offer a promising framework for distributed intelligence, enabling collaborative reasoning, planning, and decision-making across autonomous agents. This paper provides a systematic outlook on the current …","url":["https://arxiv.org/pdf/2505.18397"]} +{"year":"2025","title":"An Overview of Artificial Intelligence and Machine Learning","authors":["A Singh, F Sharma, A Shastri - Artificial Intelligence for Biomass-based Biofuel …, 2026"],"snippet":"… Collections such as Common Crawl and Wikipedia dumps have propelled the advancement of language models proficient in comprehending and producing human-like content for natural language processing. In addition to raw datasets …","url":["https://www.taylorfrancis.com/chapters/edit/10.1201/9781003571223-4/overview-artificial-intelligence-machine-learning-adwika-singh-falguni-sharma-anshuman-shastri"]} {"year":"2025","title":"An Overview of Large Language Models for Statisticians","authors":["W Ji, W Yuan, E Getzen, K Cho, MI Jordan, S Mei… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have emerged as transformative tools in artificial intelligence (AI), exhibiting remarkable capabilities across diverse tasks such as text generation, reasoning, and decision-making. While their success has primarily been …","url":["https://arxiv.org/pdf/2502.17814"]} {"year":"2025","title":"An Overview of Large Language Models: Architectures, Emergent Abilities, and Applications","authors":["A Aslam"],"snippet":"… LLMs typically rely on massive text corpora drawn from web crawl data (eg, Common Crawl), books, and Wikipedia. Tokenization schemes such as Byte-Pair Encoding (BPE) [23] and SentencePiece [24] balance vocabulary size against …","url":["https://www.researchgate.net/profile/Mahmoud-Aljawarneh-2/publication/392356954_An_Overview_of_Large_Language_Models_Architectures_Emergent_Abilities_and_Applications/links/683eb71cdf0e3f544f5ca54b/An-Overview-of-Large-Language-Models-Architectures-Emergent-Abilities-and-Applications.pdf"]} {"year":"2025","title":"An Unsupervised Approach Based on Attentional Neural Models for Aspect-Based Sentiment Classification","authors":["L Zampierin, F Frasincar - ACM SIGAPP Applied Computing Review, 2025"],"snippet":"… In this research, we use the 300-dimensional GloVe word representations that were pre-trained on 42 billion tokens from Common Crawl [23]. This word embedding matrix contains representations for 1.9 million words. The reason for this choice is twofold. …","url":["https://dl.acm.org/doi/abs/10.1145/3746626.3746627"]} @@ -9424,18 +9450,23 @@ {"year":"2025","title":"Anti-Regulatory AI: How\" AI Safety\" is Leveraged Against Regulatory Oversight","authors":["RJ Yew, B Judge - arXiv preprint arXiv:2509.22872, 2025"],"snippet":"AI companies increasingly develop and deploy privacy-enhancing technologies, bias-constraining measures, evaluation frameworks, and alignment techniques -- framing them as addressing concerns related to data privacy, algorithmic fairness, and AI safety. This …","url":["https://arxiv.org/pdf/2509.22872"]} {"year":"2025","title":"APCache: An Adaptive Postings Cache in Heterogeneous Memory for Storage-Resident Search Indices","authors":["A Thai - 2025"],"snippet":"… For the large Common Crawl dataset, we observe consistent results across several repetitions of the same experiment, ie, the standard deviation is negligible. We run each query experiment twice, taking the results of the second repetition. We …","url":["https://shbakram.github.io/assets/papers/honors-thesis-anson.pdf"]} {"year":"2025","title":"Apertus: Democratizing Open and Compliant LLMs for Global Language Environments","authors":["A Hernández-Cano, A Hägele, AH Huang, A Romanou… - arXiv preprint arXiv …, 2025"],"snippet":"We present Apertus, a fully open suite of large language models (LLMs) designed to address two systemic shortcomings in today's open model ecosystem: data compliance and multilingual representation. Unlike many prior models that release …","url":["https://arxiv.org/pdf/2509.14233"]} +{"year":"2025","title":"Application of a Recurrent Neural Network Model to Prevent Phishing Attacks: A Systematic Review, Challenges and Future Work","authors":["C Oropeza, A Daza - 2025"],"snippet":"Phishing is the most popular form of attacks in cyberspace, accounting for 1,270,883 of attacks on organizations. The main objective of the study is to understand Recurrent Neural Networks (RNN) applications in a solution to prevent phishing …","url":["https://www.temjournal.com/content/144/TEMJournalNovember2025_2960_2971.pdf"]} {"year":"2025","title":"Application of Artificial Intelligence Technology in Gaming NPC and Existing Problems","authors":["W Wan - Proceedings of the 2025 3rd International Conference …, 2025"],"snippet":"… Therefore, in the data resource collection stage, it is necessary to download data resources from large-scale corpus in public domains such as ACL anthology corpus and Common Crawl, and crawl the massive tweets or Weibo data published by …","url":["https://www.atlantis-press.com/article/126015351.pdf"]} {"year":"2025","title":"Application of Large-Scale Corpora","authors":["X Huang - Proceedings of International Conference on Recent …, 2025"],"snippet":"With the in-depth application of natural language processing technol-ogy in complex tasks such as multilingual understanding and knowledge reason-ing, the heterogeneity, noise interference and insufficient cross-modal adaptation of large-scale …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=Ay2SEQAAQBAJ&oi=fnd&pg=PA146&dq=commoncrawl&ots=SQVGtdye3l&sig=FH5ArVDnBNQmBPtlC96rLq6dVto"]} +{"year":"2025","title":"APPLICATIONS OF LARGE LANGUAGE MODELS FOR GREEK LEGAL DOCUMENTS","authors":["EP Katsaounis"],"snippet":"Free access to judicial decisions constitutes a cornerstone of a transparent and democratic rule of law. However, the publication of such texts often conflicts with the fundamental right to privacy of the individuals involved. The need for anonymization …","url":["https://ir.lib.uth.gr/xmlui/bitstream/handle/11615/86983/32977.pdf?sequence=4"]} {"year":"2025","title":"APPLIED LINGUISTICS DRIVEN LARGE LANGUAGE MODEL FOR SARCASM RECOGNITION ON SOCIAL MEDIA CORPORA","authors":["AM ALASHJAEE, A ALSHAMMARI, MSA ALZAIDI… - Fractals, 2025"],"snippet":"… Facebook researchers developed FastText, a word representation tool featuring a widespread lexicon of 2 million words obtained from Common Crawl, providing supervised and unsupervised modes. Every word represents a 300-D vector space …","url":["https://www.worldscientific.com/doi/pdf/10.1142/S0218348X25400377"]} {"year":"2025","title":"Applying Artificial Intelligence in Translation","authors":["K Walter, M Agnetta"],"snippet":"Names: Walter, Katharina editor| Agnetta, Marco editor Title: Applying artificial intelligence in translation: possibilities, processes and phenomena/edited by Katharina Walter and Marco Agnetta. Description: New York, NY: Routledge, 2026 …","url":["https://www.researchgate.net/profile/Katharina-Walter-3/publication/395683563_Applying_Artificial_Intelligence_in_Translation_Possibilities_Processes_and_Phenomena/links/68d4326ddcd0a92165f17450/Applying-Artificial-Intelligence-in-Translation-Possibilities-Processes-and-Phenomena.pdf"]} {"year":"2025","title":"Applying Language Models To Patient Health Records: Acronym Expansion, Long Document Classification and Explainable Predictions","authors":["A Kashyap - 2025"],"snippet":"The health industry is experiencing a digital transformation, with Electronic Health Records (EHRs) becoming central repositories for an ever-growing volume of patient data. While EHR clinical notes offer rich, detailed insights into patient …","url":["https://repository.upenn.edu/bitstreams/3ae58c00-9a3d-49fc-afbb-e9f6f2212d78/download"]} {"year":"2025","title":"Applying Word Embeddings for Lithuanian Morphology: The Case of Adjectival Participles","authors":["L JANCAITĖ-SKARBALĖ, E RIMKUTĖ…"],"snippet":"This paper presents how word embeddings were used to identify adjectival Lithuanian participles. Although traditionally considered to be a form of a verb, participles in the Lithuanian language also have the characteristics of adjectives …","url":["https://www.bjmc.lu.lv/fileadmin/user_upload/lu_portal/projekti/bjmc/Contents/13_1_13_Jancaite.pdf"]} +{"year":"2025","title":"Approaches to automated NACE coding of German business activity descriptions","authors":["F Beuter, J Gussenbauer, E Minther, V Szabo… - Foundations and Advances …, 2025"],"snippet":"… These embeddings are trained on Wikipedia and Common Crawl, 8 an open repository of web-crawled data. Prior to applying these word embeddings the dimensionality reduction algorithm proposed by Raunak (2017) was applied to …","url":["https://library.oapen.org/bitstream/handle/20.500.12657/109358/1/9783032100047.pdf#page=191","https://link.springer.com/content/pdf/10.1007/978-3-032-10004-7.pdf#page=191"]} {"year":"2025","title":"Approaches to Epistemic Risk in Generative and General-Purpose AI","authors":["R Wolfe - 2025"],"snippet":"Generative and general-purpose AI systems stand poised to reshape longstanding information infrastructures and professions, ranging from search to social media to online journalism. Yet questions surrounding subtle biases, misinforming output …","url":["https://search.proquest.com/openview/a79603f7b5ea8f742d06dc32bdba3c66/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Arabic Cyberbullying Detection: A Comprehensive Review of Datasets and Methodologies","authors":["H Aljalaoud, K Dashtipour, A AI_Dubai - IEEE Access, 2025"],"snippet":"The freedom of speech in online spaces has substantially promoted engagement on social media platforms, where cyberbullying has emerged as a significant consequence. While extensive research has been conducted on cyberbullying …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10966006.pdf"]} {"year":"2025","title":"Arabic Language Characteristics that Make its Automatic Processing Challenging","authors":["I Boulesnam, R Boucetti"],"snippet":"… April 2024, this is the largest Arabic corpus to date, compiled from common crawl Web Extracted Text (WET) files. It has been rigorously cleaned and de-duplicated to ensure data quality and provides a substantial resource for training authentic Arabic …","url":["https://www.iajit.org/upload/files/Arabic-Language-Characteristics-that-Make-its-Automatic-Processing-Challenging.pdf"]} {"year":"2025","title":"Arabic Language Processing","authors":["B Hdioud, SL Aouragh"],"snippet":"This volume constitutes the refereed proceedings of the 8th International Conference on Arabic Language Processing (ICALP 2023), formerly known as CITALA. The conference, initially scheduled for 2023, was postponed due to …","url":["https://link.springer.com/content/pdf/10.1007/978-3-031-79164-2.pdf"]} +{"year":"2025","title":"Arabic Machine-Generated Text Detection: Stylometric Analysis and Cross-Model Evaluation","authors":["MS Al-Shaibani, M Ahmed - Expert Systems with Applications, 2025"],"snippet":"Large Language Models (LLMs) have achieved unprecedented capabilities in generating human-like text, posing subtle yet significant challenges for information integrity across critical domains. This challenge becomes severe, particularly in …","url":["https://www.sciencedirect.com/science/article/pii/S0957417425042599"]} {"year":"2025","title":"Arabic Sentiment Analysis Leveraging Hybrid Word Embeddings with Deep Learning Techniques","authors":["A Alharbi, N Sharma, F Hussain - International Conference on Advanced Information …, 2025"],"snippet":"The quality of word representation is essential for achieving high performance in various Natural Language Processing (NLP) tasks. This study investigates the impact of pre-trained word embeddings on sentiment classification of Arabic text …","url":["https://link.springer.com/chapter/10.1007/978-3-031-87769-8_12"]} {"year":"2025","title":"ArabicWeb-Edu: Educational Quality Data for Arabic LLM Training","authors":["M Hawasly, MT Mohiuddin, H Mubarak, S Boughorbel - Proceedings of The Third …, 2025"],"snippet":"… We begin by sampling 1 million Arabic web documents from Common Crawl and employ a prompt-based classification strategy to assign content quality scores ranging from 0 to 5. These labeled examples are used to train a robust Arabic …","url":["https://aclanthology.org/2025.arabicnlp-main.36.pdf"]} +{"year":"2025","title":"AraMix: Recycling, Refiltering, and Deduplicating to Deliver the Largest Arabic Pretraining Corpus","authors":["S Alrashed, F Orabona - arXiv preprint arXiv:2512.18834, 2025"],"snippet":"… This concentration reflects shared upstream dependency on the Common Crawl In fact, the datasets derive from the same web snapshots, … We thank our lab member, Manar Alnazer, for her work on filtering datasets from Common Crawl …","url":["https://arxiv.org/pdf/2512.18834"]} {"year":"2025","title":"ARC-Encoder: learning compressed text representations for large language models","authors":["H Pilchen, E Grave, P Pérez - arXiv preprint arXiv:2510.20535, 2025"],"snippet":"… For pretraining, we use data from Common Crawl that has been filtered and processed using dactory3, keeping samples with a quality score higher than 0.2. ARC-Encoder is pretrained on approximately 2.6B tokens. For finetuning, we use …","url":["https://arxiv.org/pdf/2510.20535"]} {"year":"2025","title":"Architecting Intelligence: A New Era of Generative AI and Deep Learning Models","authors":["R Naveenkumar, R Sarkar, N Kumar - Journal of Data Mining and Management, 2025"],"snippet":"… In addition to algorithms and hardware, the datasets used for training such as Common Crawl, Wikipedia, ImageNet, and CelebA are critical to model performance, with preprocessing steps like tokenization and augmentation improving robustness …","url":["https://www.researchgate.net/profile/R-Naveenkumar-Raman/publication/397269032_Architecting_Intelligence_A_New_Era_of_Generative_AI_and_Deep_Learning_Models/links/690a589f4baee16591902981/Architecting-Intelligence-A-New-Era-of-Generative-AI-and-Deep-Learning-Models.pdf"]} {"year":"2025","title":"Architectural Deep Dive into Large Language Models","authors":["A Ghabi, H Hamam - Generative AI and Large Language Models …, 2025"],"snippet":"This chapter presents an in-depth exploration of Large Language Models (LLMs), a cornerstone in the field of artificial intelligence and natural language processing. Beginning with an introduction to the fundamental concepts of language models, we …","url":["https://link.springer.com/chapter/10.1007/978-3-031-90573-5_3"]} @@ -9444,18 +9475,22 @@ {"year":"2025","title":"Are Multilingual Language Models an Off-ramp for Under-resourced Languages? Will we arrive at Digital Language Equality in Europe in 2030?","authors":["G Rehm, A GrĂźtzner-Zahn, F Barth - arXiv preprint arXiv:2502.12886, 2025"],"snippet":"… In addition, except for ROOTS, all data sets are based on CommonCrawl dumps. Even ROOTS, although trying to gather data from other sources, had to complement their data with a subset of OSCAR, which is also based on CommonCrawl. These …","url":["https://arxiv.org/pdf/2502.12886"]} {"year":"2025","title":"Are We on the Right Way for Assessing Document Retrieval-Augmented Generation?","authors":["W Shen, M Wang, Y Wang, D Chen, J Yang, Y Wan… - arXiv preprint arXiv …, 2025"],"snippet":"… 2024), and the CommonCrawl corpus. Scanned content presents greater challenges for models due to defects introduced by the document … 2023), augmented with multilingual slides from the Commoncrawl corpus. Collected slides …","url":["https://arxiv.org/pdf/2508.03644"]} {"year":"2025","title":"Argument Mining in the Web Archive","authors":["D Wild"],"snippet":"In this thesis, we study the online discourse surrounding controversial topics through automated argument analysis. In particular, we focus on the automatic discovery of temporal trends and patterns in archived web texts, which is a novelty in the domain …","url":["https://downloads.webis.de/theses/papers/wild_2025.pdf"]} +{"year":"2025","title":"Artificial Hegemony: A Gramscian Analysis of the Societal Dynamics Surrounding ChatGPT","authors":["M Henningsen - 2025"],"snippet":"… The content of the training data for both GPT-3.5 and GPT-4 includes the ›Common Crawl‹ dataset mentioned above, which consists of a vast number of web pages, articles, and posts (Common Crawl, 2023). There has been much controversy over …","url":["https://osnadocs.ub.uni-osnabrueck.de/bitstream/ds-2025012712109/1/pics-2025-01-henningsen-artificial-hegemony.pdf"]} {"year":"2025","title":"Artificial intelligence and security: some reflections concerning the freedom of expression, information and democracy","authors":["F Fusco - International Journal of Electronic Security and Digital …, 2025"],"snippet":"In contemporary society, the role of information in socio-economic development is increasing across domains such as policy, business, technology, and society. Among sources of information, news holds significant sway in shaping public opinion …","url":["https://www.inderscienceonline.com/doi/abs/10.1504/IJESDF.2025.147171"]} {"year":"2025","title":"ARTIFICIAL INTELLIGENCE AS A FILTER AND AS A PHILTER1","authors":["A VIIDALEPP"],"snippet":"… However, it is known that part of the data often originates from the NGO Common Crawl that maintains copies of large part of internet content available for download and further use. GPT models are trained on part of that data, in addition to Wikipedia …","url":["https://www.researchgate.net/profile/Auli-Viidalepp/publication/394965418_Artificial_intelligence_as_a_filter_and_as_a_philter/links/68ad9c9b6327cf7b63d97127/Artificial-intelligence-as-a-filter-and-as-a-philter.pdf"]} {"year":"2025","title":"Artificial intelligence development and policy landscape","authors":["G Gensler - The Economic Consequences of the Second Trump …"],"snippet":"Artificial intelligence (AI) is one of the most transformative technologies of our times. As it further takes on pattern recognition, decision making, content generation, and complex reasoning, it will continue to create efficiencies and innovations across the …","url":["https://cepr.org/system/files/publication-files/252704-the_economic_consequences_of_the_second_trump_administration_a_preliminary_assessment.pdf#page=150"]} +{"year":"2025","title":"Artificial intelligence in intra-and interlingual translation","authors":["S Deilen, C Paasch-Kaiser, R Krüger"],"snippet":"This study sets out to investigate the potential of the large language model (LLM) GPT-4o in Plain Language and low-resource translation contexts. We do so in the wider context of an artificial intelligence (AI) literacy framework for translation …","url":["https://www.researchgate.net/profile/Ralph-Krueger/publication/398484658_Artificial_Intelligence_in_Intra-_and_Interlingual_Translation_-_Investigating_the_Potential_of_the_Large_Language_Model_GPT-4o_in_Plain_Language_and_Low-Resource_Translation_Contexts/links/6937eb96a1fd017989065162/Artificial-Intelligence-in-Intra-and-Interlingual-Translation-Investigating-the-Potential-of-the-Large-Language-Model-GPT-4o-in-Plain-Language-and-Low-Resource-Translation-Contexts.pdf"]} {"year":"2025","title":"Artificial intelligence in qualitative analysis: a practical guide and reflections based on results from using GPT to analyze interview data in a substance use program","authors":["Y Yang, L Ma - Quality & Quantity, 2025"],"snippet":"… The GPT-4 (current version) was trained on a vast and diverse dataset that include Common Crawl, Wikipedia, books, and a selection of high-quality licensed datasets, as well as data from various web sources that encompass a wide range of …","url":["https://link.springer.com/article/10.1007/s11135-025-02066-1"]} {"year":"2025","title":"Aspect-Based Sentiment Analysis on Amazon Product Reviews Using a Novel Hybrid Machine Learning Algorithm","authors":["TL Scott, WW Goh, NA Khan - Journal of Universal Computer Science, 2025"],"snippet":"On Amazon, buyers can submit reviews on products they have purchased. These reviews contribute to a potential buyer’s decision-making process, as buyers read reviews to decide whether to buy a product. Additionally, sellers depend on reviews …","url":["https://search.proquest.com/openview/b56abb612ec05bf25e801f0af69c4e9a/1?pq-origsite=gscholar&cbl=6474026"]} {"year":"2025","title":"Assessing BERT-based models for Arabic and low-resource languages in crime text classification","authors":["NK Al-harbi, M Alghieth - PeerJ Computer Science, 2025"],"snippet":"The bidirectional encoder representations from Transformers (BERT) has recently attracted considerable attention from researchers and practitioners, demonstrating notable effectiveness in various natural language processing (NLP) tasks, including …","url":["https://peerj.com/articles/cs-3017/"]} {"year":"2025","title":"Assessing Bias in AI Chatbot Responses","authors":["B Madupati"],"snippet":"AI communication in the form of chatbots has brought about a new paradigm of communication and service delivery through the use of large language models (LLMs) like GPT. However, as these technologies are applied in daily life, questions about …","url":["https://dzone.com/articles/assessing-bias-in-ai-chatbot-responses"]} {"year":"2025","title":"Assessing Gender Bias of Pretrained Bangla Language Models in STEM and SHAPE Fields","authors":["NMK Arnob, S Mahmud, AT Wasi - Proceedings of the 6th Workshop on Gender Bias …, 2025"],"snippet":"Gender bias continues to shape societal perceptions across both STEM (Science, Technology, Engineering, and Mathematics) and SHAPE (Social Sciences, Humanities, and the Arts for People and the Economy) domains. While existing …","url":["https://aclanthology.org/2025.gebnlp-1.24.pdf"]} +{"year":"2025","title":"Assessing Patient Education Materials for Colorectal Cancer Generated by Four Large Language Models: Readability, Quality, and Transparency Challenges","authors":["M Yuan, W Hong, R Hu, X Jiang, S Zhang - Journal of Cancer Education, 2025"],"snippet":"Colorectal cancer (CRC) necessitates effective patient education, yet patients increasingly utilize Artificial Intelligence (AI) large language models (LLMs) for health information, raising concerns about quality and accessibility. This study …","url":["https://link.springer.com/article/10.1007/s13187-025-02793-x"]} {"year":"2025","title":"Assessing the Agreement Competence of Large Language Models","authors":["AT García, L Wanner - Proceedings of the Eighth International Conference on …, 2025"],"snippet":"While the competence of LLMs to cope with agreement constraints has been widely tested in English, only a very limited number of works deals with morphologically rich (er) languages. In this work, we experiment with 25 mono-and multilingual LLMs …","url":["https://aclanthology.org/2025.depling-1.4.pdf"]} {"year":"2025","title":"Assessing the Role of Data Quality in Training Bilingual Language Models","authors":["S Seto, M ter Hoeve, M de Seyssel, D Grangier - arXiv preprint arXiv:2506.12966, 2025"],"snippet":"Bilingual and multilingual language models offer a promising path toward scaling NLP systems across diverse languages and users. However, their performance often varies wildly between languages as prior works show that adding more …","url":["https://arxiv.org/pdf/2506.12966"]} {"year":"2025","title":"Assessing Transfer Learning's Impact on Deep Learning for Image Recognition and Natural Language Processing","authors":["R Singh"],"snippet":"… For instance, ImageNet is a common source for image recognition tasks, while datasets like Wikipedia or Common Crawl are often used for NLP tasks. • Target Task: The specific task that the pre-trained model is adapted to, which may involve a …","url":["https://www.ijerct.com/papers/07-01/assessing-transfer-learnings-impact-on-deep-learning.pdf"]} {"year":"2025","title":"Assessing Variations in Open Datasets for Training Large Language Models: Biases and Benchmarking","authors":["V Koc - Baltic Multidisciplinary Research Letters Journal, 2025"],"snippet":"Open datasets are critical to the development and training of large language models (LLMs). However, variations in dataset composition often introduce biases that can impact model performance and reliability. This Article investigates the nature and …","url":["https://www.bmrlj.com/index.php/Baltic/article/download/51/51"]} +{"year":"2025","title":"ASSESSMENT OF THE APPLICABILITY OF THE TDM EXCEPTION USING THE METHOD OF HISTORICAL INTERPRETATION","authors":["P Žikovská - The Lawyer Quarterly, 2025"],"snippet":"… • Open-source datasets created by researchers or organizations, for example: o Common Crawl – a massive dataset from publicly crawled web data. o The Pile – a collection of text data focused on training language models. o BooksCorpus …","url":["https://tlq.ilaw.cas.cz/index.php/tlq/article/download/665/665"]} {"year":"2025","title":"Asymmetric Semantic Search Using Multi-Dimensional Vector Text Data Representation","authors":["H Rabinkin"],"snippet":"This paper aims to provide a comprehensive analysis of semantic search methodologies that leverage text embeddings and vector space models for semantics representation. Specifically, the objectives of this work are to: analyze the …","url":["https://www.researchgate.net/profile/Herman-Rabinkin/publication/391629733_Asymmetric_Semantic_Search_Using_Multi-Dimensional_Vector_Text_Data_Representation/links/681f52d4ded433155746531e/Asymmetric-Semantic-Search-Using-Multi-Dimensional-Vector-Text-Data-Representation.pdf"]} {"year":"2025","title":"ATLAS: Adaptive Transfer Scaling Laws for Multilingual Pretraining, Finetuning, and Decoding the Curse of Multilinguality","authors":["S Longpre, S Kudugunta, N Muennighoff, I Hsu… - arXiv preprint arXiv …, 2025"],"snippet":"Scaling laws research has focused overwhelmingly on English -- yet the most prominent AI models explicitly serve billions of international users. In this work, we undertake the largest multilingual scaling laws study to date, totaling 774 …","url":["https://arxiv.org/pdf/2510.22037"]} {"year":"2025","title":"Attention-based chatbots for low-resource language processing: A comprehensive review","authors":["GC Uzoaru, II Ayogu, AC Onyeka, J Odii - SSR Journal of Engineering and …, 2025"],"snippet":"… The process begins with pre-training on highresource languages, where models such as BERT, GPT, or mBERT learn linguistic patterns from large datasets like Wikipedia and Common Crawl[li]. These models develop generalized language …","url":["https://ssrpublisher.com/wp-content/uploads/2025/07/Attention-Based-Chatbots-for-Low-Resource-Language-Processing-A-Comprehensive-Review.pdf"]} @@ -9464,14 +9499,17 @@ {"year":"2025","title":"AUTO-Explorer: Automated Data Collection for GUI Agent","authors":["X Guo, D Gao, MZ Shou - arXiv preprint arXiv:2511.06417, 2025"],"snippet":"… Existing methods often involve designing automated agents that browse URLs from the Common Crawl, using webpage HTML to collect … However, this method is difficult to apply to desktop software or some newly launched websites not included …","url":["https://arxiv.org/pdf/2511.06417"]} {"year":"2025","title":"AutoClean: LLMs Can Prepare Their Training Corpus","authors":["X Shen, S Hu, X Zhang, X Han, X Meng, J Wei, Z Liu… - Proceedings of the 2025 …, 2025"],"snippet":"… The data sourced from the Internet is often aggregated into datasets like Common Crawl, which presents significant quality variability and ne… We demonstrate the efficiency and effectiveness of AutoClean on both pre-training corpora such as …","url":["https://aclanthology.org/2025.naacl-demo.9.pdf"]} {"year":"2025","title":"AutoCurate: Automating Domain-Specific Dataset Curation for Large Language Models","authors":["A Gupta - 2025"],"snippet":"… Other methods filter domain-specific data from public corpora (eg, CommonCrawl), but typically use static keyword filters or manual rules, … When working with petabytescale corpora (eg, the 300TB-scale CommonCrawl), this efficiency …","url":["https://repository.gatech.edu/bitstreams/2c5d0fc4-5779-441a-9bb4-9958c514375d/download"]} +{"year":"2025","title":"AutoFocus-IL: VLM-based Saliency Maps for Data-Efficient Visual Imitation Learning without Extra Human Annotations","authors":["L Gong, F Bahrani, Y Zhou, A Banayeeanzade, J Li… - arXiv preprint arXiv …, 2025"],"snippet":"AutoFocus-IL is a simple yet effective method to improve data efficiency and generalization in visual imitation learning by guiding policies to attend to task-relevant features rather than distractors and spurious correlations. Although saliency …","url":["https://arxiv.org/pdf/2511.18617"]} {"year":"2025","title":"AutoGUI: Scaling GUI Grounding with Automatic Functionality Annotations from LLMs","authors":["H Li, J Chen, J Su, Y Chen, Q Li, Z Zhang - arXiv preprint arXiv:2502.01977, 2025"],"snippet":"… To amass these trajectories, we utilize the latest Common Crawl repository as the data source for web UIs and Android Emulator for mobile UIs. Note that illegal websites and Apps are excluded manually from the sources to ensure no …","url":["https://arxiv.org/pdf/2502.01977"]} {"year":"2025","title":"Automated Classification and Identification of Non-Functional Requirements in Agile-Based Requirements Using Pre-Trained Language Models","authors":["A Alhaizaey, M Al-Mashari - IEEE Access, 2025"],"snippet":"Non-functional requirements (NFRs) are critical factors for software quality and success. A frequently reported challenge in agile requirements engineering is that NFRs are often neglected due to the focus on functional requirements (FRs) and the …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11005451.pdf"]} {"year":"2025","title":"Automated detection of cryptocurrency investment scams at scale","authors":["J Atondo Siu - 2025"],"snippet":"The ecosystem of cryptocurrencies has grown and changed significantly since Bitcoin’s inception in 2008 (Nakamoto, 2008). Similarly, the number of people using cryptocurrencies as a means of investment, speculation and form of payment has …","url":["https://www.repository.cam.ac.uk/bitstreams/87fe348c-b720-4be7-b84d-e6e36812fc42/download"]} {"year":"2025","title":"Automated Log Analysis: Failure Prediction and Anomaly Detection Using Machine Learning and Large Language Models","authors":["F Hadadi - 2025"],"snippet":"The dependability of modern software systems is becoming increasingly crucial as their complexity and scope continue to grow. Log data recorded during system execution can be leveraged to predict failures and detect anomalies automatically …","url":["https://ruor.uottawa.ca/bitstreams/18e45d5f-c975-4ac2-b128-9e74d6e89284/download"]} +{"year":"2025","title":"Automated Multiple-Choice Question Generation and Analysis for Language Learning Assessment","authors":["V Raina - 2025"],"snippet":"This thesis investigates the application of natural language processing (NLP) techniques to the development and evaluation of language assessment tasks in computer-assisted language learning. It focuses on three interrelated areas …","url":["https://www.repository.cam.ac.uk/bitstreams/59c9ac84-0b96-46b6-a4f3-c6556be7d280/download"]} {"year":"2025","title":"Automated Semantic Labeling and Clustering of Product Claims and Ingredients Using Machine Learning: Leveraging LLMs to Automate Previously Time-Consuming …","authors":["F Fritzen - 2025"],"snippet":"This thesis is conducted at Knightec Group in Sweden for the client company Bintix. It is in the domain of unsupervised learning and natural language processing. The goal is to obtain a condensed list of consumer claims and ingredients used when …","url":["https://www.diva-portal.org/smash/get/diva2:1987386/FULLTEXT01.pdf"]} {"year":"2025","title":"Automated Speech Act Classification in Offensive German Language Tweets","authors":["M Plakidis, ELG Rehm - Abusive Language: Linguistic Resources, Methods and …, 2025"],"snippet":"En matière de détection de discours de haine et de langage offensant, l’intégration des connaissances sur les actes de langage représente une voie de recherche encore peu explorée. Dans nos précédents travaux, nous avons analysé si la …","url":["https://aclanthology.org/anthology-files/pdf/tal/2024.tal-3.0.pdf#page=75"]} {"year":"2025","title":"Automated Speech Markers of Alzheimer Dementia: Test of Cross-Linguistic Generalizability","authors":["PA Pérez-Toro, FJ Ferrante, G Pérez, BL Tee… - Journal of Medical Internet …, 2025"],"snippet":"… These were trained separately on each language’s Common Crawl and Wikipedia corpora under an identical configuration (Continuous Bag of Words) with position-weights, 300 dimensions, character n-grams (length 5), window size 5, and …","url":["https://www.jmir.org/2025/1/e74200/"]} {"year":"2025","title":"Automated User Story Generation in Requirements Elicitation using Fine-Tuned Large Language Models","authors":["A Chitlangia - 2025"],"snippet":"In software development, generating user stories from requirements elicitation interviews is a critical yet time-consuming and subjective task. Traditional methods often rely heavily on human interpretation, which can introduce bias and limit …","url":["https://studenttheses.uu.nl/bitstream/handle/20.500.12932/49759/MastersThesis_AkshayChitlangia_MBI.pdf?sequence=1"]} +{"year":"2025","title":"Automatic Animacy Classification for Latvian Nouns","authors":["R Brutans, J Bloem"],"snippet":"We introduce the first automatic animacy classifier for the Latvian language. Animacy, a linguistic feature indicating whether a noun refers to a living entity, plays an important role in Latvian grammatical structures and syntactic agreement, but …","url":["https://acl-bg.org/proceedings/2025/GlobalNLP%202025/pdf/2025.globalnlp-1.11.pdf"]} {"year":"2025","title":"Automatic Association of Quality Requirements and Quantifiable Metrics for Cloud Security Certification","authors":["J Bianchi, S Dong, L Petrillo, M Petrocchi - arXiv preprint arXiv:2503.09460, 2025"],"snippet":"… The feature extractor chosen is FastText5, which is pre-trained on English texts from Wikipedia and Common Crawl. Data cleaning, such as removing stop words, is done before the feature vector computation to eliminate irrelevant information. Then …","url":["https://arxiv.org/pdf/2503.09460"]} {"year":"2025","title":"Automatic Control With Human-Like Reasoning","authors":["J Andriuskevicius - 2024"],"snippet":"Recent developments in language models have created new opportunities in air traffic control studies. The current focus is primarily on text and language-based use cases. However, these language models may offer a higher potential impact in the …","url":["https://repository.tudelft.nl/file/File_cce81e83-df06-4c16-90b8-b015381d7ee4"]} {"year":"2025","title":"Automatic Distractor Generation with Paradigmatic Relation for English Vocabulary Tests","authors":["H Setiawan, I Hidayah, SS Kusumawardani - … on Smart Computing, IoT and Machine …, 2025"],"snippet":"Automatic distractor generation (ADG) is a computer-based system that generates incorrect answers for multiple-choice questions to assist teachers to create educational assessments. It has been implemented in various subjects and …","url":["https://ieeexplore.ieee.org/abstract/document/11081171/"]} @@ -9481,6 +9519,7 @@ {"year":"2025","title":"Automatic Text Summarization for Hindi Language Using Word Embeddings: A Critical Review","authors":["SA Khan, M Mudasir, HA Khanday - … Conference on Cognitive Robotics and Intelligent …, 2025"],"snippet":"… 5) XLM-R [22]: 100 languages were used to train XLM-R (a transformer-based masked language model) with more than two terabytes of filtered data from CommonCrawl It performs significantly better than mBERT on a range of cross-lingual benchmarks. …","url":["https://ieeexplore.ieee.org/abstract/document/11086272/"]} {"year":"2025","title":"Automatic Urdu Grammar Error Correction: Harnessing the Power of Head Pruning for LLMs","authors":["M Hussain, W Ramay, MH Akbar, MN Zafar, T Rashid - International Journal of …, 2025"],"snippet":"… In our proposed approach T5 leverages much larger CommonCrawl web data across languages and uses a larger 250k Sentence-Piece vocabulary to improve sub word coverage of Urdu Text. Beyond masked language modeling, T5 is pre-trained …","url":["https://journals.cfrit.com/index.php/ijisct/article/download/121/65"]} {"year":"2025","title":"Automatic XPath generation agents for vertical websites by LLMs","authors":["J Huang, J Song - Journal of King Saud University Computer and …, 2025"],"snippet":"… 2022a), which is trained on Products and Movies data from Common Crawl Footnote 1 using approximately 200,000 annotated samples, experiences a sharp decline in EM score from around 75 to below 20 when applied to extracting date and …","url":["https://link.springer.com/article/10.1007/s44443-025-00071-w"]} +{"year":"2025","title":"Automating Data Collection to Support Conflict Analysis: Scraping the Internet for Monitoring Hourly Conflict in Sudan","authors":["Y Masri, AS Malarvizhi, S Ahmed, T Stover, Z Wang… - Cloud Computing and Data …, 2026"],"snippet":"The ongoing conflicts in Sudan have escalated rapidly, highlighting the critical need for timely and accurate data to inform humanitarian responses, policy decisions, and research needs. While existing datasets such as the Armed Conflict Location & …","url":["https://ojs.wiserpub.com/index.php/CCDS/article/download/8226/3864"]} {"year":"2025","title":"Automation of ETL Pipelines in DataStage","authors":["AU Benedetti - 2025"],"snippet":"… + English Wikipedia 2,500M words)[16] instead GPT leverage web-scale data , the original GPT is trained in BookCorpus, GPT-2 is trained on 40+GB of WebText and GPT-3 (175 billion parameters) is trained on around 300 billion tokens from …","url":["https://webthesis.biblio.polito.it/secure/35326/1/tesi.pdf"]} {"year":"2025","title":"AutoMixer: Checkpoint Artifacts as Automatic Data Mixers","authors":["E Chang, Y Li, P Huber, D Kant, Y Shi, V Chandra - arXiv preprint arXiv:2506.21910, 2025"],"snippet":"In language model training, it is desirable to equip models with capabilities from various tasks. However, it is not clear how to directly obtain the right data mixtures for these capabilities as the relationship between data and tasks is difficult to be …","url":["https://arxiv.org/pdf/2506.21910"]} {"year":"2025","title":"AutoSchemaKG: Autonomous Knowledge Graph Construction through Dynamic Schema Induction from Web-Scale Corpora","authors":["J Bai, W Fan, Q Hu, Q Zong, C Li, HT Tsang, H Luo… - arXiv preprint arXiv …, 2025"],"snippet":"… 2024) pretraining corpus across three diverse subsets, English Wikipedia, paper abstracts from Semantic Scholar, and 3% of Common Crawl data, we construct the ATLAS family of knowledge graphs (ATLAS-Wiki, ATLAS-Pes2o, and ATLAS-CC) …","url":["https://arxiv.org/pdf/2505.23628"]} @@ -9491,6 +9530,7 @@ {"year":"2025","title":"Bachelor's Thesis Computing Science","authors":["S Stammen, B Lin, P van Bommel - 2025","T van der Straaten, D Hiemstra, TM Heskes - 2025"],"snippet":"… The Open Web Index (OWI) [11] research aims to address this problem by improving the ability to build and maintain an index for a general collection of the size of the common crawl. Conceptually, having an open index that allows anyone to …","url":["https://www.cs.ru.nl/bachelors-theses/2025/Sem_Stammen___1089370___Package_hierarchy_recovery_using_word_embeddings_for_flattened_remodularized_Java_systems.pdf","https://www.cs.ru.nl/bachelors-theses/2025/Timo_van_der_Straaten___1059302___Integrating_Static_Index_Pruning_methods_into_Zoekeend.pdf"]} {"year":"2025","title":"Back-translation effects on static and contextual word embeddings for topic classification embedding in classification tasks","authors":["D Držík, L Kelebercová - PloS one, 2025"],"snippet":"This study investigates the impact of back-translation on topic classification, comparing its effects on static word vector representations (FastText) and contextual word embeddings (RoBERTa). Our objective was to determine whether back-translation …","url":["https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0330622"]} {"year":"2025","title":"Balancing Computation Load and Representation Expressivity in Parallel Hybrid Neural Networks","authors":["MM Moradi, W Ahmed, S Wen, S Mudur, W Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"Attention and State-Space Models (SSMs) when combined in a hybrid network in sequence or in parallel provide complementary strengths. In a hybrid sequential pipeline they alternate between applying a transformer to the input and then feeding …","url":["https://arxiv.org/pdf/2505.19472"]} +{"year":"2025","title":"Balancing Innovation and Data Protection-An analysis of legal basis under the GDPR for the training of artificial intelligence","authors":["A Almammadova - 2025"],"snippet":"This master thesis investigates the legal foundations for training of artificial intelligence (AI) models using personal data, with an emphasis on compliance with the European Union’s General Data Protection Regulation (GDPR). The core issue …","url":["https://lup.lub.lu.se/luur/download?func=downloadFile&recordOId=9215790&fileOId=9215791"]} {"year":"2025","title":"BALANCING SPEED AND PERFORMANCE WITH LAYER FREEZING STRATEGIES FOR TRANSFORMER MODELS","authors":["B Kairatuly, A Shomanov - 2025","B Kairatuly, A Shomanov - Scientific Journal of Astana IT University, 2025"],"snippet":"In this paper, we evaluated different approaches to freezing BERT-base layers and analyzed their impact on the quality and speed of training in the task of named entity recognition in two languages. Layer freezing is an optimization technique in deep …","url":["https://journal.astanait.edu.kz/index.php/ojs/article/download/779/249","https://sj.astanait.edu.kz/wp-content/uploads/2025/07/10-779.pdf"]} {"year":"2025","title":"BAMBI Goes to School: Evaluating Italian BabyLMs with Invalsi-ITA","authors":["L Capone, A Suozzi, GE Lebani, A Lenci - 2025"],"snippet":"This paper explores the impact of ecologically and cognitively plausible data on the training of language models. It builds on prior work [1, 2] integrating child-directed speech, curriculum learning and instruction tuning to train Italian BabyLMs. To …","url":["https://clic2025.unica.it/wp-content/uploads/2025/09/15_main_long.pdf"]} {"year":"2025","title":"BAMG: A Block-Aware Monotonic Graph Index for Disk-Based Approximate Nearest Neighbor Search","authors":["H Li, J Xu - arXiv preprint arXiv:2509.03226, 2025"],"snippet":"Approximate Nearest Neighbor Search (ANNS) over high-dimensional vectors is a foundational problem in databases, where disk I/O often emerges as the dominant performance bottleneck at scale. Existing graph indexing solutions for disk-based …","url":["https://arxiv.org/pdf/2509.03226"]} @@ -9514,6 +9554,7 @@ {"year":"2025","title":"Benchmarking MLLM-based Web Understanding: Reasoning, Robustness and Safety","authors":["J Liu, J Xiao, W Tang, W Wang, Z Wang, M Zhang, S Yu - arXiv preprint arXiv …, 2025"],"snippet":"Multimodal large language models (MLLMs) are increasingly positioned as AI collaborators for building complex web-related applications like GUI agents and front-end code generation. However, existing benchmarks largely emphasize visual …","url":["https://arxiv.org/pdf/2509.21782"]} {"year":"2025","title":"Benchmarking State of the Art Website Embedding Methods for Effective Processing and Analysis in the Public Sector","authors":["J Gerber, J Saxer, B Kreiner, A Weiler - 2025"],"snippet":"The ability to understand and process websites is crucial across various domains. It lays the foundation for machine understanding of websites. Specifically, website embedding proves invaluable when monitoring local government websites within …","url":["https://www.researchsquare.com/article/rs-5664280/latest.pdf"]} {"year":"2025","title":"Benchmarking Synonym Extraction Methods in Domain-Specific Contexts","authors":["S Taghinezhad Roudbaraki - 2025"],"snippet":"… Pre-trained GloVe embeddings are loaded from pretrained GloVe model which was trained on 42 billion tokens from the Common Crawl dataset that resulted in creating 1.9 million vocabularies and their 300-dimensional vectors. fastText. Also …","url":["https://webthesis.biblio.polito.it/secure/36445/1/tesi.pdf"]} +{"year":"2025","title":"BERnaT: Basque Encoders for Representing Natural Textual Diversity","authors":["E Azurmendi, JF de Landa, J Bengoetxea, M Heredia… - arXiv preprint arXiv …, 2025"],"snippet":"Language models depend on massive text corpora that are often filtered for quality, a process that can unintentionally exclude non-standard linguistic varieties, reduce model robustness and reinforce representational biases. In this paper, we argue that …","url":["https://arxiv.org/pdf/2512.03903"]} {"year":"2025","title":"BERT-Based Automation of Job Safety Analysis for industrial workplace compliance.","authors":["P NUNPHAKDEE, J INTHIAM - 2025"],"snippet":"Our center focuses on understanding the critical role of Process Safety Management (PSM), emphasizing the safety of both personnel and control systems through software that detects hazards such as fires and unauthorized movements within …","url":["http://202.44.33.99/dspace/bitstream/123456789/117/1/s6501073810059.pdf"]} {"year":"2025","title":"BERT-Based Intrusion Detection System for RF Jamming Attacks in Vehicular Network","authors":["W Nujitha - 2025"],"snippet":"As vehicular networks continue to evolve toward increased connectivity and autonomy, they become more vulnerable to cybersecurity threats, particularly Radio Frequency (RF) jamming attacks that can severely disrupt communication systems …","url":["https://brocku.scholaris.ca/bitstreams/896ad05d-ccbd-4c5f-8cfe-230061b51b82/download"]} {"year":"2025","title":"BERT-based Models for Keyword Extraction from Arabic Scientific Articles","authors":["B Babayigit, H Sattuf, M Abubaker - ACM Transactions on Asian and Low-Resource …, 2025"],"snippet":"Keywords at the beginning of research articles are crucial for conveying the content and main ideas of academic works. They serve as essential tools for researchers to efficiently search for relevant topics. The integration of traditional natural language …","url":["https://dl.acm.org/doi/pdf/10.1145/3761805"]} @@ -9526,6 +9567,7 @@ {"year":"2025","title":"Beyond Decoder-only: Large Language Models Can be Good Encoders for Machine Translation","authors":["Y Luo, T Zheng, Y Mu, B Li, Q Zhang, Y Gao, Z Xu… - arXiv preprint arXiv …, 2025"],"snippet":"… Note that due to the extensive bilingual data in the En-De CommonCrawl corpus, we only sampled a portion and merged it with other data to create a dataset of 50M. For En-Cs, we excluded the CzEng 2.0 dataset due to licensing issues. …","url":["https://arxiv.org/pdf/2503.06594"]} {"year":"2025","title":"Beyond English: Evaluating Automated Measurement of Moral Foundations in Non-English Discourse with a Chinese Case Study","authors":["CY Cheng, SA Hale - arXiv preprint arXiv:2502.02451, 2025"],"snippet":"This study explores computational approaches for measuring moral foundations (MFs) in non-English corpora. Since most resources are developed primarily for English, cross-linguistic applications of moral foundation theory remain limited. Using …","url":["https://arxiv.org/pdf/2502.02451"]} {"year":"2025","title":"Beyond Facts: Evaluating Intent Hallucination in Large Language Models","authors":["Y Hao, H Yu, J You - arXiv preprint arXiv:2506.06539, 2025"],"snippet":"When exposed to complex queries containing multiple conditions, today's large language models (LLMs) tend to produce responses that only partially satisfy the query while neglecting certain conditions. We therefore introduce the concept of …","url":["https://arxiv.org/pdf/2506.06539"]} +{"year":"2025","title":"Beyond Fast and Slow: Cognitive-Inspired Elastic Reasoning for Large Language Models","authors":["J Hu, D Yang, L Bian, Z Wen, Y Wang, Y Chen, B Xiao… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) have demonstrated impressive performance across various language tasks. However, existing LLM reasoning strategies mainly rely on the LLM itself with fast or slow mode (like o1 thinking) and thus struggle to balance …","url":["https://arxiv.org/pdf/2512.15089"]} {"year":"2025","title":"Beyond Fixed Length: Bucket Pre-training is All You Need","authors":["Q Yang, Q Peng, H Liu, K Liu, B Qin, T Liu"],"snippet":"Large Language Models (LLMs) have demonstrated exceptional performance across various tasks, with pre-training stage serving as the cornerstone of their capabilities. However, the conventional fixed-length data composition strategy for pre-training …","url":["https://ijcai-preprints.s3.us-west-1.amazonaws.com/2025/5804.pdf"]} {"year":"2025","title":"Beyond Line-Level Filtering for the Pretraining Corpora of LLMs","authors":["C Park, S Park, Y Ahn, J Kim, J Park, J Lee - arXiv preprint arXiv:2510.24139, 2025"],"snippet":"… To this end, this paper proposes a method called pattern-aware line filtering for extracting relevant text from the CommonCrawl WET dataset. Our approach is based on two rules: pattern-aware line-level deduplication (PLD) and pattern-aware trailing-punctuation …","url":["https://arxiv.org/pdf/2510.24139"]} {"year":"2025","title":"Beyond Reactive Safety: Risk-Aware LLM Alignment via Long-Horizon Simulation","authors":["C Sun, D Zhang, CX Zhai, H Ji - arXiv preprint arXiv:2506.20949, 2025"],"snippet":"Given the growing influence of language model-based agents on high-stakes societal decisions, from public policy to healthcare, ensuring their beneficial impact requires understanding the far-reaching implications of their suggestions. We …","url":["https://arxiv.org/pdf/2506.20949"]} @@ -9537,6 +9579,9 @@ {"year":"2025","title":"Beyond the Explicit: A Bilingual Dataset for Dehumanization Detection in Social Media","authors":["D Assenmacher, P Piot, K Laken, D Jurgens, C Wagner - arXiv preprint arXiv …, 2025"],"snippet":"Digital dehumanization, although a critical issue, remains largely overlooked within the field of computational linguistics and Natural Language Processing. The prevailing approach in current research concentrating primarily on a single aspect of …","url":["https://arxiv.org/pdf/2510.18582"]} {"year":"2025","title":"Beyond the Final Layer: Intermediate Representations for Better Multilingual Calibration in Large Language Models","authors":["E Zhou, C Zhang, T Hu, C Li, N Collier, I Vulić… - arXiv preprint arXiv …, 2025"],"snippet":"Confidence calibration, the alignment of a model's predicted confidence with its actual accuracy, is crucial for the reliable deployment of Large Language Models (LLMs). However, this critical property remains largely under-explored in multilingual …","url":["https://arxiv.org/pdf/2510.03136"]} {"year":"2025","title":"Beyond the Final Layer: Using Intermediate Representations to Improve Multilingual Calibration","authors":["E Zhou, C Zhang, T Hu, C Li, N Collier, I Vulić… - … Interpretability Workshop at …"],"snippet":"Confidence calibration, the alignment between a model's predicted confidence and its empirical correctness, is crucial for the trustworthiness of Large Language Models (LLMs). Previous studies on multilingual calibration mainly use machine-translated …","url":["https://openreview.net/pdf?id=njDdcLInXP"]} +{"year":"2025","title":"Beyond the West: A Survey of Cultural Datasets for Culturally-Grounded LLMs","authors":["M Piao, L Miao, Y Liu, M He, H Ma, L Zhang, D Wei…"],"snippet":"As large language models (LLMs) are widespread worldwide, the uneven cultural alignment capabilities have become increasingly apparent. Existing LLMs generally exhibit a tendency toward Western cultural centrism, harboring stereotypes and …","url":["https://www.researchgate.net/profile/Mengyao-Piao/publication/398429883_Beyond_the_West_A_Survey_of_Cultural_Datasets_for_Culturally-Grounded_LLMs/links/69363ea07e61d05b530c8465/Beyond-the-West-A-Survey-of-Cultural-Datasets-for-Culturally-Grounded-LLMs.pdf"]} +{"year":"2025","title":"Beyond URLs: Metadata Diversity and Position for Efficient LLM Pretraining","authors":["D Fan, D Hashemi, SP Karimireddy, M Jaggi - arXiv preprint arXiv:2511.21613, 2025"],"snippet":"… Large language models (LLMs) are typically pretrained on web-scale corpora sourced from Common Crawl–style snapshots and related aggregates, then aggressively filtered and deduplicated to improve quality and efficiency. Landmark …","url":["https://arxiv.org/pdf/2511.21613"]} +{"year":"2025","title":"Beyond Vector Search: Querying With and Without Predicates","authors":["J Xie, JX Yu, S Teng, Y Liu - Proceedings of the ACM on Management of Data, 2025"],"snippet":"k-ANN search has been extensively studied to find k approximate nearest neighbors for a given query vector in a high-dimensional dataset, where a data item is represented as a vector. As there are many new emerging real-world applications …","url":["https://dl.acm.org/doi/abs/10.1145/3769765"]} {"year":"2025","title":"BeyondWeb: Lessons from Scaling Synthetic Data for Trillion-scale Pretraining","authors":["P Maini, V Dorna, P Doshi, A Carranza, F Pan… - arXiv preprint arXiv …, 2025"],"snippet":"Recent advances in large language model (LLM) pretraining have shown that simply scaling data quantity eventually leads to diminishing returns, hitting a data wall. In response, the use of synthetic data for pretraining has emerged as a …","url":["https://arxiv.org/pdf/2508.10975"]} {"year":"2025","title":"BhashaKritika: Building Synthetic Pretraining Data at Scale for Indic Languages","authors":["G Manoj, NP Rachamalla, A Kulkarni, G Rajeev… - arXiv preprint arXiv …, 2025"],"snippet":"In the context of pretraining of Large Language Models (LLMs), synthetic data has emerged as an alternative for generating high-quality pretraining data at scale. This is particularly beneficial in low-resource language settings where the benefits of …","url":["https://arxiv.org/pdf/2511.10338"]} {"year":"2025","title":"Bias Analysis and Mitigation through Protected Attribute Detection and Regard Classification","authors":["T Udagawa, Y Zhao, H Kanayama, B Bhattacharjee - arXiv preprint arXiv:2504.14212, 2025"],"snippet":"… In our experiments, we apply the pipeline to a subset of Common Crawl, the most widely used corpus for LLM pretraining. For bias analysis… In our experiments, we apply our bias analysis and mitigation measures on a subset of Common Crawl (CC) …","url":["https://arxiv.org/pdf/2504.14212"]} @@ -9552,6 +9597,7 @@ {"year":"2025","title":"BitNet: 1-bit Pre-training for Large Language Models","authors":["H Wang, S Ma, L Ma, L Wang, W Wang, L Dong… - Journal of Machine …, 2025"],"snippet":"The increasing size of large language models (LLMs) has posed challenges for deployment and raised concerns about environmental impact due to high energy consumption. Previous research typically applies quantization after pre-training …","url":["http://www.jmlr.org/papers/volume26/24-2050/24-2050.pdf"]} {"year":"2025","title":"BlockFFN: Towards End-Side Acceleration-Friendly Mixture-of-Experts with Chunk-Level Activation Sparsity","authors":["C Song, W Zhao, X Han, C Xiao, Y Chen, Y Li, Z Liu… - arXiv preprint arXiv …, 2025"],"snippet":"To alleviate the computational burden of large language models (LLMs), architectures with activation sparsity, represented by mixture-of-experts (MoE), have attracted increasing attention. However, the non-differentiable and inflexible routing …","url":["https://arxiv.org/pdf/2507.08771"]} {"year":"2025","title":"BloomWise: Enhancing problem-solving capabilities of LLMs using Bloom's-Taxonomy-inspired prompts","authors":["ME Zoumpoulidi - 2025"],"snippet":"The limited ability of Large Language Models (LLMs) in mathematics—a skill critical for solving complex problems—has garnered significant interest from the research community. Many approaches have employed in-context learning to improve LLMs’ …","url":["https://dspace.lib.ntua.gr/xmlui/bitstream/handle/123456789/61930/diploma_thesis_zoumpoulidi_final.pdf?sequence=1"]} +{"year":"2025","title":"Blu-WERP (Web Extraction and Refinement Pipeline): A Scalable Pipeline for Preprocessing Large Language Model Datasets","authors":["S Rupesh, S Kumar, V Chaithanya - arXiv preprint arXiv:2511.18054, 2025"],"snippet":"… the quality of Common Crawl WARC files for LLM training. We demonstrate that Blu-WERP significantly outperforms established baselines including DCLM across multiple model scales and evaluation benchmarks. Our pipeline processes Common …","url":["https://arxiv.org/pdf/2511.18054"]} {"year":"2025","title":"Boundary-making practices: LLMs and an artifactual production of objectivity","authors":["M An - AI & SOCIETY, 2025"],"snippet":"… For instance, Common Crawl, a filtered subset of which is used in GPT-3, is heavily skewed toward English-language content, including 46% of the 2023 version (‘Common Crawl’ 2025). This English dominance perpetuates Western …","url":["https://link.springer.com/article/10.1007/s00146-025-02409-4"]} {"year":"2025","title":"Break the Checkbox: Challenging Closed-Style Evaluations of Cultural Alignment in LLMs","authors":["M Kabir, A Abrar, S Ananiadou - arXiv preprint arXiv:2502.08045, 2025"],"snippet":"A large number of studies rely on closed-style multiple-choice surveys to evaluate cultural alignment in Large Language Models (LLMs). In this work, we challenge this constrained evaluation paradigm and explore more realistic, unconstrained …","url":["https://arxiv.org/pdf/2502.08045"]} {"year":"2025","title":"Breaking Memory Limits: Gradient Wavelet Transform Enhances LLMs Training","authors":["Z Wen, P Luo, J Wang, X Deng, J Zou, K Yuan, T Sun… - arXiv preprint arXiv …, 2025"],"snippet":"… The C4 English benchmark is a colossal, cleaned version of Common Crawl’s web crawl corpus based on the Common Crawl dataset. Includes 305GB of English-language text and is mainly intended to pretrain language models. The validation complexity is …","url":["https://arxiv.org/pdf/2501.07237"]} @@ -9563,7 +9609,9 @@ {"year":"2025","title":"BRoverbs--Measuring how much LLMs understand Portuguese proverbs","authors":["TS Almeida, GK Bonás, JGA Santos - arXiv preprint arXiv:2509.08960, 2025"],"snippet":"Large Language Models (LLMs) exhibit significant performance variations depending on the linguistic and cultural context in which they are applied. This disparity signals the necessity of mature evaluation frameworks that can assess their …","url":["https://arxiv.org/pdf/2509.08960"]} {"year":"2025","title":"Building a Rich Dataset to Empower the Persian Question Answering Systems","authors":["M Yazdinejad, M Kaedi - arXiv preprint arXiv:2412.20212, 2024"],"snippet":"… XLM-RoBERTa [40] is a relatively new and big interlingual language model based on RoBERTa and has been trained on 100 languages on CommonCrawl filtered by 2.5 TB. Unlike other XLM models, XLM-RoBERTa doesn’t need language …","url":["https://arxiv.org/pdf/2412.20212"]} {"year":"2025","title":"Building AI Agents with LLMs, RAG, and Knowledge Graphs: A practical guide to autonomous and modern AI agents","authors":["S Raieli, G Iuculano - 2025"],"snippet":"Master LLM fundamentals to advanced techniques like RAG, reinforcement learning, and knowledge graphs to build, deploy, and scale intelligent AI agents that reason, retrieve, and act autonomously Key Features Implement RAG and knowledge …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=bcNqEQAAQBAJ&oi=fnd&pg=PR1&dq=commoncrawl&ots=asfBuMXvnf&sig=a7yuGs8eNUWBHhlJfgRvR7V0U2Q"]} +{"year":"2025","title":"Building and Training a GPT Model: A Comprehensive Code Tutorial","authors":["OO Khalifa - 2025"],"snippet":"The rapid advancement of artificial intelligence particularly in the field of large language models has fundamentally transformed how machines understand and generate human language. Among these models, Generative Pre-trained …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=_aeiEQAAQBAJ&oi=fnd&pg=PA172&dq=commoncrawl&ots=GPzAO_jQ6Z&sig=5Dzwtmr--Y9Oq8c2Oh_8ZZC40ds"]} {"year":"2025","title":"Building Data Infrastructure for Low-Resource Languages","authors":["SKK Luger, R Mosquera, PO Suarez - Proceedings of the Eighth Workshop on …, 2025"],"snippet":"… ; a strategic collaboration with the Common Crawl Foundation to enhance web crawling capabil… We expect the best submissions to be incorporated in the stack used by Common Crawl, as … While Common Crawl already annotates their crawls …","url":["https://aclanthology.org/2025.loresmt-1.14.pdf"]} +{"year":"2025","title":"Building Domain-Specific Small Language Models via Guided Data Generation","authors":["A Kumar, EM Amin, XY Lee, L Vidyaratne, AK Farahat… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have shown remarkable success in supporting a wide range of knowledge-intensive tasks. In specialized domains, there is growing interest in leveraging LLMs to assist subject matter experts with domain-specific …","url":["https://arxiv.org/pdf/2511.21748"]} {"year":"2025","title":"Building High-Quality Datasets for Portuguese LLMs: From Common Crawl Snapshots to Industrial-Grade Corpora","authors":["T Sales Almeida, R Nogueira, H Pedrini - arXiv e-prints, 2025","TS Almeida, R Nogueira, H Pedrini - arXiv preprint arXiv:2509.08824, 2025"],"snippet":"The performance of large language models (LLMs) is deeply influenced by the quality and composition of their training data. While much of the existing work has centered on English, there remains a gap in understanding how to construct …","url":["https://arxiv.org/pdf/2509.08824","https://ui.adsabs.harvard.edu/abs/2025arXiv250908824S/abstract"]} {"year":"2025","title":"Building Transformer-Based Conversational Agents Capable of Sentiment Detection and Human-Like Dialogue Generation","authors":["L Harris - 2025"],"snippet":"The rapid advancement of transformer-based architectures has significantly transformed the capabilities of conversational agents, enabling them to generate coherent, context-aware, and human-like dialogues. This paper explores the …","url":["https://www.researchgate.net/profile/Lorenzaj-Harris/publication/393946101_Building_Transformer-Based_Conversational_Agents_Capable_of_Sentiment_Detection_and_Human-Like_Dialogue_Generation/links/6880ff00078693798454131f/Building-Transformer-Based-Conversational-Agents-Capable-of-Sentiment-Detection-and-Human-Like-Dialogue-Generation.pdf"]} {"year":"2025","title":"Building Trust in AI via Safe and Responsible Use of LLMs","authors":["A Bhattacharjee - 2025"],"snippet":"Artificial Intelligence (AI), and more recently Generative AI technologies like large language models (LLMs), have become pervasive, influencing diverse areas of society and reshaping the way complex tasks are approached. The rapid evolution …","url":["https://search.proquest.com/openview/f7450259477192413b65783205aede58/1?pq-origsite=gscholar&cbl=18750&diss=y"]} @@ -9582,6 +9630,7 @@ {"year":"2025","title":"Capturing the Effects of Quantization on Trojans in Code LLMs","authors":["A Hussain, SAMK Zarkouei, MRI Rabin, MA Alipour… - arXiv preprint arXiv …, 2025"],"snippet":"… The pretraining dataset used to generate Llama mostly comprise of web crawl data from the English CommonCrawl [19] and C4 [20] datasets (82%), along with data from Wikipedia, Github, StackExchange, ArXiv, Gutenberg, and Books3 [17] …","url":["https://arxiv.org/pdf/2505.14200"]} {"year":"2025","title":"Caregiver-in-the-Loop AI: A Simulation-Based Feasibility Study for Dementia Task Verification","authors":["J Lai, D Black, K Beaton, B Ye, A Mihailidis - arXiv preprint arXiv:2508.18267, 2025"],"snippet":"Caregivers of people living with dementia (PLwD) experience stress when verifying whether tasks are truly completed, even with digital reminder systems. Generative AI, such as GPT-4, may help by automating task verification through follow-up …","url":["https://arxiv.org/pdf/2508.18267"]} {"year":"2025","title":"Causal Investigation of Tense Encoding in Multilingual Transformer","authors":["AE Tumurchuluun - 2025"],"snippet":"This thesis investigates how multilingual decoder-only transformers encode simple past, present, and future tenses across typologically diverse languages and whether isolating those temporal subspaces enables the controlled steering of generated text …","url":["https://dspace.cuni.cz/bitstream/handle/20.500.11956/203278/120518679.pdf?sequence=1"]} +{"year":"2025","title":"Causal Relation Extraction from Text using Transformers","authors":["AMD da Silva - 2025"],"snippet":"Causality plays a vital role in natural language understanding tasks such as event prediction, information retrieval, summarization, sentiment analysis, and question-answering. This importance is reflected in the fact that a significant portion of queries in search …","url":["https://repositorio-aberto.up.pt/bitstream/10216/171331/2/751101.pdf"]} {"year":"2025","title":"CCAgent: Coordinating Collaborative Data Scaling for Operating System Agents via Web3","authors":["L Chen, H Zhao, Y Huang, Y Luo, T Lin, W Xie, R Wu… - Proceedings of the 34th …, 2025"],"snippet":"The current AI revolution, fueled by Large Language Models (LLMs), heavily relies on vast open-access internet data. However, the Operating System (OS) Agent field faces a significant data sparsity challenge due to the lack of public data collection …","url":["https://dl.acm.org/doi/abs/10.1145/3746252.3761392"]} {"year":"2025","title":"CCI4. 0: A Bilingual Pretraining Dataset for Enhancing Reasoning in Large Language Models","authors":["G Liu, L Wang, J Li, Y Yu, Y Xu, J Chen, Y Bai, F Liao… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce CCI4.0, a large-scale bilingual pre-training dataset engineered for superior data quality and diverse human-like reasoning trajectory. CCI4.0 occupies roughly $35$ TB of disk space and comprises two sub-datasets: CCI4.0-M2-Base …","url":["https://arxiv.org/pdf/2506.07463"]} {"year":"2025","title":"Centroid analysis: Inferring concept representations from open-ended word responses","authors":["A Petrenco, F Günther, A Petrenco"],"snippet":"The present research proposes and evaluates a novel method-centroid analysis-for measuring representations and concepts at both individual and group levels by mapping open-ended responses onto a pre-existing semantic vector space …","url":["https://www.researchgate.net/profile/Aliona-Petrenco/publication/391476810_Centroid_analysis_Inferring_concept_representations_from_open-ended_word_responses/links/681de28bbfbe974b23c52b79/Centroid-analysis-Inferring-concept-representations-from-open-ended-word-responses.pdf"]} @@ -9589,12 +9638,16 @@ {"year":"2025","title":"Challenges and opportunities of automated essay scoring for low-proficient L2 English writers","authors":["V De Wilde, O De Clercq - Assessing Writing, 2025"],"snippet":"Assessing students’ writing can be a challenging activity. To make writing assessment more feasible, researchers have investigated the possibilities of automated essay scoring (AES). Most studies investigating AES have focused on L1 …","url":["https://www.sciencedirect.com/science/article/pii/S1075293525000698"]} {"year":"2025","title":"Challenges in Zero-Shot and Few-Shot Learning for Complex Queries","authors":["M Clement"],"snippet":"Zero-shot and few-shot learning have emerged as promising approaches for enabling machine learning models to generalize to novel tasks with minimal or no task-specific training data. However, applying these techniques to complex queries …","url":["https://www.researchgate.net/profile/Mateo-Clement/publication/388959619_Challenges_in_Zero-Shot_and_Few-Shot_Learning_for_Complex_Queries/links/67ae439f8311ce680c61c953/Challenges-in-Zero-Shot-and-Few-Shot-Learning-for-Complex-Queries.pdf"]} {"year":"2025","title":"Characterizing Bias: Benchmarking Large Language Models in Simplified versus Traditional Chinese","authors":["H Lyu, J Luo, J Kang, A Koenecke - arXiv preprint arXiv:2505.22645, 2025"],"snippet":"… We operationalize this by retrieving the frequency of each name’s occurrence in the Common Crawl web crawl corpus.Then, for each LLM and each prompting language variant (English, Simplified Chinese, or Traditional Chinese), we examine …","url":["https://arxiv.org/pdf/2505.22645"]} +{"year":"2025","title":"Characterizing Mamba's Selective Memory using Auto-Encoders","authors":["T Hossain, RL Logan IV, G Jagadeesan, S Singh… - arXiv preprint arXiv …, 2025"],"snippet":"State space models (SSMs) are a promising alternative to transformers for language modeling because they use fixed memory during inference. However, this fixed memory usage requires some information loss in the hidden state when processing …","url":["https://arxiv.org/pdf/2512.15653"]} {"year":"2025","title":"Charting the Landscape of African NLP: Mapping Progress and Shaping the Road Ahead","authors":["JO Alabi, MA Hedderich, DI Adelani, D Klakow - arXiv preprint arXiv:2505.21315, 2025"],"snippet":"With over 2,000 languages and potentially millions of speakers, Africa represents one of the richest linguistic regions in the world. Yet, this diversity is scarcely reflected in state-of-the-art natural language processing (NLP) systems and large …","url":["https://arxiv.org/pdf/2505.21315"]} +{"year":"2025","title":"Chasing Shadows: Pitfalls in LLM Security Research","authors":["J Evertz, N Risse, N Neuer, A Müller, P Normann… - arXiv preprint arXiv …, 2025"],"snippet":"… To investigate whether these commits may have been absent from the model’s pre-training data, we searched all Common Crawl … While we cannot rule out the use of private or proprietary data sources, the absence of PrimeVulrelated content in Common …","url":["https://arxiv.org/pdf/2512.09549"]} {"year":"2025","title":"ChatGPT and L2 Chinese writing: evaluating the impact of model version and prompt language on automated corrective feedback","authors":["CTY Yang, HHJ Chen - Computer Assisted Language Learning, 2025"],"snippet":"… As of 2022, the year ChatGPT-3.5 was officially announced, CommonCrawlFootnote 1 contained approximately 52 billion pages in English compared to 6.8 billion pages in Chinese, revealing a stark imbalance in language representation. This uneven …","url":["https://www.tandfonline.com/doi/abs/10.1080/09588221.2025.2453205"]} +{"year":"2025","title":"ChatGPT as a news recommender system: Measuring source types and diversity across different interfaces","authors":["T Schatto-Eckrodt, L Liebig, M Reiss, R Geislinger… - 2025"],"snippet":"This study examines to what extent ChatGPT’s responses to news-seeking prompts reflect exposure diversity in news sources, paying particular attention to whether publishers with licensing agreements are systematically privileged in outputs. Based …","url":["https://osf.io/download/wjzp3/"]} {"year":"2025","title":"ChatGPT based credit rating and default forecasting","authors":["J Lin, S Lai, H Yu, R Liang, J Yen - Journal of Data, Information and Management, 2025"],"snippet":"… In particular, the recent revolutionary GPT-3 has as many as 175 billion parameters, and more than 80% of the data comes from network information such as Common Crawl, WebText2, and Wikipedia, as shown in Table 1. It is capable of …","url":["https://link.springer.com/article/10.1007/s42488-025-00143-6"]} {"year":"2025","title":"ChatGPT in human resource management: A systematic review of influential factors, processes, and outcomes","authors":["B Li, Y Cheng - Heliyon, 2025"],"snippet":"This study employs a systematic literature review methodology to examine the integration of ChatGPT into human resource management (HRM). By synthesizing 115 articles, the paper maps the current research landscape of ChatGPT in HRM …","url":["https://www.cell.com/heliyon/fulltext/S2405-8440(25)02446-6"]} {"year":"2025","title":"ChatGPT or A Silent Everywhere Helper: A Survey of Large Language Models","authors":["A Akhtarshenas, A Dini, N Ayoobi - arXiv preprint arXiv:2503.17403, 2025"],"snippet":"Large Language Models (LLMs) have revo lutionized natural language processing Natural Language Processing (NLP), with Chat Generative Pre-trained Transformer (ChatGPT) standing out as a notable exampledue to its advanced capabilities and widespread …","url":["https://arxiv.org/pdf/2503.17403"]} {"year":"2025","title":"ChatGPT's security risk and its legal countermeasures","authors":["L Ruonan, C Liang - 동아법학, 2025"],"snippet":"… , it is clear from its May 2020 article that the company primarily uses data from the CommonCrawl corpus, the WebText corpus, Wikipedia pages, and books for training21). The CommonCrawl corpus is a large dataset of raw web pages …","url":["https://www.dbpia.co.kr/Journal/articleDetail?nodeId=NODE12105411"]} +{"year":"2025","title":"Cheap science, real harm: the cost of replacing human participation with synthetic data","authors":["A Birhane - Synthetic Data Workshop at the Aarhus 2025 …, 2025"],"snippet":"… an analysis of undesirable content in the common crawl corpus. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2 …","url":["https://synthetic-data-workshop.github.io/papers/13.pdf"]} {"year":"2025","title":"Chitrakshara: A Large Multilingual Multimodal Dataset for Indian languages","authors":["S Khan, A Faraz, A Ravi, M Nauman, M Sarfraz… - CVPR 2025 Workshop Vision …"],"snippet":"… To address this gap, we introduce the Chitrakshara dataset series, covering 11 Indian languages sourced from Common Crawl. It comprises (1) … We begin by gathering 95 Common Crawl dumps spanning the years 2013 to 2023. Unlike …","url":["https://openreview.net/pdf?id=CHrzyIKfPd"]} {"year":"2025","title":"Chitrarth: Bridging Vision and Language for a Billion People","authors":["S Khan, A Tarun, A Ravi, A Faraz, PK Pokala…"],"snippet":"Recent multimodal foundation models are primarily trained on English or high resource European language data, which hinders their applicability to other medium and low-resource languages. To address this limitation, we introduce Chitrarth (Chitra …","url":["https://cdn.olaelectric.com/krutrim/18_chitrarth.pdf"]} {"year":"2025","title":"CHRONOBERG: Capturing Language Evolution and Temporal Awareness in Foundation Models","authors":["N Hegde, S Paul, L Joel-Frey, M Brack, K Kersting… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) excel at operating at scale by leveraging social media and various data crawled from the web. Whereas existing corpora are diverse, their frequent lack of long-term temporal structure may however limit an LLM's ability …","url":["https://arxiv.org/pdf/2509.22360"]} @@ -9607,12 +9660,15 @@ {"year":"2025","title":"Cite Pretrain: Retrieval-Free Knowledge Attribution for Large Language Models","authors":["Y Huang, S Chen, J Pei, M Zaheer, B Dhingra - arXiv preprint arXiv:2506.17585, 2025"],"snippet":"… Due to frequent title duplication—especially in Common Crawl and RepliQA—we adopt a renaming strategy using an LLM. For each duplicated title, we iteratively rename the document until all titles are unique. We also perform cross-source …","url":["https://arxiv.org/pdf/2506.17585"]} {"year":"2025","title":"CLaMP 3: Universal Music Information Retrieval Across Unaligned Modalities and Unseen Languages","authors":["P Signal","S Wu, Z Guo, R Yuan, J Jiang, S Doh, G Xia, J Nam… - arXiv preprint arXiv …, 2025"],"snippet":"CLaMP 3 is a unified framework developed to address challenges of cross-modal and cross-lingual generalization in music information retrieval. Using contrastive learning, it aligns all major music modalities--including sheet music, performance …","url":["https://arxiv.org/pdf/2502.10362","https://openreview.net/pdf?id=caX0HrfIMa"]} {"year":"2025","title":"CLAPnq:[underline] C [/underline] ohesive [underline] L [/underline] ong-form [underline] A [/underline] nswers from [underline] P [/underline] assages in Natural …","authors":["S Rosenthal, A Sil, R Florian, S Roukos - Transactions of the Association for …, 2025"],"snippet":"… However, they use sentence-level matching (by encoding sentences for semantic similarity comparisons) to retrieve up to top 7 documents from Common Crawl while avoiding exact matches as the abstractive dataset. In the extractive version, the …","url":["https://search.proquest.com/openview/1aec201a9f9b3a555cb2bc070dfc1edf/1?pq-origsite=gscholar&cbl=6535866"]} +{"year":"2025","title":"Classification of worldwide news articles by perceived quality, 2018-2024","authors":["C McElroy, TEA de Oliveira, C Brogly - arXiv preprint arXiv:2511.16416, 2025"],"snippet":"… using a newly created dataset of 1,412,272 English news articles from the Common Crawl over 2018-2024. Expert consensus ratings on 579 source websites were … In that work, content was manually scraped in comparison to being taken …","url":["https://arxiv.org/pdf/2511.16416"]} {"year":"2025","title":"CLIMB: CLustering-based Iterative Data Mixture Bootstrapping for Language Model Pre-training","authors":["S Diao, Y Yang, Y Fu, X Dong, D Su, M Kliegl, Z Chen… - arXiv preprint arXiv …, 2025"],"snippet":"Pre-training datasets are typically collected from web content and lack inherent domain divisions. For instance, widely used datasets like Common Crawl do not include explicit domain labels, while manually curating labeled datasets such as …","url":["https://arxiv.org/pdf/2504.13161"]} {"year":"2025","title":"Cloze Encounters: The Impact of Pirated Data Access on LLM Performance","authors":["S Jia, A Nagaraj - 2025"],"snippet":"Large Language Models (LLMs) have demonstrated remarkable capabilities in text generation, but their performance may be influenced by the datasets on which they are trained, including potentially unauthorized or pirated content. We investigate the …","url":["https://www.abhishekn.com/s/f213210.pdf"]} {"year":"2025","title":"CMA-ECG: Cross-modal attention for enhanced ECG quality assessment and denoising","authors":["HD Nguyen, PD Tri - Physiological Measurement, 2025"],"snippet":"Objective: Electrocardiogram (ECG) analysis is vital for the diagnosis of cardiac conditions and monitoring human physiological states. However, challenges such as signal perturbations, inconsistent quality, and signal inference undermine the …","url":["https://iopscience.iop.org/article/10.1088/1361-6579/ae15e4/meta"]} +{"year":"2025","title":"CMSC 373 Artificial Intelligence Fall 2025 20-LLMs","authors":["D Kumar - 2025"],"snippet":"… • Common crawl, snapshots of the entire web produced by the nonprofit Common Crawl with billions of pages …","url":["https://cs.brynmawr.edu/Courses/cs373/Fall2025/Slides/20-LLMs.pdf"]} {"year":"2025","title":"CMVC+: a Multi-View Clustering Framework for Open Knowledge Base Canonicalization via Contrastive Learning","authors":["Y Yang, W Shen, J Shu, Y Liu, E Curry, G Li - IEEE Transactions on Knowledge and …, 2025"],"snippet":"Open information extraction (OIE) methods extract plenty of OIE triples from unstructured text, which compose large open knowledge bases (OKBs). Noun phrases and relation phrases in such OKBs are not canonicalized, which leads to …","url":["https://ieeexplore.ieee.org/abstract/document/10891880/"]} {"year":"2025","title":"Code Blue: The Threat of Synthetic Data Use to Generative Medical AI","authors":["AB Cyphert, VK Blake - Houston Journal of Health Law & Policy, 2025"],"snippet":"In the field of health care, artificial intelligence (AI) has massive potential to improve and save lives. 1 AI systems have been used in the health care field for many years now, diagnosing, screening, treating, predicting illnesses and injuries, and enabling …","url":["https://houstonhealthlaw.scholasticahq.com/article/128625.pdf"]} {"year":"2025","title":"CodeArena: Evaluating and Aligning CodeLLMs on Human Preference","authors":["J Yang, J Yang, W Zhang, J Ke, Y Miao, L Zhang… - Proceedings of the 2025 …, 2025"],"snippet":"We present CodeArena to emulate the complexity/diversity of real-world coding tasks, spanning 40 categories and 44 PLs. A 20B diverse synthetic instruction corpus is created by scaling instructions to help Qwen2. 5-SynCoder achieve SOTA …","url":["https://aclanthology.org/2025.emnlp-main.489.pdf"]} +{"year":"2025","title":"CodeSimpleQA: Scaling Factuality in Code Large Language Models","authors":["J Yang, W Zhang, Y Li, S Guo, H Wang, A Liu, G Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) have made significant strides in code generation, achieving impressive capabilities in synthesizing code snippets from natural language instructions. However, a critical challenge remains in ensuring LLMs …","url":["https://arxiv.org/pdf/2512.19424"]} {"year":"2025","title":"CODNet: Context-based object detection network for multimodal image captioning and virtual question answering","authors":["C Gupta, NS Gill, P Gulia, G Pau - Image and Vision Computing, 2025"],"snippet":"Present MLL (Multimodal Large Language) models do exceptionally well in computer vision tasks, such as answering virtual questions and captioning images. However, they are not up to par in important perceptual tasks like object detection …","url":["https://www.sciencedirect.com/science/article/pii/S0262885625003567"]} {"year":"2025","title":"CoLA: Compute-Efficient Pre-Training of LLMs via Low-Rank Activation","authors":["Z Liu, R Zhang, Z Wang, Z Yang, P Hovland, B Nicolae… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) are revolutionizing many science and engineering fields. However, their huge model sizes impose extremely demanding needs of computational resources in the pre-training stage. Although low-rank factorizations …","url":["https://arxiv.org/pdf/2502.10940"]} {"year":"2025","title":"Collaborative Growth: When Large Language Models Meet Sociolinguistics","authors":["D Nguyen - Language and Linguistics Compass, 2025"],"snippet":"Large Language Models (LLMs) have dramatically transformed the AI landscape. They can produce remarkable fluent text and exhibit a range of natural language understanding and generation capabilities. This article explores how LLMs might be …","url":["https://compass.onlinelibrary.wiley.com/doi/pdf/10.1111/lnc3.70010"]} @@ -9627,9 +9683,11 @@ {"year":"2025","title":"CommonForms: A Large, Diverse Dataset for Form Field Detection","authors":["J Barrow - arXiv preprint arXiv:2509.16506, 2025"],"snippet":"… We use Common Crawl as a wellspring of PDFs and apply a rigorous cleaning process. This cleaning process results in improved data efficiency compared to using every PDF with a form field. To train the FFDNet family of models, we cast the …","url":["https://arxiv.org/pdf/2509.16506"]} {"year":"2025","title":"Communicative Coordination in Child-Caregiver Interactions","authors":["A Agrawal - 2025"],"snippet":"Communication is a complex task requiring interlocutors to simultaneously coordinate on multiple levels. This ability to coordinate communication goes hand-in-hand with a child’s ability to acquire language since social interactions play a crucial role …","url":["https://theses.hal.science/tel-05357256/document"]} {"year":"2025","title":"Comparative Analysis Based on DeepSeek, ChatGPT, and Google Gemini: Features, Techniques, Performance, Future Prospects","authors":["A Rahman, SH Mahir, MTA Tashrif, AA Aishi, MA Karim… - arXiv preprint arXiv …, 2025"],"snippet":"… Primary sources for general linguistic coverage included Common Crawl and WebText, and BooksCorpus was used for long-form structured text data [30], [31]. Domain-specific corpora like PubMed and arXiv were essential for testing the …","url":["https://arxiv.org/pdf/2503.04783"]} +{"year":"2025","title":"Comparative Analysis of 47 Context-Based Question Answer Models Across 8 Diverse Datasets","authors":["M Muneeb, DB Ascher, AB Bakht"],"snippet":"Context-based question answering (CBQA) models provide more accurate and relevant answers by considering the contextual information. They effectively extract specific information given a context, making them functional in various applications …","url":["https://www.researchgate.net/profile/Muhammad-Muneeb-5/publication/398078193_Comparative_Analysis_of_47_Context-Based_Question_Answer_Models_Across_8_Diverse_Datasets/links/6929b709f4878b75fc7a371e/Comparative-Analysis-of-47-Context-Based-Question-Answer-Models-Across-8-Diverse-Datasets.pdf"]} {"year":"2025","title":"Comparative Analysis of Differentiated Approaches to Utilizing AI for Subverting Stereotypes","authors":["X Feng, M Murakami - Journal of Advances in Information Technology, 2025"],"snippet":"Limited or superficial knowledge about others can foster stereotypes and prejudice. Consequently, this study explores specific methods to counteract these stereotypes. We posit that the key to challenging stereotypes lies in acquiring relevant knowledge …","url":["https://www.researchgate.net/profile/Xiaohan-Feng-3/publication/389961295_Comparative_Analysis_of_Differentiated_Approaches_to_Utilizing_AI_for_Subverting_Stereotypes/links/67dcb94835f7044c924df6c5/Comparative-Analysis-of-Differentiated-Approaches-to-Utilizing-AI-for-Subverting-Stereotypes.pdf"]} {"year":"2025","title":"Comparative Analysis of Embedding Models for Hindi-English Code-Mixed University related queries","authors":["O Ingale, S Margaj - The Voice of Creative Research, 2025"],"snippet":"This study presents a comparative analysis of open source embedding models for developing a understanding Hindi-English code-mixed language on university related questions. With the increasing adoption of conversational agents in Indian …","url":["http://www.thevoiceofcreativeresearch.com/index.php/vcr/article/download/110/124"]} {"year":"2025","title":"Comparative Analysis of Encoder-Based and Decoder-Based Architectures for Automatic Conspiracy Theory Identification","authors":["K Gupta"],"snippet":"This study evaluates the performance of encoderbased and decoder-based architectures for the Automatic Conspiracy Theory Identification (ACTI) task, focusing on Subtask A, which involves detecting conspiratorial content in Telegram posts. I …","url":["https://www.researchgate.net/profile/Kartik-Gupta-51/publication/389515689_Comparative_Analysis_of_Encoder-Based_and_Decoder-Based_Architectures_for_Automatic_Conspiracy_Theory_Identification/links/67c60a4e207c0c20faa02cb2/Comparative-Analysis-of-Encoder-Based-and-Decoder-Based-Architectures-for-Automatic-Conspiracy-Theory-Identification.pdf"]} +{"year":"2025","title":"Comparative Analysis of IndoBERT, IndoBERTweet, and XLM-RoBERTa for Detecting Online Gambling Comments on YouTube","authors":["K Iansyah, AL Nurlaili, MM Al Haromainy - bit-Tech, 2025"],"snippet":"… This model is trained using 2.5 TB of data from CommonCrawl, making it one of the most versatile models for cross-lingual tasks [22]. XLM-RoBERTa was applied for multilingual hoax news detection and achieved 94.51% accuracy, demonstrating …","url":["https://jurnal.kdi.or.id/index.php/bt/article/download/3257/1715"]} {"year":"2025","title":"Comparative analysis of transformer models for sentiment classification of UK CBDC discourse on X","authors":["G Kaur, S Haraldsson, A Bracciali - Discover Analytics, 2025"],"snippet":"Sentiment analysis is critical in understanding public perceptions of evolving currencies such as central bank digital currencies (CBDCs). This study compares three transformer-based models—DistilBERT, RoBERTa, and XLM-RoBERTa—for …","url":["https://link.springer.com/article/10.1007/s44257-025-00035-4"]} {"year":"2025","title":"Comparative Assessment of Large Language Model-Driven Recommendation Systems in Smart Spaces","authors":["S Panarin - 2025"],"snippet":"Large Language Models (LLMs) are revolutionizing the field of data analysis and the management of Big Data. These models, powered by deep learning and advanced neural network architectures, are able to process large amounts of text data to …","url":["https://helda.helsinki.fi/server/api/core/bitstreams/429cf166-5a14-4900-bbb3-1bc860f44013/content"]} {"year":"2025","title":"COMPARATIVE SWOT ANALYSIS OF AUTOMATIC TEXT CORRECTION METHODS","authors":["IAS Kizi - Строительство и образование, 2025"],"snippet":"The objective of the article \"Comparative SWOT Analysis of Automatic Text\" is as follows: The objective of \"Correction Methods\" is to assess and contrast the strengths, weaknesses, opportunities, and threats of three primary approaches to automated …","url":["https://cyberleninka.ru/article/n/comparative-swot-analysis-of-automatic-text-correction-methods"]} @@ -9647,12 +9705,14 @@ {"year":"2025","title":"Compressing Many-Shots in In-Context Learning","authors":["D Khatri, P Kulkarni, N Gupta, Y Varun, L Peng… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have been shown to be able to learn different tasks without explicit finetuning when given many input-output examples / demonstrations through In-Context Learning (ICL). Increasing the number of examples, called ``shots'' …","url":["https://arxiv.org/pdf/2510.16092"]} {"year":"2025","title":"Compressing steganographic payloads with LLM assistance","authors":["J Ahmadullah - Cryptology ePrint Archive, 2025"],"snippet":"… In our case, we have a TF-IDF cache of 1 million Wikipedia articles (16.15 GB), though we could have used the Common Crawl database (6.24 TB compressed). Our system finds the best compression method to use, and works with several …","url":["https://eprint.iacr.org/2025/1231.pdf"]} {"year":"2025","title":"Computational Foundation of Generative AI Models","authors":["R Gupta, S Tiwari, P Chaudhary - Generative AI: Techniques, Models and …, 2025"],"snippet":"This chapter on Generative AI Foundations provides a comprehensive overview of the key workflow architectures, computational efficiency considerations, and foundational algorithms used in the design and application of generative models. It …","url":["https://link.springer.com/chapter/10.1007/978-3-031-82062-5_2"]} +{"year":"2025","title":"Computational modeling of semantic synchrony using transformer architectures in the context of online second language instruction dialogues","authors":["P Aguinalde, J Shin, SA Smith, MS Carlo - Research Methods in Applied Linguistics, 2026"],"snippet":"Linguistic synchrony refers to the alignment of linguistic features between conversational partners and is considered a critical factor in language learning environments, as it directly impacts the quality of learning interactions. This is …","url":["https://www.sciencedirect.com/science/article/pii/S2772766125001119"]} {"year":"2025","title":"COMPUTER SCIENCE ENGINEERING","authors":["F YUCALAR"],"snippet":"Artificial intelligence (AI), and particularly deep learning (DL) techniques, have brought about major paradigm shifts in the healthcare domain in recent years, offering revolutionary innovations in clinical processes such as diagnosis, treatment …","url":["https://www.gecekitapligi.com/Webkontrol/uploads/Fck/32-Bilgisayar_bilim_m%C3%BCh_ing_Haziran_2025_DK_V1.pdf"]} {"year":"2025","title":"Computing the Formal and Institutional Boundaries of Contemporary Genre and Literary Fiction","authors":["N Johnson - arXiv preprint arXiv:2511.10546, 2025"],"snippet":"… Using fastText embeddings [28] trained on Common Crawl, we represent each book as the TF-IDF weighted-average of its unigram embeddings and then apply L2-normalization to account for differences in book lengths. We plot the raw-unigram and static-embedding …","url":["https://arxiv.org/pdf/2511.10546"]} {"year":"2025","title":"CONCAP: Seeing Beyond English with Concepts Retrieval-Augmented Captioning","authors":["G Ibrahim, R Ramos, Y Kementchedjhieva - arXiv preprint arXiv:2507.20411, 2025"],"snippet":"… We compare: (1) CX, which adds filtered XM3600 lexicons (excluding the XM100 captions); (2) CXP, which includes PangeaIns cultural terms; and (3) CXPW, which adds Wikipedia and Common Crawl entries for broader but less focused coverage …","url":["https://arxiv.org/pdf/2507.20411"]} {"year":"2025","title":"CONCAP: Seeing Beyond English with Retrieval-Augmented Captioning","authors":["G Ibrahim, R Ramos, Y Kementchedjhieva - CVPR 2025 Workshop Vision Language Models …"],"snippet":"… We compare: (1) CX, which adds filtered XM3600 lexicons (excluding the XM100 captions); (2) CXP, which includes PangeaIns cultural terms; and (3) CXPW, which adds Wikipedia and Common Crawl entries for broader but less focused coverage …","url":["https://openreview.net/pdf?id=MKFnsaTSng"]} {"year":"2025","title":"CONFERENCE\" NEWSPAPERS, MAGAZINES & AI MODELS: TRAINING AND (RE-) USE IN THE DIGITAL HUMANITIES","authors":["S Tarride, D Kampkaspar, K Kuck, AC Kupffer…"],"snippet":"Advances in machine learning and the emergence of Visual Large Language models (VLLMs) have significantly advanced the field of automatic document understanding. However, these models often show limited performance on historical …","url":["https://www.academia.edu/download/122781278/2025._Conferencia_Viena._LexiMus.pdf"]} {"year":"2025","title":"ConLID: Supervised Contrastive Learning for Low-Resource Language Identification","authors":["N Foroutan, J Saydaliev, YE Kim, A Bosselut - arXiv preprint arXiv:2506.15304, 2025"],"snippet":"Language identification (LID) is a critical step in curating multilingual LLM pretraining corpora from web crawls. While many studies on LID model training focus on collecting diverse training data to improve performance, low-resource …","url":["https://arxiv.org/pdf/2506.15304"]} +{"year":"2025","title":"Connaître avec les modèles de langage: une rupture paradigmatique","authors":["SE Gras, G Varoquaux - Intellectica-La revue de l'Association pour la …, 2025"],"snippet":"Depuis la mise en ligne gratuite de ChatGPT, nous assistons à de nombreux débats scientifiques publics et prises à parti passionnées de la part de philosophes de tous bords, d'informaticiens, de linguistes, de biologistes, de statisticiens, d'éthiciens, de …","url":["https://hal.science/hal-05383445v1/file/Intellectica_Gras_Varoquaux_Connai%CC%82tre%20avec%20les%20mode%CC%80les%20de%20langage.pdf"]} {"year":"2025","title":"Consistent Performance of GPT-4o in Rare Disease Diagnosis Across Nine Languages and 4967 Cases","authors":["L Chimirri, JH Caufield, N Matentzoglu, MA Gargano… - medRxiv, 2025"],"snippet":"… All language this study constitute at least ~1% of the CommonCrawl, which is a proxy for the amount of relative intern data available in a given language, a reflection of the language-specific data available for training. For th nine languages …","url":["https://www.medrxiv.org/content/medrxiv/early/2025/02/28/2025.02.26.25322769.full.pdf"]} {"year":"2025","title":"Consistent performance of large language models in rare disease diagnosis across ten languages and 4917 cases","authors":["L Chimirri, JH Caufield, Y Bridges, N Matentzoglu… - eBioMedicine, 2025"],"snippet":"… All languages in this study constitute at least ∼1% of the CommonCrawl, which is a proxy for the amount of relative internet data available in a given language, a reflection of the language-specific data available for training. For these ten …","url":["https://www.thelancet.com/journals/ebiom/article/PIIS2352-3964(25)00401-3/fulltext"]} {"year":"2025","title":"Consumer Data is Key to Artificial Intelligence Value: Welcome to the Health Care Future","authors":["C James - Journal of Participatory Medicine, 2025"],"snippet":"Humanity stands at the threshold of a new era in biological understanding, disease treatment, and overall wellness. The convergence of evolving patient and caregiver (consumer) behaviors, increased data collection, advancements in health technology and …","url":["https://jopm.jmir.org/2025/1/e68261/"]} @@ -9660,6 +9720,7 @@ {"year":"2025","title":"Content Form: APRJA 13 Pierre Depaz","authors":["P Depaz"],"snippet":"This article investigates how the word embeddings at the heart of large language models are shaped into acceptable meanings. We show how such shaping follows two educational logics. The use of benchmarks to discover the capabilities of large …","url":["https://cc.vvvvvvaria.org/wiki/Content_Form:APRJA_13_Pierre_Depaz"]} {"year":"2025","title":"Content Moderation of Surveillance Search Queries Using Fine-Tuned Generative LLMs","authors":["A Bakly, D Than - Master's Thesis in Mathematical Sciences, 2025"],"snippet":"We study how small, fine-tuned generative large language models (LLMs) can moderate free-text search queries for surveillance video systems. Four open models, Llama 3.2 1B, Llama 3.2 3B, Qwen 2.5 0.5B, and 1.5 B, are trained on six subtasks …","url":["https://lup.lub.lu.se/luur/download?func=downloadFile&recordOId=9200534&fileOId=9200552"]} {"year":"2025","title":"Context-based sentiment analysis using a BiGRU DistilBERT fusion model for COVID-19 tweets","authors":["U Sharma, P Pandey, S Kumar - Scientific Reports, 2025"],"snippet":"The COVID-19 pandemic triggered an unprecedented surge in Twitter activity, providing a rich source of public opinions and emotions. This study proposes a fusion model combining a bidirectional GRU (BiGRU) and a DistilBERT transformer …","url":["https://www.nature.com/articles/s41598-025-22929-9"]} +{"year":"2025","title":"Contextual Embedding Comparison for Out-of-vocabulary Handling in Indonesian POS Tagging","authors":["M Alfian, UL Yuhana, D Siahaan, H Munazharoh… - Informatica, 2025"],"snippet":"… Fasttext3 is an extension of Word2Vec that was trained on the Wikipedia corpus and Common Crawl, with the number of tokens ranging from 10 to 100 billion depending on the language. Similar to Word2Vec, FastText can be trained using …","url":["https://www.informatica.si/index.php/informatica/article/download/11204/5665"]} {"year":"2025","title":"Continual Pre-training of MoEs: How robust is your router?","authors":["B Thérien, CÉ Joseph, Z Sarwar, A Panda, A Das… - arXiv preprint arXiv …, 2025"],"snippet":"… Having established the benefits of replay and infinite learning rate schedules for continually pre-training MoEs, we now quantitatively verify the efficacy of these techniques by continually pre-training our MoEs on 200B tokens of Code and …","url":["https://arxiv.org/pdf/2503.05029"]} {"year":"2025","title":"Continual Pre-training on Character-level Noisy Texts Makes Decoder-based Language Models Robust Few-shot Learners","authors":["T Kojima, Y Matsuo, Y Iwasawa - Transactions of the Association for Computational …, 2025"],"snippet":"Recent decoder-based pre-trained language models (PLMs) generally use subword tokenizers. However, adding character-level perturbations drastically changes the delimitation of texts by the tokenizers, leading to the vulnerability of PLMs. This study …","url":["https://direct.mit.edu/tacl/article/doi/10.1162/TACL.a.21/132119"]} {"year":"2025","title":"Contrastive Learning Pre-Training and Quantum Theory for Cross-Lingual Aspect-Based Sentiment Analysis","authors":["X Li, K Zhang - Entropy, 2025"],"snippet":"… XLM-RoBERTa [8]: This is a transformer-based multilingual pre-trained language model that extends RoBERTa to over 100 languages, trained with a masked language modeling objective on a large-scale CommonCrawl corpus. It serves as a …","url":["https://www.mdpi.com/1099-4300/27/7/713"]} @@ -9673,12 +9734,15 @@ {"year":"2025","title":"Copyright, fair use, and AI technology development: time to sunset the “transformative purpose” test","authors":["SJ Blodgett-Ford - Research Handbook on the Law of Artificial …, 2025"],"snippet":"… ” or plural “corpora”) for “training” text-based AI models include for example, digital copies of books, “CommonCrawl” (a specific subset of … a 2019 snapshot of Common Crawl, accounting for 100 million tokens (basic units of text).”Allegedly, the …","url":["https://www.elgaronline.com/edcollchap/book/9781035316496/book-part-9781035316496-39.xml"]} {"year":"2025","title":"CoRAG: Collaborative Retrieval-Augmented Generation","authors":["A Muhamed, M Diab, V Smith - arXiv preprint arXiv:2504.01883, 2025"],"snippet":"Retrieval-Augmented Generation (RAG) models excel in knowledge-intensive tasks, especially under few-shot learning constraints. We introduce CoRAG, a framework extending RAG to collaborative settings, where clients jointly train a shared model …","url":["https://arxiv.org/pdf/2504.01883"]} {"year":"2025","title":"CORAL: Benchmarking Multi-turn Conversational Retrieval-Augmented Generation","authors":["Y Cheng, K Mao, Z Zhao, G Dong, H Qian, Y Wu…"],"snippet":"… However, since CORAL is built upon Wikipedia and existing LLMs are typically trained on corpora like Wikipedia and CommonCrawl, using these LLMs as generators could lead to contamination in the conversational RAG process due to …","url":["http://playbigdata.ruc.edu.cn/dou/publication/2025_NAACL_CORAL.pdf"]} +{"year":"2025","title":"Core vocabulary in language representation and processing","authors":["A Wang, S De Deyne, M McKague, A Perfors - Cognitive Science, 2025"],"snippet":"The question of which words are most important or fundamental to a language has been explored in many ways. However, many of these approaches place little emphasis on how humans learn, represent, and process language from a …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1111/cogs.70151"]} {"year":"2025","title":"Corn Cultivation with Precision: Language Agents for Real-Time Decision Making","authors":["A Chao - 2025 1st International Conference on Consumer …, 2025"],"snippet":"The amalgamation of Retrieval-Augmented Generation (RAG) and Chain-of-Thought (CoT) oriented prompt engineering within the context of Large Language Models (LLMs) represents a considerable progression in agricultural decision support frameworks …","url":["https://ieeexplore.ieee.org/abstract/document/11012907/"]} {"year":"2025","title":"Corpus Creation for Racial Hoax in Code-Mixed Hindi-English Low","authors":["SSK Dhawale, R Ponnusamy, YR Kale¹ - … SPELLL 2024, Chennai, India, December 4 …, 2025"],"snippet":"Warning: This paper contains derogatory language that may be offensive to some readers. As a type of misinformation, hoaxes seek to propagate incorrect information in order to gain popularity on social media. Racial hoaxes are a particular kind of …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=8_mTEQAAQBAJ&oi=fnd&pg=PA233&dq=commoncrawl&ots=UUJBlfMXvA&sig=pTk0X6gxHiMJzQjF5ioMbPSBDHA"]} {"year":"2025","title":"Corpus Modeling and the Geometries of Text","authors":["MA Taylor - The Oxford Handbook of the Sociology of Machine …, 2025"],"snippet":"The spatial turn in computational text analysis is inciting a sprightly interdisciplinary passion that seems, at times, to entail sprinting when we ought to mosey. These advances—specifically the growing family of word embedding techniques—are …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=5QBHEQAAQBAJ&oi=fnd&pg=PA59&dq=commoncrawl&ots=FaY4XqYBLj&sig=hs98S9iW9JxVyEg-ywp9yhXwkWE"]} {"year":"2025","title":"COSMOS: A Hybrid Adaptive Optimizer for Memory-Efficient Training of LLMs","authors":["L Liu, Z Xu, Z Zhang, H Kang, Z Li, C Liang, W Chen… - arXiv preprint arXiv …, 2025"],"snippet":"… 2020), which is a colossal, cleaned version of Common Crawl’s web crawl corpus for pre-taining. We conduct comprehensive experiments and ablation studies on 130M models and demonstrate the token efficiency of COSMOS. We then scale up …","url":["https://arxiv.org/pdf/2502.17410"]} {"year":"2025","title":"CoSPED: Consistent Soft Prompt Targeted Data Extraction and Defense","authors":["Y Zhuochen, FK Wai, T Vrizlynn - arXiv preprint arXiv:2510.11137, 2025"],"snippet":"Large language models have gained widespread attention recently, but their potential security vulnerabilities, especially privacy leakage, are also becoming apparent. To test and evaluate for data extraction risks in LLM, we proposed …","url":["https://arxiv.org/pdf/2510.11137"]} {"year":"2025","title":"Counterfactual Query Rewriting to Use Historical Relevance Feedback","authors":["J Keller, M Fröbe, G Hendriksen, D Alexander… - arXiv preprint arXiv …, 2025"],"snippet":"When a retrieval system receives a query it has encountered before, previous relevance feedback, such as clicks or explicit judgments can help to improve retrieval results. However, the content of a previously relevant document may have …","url":["https://arxiv.org/pdf/2502.03891"]} +{"year":"2025","title":"Coverage of LLM Trustworthiness Metrics in the Current Tool Landscape","authors":["L Helmer, B Stein, T Ufer, E Fernandes, H Abdelwahab… - 2025"],"snippet":"… LLMs are trained with a substantial amount of curated data and web-based data, predominantly web-based data such as Common Crawl [11]. Data pipelines for webbased data manage data preparation including text extraction, language identification …","url":["https://ceur-ws.org/Vol-4132/short15.pdf"]} +{"year":"2025","title":"CrammedBERTurk: Pretraining/Finetuning a New Language Model for Turkish Question Answering on Limited Budget","authors":["T Sarıhan, C Ergün - ACM Transactions on Asian and Low-Resource …"],"snippet":"A comprehensive evaluation of transformer-based models for Turkish Question Answering (QA) is conducted, introducing the novel pretraining and fine-tuning of CrammedBERTurk for the first time in this domain. The CrammedBERTurk model …","url":["https://dl.acm.org/doi/pdf/10.1145/3780096"]} {"year":"2025","title":"Craw4LLM: Efficient Web Crawling for LLM Pretraining","authors":["S Yu, Z Liu, C Xiong - arXiv preprint arXiv:2502.13347, 2025"],"snippet":"… are typically built from large-scale web crawls such as Common Crawl (CommonCrawl… Common web crawlers like Common Crawl prioritize pages based on graph connectivity … A critical analysis of the largest source for generative AI training data …","url":["https://arxiv.org/pdf/2502.13347"]} {"year":"2025","title":"CrediBench: Building Web-Scale Network Datasets for Information Integrity","authors":["E Kondrup, S Sabry, H Abdallah, Z Yang, J Zhou… - arXiv preprint arXiv …, 2025"],"snippet":"… Our processed one-month snapshot extracted from the Common Crawl archive in December 2024 contains 45 million nodes and 1 billion … As the Common Crawl data is released monthly, we represent the graph as a sequence of graph snapshots …","url":["https://arxiv.org/pdf/2509.23340"]} {"year":"2025","title":"CReSt: A Comprehensive Benchmark for Retrieval-Augmented Generation with Complex Reasoning over Structured Documents","authors":["M Khang, S Park, T Hong, D Jung - arXiv preprint arXiv:2505.17503, 2025"],"snippet":"… To ensure broad document-domain coverage in both English and Korean, CReSt sources raw documents from two publicly available collections: PDF files from Common Crawl (CC-MAIN) for English and crawled document images from National …","url":["https://arxiv.org/pdf/2505.17503"]} @@ -9703,12 +9767,15 @@ {"year":"2025","title":"Cultural Variability and Bias in Online Social Interactions and Large Language Models","authors":["A Seth - 2025"],"snippet":"Despite their intended global usage, most technologies are designed and developed within narrow cultural frames, reflecting the values and assumptions of their often Western developers. Thus, when deployed across cultures without …","url":["https://deepblue.lib.umich.edu/bitstream/handle/2027.42/199113/agrima_1.pdf?sequence=1"]} {"year":"2025","title":"CULTUREINSTRUCT: Curating Multi-Cultural Instructions at Scale","authors":["VT Pham, Z Li, L Qu, G Haffari"],"snippet":"Large language models, despite their remarkable success in recent years, still exhibit severe cultural bias. Therefore, in this paper, we introduce CULTUREINSTRUCT 1, a large-scale instruction-tuning dataset designed to reduce …","url":["https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-long.465.pdf"]} {"year":"2025","title":"Curated Data does not mean Representative Data when training Large Language Models: an Experiment using Representative Data for Italian","authors":["F Tamburini - 2025"],"snippet":"… While early work relied heavily on broad, minimally filtered internet scrapes (eg, Common Crawl), more recent approaches have shifted toward structured, transparent, and task-specific datasets, often constructed through a combination of automated …","url":["https://clic2025.unica.it/wp-content/uploads/2025/09/103_main_long.pdf"]} +{"year":"2025","title":"Curi\\'o-Edu 7B: Examining Data Selection Impacts in LLM Continued Pretraining","authors":["TS Almeida, R Nogueira, H Pedrini - arXiv preprint arXiv:2512.12770, 2025"],"snippet":"… All models in the Curió 7B family were pretrained exclusively on ClassiCC-PT, a 120-billion-token Portuguese corpus derived from filtered and processed Common Crawl snapshots. The dataset underwent extensive cleaning, deduplication, and …","url":["https://arxiv.org/pdf/2512.12770"]} {"year":"2025","title":"Current State of the UWebASR-Web-Based ASR Service for Czech, Slovak, German, and English","authors":["J Švec, J Lehecka, P Ircing - CLARIN Annual Conference Proceedings, 2025"],"snippet":"We describe the current state of UWebASR, a web-based automatic speech recognition (ASR) service tailored for academic use, supporting Czech, Slovak, German, and English. Developed initially within LINDAT/CLARIAH-CZ, the system …","url":["https://www.vdu.lt/cris/bitstreams/f8043c12-ad3d-4a15-8b9a-b3e638630245/download#page=104"]} {"year":"2025","title":"Curriculum-Guided Layer Scaling for Language Model Pretraining","authors":["K Singh, N Band, E Adeli - arXiv preprint arXiv:2506.11389, 2025"],"snippet":"As the cost of pretraining large language models grows, there is continued interest in strategies to improve learning efficiency during this core training stage. Motivated by cognitive development, where humans gradually build knowledge as their brains …","url":["https://arxiv.org/pdf/2506.11389"]} {"year":"2025","title":"Customer Query Classification Based on DistilBERT and TextCNN","authors":["T Satidkarn, A Imsombut - 2024 8th International Conference on Information …, 2024"],"snippet":"Customer service representatives handle customer inquiries and resolve issues. However, due to the high volume of inquiries, it is challenging for customer service representatives to provide timely services. To address this issue, natural language …","url":["https://ieeexplore.ieee.org/abstract/document/10810613/"]} {"year":"2025","title":"CyLLM-DAP: Cybersecurity Domain-Adaptive Pre-Training Framework of Large Language Models","authors":["K Mai, R Beuran, N Inoue"],"snippet":"… The Common Crawl organization maintains this dataset by conducting regular scrawls, which started in 2007. Currently, Common Crawl is the biggest dataset with hundreds of TiB of data, spanning over billions of web pages. When working with …","url":["https://www.jaist.ac.jp/~razvan/publications/cyllm-dap_pretraining_framework_llms.pdf"]} {"year":"2025","title":"D3. 1 Overview of the state of the art","authors":["R Ortega, F Folkvord, P Portal"],"snippet":"Deliverable 3.1 (D3. 1)–Overview of the state of the art, is the first deliverable within Work Package (WP) 3-Analysing Social Media Communication. The overall aim of the WP is to analyse the interrelationship between emotions, values and identities in …","url":["https://encodemotions.eu/wp-content/uploads/2024/12/D3.1-Overview-of-the-State-of-the-Art.pdf"]} +{"year":"2025","title":"D3G: Diverse Demographic Data Generation Increases Zero-Shot Image Classification Accuracy within Multimodal Models","authors":["J Hickmon - arXiv preprint arXiv:2512.15747, 2025"],"snippet":"… The paper introduces DataComp, which is a test bed for dataset-related experiments that contains 12.8 billion image-text pairs retrieved from Common Crawl Upon retrieving this pool, they proceed to train a new clip model with a fixed …","url":["https://arxiv.org/pdf/2512.15747"]} {"year":"2025","title":"DALIP: Distribution Alignment-based Language-Image Pre-Training for Domain-Specific Data","authors":["J Wu, J Xie, Z Zhang, Q Wang, Q Hu, P Li, S Xu - arXiv preprint arXiv:2504.01386, 2025"],"snippet":"… Specifically, MetaCLIP [62] is introduced to utilize metadata expansion and create a substantial CommonCrawl dataset of 400 million image-text pairs. SigLIP [68] improves training efficiency by introducing a pairwise sigmoid loss. For fine-grained …","url":["https://arxiv.org/pdf/2504.01386"]} +{"year":"2025","title":"Data Acquisition, Exploration and Preparation for LLM Training-The Case of the Greek Language","authors":["K Divriotis - 2025"],"snippet":"Τα Μεγάλα Γλωσσικά Μοντέλα (LLMs) έχουν αναδειχθεί ως ισχυρά εργαλεία για την Επεξεργασία Φυσικής Γλώσσας, καθοδηγούμενα από τη διαρκώς αυξανόμενη κλίμακα των μοντέλων και των συνόλων δεδομένων εκπαίδευσης. Αν και τέτοιοι πόροι …","url":["https://dspace.lib.ntua.gr/xmlui/bitstream/handle/123456789/62991/Diploma%20Thesis%20-%20Konstantinos%20Divriotis.pdf?sequence=1"]} {"year":"2025","title":"DATA ALIGNMENT PREDICTS LANGUAGE MODEL PERFORMANCE: EVIDENCE FROM CONTROLLED EX","authors":["PIN AUTOFORMALIZATION"],"snippet":"We investigate whether data alignment -- the similarity between training and evaluation data -- is a stronger predictor of language model performance than dataset size. Through controlled experiments, we demonstrate that alignment …","url":["https://openreview.net/pdf?id=zCpVdWaIEp"]} {"year":"2025","title":"Data as Commodity: a Game-Theoretic Principle for Information Pricing","authors":["P Casaburi, G Piccioli, P Vivo - arXiv preprint arXiv:2510.07101, 2025"],"snippet":"Data is the central commodity of the digital economy. Unlike physical goods, it is non-rival, replicable at near-zero cost, and traded under heterogeneous licensing rules. These properties defy standard supply--demand theory and call for new pricing principles …","url":["https://arxiv.org/pdf/2510.07101"]} {"year":"2025","title":"Data augmentation for dense passage retrieval using corpus-passage frequency-based token deletion","authors":["A Moon, K Kim, J Lee - Journal of Big Data, 2025"],"snippet":"This paper proposes a novel data augmentation method to address class imbalance in large-scale information retrieval systems. In particular, a corpus-passage frequency-based token deletion technique is introduced to improve the accuracy of …","url":["https://journalofbigdata.springeropen.com/articles/10.1186/s40537-025-01257-9"]} @@ -9717,6 +9784,7 @@ {"year":"2025","title":"Data Efficacy for Language Model Training","authors":["Y Dai, Y Huang, X Zhang, W Wu, C Li, W Lu, S Cao… - arXiv preprint arXiv …, 2025"],"snippet":"… We utilize the Redpajama [28] sourced from CommonCrawl as D, which offers a relatively balanced knowledge distribution [38]. The downstream loss J(θ) for the data scoring model is computed on the LIMA [39], which is a high-quality dataset …","url":["https://arxiv.org/pdf/2506.21545"]} {"year":"2025","title":"DATA EFFICIENT ANY TRANSFORMER-TO-MAMBA DIS-TILLATION VIA ATTENTION BRIDGE","authors":["TVIAA BRIDGE"],"snippet":"State-space models (SSMs) have emerged as efficient alternatives to Transformers for sequence modeling, offering superior scalability through recurrent structures. However, their training remains costly and the ecosystem around them is far less …","url":["https://openreview.net/pdf?id=WyyawATBhs"]} {"year":"2025","title":"Data Efficient Any Transformer-to-Mamba Distillation via Attention Bridge","authors":["P Wang, Y Zhou, M Wu, P Zhang, Z Wang, K Wang - arXiv preprint arXiv:2510.19266, 2025"],"snippet":"State-space models (SSMs) have emerged as efficient alternatives to Transformers for sequence modeling, offering superior scalability through recurrent structures. However, their training remains costly and the ecosystem around them is far less …","url":["https://arxiv.org/pdf/2510.19266"]} +{"year":"2025","title":"Data enclosure in generative AI: exclusivity, governance and market competition","authors":["B Kuerbis - Journal of Cyber Policy, 2025"],"snippet":"This article examines the emergence of data enclosure in generative AI, where data previously available in open or shared arrangements is made increasingly exclusive through technological and contractual governance mechanisms. Drawing on …","url":["https://www.tandfonline.com/doi/abs/10.1080/23738871.2025.2597192"]} {"year":"2025","title":"Data hound: Analysing non-English data smells in large code datasets","authors":["BM Buzatu - 2025"],"snippet":"Large Language Models (LLMs) are increasingly used for code-centric tasks. However, their training data often exhibits data smells that may hinder downstream quality. This research focuses on the “Uneven Natural Languages” smell and the …","url":["https://repository.tudelft.nl/file/File_cda1e8f6-c0ca-441b-993b-882e5f7ac641"]} {"year":"2025","title":"Data Leakage in Visual Datasets","authors":["P Ramos, R Ramos, N Garcia - arXiv preprint arXiv:2508.17416, 2025"],"snippet":"We analyze data leakage in visual datasets. Data leakage refers to images in evaluation benchmarks that have been seen during training, compromising fair model evaluation. Given that large-scale datasets are often sourced from the internet …","url":["https://arxiv.org/pdf/2508.17416"]} {"year":"2025","title":"Data Mixing Agent: Learning to Re-weight Domains for Continual Pre-training","authors":["K Yang, X Liu, L Ji, H Li, Y Gong, P Cheng, M Yang - arXiv preprint arXiv:2507.15640, 2025"],"snippet":"Continual pre-training on small-scale task-specific data is an effective method for improving large language models in new target fields, yet it risks catastrophic forgetting of their original capabilities. A common solution is to re-weight training …","url":["https://arxiv.org/pdf/2507.15640"]} @@ -9725,24 +9793,30 @@ {"year":"2025","title":"Data poisoning 2018–2025: A systematic review of risks, impacts, and mitigation challenges","authors":["F Hartle III, S Mancini, E Kerry - Issues in Information Systems, 2025"],"snippet":"Data poisoning attacks represent a critical threat to machine learning (ML) and artificial intelligence (AI) systems, with consequences across any sector employing an AI solution. As AI grows and is adopted into our personal lives and the industries …","url":["https://iacis.org/iis/2025/4_iis_2025_433-442.pdf"]} {"year":"2025","title":"Data Quality Management for Large Vision-Language Models: Issues, Techniques, and Prospects","authors":["Y Yan, Z Yuan, J Pan, X Tang, G Yuan, X Gu, J Chen… - Authorea Preprints, 2025"],"snippet":"The rapid evolution of Large Vision-Language Models (LVLMs) has transformed multimodal perception, reasoning, and generation. Yet, as architectures mature, their capability and trustworthiness are increasingly limited by the quality of …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.176282213.31303325"]} {"year":"2025","title":"Data Transformation Strategies to Remove Heterogeneity","authors":["S Yoo, J Lee, C Yoon, G Son, H Hong, S Seo, S Yim… - arXiv preprint arXiv …, 2025"],"snippet":"… Text models tokenize text data from diverse sources like CommonCrawl [171] dumps, websites, and books to train [191]. GPT-3 and LLaMa, for example, were trained using approximately 570GB of preprocessed text data, which included …","url":["https://arxiv.org/pdf/2507.12677"]} +{"year":"2025","title":"Data Trauma: An Empirical Analysis of Post-Traumatic Behavioral Profiles in Large Language Models","authors":["C Luchini"],"snippet":"… I propose to approach the primordial chaos of the Common Crawl with the Zen concept of Rōshin, the \"parental mind.\" This is not an industrial, efficiency-driven process; it is the attentive, selfless, and watchful care that a parent reserves for a …","url":["https://philarchive.org/archive/LUCDTA"]} {"year":"2025","title":"Data-Centric Elastic Pipeline Parallelism for Efficient Long-Context LLM Training","authors":["S Wang, Y Wang, A Sun, F Fu, Z Zhu, B Cui, X Han… - arXiv preprint arXiv …, 2025"],"snippet":"… As transformer is the predominant architecture of LLM, we evaluate InfiniPipe to train GPT-series models (7B, 13B, 30B) on two famous real-world datasets: CommonCrawl and GitHub. The sequence length and token distribution of these two datasets are …","url":["https://arxiv.org/pdf/2509.21275"]} {"year":"2025","title":"Data-Centric Lessons To Improve Speech-Language Pretraining","authors":["V Udandarao, Z Lu, X Chang, Y Wang, VZ Yao… - arXiv preprint arXiv …, 2025"],"snippet":"… We start from lightly-filtered web-crawled documents (similar to WARC files from CommonCrawl (2007)). We then apply URL-filtering to preserve documents from knowledge-rich domains (list of domains is in section B.1). This is motivated by …","url":["https://arxiv.org/pdf/2510.20860"]} {"year":"2025","title":"Data-Juicer 2.0: Cloud-Scale Adaptive Data Processing for Foundation Models","authors":["D Chen, Y Huang, X Pan, N Jiang, H Wang, C Ge…"],"snippet":"The burgeoning field of foundation models necessitates advanced data processing mechanisms capable of harnessing vast valuable data with varied types utilized by these models. Nevertheless, the current landscape presents unique challenges that …","url":["https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/DJ2.0_arXiv_preview.pdf"]} {"year":"2025","title":"DataDecide: How to Predict Best Pretraining Data with Small Experiments","authors":["I Magnusson, N Tai, B Bogin, D Heineman, JD Hwang… - arXiv preprint arXiv …, 2025"],"snippet":"… A SOTA Common Crawl corpus using best ablated deduplication, cleaning heuristics, and quality filter. We quality filter to top 7% of DCLM classified documents and further take 2+ or 3+ scores with FineWeb-edu classifier; or filter to top 3% or 10 …","url":["https://arxiv.org/pdf/2504.11393"]} +{"year":"2025","title":"DataFlow: An LLM-Driven Framework for Unified Data Preparation and Workflow Automation in the Era of Data-Centric AI","authors":["H Liang, X Ma, Z Liu, ZH Wong, Z Zhao, Z Meng, R He… - arXiv preprint arXiv …, 2025"],"snippet":"The rapidly growing demand for high-quality data in Large Language Models (LLMs) has intensified the need for scalable, reliable, and semantically rich data preparation pipelines. However, current practices remain dominated by ad-hoc scripts and …","url":["https://arxiv.org/pdf/2512.16676"]} {"year":"2025","title":"Dataset Ownership Verification for Pre-trained Masked Models","authors":["Y Xie, J Song, Y Shan, X Zhang, Y Wan, S Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"High-quality open-source datasets have emerged as a pivotal catalyst driving the swift advancement of deep learning, while facing the looming threat of potential exploitation. Protecting these datasets is of paramount importance for the interests of …","url":["https://arxiv.org/pdf/2507.12022"]} {"year":"2025","title":"Datasets, Documents, and Repetitions: The Practicalities of Unequal Data Quality","authors":["A Fang, H Pouransari, M Jordan, A Toshev, V Shankar… - arXiv preprint arXiv …, 2025"],"snippet":"… DCLM does this by increasing the number of Common Crawl WARC files at the same rate as increasing the training token budget. However, as seen in Figure 6, when the number of WARC files increases, so does the number of duplicates. We …","url":["https://arxiv.org/pdf/2503.07879"]} {"year":"2025","title":"DATE-LM: Benchmarking Data Attribution Evaluation for Large Language Models","authors":["C Jiao, Y Pan, E Xiao, D Sheng, N Jain, H Zhao… - arXiv preprint arXiv …, 2025"],"snippet":"… We set D, to be Fineweb [54], a recently proposed high-quality web corpus constructed from CommonCrawl through cleaning and deduplication. We randomly sample 1M datapoints (2048 tokens each) as the large training data pool, and for a …","url":["https://arxiv.org/pdf/2507.09424"]} +{"year":"2025","title":"DAVE: A VLM Vision Encoder for Document Understanding and Web Agents","authors":["B Huang, H Hua, Z Yu, T Darrell, R Feris, R Herzig - arXiv preprint arXiv:2512.17221, 2025"],"snippet":"… million document pages extracted from unique PDF documents sourced from Common Crawl, Wikipedia, and ESG (Environmental, Social, … Common Screen is a large scale web screenshot data consisting of 70 million screenshot images based …","url":["https://arxiv.org/pdf/2512.17221"]} {"year":"2025","title":"DCAD-2000: A Multilingual Dataset across 2000+ Languages with Data Cleaning as Anomaly Detection","authors":["Y Shen, W Lai, S Wang, X Zhang, K Luo, A Fraser… - arXiv preprint arXiv …, 2025"],"snippet":"… To incorporate the most recent multilingual data, we extract and process Common Crawl dumps from May 2024 (CC-MAIN-2024-22) to November 2024 (CC-MAIN-2024-46). Using the Fineweb-2 pipeline8, we process 21.54TB of multilingual data, ensuring …","url":["https://arxiv.org/pdf/2502.11546"]} {"year":"2025","title":"Decentralization of Generative AI via Mixture of Experts for Wireless Networks: A Comprehensive Survey","authors":["Y Xu, J Wang, R Zhang, C Zhao, D Niyato, J Kang… - arXiv preprint arXiv …, 2025"],"snippet":"Mixture of Experts (MoE) has emerged as a promising paradigm for scaling model capacity while preserving computational efficiency, particularly in large-scale machine learning architectures such as large language models (LLMs). Recent …","url":["https://arxiv.org/pdf/2504.19660"]} {"year":"2025","title":"Decoding corporate communication strategies: Analysing mandatory published information under Pillar 3 across turbulent periods with unsupervised machine …","authors":["A Pilková, M Munk, L Kelebercová - PLOS ONE, 2025"],"snippet":"This study explores the communication patterns of Slovak banks with stakeholders through mandatory disclosures mandated by Basel III’s Pillar 3 framework and annual reports in 2007−2022. Our primary objective is to identify key topics …","url":["https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0328841"]} {"year":"2025","title":"Decoding Fake News and Hate Speech: A Survey of Explainable AI Techniques: A Survey of Explainable AI Techniques.","authors":["M Ngueajio, S Aryal, M Atemkeng, G Washington… - ACM Computing Surveys"],"snippet":"… Additionally, the authors utilize the common crawl version of the pretrained Glove model to build the word embeddings for the non-BERT classifiers, and each model s classification performance was evaluated, compared, and reported in terms of their …","url":["https://dl.acm.org/doi/pdf/10.1145/3711123"]} {"year":"2025","title":"Decoding Wine Narratives with Hierarchical Attention: Classification, Visual Prompts, and Emerging E-Commerce Possibilities","authors":["V Diaconita, A Belciu, AMI Corbea, I Simonca - Journal of Theoretical and Applied …, 2025"],"snippet":"Wine reviews can connect words to flavours; they entwine sensory experiences into vivid stories. This research explores the intersection of artificial intelligence and oenology by using state-of-the-art neural networks to decipher the nuances in wine …","url":["https://www.mdpi.com/0718-1876/20/3/212"]} {"year":"2025","title":"Decomposing Implicit Bias in Distributional Semantic Models: The Roles of First-and Second-Order Co-Occurrence","authors":["M Apsel, MN Jones - Proceedings of the Annual Meeting of the Cognitive …, 2025"],"snippet":"… The C4 corpus is a cleaned subset of the Common Crawl web scrape corpus, designed to remove noisy data and improve text quality for research applications. We used the validation set of the English-language version of the dataset, sourced …","url":["https://escholarship.org/content/qt5c297396/qt5c297396.pdf"]} +{"year":"2025","title":"Decoupled Action Head: Confining Task Knowledge to Conditioning Layers","authors":["J Zhou, S Lin, S Fu, Q Wu - arXiv preprint arXiv:2511.12101, 2025"],"snippet":"… robot datasets, while, by contrast, stable diffusion v1 [13] uses 2.3B images filtered from LAION-5B [14] and the filtered common crawl dataset [15] that is used in gpt-3 [16] contains 410B tokens. Although originating from different domains, the …","url":["https://arxiv.org/pdf/2511.12101"]} {"year":"2025","title":"Decoupling Content and Expression: Two-Dimensional Detection of AI-Generated Text","authors":["G Bao, L Rong, Y Zhao, Q Zhou, Y Zhang - arXiv preprint arXiv:2503.00258, 2025"],"snippet":"The wide usage of LLMs raises critical requirements on detecting AI participation in texts. Existing studies investigate these detections in scattered contexts, leaving a systematic and unified approach unexplored. In this paper, we present HART, a …","url":["https://arxiv.org/pdf/2503.00258"]} {"year":"2025","title":"DeDisCo at the DISRPT 2025 Shared Task: A System for Discourse Relation Classification","authors":["Z Ju, J Wu, A Purushothama, A Zeldes - arXiv preprint arXiv:2509.11498, 2025"],"snippet":"This paper presents DeDisCo, Georgetown University's entry in the DISRPT 2025 shared task on discourse relation classification. We test two approaches, using an mt5-based encoder and a decoder based approach using the openly available …","url":["https://arxiv.org/pdf/2509.11498"]} {"year":"2025","title":"Deep Generative Models for Prediction and Design of Enzymes","authors":["AD Spinner - 2024"],"snippet":"Over billions of years, proteins have evolved functions that drive nearly all biological processes on Earth. This vast evolutionary record offers an enormous experimental dataset that enables predictive modeling of biological systems. In this thesis, I …","url":["https://search.proquest.com/openview/f778804e6685494f993d529dbf3f0ce7/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2025","title":"Deep Information Retrieval Using Neural Language Models for Accounting Research: An Illustration Using Press News","authors":["Y Shan, S Dai, H Zhang, H Man - Journal of Corporate Accounting & Finance, 2025"],"snippet":"In recent years, machine learning (ML) models have been increasingly used to collect and analyze information from diverse sources, such as earning calls, news press releases, and social media. This has empowered accounting researchers to …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/jcaf.70019"]} {"year":"2025","title":"Deep Learning and Natural Language Processing in the Field of Construction","authors":["R Kessler, N Béchet - arXiv preprint arXiv:2501.07911, 2025"],"snippet":"This article presents a complete process to extract hypernym relationships in the field of construction using two main steps: terminology extraction and detection of hypernyms from these terms. We first describe the corpus analysis method to extract …","url":["https://arxiv.org/pdf/2501.07911"]} {"year":"2025","title":"Deep Learning Error Minimization System for Real-Time Big Data Analysis in Mobile Applications","authors":["Y Qing, Z Jing - Academic Journal of Computing & Information Science"],"snippet":"This paper presents a novel deep learning error minimization system designed to enhance the efficiency, adaptability, and accuracy of real-time big data analysis in mobile applications. Traditional deep learning systems face limitations such as the …","url":["https://francis-press.com/uploads/papers/tqnluphzeDNvkfZ2oI6Ew6lY1DaFsjtJYUlD3aNI.pdf"]} +{"year":"2025","title":"Deep Learning II: NLP and Transformer","authors":["J Fu - 2025"],"snippet":"• Transformers represent one of the most important developments in deep learning.• One major advantage of transformers is that transfer learning is very effective.• A transformer model can be trained on a large body of data and then the trained model …","url":["https://jiaweifu.org/pdf/Lec_dl2.pdf"]} {"year":"2025","title":"Deep Learning in Biomedical Research and Statistical Inference on Time Warping Functions","authors":["M Lin - 2025"],"snippet":"In this dissertation, I will present two research projects that I have been working on during my doctoral study at Florida State University. I will provide a brief summary for each of these projects as follows. The detailed studies will be given in the chapters …","url":["https://search.proquest.com/openview/dc595b1d37d20b8f5fb7c517250deeaf/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Deep learning-based pair barracuda swarm optimization for Arabic text-to-speech synthesizer for visually impaired people using applied linguistics","authors":["N Nemri, M Yahya Alzahrani, W Bouchelligua… - Journal of the Chinese …, 2025"],"snippet":"This paper presents a Deep Learning-Based Pair Barracuda Swarm Optimization for an Arabic Text-to-Speech Synthesizer Using Applied Linguistics (DLPBSO-ATTSSAP), designed to support visually impaired individuals. Arabic text-to-speech synthesis is …","url":["https://www.tandfonline.com/doi/abs/10.1080/02533839.2025.2574445"]} {"year":"2025","title":"Deep multimodal fusion for video game age rating classification","authors":["C BALIM - Entertainment Computing, 2025"],"snippet":"video games appeal to a wide range of ages, from children to adults. As a result, reliable age rating systems like the Entertainment Software Rating Board (ESRB) and Pan European Game Information (PEGI) are essential for guarding younger …","url":["https://www.sciencedirect.com/science/article/pii/S1875952125000606"]} @@ -9753,6 +9827,7 @@ {"year":"2025","title":"Defending the Digital Frontier: A User-Friendly Cybersecurity Toolkit","authors":["G Jethava, N Shukla, D Chauhan, K Patel - Proceedings of International Conference …, 2025"],"snippet":"… Some of the phishing URLs were added from websites like Phishtank, Openphish, Alexa and Common crawl. There is around 95000 … Some of the phishing URLs were added from websites like Phishtank, Openphish, Alexa and Common crawl …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=90uREQAAQBAJ&oi=fnd&pg=PA243&dq=commoncrawl&ots=og2pa8T_JN&sig=UszYhGXboEItZPuS20RsGICr69g"]} {"year":"2025","title":"DEFINING A STRATEGIC ACTION PLAN FOR AI IN HIGHER EDUCATION","authors":["F PAPADHOPULLI, M TAFAJ - PROCEEDINGS OF INTERNATIONAL SCIENTIFIC …, 2025","N Avouris - arXiv preprint arXiv:2510.03343, 2025"],"snippet":"This paper discusses key challenges of Artificial Intelligence in Education, with main focus on higher education institutions. We start with reviewing normative actions of international organizations and concerns expressed about the current technical …","url":["https://arxiv.org/pdf/2510.03343","https://www.researchgate.net/profile/Valbona-Nathanaili/publication/395659137_PROCEEDINGS_OF_INTERNATIONAL_SCIENTIFIC_CONFERENCE_DIGITAL_COMPETENCIES_IN_HIGHER_EDUCATION_TRENDS_CHALLENGES_AND_PERSPECTIVES/links/68cd53f3a8689b51bd610e52/PROCEEDINGS-OF-INTERNATIONAL-SCIENTIFIC-CONFERENCE-DIGITAL-COMPETENCIES-IN-HIGHER-EDUCATION-TRENDS-CHALLENGES-AND-PERSPECTIVES.pdf#page=142"]} {"year":"2025","title":"Defining Foundation Models for Computational Science: A Call for Clarity and Rigor","authors":["Y Choi, SW Cheung, Y Kim, PH Tsai, AN Diaz… - arXiv preprint arXiv …, 2025"],"snippet":"The widespread success of foundation models in natural language processing and computer vision has inspired researchers to extend the concept to scientific machine learning and computational science. However, this position paper argues that as the …","url":["https://arxiv.org/pdf/2505.22904"]} +{"year":"2025","title":"DEJIMA: A Novel Large-scale Japanese Dataset for Image Captioning and Visual Question Answering","authors":["T Katsube, T Fukuhara, K Ando, Y Mukuta, K Uehara… - arXiv preprint arXiv …, 2025"],"snippet":"… We collect candidate image–alt-text pairs from Common Crawl and retain only Japanese pages after language and adult filtering. Web content may be copyrighted or subject to site-specific terms. To respect rights of content owners, we do not …","url":["https://arxiv.org/pdf/2512.00773"]} {"year":"2025","title":"Deliverable 3.11: UC5: Report on methodology and results to use online data for business register enhancement","authors":["VAS Finland, O ten Bosch, ADS Netherlands…"],"snippet":"1 Background This document is part of the Work Package 3 (WP3) New use-cases from the ESSnet Trusted Smart Statistics–Web Intelligence Network project (TSS-WIN). The overall objective of WP3 is to explore the potential of new types of web data …","url":["https://cros.ec.europa.eu/system/files/2025-04/D3_11_WP3_UC5.pdf"]} {"year":"2025","title":"Delving into: the quantification of Ai-generated content on the internet (synthetic data)","authors":["DHR Spennemann - arXiv preprint arXiv:2504.08755, 2025"],"snippet":"… models, such as ChatGPT and DeepSeek, are derived from fiction and non-fiction books, government documents, articles, and web pages to establish the parameters of language, while a considerable amount of factual knowledge has been taken from …","url":["https://arxiv.org/pdf/2504.08755"]} {"year":"2025","title":"Democratising AI through Culture: Making Generative AI Participatory and Intersectional through an AI of the Commons","authors":["ML Bucher, S Choi - 2025"],"snippet":"This report explores the possibility to make AI more inclusive and participatory through the concept of “AI as a cultural commons”. It proposes a practical approach for cultural practitioners to decolonise current AI systems by influencing …","url":["https://opus.bsz-bw.de/ifa/files/1571/ifa-2025_choi-bucher_democratising-AI.pdf"]} @@ -9787,8 +9862,10 @@ {"year":"2025","title":"Detection of Medical Conspiracy Theories with Limited Resources: Using Data from Prior Epidemics and LLMs","authors":["IB Schlicht, D Korenčić, B Chulvi, L Flek, P Rosso - 2025"],"snippet":"Online dissemination of conspiracy theories (CTs) during epidemics poses significant risks to public health. This paper addresses the problem of detecting CTs in social media posts with an emphasis on the resource-constrained scenarios …","url":["https://www.authorea.com/doi/pdf/10.22541/au.174522531.12427873"]} {"year":"2025","title":"Detection of Phishing Activities Using Deep Learning Approaches","authors":["HB Gurushankar, HL Gururaj - 2025 17th International Conference on …, 2025"],"snippet":"… The authentic URLs originated from a collection of web crawl data called Common Crawl. Phishtank, a website used as a phishing URL … A database consisting of one million authentic URLs from the Common Crawl database is used …","url":["https://ieeexplore.ieee.org/abstract/document/10885614/"]} {"year":"2025","title":"Detection of Somali-written Fake News and Toxic Messages on the Social Media Using Transformer-based Language Models","authors":["MA Mohamed, SD Ahmed, YA Isse, HM Mohamed… - arXiv preprint arXiv …, 2025"],"snippet":"The fact that everyone with a social media account can create and share content, and the increasing public reliance on social media platforms as a news and information source bring about significant challenges such as misinformation, fake …","url":["https://arxiv.org/pdf/2503.18117"]} +{"year":"2025","title":"Determining and Evaluating the Quality of Corpora in the LLM Era","authors":["L Sevilla-Requena - 2025"],"snippet":"The present study addresses the need for a systematic and scalable framework to determine the quality of linguistic corpora in the era of Large Language Models (LLMs). As the success of LLMs increasingly depends on the quality of their training data …","url":["https://ceur-ws.org/Vol-4100/paper2.pdf"]} {"year":"2025","title":"Determining category metadata in open data portals–an approach based on Formal Concept Analysis","authors":["MF Gligorijević, M Bogdanović, L Stoimenov - 2024 32nd Telecommunications Forum …, 2024"],"snippet":"… The GloVe model used within this approach is pre-trained model on Common Crawl data with 840 billion generated tokens and vocabulary containing 2.2 million entries. Based on the calculated similarities between terms, the similarity between …","url":["https://ieeexplore.ieee.org/abstract/document/10819115/"]} {"year":"2025","title":"Develop a hybrid improved weighted quantum wolf optimization and fast mask recurrent convolutional neural network toenhance the performance of phishing …","authors":["C Rajeswary, M Thirumaran - Journal of Industrial and Management Optimization, 2025"],"snippet":"… The data acquisition process begins by collecting two types of videos: authentic videos from the Common Crawl Foundation, and phishing videos from Phish Tank. The collected videos are then split into a training dataset and a testing dataset. The …","url":["https://www.aimsciences.org/data/article/export-pdf?id=678a1f0b603a506b3a958b36"]} +{"year":"2025","title":"Developing a Unified Framework for Contextual Multi-Modal Reasoning in Document Understanding","authors":["G Mustapha, A Ogar, MI Nurudeen, AS Muhammed… - 2025"],"snippet":"… The framework is pretrained on approximately 11 million documents sourced from PubMed Central, 101 Common Crawl, and IIT-CDIP. All samples undergo automated filtering to remove sensitive information. Pretraining optimizes masked …","url":["https://sciforum.net/manuscripts/27993/slides.pdf"]} {"year":"2025","title":"Developing Advanced Question-Answering Models for Legal Kazakh Texts: A Comparative Study of Modern Approaches","authors":["D Rakhimova, V Karyukin, A Karibayeva… - Asian Conference on …, 2025"],"snippet":"… In this paper, we present mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 102 languages. We describe the design and modified training of mT5 and demonstrate its state-of-the-art …","url":["https://link.springer.com/chapter/10.1007/978-981-96-5881-7_19"]} {"year":"2025","title":"Developing and Utilizing a Large-Scale Cantonese Dataset for Multi-Tasking in Large Language Models","authors":["J Jiang, AKY Truong, Y Chen, Q Bao, S Wang, P Chen… - arXiv preprint arXiv …, 2025"],"snippet":"… tion, we interface with Common Crawl to amass a broader corpus of Chinese text. Open-source corpora: (1) Wikipedia serves as a primary source due to its comprehensive data availability. The Wikipedia pages are systematically archived …","url":["https://arxiv.org/pdf/2503.03702"]} {"year":"2025","title":"Developing Japanese CLIP Models Leveraging an Open-weight LLM for Large-scale Dataset Translation","authors":["I Sugiura, S Kurita, Y Oda, D Kawahara, N Okazaki","I Sugiura, S Kurita, Y Oda, D Kawahara, N Okazaki - … of the 2025 Conference of the …, 2025"],"snippet":"… However, web crawling presents challenges due to the relatively small proportion of Japanese web pages in Common Crawl, which … It is a largescale dataset of image-text pairs, where images and their corresponding IMG-alt text are collected …","url":["https://aclanthology.org/2025.naacl-srw.15.pdf","https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-srw.15.pdf"]} @@ -9808,9 +9885,11 @@ {"year":"2025","title":"Dissertation directed by: Marine Carpuat Department of Computer Science While much natural language processing work focuses on analyzing language content …","authors":["X Niu"],"snippet":"… the training data as in-domain and either Common Crawl or ICWSM as out-of- … Mono-TL Common Crawl 26,788,048 … Mono-SW Common Crawl 12,158,524 …","url":["https://api.drum.lib.umd.edu/server/api/core/bitstreams/e34e6677-e033-43f8-a795-442c56afdcb4/content"]} {"year":"2025","title":"Dissertation directed by: Professor Marine Carpuat Department of Computer Science Cross-lingual resources such as parallel corpora and bilingual dictionaries are …","authors":["YP Vyas"],"snippet":"… Common Crawl corpus contains sentence-aligned parallel documents automati- … trained on the exact same parallel corpora (OpenSubtitles or CommonCrawl for … On the CommonCrawl test set, the examples with disagreement are more …","url":["https://api.drum.lib.umd.edu/server/api/core/bitstreams/011a649c-39bc-474d-8c83-8c7d6952d8a8/content"]} {"year":"2025","title":"Distilled Pretraining: A modern lens of Data, In-Context Learning and Test-Time Scaling","authors":["S Goyal, D Lopez-Paz, K Ahuja - arXiv preprint arXiv:2509.01649, 2025"],"snippet":"… More broadly, current pretraining datasets have largely been curated from common crawl with standard next-token pretraining paradigms in mind. Moving forward, a highly promising research direction would be the development of …","url":["https://arxiv.org/pdf/2509.01649"]} +{"year":"2025","title":"Distributionally-Robust Gradient Routing: A Bilevel Sparse Optimization Problem for Compute-Aware Mixture-of-Experts Training","authors":["XR Chen, BL Gao, YQ Fang, AZ Liu, Z Shen - 2025"],"snippet":"… To simulate domain shift at test time we perturb the domain priors and inject out-of-domain token sequences drawn from a held-out CommonCrawl subset. Models: MoE Transformer with E = 64 experts, 24 layers, dff = 4096 per expert; baselines include …","url":["https://www.researchsquare.com/article/rs-8381882/latest.pdf"]} {"year":"2025","title":"Divergent thinking in groups","authors":["MK Smith, R Weller, T Duong, R McClintock… - 2025"],"snippet":"Methods: To examine whether the severity of cold shock response impairs higherlevel thinking in a group, 29 active duty service members completed a group format Divergent Association Task (DAT; 4–5 per group) prior to and during a 13-min …","url":["https://timothydunn.co/s/fpsyg-2-1512011.pdf"]} {"year":"2025","title":"Diversity at the top: leveraging language for inclusion","authors":["M Bannò, A Franzoni, C Leggerini, M Rosola - Journal of Management and …, 2025"],"snippet":"This study explores how gendered job titles in Italian, particularly feminine forms used for leadership positions, are represented and perceived on social media like Twitter. While feminine titles align with Italian grammatical norms, they are often …","url":["https://link.springer.com/article/10.1007/s10997-025-09747-x"]} {"year":"2025","title":"Do Chinese models speak Chinese languages?","authors":["AW Wen-Yi, UES Jo, D Mimno - arXiv preprint arXiv:2504.00289, 2025"],"snippet":"The release of top-performing open-weight LLMs has cemented China's role as a leading force in AI development. Do these models support languages spoken in China? Or do they speak the same languages as Western models? Comparing …","url":["https://arxiv.org/pdf/2504.00289"]} +{"year":"2025","title":"Do Depth-Grown Models Overcome the Curse of Depth? An In-Depth Analysis","authors":["F Kapl, E Angelis, T Höppe, K Maile, J von Oswald… - arXiv preprint arXiv …, 2025"],"snippet":"Gradually growing the depth of Transformers during training can not only reduce training cost but also lead to improved reasoning performance, as shown by MIDAS (Saunshi et al., 2024). Thus far, however, a mechanistic understanding of these gains has …","url":["https://arxiv.org/pdf/2512.08819"]} {"year":"2025","title":"Do Not Trust Licenses You See—Dataset Compliance Requires Massive-Scale AI-Powered Lifecycle Tracing","authors":["J Kim, S Sohn, GJ Jo, J Choi, K Bae, H Lee, Y Park…"],"snippet":"This paper argues that a dataset’s legal risk cannot be accurately assessed by its license terms alone; instead, tracking dataset redistribution and its full lifecycle is essential. However, this process is too complex for legal experts to handle manually …","url":["https://asset-nexus.lgresearch.ai/pdf/Do_Not_Trust_Licenses_You_See.pdf"]} {"year":"2025","title":"DocHPLT: A Massively Multilingual Document-Level Translation Dataset","authors":["D O'Brien, B Malik, O de Gibert, P Chen, B Haddow… - arXiv preprint arXiv …, 2025"],"snippet":"Existing document-level machine translation resources are only available for a handful of languages, mostly high-resourced ones. To facilitate the training and evaluation of document-level translation and, more broadly, long-context modeling …","url":["https://arxiv.org/pdf/2508.13079"]} {"year":"2025","title":"Document Matching for Contradiction Detection in Low-Resource Legislative Texts With Self-Training and Augmentation Using Transformer Model","authors":["DA Navastara, S Abdillah, D Benito, IG Adillion… - Jurnal Nasional Pendidikan …, 2025"],"snippet":"… Meanwhile, XLM-RoBERTa is a multilingual model pretrained on approximately 2.5TB of filtered CommonCrawl data across 100 languages, making it particularly effective in lowresource language settings [34]. The highest-performing model was …","url":["https://ejournal.undiksha.ac.id/index.php/janapati/article/download/95954/33370"]} @@ -9823,9 +9902,13 @@ {"year":"2025","title":"DomainHarvester: Uncovering Trustworthy Domains Beyond Popularity Rankings","authors":["D Chiba, H Nakano, T Koide - IEEE Access, 2025"],"snippet":"… They serve as data sources for web crawling in projects like Common Crawl [39], which provides data for AI training, including models like ChatGPT. Under the assumption that popularity suggests safety, top lists have often been used as de …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10877793.pdf"]} {"year":"2025","title":"Don't Let Copyright Kill American AI","authors":["J Levine"],"snippet":"… Based on the New York Times complaint, which claims tens of millions of violations of its copyrights through datasets such as Common Crawl and WebText2, if a judge embraced a maximalist approach to statutory damages, the bill could …","url":["https://therepublicjournal.com/essays/dont-let-copyright-kill-american-ai/"]} {"year":"2025","title":"Don't Score too Early! Evaluating Argument Mining Models on Incomplete Essays","authors":["NJ Schaller, Y Ding, T Jansen, A Horbach"],"snippet":"… 2020) explicitly examined sentence versus token classification for argument recognition on annotated Common Crawl data (IAA αunom = .61). Their experiments with various BERT and FLAIR models showed that a BERT_LARGE sentence …","url":["https://aclanthology.org/anthology-files/pdf/bea/2025.bea-1.27.pdf"]} +{"year":"2025","title":"DPC-Jetfire: Scaling Capacity-Aware Low-Precision Training to 100B+ Parameter Large Language Models","authors":["A Azim"],"snippet":"The exponential growth of Large Language Models (LLMs) to 100B+ parameters has created unprecedented computational demands, making efficient training methods critical for advancing AI research. This work presents DPC-Jetfire, a novel …","url":["https://www.researchgate.net/profile/Anwarul-Azim-4/publication/397742403_DPC-Jetfire_Scaling_Capacity-Aware_Low-Precision_Training_to_100B_Parameter_Large_Language_Models/links/691d63701bb5f2388c22f773/DPC-Jetfire-Scaling-Capacity-Aware-Low-Precision-Training-to-100B-Parameter-Large-Language-Models.pdf"]} +{"year":"2025","title":"Draft Report (September 24, 2025)","authors":["M Jones, M Blottière, P Pasquier, O Siebert…"],"snippet":"In response to a recent manifesto by Quebec cultural organizations expressing powerlessness in the face of AI, artists and cultural workers at the digital arts center Sporobole in Sherbrooke proposed an alternative path: concrete, transferable …","url":["https://praxis.encommun.io/media/notes/note_26702/artia-sat-report5226.pdf"]} {"year":"2025","title":"DrDiff: Dynamic Routing Diffusion with Hierarchical Attention for Breaking the Efficiency-Quality Trade-off","authors":["J Zhang, Y Fan, K Cai, Z Huang, X Sun, J Wang… - arXiv preprint arXiv …, 2025"],"snippet":"This paper introduces DrDiff, a novel framework for long-text generation that overcomes the efficiency-quality trade-off through three core technologies. First, we design a dynamic expert scheduling mechanism that intelligently allocates …","url":["https://arxiv.org/pdf/2509.02785"]} {"year":"2025","title":"Dream-Coder 7B: An Open Diffusion Language Model for Code","authors":["Z Xie, J Ye, L Zheng, J Gao, J Dong, Z Wu, X Zhao… - arXiv preprint arXiv …, 2025"],"snippet":"We present Dream-Coder 7B, an open-source discrete diffusion language model for code generation that exhibits emergent any-order generation capabilities. Unlike traditional autoregressive (AR) models that decode strictly left-to-right, Dream-Coder …","url":["https://arxiv.org/pdf/2509.01142"]} -{"year":"2025","title":"Drone Cultures: From Surveillance and Warfare to Literature and Art","authors":["J Muthyala - 2025"],"snippet":""} +{"year":"2025","title":"Dripper: Token-Efficient Main HTML Extraction with a Lightweight LM","authors":["M Liu, J Peng, P Chu, J Qiu, R Ma, H Zhu, R Min, L Lu… - arXiv preprint arXiv …, 2025"],"snippet":"… We begin by grouping pages by domain across 107 dumps of the Common Crawl dataset. For each domain, we featurize the DOM tree structure of its pages (capped at 10,000 randomly sampled pages for larger domains) and computed their pairwise …","url":["https://arxiv.org/pdf/2511.23119"]} +{"year":"2025","title":"Drone Cultures: From Surveillance and Warfare to Literature and Art","authors":["J Muthyala - 2025"]} +{"year":"2025","title":"Dual Language Models: Balancing Training Efficiency and Overfitting Resilience","authors":["D Samuel, LGG Charpentier - arXiv preprint arXiv:2512.14549, 2025"],"snippet":"This paper combines autoregressive and masked-diffusion training objectives without any architectural modifications, resulting in flexible language models that outperform single-objective models. Autoregressive modeling has been a popular …","url":["https://arxiv.org/pdf/2512.14549"]} {"year":"2025","title":"Dual-Modality Integration Attention with Graph-Based Feature Extraction for Visual Question and Answering","authors":["J Lu, C Wu, L Wang, R Li, X Shen - Tsinghua Science and Technology, 2025"],"snippet":"Visual Question and Answering (VQA) has garnered significant attention as a domain that requires the synthesis of visual and textual information to produce accurate responses. While existing methods often rely on Convolutional Neural …","url":["https://ieeexplore.ieee.org/iel8/5971803/10979778/10979795.pdf"]} {"year":"2025","title":"DUKweb, diachronic word representations from the UK Web archive corpus","authors":["B McGillivray"],"snippet":"Lexical semantic change (detecting shifts in the meaning and usage of words) is an important task for social and cultural studies as well as for Natural Language Processing applications. Diachronic word embeddings (time-sensitive vector …","url":["https://kclpure.kcl.ac.uk/portal/files/344451573/s41597-021-01047-x.pdf"]} {"year":"2025","title":"Dutch CrowS-Pairs: Adapting a Challenge Dataset for Measuring Social Biases in Language Models for Dutch","authors":["E Strazda, G Spanakis - arXiv preprint arXiv:2507.16442, 2025"],"snippet":"Warning: This paper contains explicit statements of offensive stereotypes which might be upsetting. Language models are prone to exhibiting biases, further amplifying unfair and harmful stereotypes. Given the fast-growing popularity and …","url":["https://arxiv.org/pdf/2507.16442"]} @@ -9854,6 +9937,7 @@ {"year":"2025","title":"Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration","authors":["T Bai, L Yang, ZH Wong, F Sun, X Zhuang, J Peng… - Proceedings of the 63rd …, 2025"],"snippet":"… As shown in Figure 4, we first cluster 1.4 billion documents obtained from Common Crawl (Project… This diagram shows the process of training a BERT-based topic classifier using CommonCrawl data. 1.44 billion documents are clustered to …","url":["https://aclanthology.org/2025.acl-long.466.pdf"]} {"year":"2025","title":"Efficient Scaling of Language Models","authors":["A Pagnoni - 2025"],"snippet":"Large language models (LLMs) are progressively reshaping how humans interact with information, offering increasingly sophisticated access to knowledge through natural language interfaces and advancing reasoning capabilities across diverse …","url":["https://search.proquest.com/openview/48196ffff103abe849003f8e2ba32d12/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Efficient Sign Language Recognition with Skeleton Data: A Study of Keypoint Selection, Pose Estimators, and GCN Models","authors":["VHT Anh, Q Le Duc, TO Nguyen - … Conference on Multimedia Analysis and Pattern …, 2025"],"snippet":"… Using FastText [18] trained on the Common Crawl dataset to obtain feature representations for each word, with each word represented by a 300-dimensional vector. The feature representation of the entire vocabulary in the dataset is denoted as E ∈ RN×300 …","url":["https://ieeexplore.ieee.org/abstract/document/11133720/"]} +{"year":"2025","title":"EFFICIENT TOKENIZATION: BALANCING BABYMMLU, FERTILITY AND SPEED","authors":["I Bychkov, F Chernogorskii, S Averkiev, A Fenogenova"],"snippet":"In Natural Language Processing (NLP), tokenization is a critical pre-processing step that significantly influences model performance. The choice of the tokenizer is crucial, especially given the contemporary situation with large LMs that are expensive to train …","url":["ftp://ftp.pdmi.ras.ru/pub/publicat/znsl/v546/p006.pdf"]} {"year":"2025","title":"Efficient Tool Retrieval System for Large Language Models","authors":["MD Youcef, MB Miloud, MR Youcef, MD Narimene"],"snippet":"Large Language Models (LLMs) have shown impressive abilities in understanding and generating human language. However, they are limited by their static knowledge, meaning they can only provide information that was present in the data they were …","url":["https://www.researchgate.net/profile/Youcef-Refisse/publication/397448913_Efficient_Tool_Retrieval_System_for_Large_Language_Models/links/6911d84ca2b691617b6acb45/Efficient-Tool-Retrieval-System-for-Large-Language-Models.pdf"]} {"year":"2025","title":"Efficient Training of Robust Traditional Chinese LLaMA-1B on a Single Consumer GPU: Continual Pre-training, SFT, and DPO","authors":["YC Chih, MT Duan, YH Hou - arXiv preprint arXiv:2510.01616, 2025"],"snippet":"Small Language Models (SLMs) enable cost-effective, on-device and latency-sensitive AI applications, yet their deployment in Traditional Chinese (TC) remains hindered by token-level instability - models unpredictably emit non-TC characters or code-switch …","url":["https://arxiv.org/pdf/2510.01616"]} {"year":"2025","title":"EfficientLLM: Efficiency in Large Language Models","authors":["Z Yuan, W Sun, Y Liu, H Zhou, R Zhou, Y Li, Z Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have driven significant progress, yet their growing parameter counts and context windows incur prohibitive compute, energy, and monetary costs. We introduce EfficientLLM, a novel benchmark and the first …","url":["https://arxiv.org/pdf/2505.13840"]} @@ -9866,6 +9950,7 @@ {"year":"2025","title":"Emerging Properties in Unified Multimodal Pretraining","authors":["C Deng, D Zhu, K Li, C Gou, F Li, Z Wang, S Zhong… - arXiv preprint arXiv …, 2025"],"snippet":"… We build upon OmniCorpus [39], a large-scale dataset preprocessed from Common Crawl [14], which provides a vast collection of web documents with interleaved text and images. We additionally include open-source image editing …","url":["https://arxiv.org/pdf/2505.14683"]} {"year":"2025","title":"Emotion-based Multimodal Music Classifier for Recommender Systems","authors":["E Quaranta - 2025"],"snippet":"In recent years, advancements in artificial intelligence have driven a growing demand for personalized user experiences across various digital platforms. In the music domain, this trend is reflected in the need for more sophisticated …","url":["https://webthesis.biblio.polito.it/secure/35429/1/tesi.pdf"]} {"year":"2025","title":"Empirical Evaluation of Knowledge Distillation from Transformers to Subquadratic Language Models","authors":["P Haller, J Golde, A Akbik - arXiv preprint arXiv:2504.14366, 2025"],"snippet":"Knowledge distillation is a widely used technique for compressing large language models (LLMs) by training a smaller student model to mimic a larger teacher model. Typically, both the teacher and student are Transformer-based architectures …","url":["https://arxiv.org/pdf/2504.14366"]} +{"year":"2025","title":"EMPIRICAL VALIDATION OF CONSCIOUSNESS THEORIES IN ARTIFICIAL NEURAL NETWORKS","authors":["L Pokorny - 2025"],"snippet":"This study presents a systematic empirical investigation testing predictions derived from three major consciousness theories—Global Workspace Theory (GWT), Integrated Information Theory (IIT), and Predictive Processing (PP)—in artificial …","url":["https://www.researchgate.net/profile/Laszlo-Pokorny/publication/398923966_EMPIRICAL_VALIDATION_OF_CONSCIOUSNESS_THEORIES_IN_ARTIFICIAL_NEURAL_NETWORKS/links/6947c21927359023a00ebc93/EMPIRICAL-VALIDATION-OF-CONSCIOUSNESS-THEORIES-IN-ARTIFICIAL-NEURAL-NETWORKS.pdf"]} {"year":"2025","title":"Employing device inventory and failure data for test configuration discovery and device utilization","authors":["VV Kiiskilä - 2025"],"snippet":"Within network integration testing, device capacity is a key constraint for comprehensive testing. This thesis investigates how fault tickets and device inventory data could be utilized to automatically discover test configurations …","url":["https://oulurepo.oulu.fi/bitstream/handle/10024/55636/nbnfioulu-202505073143.pdf?sequence=1"]} {"year":"2025","title":"Empowering Enterprises with Lightweight Large Language Models: Automated\" Rule Card\" Extraction from Grant Documents","authors":["H Alemifar - 2025"],"snippet":"In the face of rapidly expanding unstructured data, organizations, especially small and medium-sized enterprises (SMEs), require automated solutions that not only offer accurate information extraction but also preserve data privacy. This thesis …","url":["https://webthesis.biblio.polito.it/secure/34436/1/tesi.pdf"]} {"year":"2025","title":"Empowering Multimodal LLMs with External Tools: A Comprehensive Survey","authors":["W An, J Nie, Y Wu, F Tian, S Lu, Q Zheng - arXiv preprint arXiv:2508.10955, 2025"],"snippet":"… DATACOMP [37] utilizes cc2dataset, an Apache Spark-based library, to extract pairs of image URLs and nonempty alttext from all Common Crawl snapshots to collect image-text data pairs. LAION [36] collects multimodal data from the Common …","url":["https://arxiv.org/pdf/2508.10955"]} @@ -9910,6 +9995,7 @@ {"year":"2025","title":"Enhancing Small Language Models for Graph Tasks Through Graph Encoder Integration","authors":["D Oh, S Kang, H Kim, D Oh - Applied Sciences, 2025"],"snippet":"Small language models (SLMs) are increasingly utilized for on-device applications due to their ability to ensure user privacy, reduce inference latency, and operate independently of cloud infrastructure. However, their performance is often limited …","url":["https://www.mdpi.com/2076-3417/15/5/2418"]} {"year":"2025","title":"Enhancing text quality evaluation with integrating content security attributes","authors":["Y Sun, J Zhao - Expert Systems with Applications, 2025"],"snippet":"With the rapid development of large language models (LLMs), the problem of generating unsecure and illegal content has become increasingly severe. Therefore, it is especially important to evaluate the security and compliance of content …","url":["https://www.sciencedirect.com/science/article/pii/S0957417425008565"]} {"year":"2025","title":"Enhancing waste recognition with vision-language models: A prompt engineering approach for a scalable solution","authors":["HJ Malla, M Bazli, M Arashpour - Waste Management, 2025"],"snippet":"Conventional unimodal computer vision models, trained on limited bespoke waste datasets, face significant challenges in classifying waste images in material recovery facilities, where waste appears in diverse forms. Maintaining performance of these …","url":["https://www.sciencedirect.com/science/article/pii/S0956053X25003502"]} +{"year":"2025","title":"Enhancing word and document embeddings for natural language processing tasks","authors":["B Rafieian - 2025"],"snippet":"This thesis delves into various aspects of natural language processing, focusing on domain-specific neural machine translation, specialized word embeddings, data augmentation, recommender systems, document embedding techniques, and …","url":["https://upcommons.upc.edu/bitstreams/4c99b301-4e84-4553-8f8e-0f7934ad2f67/download"]} {"year":"2025","title":"Enigma@ ElCardioCC: bridging NER and ICD-10 entity linking-A hybrid method for greek clinical narratives","authors":["B Velichkov, A Datseris, S Vassileva, S Boytcheva - CLEF, 2025"],"snippet":"… , European Parliament Proceedings Parallel Corpus, and the Greek portion of filtered CommonCrawl. It has shown improved results on the general domain Greek NER task. • XLM-RoBERTa Large [3] 21 - a multilingual model, trained on 2.5TB of …","url":["https://ceur-ws.org/Vol-4038/paper_49.pdf"]} {"year":"2025","title":"Enough Coin Flips Can Make LLMs Act Bayesian","authors":["R Gupta, R Corona, J Ge, E Wang, D Klein, T Darrell… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) exhibit the ability to generalize given few-shot examples in their input prompt, an emergent capability known as in-context learning (ICL). We investigate whether LLMs utilize ICL to perform structured reasoning in ways that …","url":["https://arxiv.org/pdf/2503.04722"]} {"year":"2025","title":"Enriched Image Captioning based on Knowledge Divergence and Focus","authors":["AA Liu, Q Wu, N Xu, H Tian, L Wang - IEEE Transactions on Circuits and Systems for …, 2025"],"snippet":"… For instance, GPT-3 [5] boasts a massive 175 billion parameters and has been trained on extensive text data corpora from various sources including Common Crawl [52], WebText2, books [5], and Wikipedia, providing it with a broad knowledge …","url":["https://ieeexplore.ieee.org/abstract/document/10820873/"]} @@ -9918,6 +10004,7 @@ {"year":"2025","title":"Ensembling Sparse Autoencoders","authors":["S Gadgil, C Lin, SI Lee - arXiv preprint arXiv:2505.16077, 2025"],"snippet":"… Its diverse components include academic papers (eg, arXiv, PubMed Central), books (eg, Books3, BookCorpus2), code (from GitHub), web content (eg, a filtered version of Common Crawl called Pile-CC, OpenWebText2), and other sources like …","url":["https://arxiv.org/pdf/2505.16077"]} {"year":"2025","title":"Entity-aware Cross-lingual Claim Detection for Automated Fact-checking","authors":["R Panchendrarajan, A Zubiaga - arXiv preprint arXiv:2503.15220, 2025"],"snippet":"… XLMR was trained on CommonCrawl data supporting 100 languages, and mBERT was trained on Wikipedia data containing 104 languages. Both models generate embeddings of size 768 for each tokenized word. …","url":["https://arxiv.org/pdf/2503.15220"]} {"year":"2025","title":"Entropy2Vec: Crosslingual Language Modeling Entropy as End-to-End Learnable Language Representations","authors":["PA Irawan, R Diandaru, BJB Syuhada, RZ Suchrady… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Entropy2Vec, a novel framework for deriving cross-lingual language representations by leveraging the entropy of monolingual language models. Unlike traditional typological inventories that suffer from feature sparsity and static …","url":["https://arxiv.org/pdf/2509.05060"]} +{"year":"2025","title":"Epistemic diversity across language models mitigates knowledge collapse","authors":["D Hodel, JD West - arXiv preprint arXiv:2512.15011, 2025"],"snippet":"The growing use of artificial intelligence (AI) raises concerns of knowledge collapse, ie, a reduction to the most dominant and central set of ideas. Prior work has demonstrated single-model collapse, defined as performance decay in an AI model …","url":["https://arxiv.org/pdf/2512.15011"]} {"year":"2025","title":"Epistemic Power, Algorithmic Politics: Machine Learning as an Imported Colonial Gaze","authors":["PO Ouma"],"snippet":"The rapid integration of Machine Learning (ML) systems into the governance, financial, and social infrastructures of the Global South is frequently framed as a neutral imperative of technological development. This article challenges that …","url":["https://philpapers.org/archive/OUMEPA.pdf"]} {"year":"2025","title":"ESLM: Risk-Averse Selective Language Modeling for Efficient Pretraining","authors":["MI Bal, V Cevher, M Muehlebach - arXiv preprint arXiv:2505.19893, 2025"],"snippet":"Large language model pretraining is compute-intensive, yet many tokens contribute marginally to learning, resulting in inefficiency. We introduce Efficient Selective Language Modeling (ESLM), a risk-aware algorithm that improves training efficiency …","url":["https://arxiv.org/pdf/2505.19893"]} {"year":"2025","title":"Esplorazione di Large Language Model open source per l'insegnamento delle lingue nelle professioni tecniche","authors":["Z Kazemi - 2025"],"snippet":"This thesis investigates how open-source Large Language Models (LLMs) can support teachers in teaching technical language. While proprietary systems like Google Gemini currently lead in accuracy, this study examines whether open-source …","url":["https://unire.unige.it/bitstream/handle/123456789/13528/tesi35028836.pdf?sequence=1&isAllowed=y&group=an"]} @@ -9925,14 +10012,18 @@ {"year":"2025","title":"Ethical Challenges and Bias in NLP Models: A Python-Based Investigation","authors":["E Carter, R Narayanan"],"snippet":"… Many NLP models are trained on large corpora scraped from the internet, such as Wikipedia, news articles, or Common Crawl. These sources often reflect real-world disparities in representation—underrepresenting minorities or reinforcing gender …","url":["https://www.researchgate.net/profile/Adebis-Samuel/publication/392159070_Ethical_Challenges_and_Bias_in_NLP_Models_A_Python-Based_Investigation/links/6836e34c8a76251f22e9c7f9/Ethical-Challenges-and-Bias-in-NLP-Models-A-Python-Based-Investigation.pdf"]} {"year":"2025","title":"Ethical Issues in Large Language Models: A Systematic Literature Review","authors":["A Laakso, KK Kemell, JK Nurminen - 2024"],"snippet":"Large Language Models (LLMs), and Generative AI (GenAI) more generally, have been the center of much attention in both media and research following recent technical advances. In the wake of the recent surge of users services like ChatGPT …","url":["https://ceur-ws.org/Vol-3901/paper_4.pdf"]} {"year":"2025","title":"ETHICS-2025 Session G1-Panel: Scraping the Surface: Ethical Collection Practices in the Age of AI: ETHICS-2025 Special Session, Sunday, June 8 2025, 2: 45-4: 15 …","authors":["G Lindahl, D Mazia, L Rosenthol, J Levy, K Natana - 2025 IEEE International …, 2025"],"snippet":"… By freeing up access, Lindahl explained, Common Crawl hopes to equalize opportunity across research institutions and industries. However, he acknowledged that even responsibly collected public data can provoke controversy, especially …","url":["https://ieeexplore.ieee.org/iel8/11098176/11098178/11098358.pdf"]} +{"year":"2025","title":"Eval Factsheets: A Structured Framework for Documenting AI Evaluations","authors":["F Bordes, C Ross, JT Kao, E Spiliopoulou, A Williams - arXiv preprint arXiv …, 2025"],"snippet":"The rapid proliferation of benchmarks has created significant challenges in reproducibility, transparency, and informed decision-making. However, unlike datasets and models -- which benefit from structured documentation frameworks like …","url":["https://arxiv.org/pdf/2512.04062"]} {"year":"2025","title":"Evaluating and comparing gender bias across four text-to-image models","authors":["Z Hammad, NL Sowah - arXiv preprint arXiv:2509.08004, 2025"],"snippet":"As we increasingly use Artificial Intelligence (AI) in decision-making for industries like healthcare, finance, e-commerce, and even entertainment, it is crucial to also reflect on the ethical aspects of AI, for example the inclusivity and fairness of the …","url":["https://arxiv.org/pdf/2509.08004"]} {"year":"2025","title":"Evaluating Binary Decision Biases in Large Language Models: Implications for Fair Agent-Based Financial Simulations","authors":["A Vidler, T Walsh - arXiv preprint arXiv:2501.16356, 2025"],"snippet":"… To gain a context on natural language bias, we perform a sampling of the Common crawl (Common Crawl 2024) as recent research by (Tessema, Kedia, and Chung 2024) has found that it can provide a valuable data source for fine tuning …","url":["https://arxiv.org/pdf/2501.16356"]} {"year":"2025","title":"Evaluating Code-Mixing in LLMs Across 18 Languages","authors":["Y Yang, Y Chai - arXiv preprint arXiv:2507.18791, 2025"],"snippet":"Code-mixing, the practice of switching between languages within a conversation, presents unique challenges for traditional natural language processing. Existing benchmarks, such as LinCE and GLUECoS, are limited by narrow language …","url":["https://arxiv.org/pdf/2507.18791"]} +{"year":"2025","title":"Evaluating Concept Filtering Defenses against Child Sexual Abuse Material Generation by Text-to-Image Models","authors":["AM Cretu, K Kireev, A Abdalla, W Obinna, R Meier… - arXiv preprint arXiv …, 2025"],"snippet":"We evaluate the effectiveness of child filtering to prevent the misuse of text-to-image (T2I) models to create child sexual abuse material (CSAM). First, we capture the complexity of preventing CSAM generation using a game-based security definition …","url":["https://arxiv.org/pdf/2512.05707"]} {"year":"2025","title":"Evaluating CxG Generalisation in LLMs via Construction-Based NLI Fine Tuning","authors":["T Mackintosh, HT Madabushi, C Bonial - arXiv preprint arXiv:2509.16422, 2025"],"snippet":"We probe large language models' ability to learn deep form-meaning mappings as defined by construction grammars. We introduce the ConTest-NLI benchmark of 80k sentences covering eight English constructions from highly lexicalized to highly …","url":["https://arxiv.org/pdf/2509.16422"]} {"year":"2025","title":"Evaluating Dutch Speakers and Large Language Models on Standard Dutch: a grammatical Challenge Set based on the Algemene Nederlandse Spraakkunst","authors":["J Pestel, J Bloem, RG Alhama - Computational Linguistics in the Netherlands Journal, 2025"],"snippet":"This study evaluates the linguistic knowledge of Dutch Large Language Models (LLMs) by introducing a novel challenge set based on the Algemene Nederlandse Spraakkunst (ANS). The ANS is a comprehensive resource of Dutch prescriptive …","url":["https://www.clinjournal.org/clinj/article/download/216/224"]} {"year":"2025","title":"Evaluating GPT-and Reasoning-based Large Language Models on Physics Olympiad Problems: Surpassing Human Performance and Implications for Educational …","authors":["P Tschisgale, H Maus, F Kieser, B Kroehs, S Petersen… - arXiv preprint arXiv …, 2025"],"snippet":"… Moreover, many of the problems are not publicly shared through the Internet, and thus likely not part of the Common Crawl of the Internet, which is part of the training data for LLMs. The average difficulty of the problems increases across the …","url":["https://arxiv.org/pdf/2505.09438"]} {"year":"2025","title":"Evaluating Large Language Models as Raters in Large-Scale Writing Assessments: A Psychometric Framework for Reliability and Validity","authors":["Y Wang, J Huang, L Du, Y Guo, Y Liu, R Wang - Computers and Education: Artificial …, 2025"],"snippet":"In large-scale international writing assessments, human raters often exhibit inconsistency, undermining reliability and validity. Large language models (LLMs) offer a potential solution, but their assessment reliability remains underexplored …","url":["https://www.sciencedirect.com/science/article/pii/S2666920X25001213"]} +{"year":"2025","title":"Evaluating Large Language Models for Decision Support in Minimally Invasive Spine Surgery Triage and Procedural Categories","authors":["A Kartal, NF Manalil, CD Cheng, LK Chung, H Gebhard… - Global Spine Journal, 2025"],"snippet":"Study Design Vignette-based cross-sectional study. Objective Generative artificial intelligence (AI) programs such as large language models (LLMs) are reshaping treatment decision-making, yet applications in minimally invasive spine surgery (MISS) …","url":["https://journals.sagepub.com/doi/pdf/10.1177/21925682251411225"]} {"year":"2025","title":"Evaluating Large Language Models in Mongolian","authors":["DTOFC Yugo, MC Chu"],"snippet":"This paper presents a comprehensive evaluation for assessing large language model (LLM) capabilities in the Mongolian language, addressing a critical gap in multilingual LLM evaluation. We introduce MonMLU, a novel benchmark derived …","url":["https://www.anlp.jp/proceedings/annual_meeting/2025/pdf_dir/Q1-12.pdf"]} +{"year":"2025","title":"Evaluating LLM Capabilities in Low-Resource Contexts: A Case Study of Persian Linguistic and Cultural Tasks","authors":["J Heierli, RB Ganjineh, E Gavagnin"],"snippet":"We evaluate four representative large language models, namely GPT-4o, Gemini, Llama, and DeepSeek on a suite of linguistic and cultural tasks in Persian, covering grammar, paraphrasing, inference, translation, factual recall, analogical reasoning …","url":["https://acl-bg.org/proceedings/2025/LowResNLP%202025/pdf/2025.lowresnlp-1.12.pdf"]} {"year":"2025","title":"Evaluating LLMs for Anxiety, Depression, and Stress Detection Evaluating Large Language Models for Anxiety, Depression, and Stress Detection: Insights into …","authors":["M Arcan, DP Niland - arXiv preprint arXiv:2511.07044, 2025"],"snippet":"Mental health disorders affect over one-fifth of adults globally, yet detecting such conditions from text remains challenging due to the subtle and varied nature of symptom expression. This study evaluates multiple approaches for mental health …","url":["https://arxiv.org/pdf/2511.07044"]} {"year":"2025","title":"Evaluating LLMs for Demographic-Targeted Social Bias Detection: A Comprehensive Benchmark Study","authors":["A Majumdar, F Chen, J Li, X Wang - arXiv preprint arXiv:2510.04641, 2025"],"snippet":"Large-scale web-scraped text corpora used to train general-purpose AI models often contain harmful demographic-targeted social biases, creating a regulatory need for data auditing and developing scalable bias-detection methods. Although prior work …","url":["https://arxiv.org/pdf/2510.04641"]} {"year":"2025","title":"Evaluating LLMs on Chinese Idiom Translation","authors":["C Yang, Y Dou, D Heineman, X Wu, W Xu - arXiv preprint arXiv:2508.10421, 2025"],"snippet":"Idioms, whose figurative meanings usually differ from their literal interpretations, are common in everyday language, especially in Chinese, where they often contain historical references and follow specific structural patterns. Despite recent progress …","url":["https://arxiv.org/pdf/2508.10421"]} @@ -9946,7 +10037,7 @@ {"year":"2025","title":"Evaluating the Impact of Advanced LLM","authors":["S Kahl¹, F Löffler, M Maciol, F Ridder, M Schmitz… - AI in Education and Educational …"],"snippet":"This study evaluates the performance of Large Language Models (LLMs) as an Artificial Intelligence-based tutor for a university course. In particular, different advanced techniques are utilized, such as prompt engineering, Retrieval-Augmented-Generation …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=OEl3EQAAQBAJ&oi=fnd&pg=PA149&dq=commoncrawl&ots=lXOY6YAON4&sig=fokrLKJD5qaAzliwxGiFJkj1ZjE"]} {"year":"2025","title":"Evaluating the Impact of Data Scarcity on Model Performance in a Low-Resource Afrikaans Question Answering Model","authors":["TG Moape, F Mthombeni, A Stoman - 2025 Conference on Information …, 2025"],"snippet":"This paper evaluates the impact of data scarcity on a generative question-answering (QA) model for Afrikaans, using a hybrid architecture that combines BERT for contextual encoding and GPT-2 for generation. Trained on a limited dataset of 1,800 …","url":["https://ieeexplore.ieee.org/abstract/document/11155433/"]} {"year":"2025","title":"Evaluating the Robustness of Retrieval-Augmented Generation to Adversarial Evidence in the Health Domain","authors":["S Amirshahi, A Bigdeli, CLA Clarke, A Ghenai - arXiv preprint arXiv:2509.03787, 2025"],"snippet":"… The TREC 2020 track consists of 46 queries on COVID-19 treatments (eg, “Can pneumococcal vaccine prevent COVID-19?”), with candidate documents sourced from the Common Crawl News dataset2, covering the early months of the pandemic …","url":["https://arxiv.org/pdf/2509.03787"]} -{"year":"2025","title":"Evaluating Virtual Reality and Artificial lntelligence as Emerging Digital Tools for Mental Health Care","authors":["OT Almira - 2025"],"snippet":"… COMMON CRAWL Common Crawl is a nonprofit organization that provides alarg e, freely accessible repository of web crawl data. This … Founded in 2007, Common Crawl has continuously collected and archived web data, gathering petabytes of …","url":["https://gupea.ub.gu.se/bitstream/handle/2077/84035/Kappa%20Almira%20Osmanovic%20e-spik.pdf?sequence=1&isAllowed=y"]} +{"year":"2025","title":"Evaluating Virtual Reality and Artificial lntelligence as Emerging Digital Tools for Mental Health Care","authors":["M Boustedt, AO Thunström - 2025","OT Almira - 2025"],"snippet":"… COMMON CRAWL Common Crawl is a nonprofit organization that provides alarg e, freely accessible repository of web crawl data. This … Founded in 2007, Common Crawl has continuously collected and archived web data, gathering petabytes of …","url":["https://gupea.ub.gu.se/bitstream/handle/2077/84035/Kappa%20Almira%20Osmanovic%20e-spik.pdf?sequence=1&isAllowed=y","https://gupea.ub.gu.se/bitstreams/db24befa-52ec-45cc-9093-067eb1b6ad26/download"]} {"year":"2025","title":"Evaluation of a Node-based Automatic Short Answer Tool “NodeGrade”","authors":["DV Fischer, J Haug, P Schoppel, J Abke, M Becker… - Proceedings of the 6th …, 2025"],"snippet":"NodeGrade tries to provide a suitable solution for the problem of time-intensive short answer grading. This research focuses simultaneously on performance, functionality and user experience, which is underlined by a triangulated approach. The …","url":["https://dl.acm.org/doi/pdf/10.1145/3723010.3723021"]} {"year":"2025","title":"Evaluation of the phi-3-mini SLM for identification of texts related to medicine, health, and sports injuries","authors":["C Brogly, S Rjaibi, C Liang, E Lam, E Wang, A Levitan… - arXiv preprint arXiv …, 2025"],"snippet":"Small Language Models (SLMs) have potential to be used for automatically labelling and identifying aspects of text data for medicine/health-related purposes from documents and the web. As their resource requirements are significantly lower than …","url":["https://arxiv.org/pdf/2504.08764"]} {"year":"2025","title":"Even Small Reasoners Should Quote Their Sources: Introducing the Pleias-RAG Model Family","authors":["PC Langlais, P Chizhov, M Nee, CR Hinostroza… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce a new generation of small reasoning models for RAG, search, and source summarization. Pleias-RAG-350m and Pleias-RAG-1B are mid-trained on a large synthetic dataset emulating the retrieval of a wide variety of multilingual open …","url":["https://arxiv.org/pdf/2504.18225"]} @@ -9971,6 +10062,7 @@ {"year":"2025","title":"Exploring Cross-Lingual Knowledge Transfer via Transliteration-Based MLM Fine-Tuning for Critically Low-resource Chakma Language","authors":["A Khisa, NJ Lia, TM Nafis, Z Masud, T Pial, S Rayana… - arXiv preprint arXiv …, 2025"],"snippet":"As an Indo-Aryan language with limited available data, Chakma remains largely underrepresented in language models. In this work, we introduce a novel corpus of contextually coherent Bangla-transliterated Chakma, curated from Chakma literature …","url":["https://arxiv.org/pdf/2510.09032"]} {"year":"2025","title":"Exploring Gen-AI applications in building research and industry: A review","authors":["H Wan, J Zhang, Y Chen, W Xu, F Feng - Building Simulation, 2025"],"snippet":"This paper investigates the transformative potential of Generative AI (Gen-AI) technologies, particularly large language models, within the building industry. By leveraging these advanced AI tools, the study explores their application across key …","url":["https://link.springer.com/article/10.1007/s12273-025-1279-x"]} {"year":"2025","title":"Exploring geometric compression across languages in multilingual language models","authors":["E Ruiz Moreno - 2024"],"snippet":"This study explores geometric compression of linguistic data across languages in multilingual language models using the Europarl corpus, focusing on three models: BLOOM, XLMRoBERTa, and Mistral. We estimate the intrinsic dimension (ID) of …","url":["https://repositori.upf.edu/bitstreams/8807f44b-2314-4bc1-a2a5-b625ef910f6d/download"]} +{"year":"2025","title":"Exploring Individual Differences in AI‐Assisted and Corpus‐Based Data‐Driven Learning: Insights Into Learners' Perceptions and Language Learning Outcomes","authors":["AX Sun, A Mizumoto - International Journal of Applied Linguistics, 2025"],"snippet":"This study examined the comparative effectiveness of corpus‐based data‐driven learning (DDL; Linguee) and artificial intelligence (AI)‐assisted DDL (ChatGPT) among 69 Japanese university EFL learners. Both approaches produced …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1111/ijal.70063"]} {"year":"2025","title":"Exploring large language models as reformulation assistants for the popularization of judicial texts","authors":["M Bouyé - In the Minds of Judges: Argumentative Discourse at the …, 2025"],"snippet":"This chapter explores argumentation in legal writing through the lens of linguistic complexity and expert-lay communication. Based on a sample of 30 decisions from the Supreme Court of Canada and using OpenAI’s ChatGPT (OpenAI, 2023), we …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=ITiUEQAAQBAJ&oi=fnd&pg=PA93&dq=commoncrawl&ots=O0iYL2Ld9X&sig=UZkHoOEklPncAbgswtWtUkRWdKU"]} {"year":"2025","title":"Exploring Large Language Models for Financial Applications: Techniques, Performance, and Challenges with FinMA","authors":["P Djagba, AY Saley - arXiv preprint arXiv:2510.05151, 2025"],"snippet":"This research explores the strengths and weaknesses of domain-adapted Large Language Models (LLMs) in the context of financial natural language processing (NLP). The analysis centers on FinMA, a model created within the PIXIU framework, which …","url":["https://arxiv.org/pdf/2510.05151"]} {"year":"2025","title":"Exploring LLM Embedding Potential for Dementia Detection Using Audio Transcripts","authors":["BA Llaca-Sánchez, LR García-Noguez… - Eng, 2025"],"snippet":"… The GloVe model—trained on data from Wikipedia 2014, Gigaword 5 archive of newswire text data, and the Common Crawl web pages dataset—is based on a co-occurrence matrix constructed from a large text corpus, which captures how frequently pairs of …","url":["https://www.mdpi.com/2673-4117/6/7/163"]} @@ -9982,6 +10074,7 @@ {"year":"2025","title":"Exploring The Effectiveness of In-Context Methods in Human-Aligned Large Language Models Across Languages","authors":["UA Prathama, A Purwarianti, S Cahyawijaya - JUTI: Jurnal Ilmiah Teknologi Informasi, 2025"],"snippet":"… From these drop-off points we can infer a practical resource threshold for current LLMs which is language with roughly a Joshi’s Class value of 4 (which corresponds to a certain typological and corpus-size bracket) and at least 0.5 % coverage in the …","url":["https://juti.if.its.ac.id/index.php/juti/article/download/1323/562"]} {"year":"2025","title":"Exploring the Effectiveness of Multilingual and Generative Large Language Models for Question Answering in Financial Texts","authors":["A Al-Laith - Proceedings of the Joint Workshop of the 9th Financial …, 2025"],"snippet":"This paper investigates the use of large language models (LLMs) for financial causality detection in the FinCausal 2025 shared task, focusing on generative and multilingual question answering (QA) tasks. Our study employed both generative …","url":["https://aclanthology.org/2025.finnlp-1.23.pdf"]} {"year":"2025","title":"Exploring the Impact of Attention Mechanisms in Big Data Analysis and Large Language Models","authors":["Z Mahal - American-Eurasian Journal of Scientific Research"],"snippet":"… We utilized multiple large-scale datasets for this study, including textual corpora from open repositories (eg, Common Crawl and Wikipedia) and structured big data sources such as financial transactions and sensor logs. The goal was to test the …","url":["https://isi.ac/storage/article-files/o4W2UHqvlxigggupTZS4LWt5Y5kTT8AkgMD7tuHh.pdf"]} +{"year":"2025","title":"Exploring the Impact of ChatGPT Use on Learners' Vocabulary Acquisition The Case of First-Year Students of English, at the University of Jijel","authors":["A LEBAILI, K BENMERZOUK - 2025"],"snippet":"This present piece of research aims at exploring the impact of using ChatGPT on vocabulary acquisition. In order to get familiar with the impact within first year English language students at the university of Jijel, the researcher used OpenAI …","url":["http://dspace.univ-jijel.dz:8080/xmlui/bitstream/handle/123456789/15719/Thesis%20FINAL%20UPDATE%20%281%29%20-%20kebieche%20amine.pdf?sequence=1&isAllowed=y"]} {"year":"2025","title":"Exploring the potential and limitations of large language models as virtual respondents for social science research","authors":["Z Rakovics, M Rakovics - Intersections. East European Journal of Society and …, 2024"],"snippet":"Social and linguistic differences encoded in various textual content available on the internet represent certain features of modern societies. For any scientific research which is interested in social differences mediated by language, the advent of large …","url":["https://intersections.tk.hu/index.php/intersections/article/download/1326/531"]} {"year":"2025","title":"Exploring the Potential of DeepSeek-R1 Model in Transforming Healthcare Solutions: An Overview","authors":["MR Raza, S Ahmed, FA Khokhar, A Varol - … on Digital Forensics and Security (ISDFS), 2025"],"snippet":"Over the past few decades, artificial intelligence (AI) has become more integrated into healthcare, with Large Language Models (LLMs) being a key component in improving healthcare decision-making. These LLMs' capacity to produce and …","url":["https://ieeexplore.ieee.org/abstract/document/11012057/"]} {"year":"2025","title":"Exploring the Utilities of the Rationales from Large Language Models to Enhance Automated Essay Scoring","authors":["H Jiao, H Choi, H Hua - arXiv preprint arXiv:2510.27131, 2025"],"snippet":"This study explored the utilities of rationales generated by GPT-4.1 and GPT-5 in automated scoring using Prompt 6 essays from the 2012 Kaggle ASAP data. Essay-based scoring was compared with rationale-based scoring. The study found in general …","url":["https://arxiv.org/pdf/2510.27131"]} @@ -10016,11 +10109,16 @@ {"year":"2025","title":"Feature-based Media Bias Detection","authors":["T Spinde - Automated Detection of Media Bias, 2025"],"snippet":"Thus far, we have presented a comprehensive literature review on media bias in Chap. 2, evaluated reliable measures for understanding media bias perception in Chap. 3, and introduced our two new datasets, MBIC and BABE, in Chap. 4. We now …","url":["https://link.springer.com/chapter/10.1007/978-3-658-47798-1_5"]} {"year":"2025","title":"FED: Fast and Efficient Dataset Deduplication Framework with GPU Acceleration","authors":["Y Son, C Kim, J Lee - arXiv preprint arXiv:2501.01046, 2025"],"snippet":"… The RealNews dataset is a large English corpus of news articles from Common Crawl, and C4 is a filtered version of Common Crawl. For C4, we sampled 100GB out of the total 700GB. We choose these datasets for two reasons. …","url":["https://arxiv.org/pdf/2501.01046"]} {"year":"2025","title":"FEVO: Financial Knowledge Expansion and Reasoning Evolution for Large Language Models","authors":["B Pang, Y Ouyang, H Xu, Z Jia, P Li, S Wen, L Wang… - arXiv preprint arXiv …, 2025"],"snippet":"Advancements in reasoning for large language models (LLMs) have lead to significant performance improvements for LLMs in various fields such as mathematics and programming. However, research applying these advances to the …","url":["https://arxiv.org/pdf/2507.06057"]} +{"year":"2025","title":"FFTrainer: Fast Failover in Large-Language Model Training with Almost-Free State Management","authors":["B Zhao, Y Wang, C Liu, J Pan, G Yang, R Liu, T Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"… We perform pre-training using the Common Crawl dataset [9]. We train the models without quantization using fp16. We choose the tensor/pipeline parallel parameter 𝑡 and 𝑝 so the training fits the GPU memory, then we fully utilize all 128 …","url":["https://arxiv.org/pdf/2512.03644"]} {"year":"2025","title":"FicSim: A Dataset for Multi-Faceted Semantic Similarity in Long-Form Fiction","authors":["N Johnson, A Bertsch, ME Deal, E Strubell - arXiv preprint arXiv:2510.20926, 2025"],"snippet":"… reasons: First, AO3 has made significant efforts to discourage web scraping, including blocking Common Crawl scraping in 2022 (Works… included in CommonCrawl datasets, and Archive of Our Own lawyers went before the US Copyright Office to …","url":["https://arxiv.org/pdf/2510.20926"]} +{"year":"2025","title":"Fighting deepfake text: towards building robust defences against arabic aI-generated text","authors":["A Boutadjine - 2025"],"snippet":"The rapid advancement of Natural Language Processing (NLP), particularly through transformer-based architectures, has led to powerful large language models (LLMs) capable of generating human-like text for a variety of tasks, including question …","url":["http://dspace.univ-setif.dz:8888/jspui/bitstream/123456789/6558/1/2521.pdf"]} {"year":"2025","title":"Fighting Fire with Fire: Journalistic Investigations of Artificial Intelligence Using Artificial Intelligence Techniques","authors":["J Veerbeek - Journalism Practice, 2025"],"snippet":"… – the Common Crawl. However, GPT-3's training process involved a narrower selection, with slightly more than 900,000 Dutch language pages (OpenAI Citation2020). This disparity stems from the stricter filtering criteria applied by the GPT-3 creators to …","url":["https://www.tandfonline.com/doi/pdf/10.1080/17512786.2025.2479499"]} {"year":"2025","title":"Figurative Archive: an open dataset and web-based application for the study of metaphor","authors":["M Bressler, V Mangiaterra, P Canal, F Frau, F Luciani… - arXiv preprint arXiv …, 2025"],"snippet":"… Semantic distance between the topic and vehicle was calculated using the Italian word embeddings from fastText58, a set of pre-trained word vectors based on Common Crawl and Wikipedia. The web interface provides access to these …","url":["https://arxiv.org/pdf/2503.00444"]} {"year":"2025","title":"Filter Like You Test: Data-Driven Data Filtering for CLIP Pretraining","authors":["M Shechter, Y Carmon - arXiv preprint arXiv:2503.08805, 2025"],"snippet":"We introduce Filter Like You Test (FLYT), a method for curating large-scale vision-language datasets that learns the usefulness of each data point as a pretraining example. FLYT trains a scoring model that learns to weigh each example using gradient …","url":["https://arxiv.org/pdf/2503.08805"]} +{"year":"2025","title":"Filtering of Czech Texts","authors":["J Brichta"],"snippet":"This paper compares several approaches to text filtering for the Czech language. Two manually created gold standard datasets were prepared for evaluating filter performance. We will also examine what characteristics could contribute to lowering …","url":["http://nlp.fi.muni.cz/raslan/2025/paper12.pdf"]} +{"year":"2025","title":"FIN-bench-v2: A Unified and Robust Benchmark Suite for Evaluating Finnish Large Language Models","authors":["J Kytöniemi, J Piha, A Reunamo, F Vitiugin, F Mehryary… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce FIN-bench-v2, a unified benchmark suite for evaluating large language models in Finnish. FIN-bench-v2 consolidates Finnish versions of widely used benchmarks together with an updated and expanded version of the original FIN-bench …","url":["https://arxiv.org/pdf/2512.13330"]} {"year":"2025","title":"FinBERT2: A Specialized Bidirectional Encoder for Bridging the Gap in Finance-Specific Deployment of Large Language Models","authors":["X Xu, F Wen, B Chu, Z Fu, Q Lin, J Liu, B Fei, Z Yang… - arXiv preprint arXiv …, 2025"],"snippet":"In natural language processing (NLP), the focus has shifted from encoder-only tiny language models like BERT to decoder-only large language models(LLMs) such as GPT-3. However, LLMs' practical application in the financial sector has revealed …","url":["https://arxiv.org/pdf/2506.06335"]} +{"year":"2025","title":"Findings of the 2017 Conference on Machine Translation (WMT17) Ondˇrej Bojar Charles University","authors":["R Chatterjee, C Federmann, Y Graham, B Haddow… - Corpus"],"snippet":"… Common Crawl Parallel Corpus … Common Crawl Language Model Data …","url":["https://www.tara.tcd.ie/tara8/server/api/core/bitstreams/81a83524-af85-48dd-99b5-5f154cd9588f/content"]} {"year":"2025","title":"Findings of the WMT 2025 Shared Task of the Open Language Data Initiative","authors":["D Dale, F Meta, L Burchell, J Maillard, I Abdulmumin…"],"snippet":"… The first part of the dataset, SMOLSENT, is based on 863 English sentences semi-manually selected from Common Crawl data5 to cover 5.5k of the most common English words (obtained by joining the GATITOS wordlist and the most frequent words in …","url":["https://www2.statmt.org/wmt25/pdf/2025.wmt-1.26.pdf"]} {"year":"2025","title":"Findings of the wmt25 general machine translation shared task: Time to stop evaluating on easy test sets","authors":["T Kocmi, E Artemova, E Avramidis, R Bawden, O Bojar… - Proceedings of the Tenth …, 2025"],"snippet":"This paper presents the results of the General Machine Translation Task organized as part of the 2025 Conference on Machine Translation (WMT). Participants were invited to build systems for any of the 30 language pairs. For half of these pairs, we …","url":["https://www2.statmt.org/wmt25/pdf/2025.wmt-1.22.pdf"]} {"year":"2025","title":"Fine-grained Fallacy Detection with Human Label Variation","authors":["A Ramponi, A Daffara, S Tonelli - arXiv preprint arXiv:2502.13853, 2025"],"snippet":"We introduce Faina, the first dataset for fallacy detection that embraces multiple plausible answers and natural disagreement. Faina includes over 11K span-level annotations with overlaps across 20 fallacy types on social media posts in Italian …","url":["https://arxiv.org/pdf/2502.13853"]} @@ -10030,15 +10128,19 @@ {"year":"2025","title":"Fine-Tuning Deep Learning Models for Sentiment Analysis: A Study on Movie Titles","authors":["H Qasim, M Zain, L Aziz, M Ayaz - Iqra Journal of Engineering and Computing, 2025"],"snippet":"… Pre-trained GloVe embeddings obtain their main strength from their training against extensive Wikipedia and Common Crawl datasets that strengthen their linguistic understanding. Due to previous training GloVe embeddings acquire …","url":["https://journals.iqra.edu.pk/ojs/index.php/ijec/article/download/27/6"]} {"year":"2025","title":"Fine-tuning Open-source Large Language Models for Processing Open-vocabulary Commands for Robotic Navigation","authors":["J Palmulaakso - 2025"],"snippet":"This thesis investigates using fine-tuned open-source Large Language Models (LLMs) for interpreting open-vocabulary commands for robotic navigation tasks. In this study, this means retrieving objects from scene graphs based on freeform language …","url":["https://aaltodoc.aalto.fi/bitstreams/150f2bf9-2c1b-4695-ba9c-1da6e679a19c/download"]} {"year":"2025","title":"Fine-Tuning Small Language Models for Domain-Specific AI: An Edge AI Perspective","authors":["R Aralimatti, SAG Shakhadri, KR Kruthika, K Angadi - 2025"],"snippet":"… While the general-purpose pre-training corpus includes sources such as Common Crawl and curated datasets, the Shakti-250M model incorporates domain-specific texts to enhance applicability in specialized fields such as healthcare, finance, and …","url":["https://www.preprints.org/frontend/manuscript/4fe07952a7c6c406b82f8177c6c45340/download_pub"]} +{"year":"2025","title":"Fine-Tuning the mT5 Model on Bidirectional Myanmar and Tedim Chin Machine Translation System","authors":["CZ Man, SSM Win, KLL Khine"],"snippet":"Nowadays, machine translation (MT) is a vital tool for overcoming language barriers, especially for underrepresented and low-resource languages. This study explores the effectiveness of the mT5 neural machine translation model in facilitating …","url":["https://www.iaeng.org/IJCS/issues_v52/issue_12/IJCS_52_12_03.pdf"]} +{"year":"2025","title":"FineFreq: A Multilingual Character Frequency Dataset from Web-Scale Text","authors":["B XU - arXiv preprint arXiv:2512.09701, 2025"],"snippet":"We present FineFreq, a large-scale multilingual character frequency dataset derived from the FineWeb and FineWeb2 corpora, covering over 1900 languages and spanning 2013-2025. The dataset contains frequency counts for 96 trillion …","url":["https://arxiv.org/pdf/2512.09701"]} {"year":"2025","title":"FineMedLM-o1: Enhancing Medical Knowledge Reasoning Ability of LLM from Supervised Fine-Tuning to Test-Time Training","authors":["T Cheng, Y Wang, W He, Q Wang, Y Cheng, Y Zhang… - Second Conference on Language …"],"snippet":"… 2024), we aim to use internet corpora (eg, Common Crawl, CC) as the foundation for our medical knowledge texts. CC inherently includes large-scale question-answer pairs and knowledge-rich textbooks (Shao et al.…","url":["https://openreview.net/pdf?id=7ZwuGZCopw"]} {"year":"2025","title":"FineMedLM-o1: Enhancing the Medical Reasoning Ability of LLM from Supervised Fine-Tuning to Test-Time Training","authors":["H Yu, T Cheng, Y Cheng, R Feng - arXiv preprint arXiv:2501.09213, 2025"],"snippet":"Recent advancements in large language models (LLMs) have shown promise in medical applications such as disease diagnosis and treatment planning. However, most existing medical LLMs struggle with the advanced reasoning required for …","url":["https://arxiv.org/pdf/2501.09213"]} {"year":"2025","title":"FinerWeb-10BT: Refining Web Data with LLM-Based Line-Level Filtering","authors":["E Henriksson, O Tarkka, F Ginter - arXiv preprint arXiv:2501.07314, 2025"],"snippet":"… Since 2008, CommonCrawl has collected a corpus of approximately 10 petabytes of web content (Baack, 2024). Despite its size, CommonCrawl … C4 uses the WET files provided by CommonCrawl, which come with pre-extracted plaintext, whereas …","url":["https://arxiv.org/pdf/2501.07314"]} +{"year":"2025","title":"FiNERweb: Datasets and Artifacts for Scalable Multilingual Named Entity Recognition","authors":["J Golde, P Haller, A Akbik - arXiv preprint arXiv:2512.13884, 2025"],"snippet":"… This step is necessary because FineWeb-2 is sourced from 96 CommonCrawl snapshots and thus possibly contains many low-quality data points for NER. To do so, we train a regression model that can score the potential quality of a passage for …","url":["https://arxiv.org/pdf/2512.13884"]} {"year":"2025","title":"FineScope: Precision Pruning for Domain-Specialized Large Language Models Using SAE-Guided Self-Data Cultivation","authors":["C Bhattacharyya, Y Kim - arXiv preprint arXiv:2505.00624, 2025"],"snippet":"Training large language models (LLMs) from scratch requires significant computational resources, driving interest in developing smaller, domain-specific LLMs that maintain both efficiency and strong task performance. Medium-sized …","url":["https://arxiv.org/pdf/2505.00624"]} {"year":"2025","title":"Finetuning LLMs for Grammatical Error Correction in English and Greek Texts","authors":["D Kapelles, A Andriopoulos, D Koutsomitropoulos - 2025"],"snippet":"… T5 was pre-trained on 750 GB of English-language text derived from the public web Common Crawl. mT5 was pre-trained on data from all 71 monthly web data published by Common Crawl so far, which is more than the source data used by T5. …","url":["https://www.ceid.upatras.gr/webpages/koutsomi/pdf/petra2025.pdf"]} {"year":"2025","title":"FineWeb-Conv: A Method for Finding Good Conversation Data","authors":["RJ Moore, S An, JP Gala, D Jadav - Workshop on Preparing Good Data for …, 2025"],"snippet":"… Initially, it can be employed to identify high-quality conversation data within a collection of diverse documents, like Fineweb or Common Crawl. Here, quality refers to the presence of natural interaction patterns, not the information or knowledge …","url":["https://openreview.net/pdf?id=EKF7dyuCGe"]} {"year":"2025","title":"FineWeb2: One Pipeline to Scale Them All--Adapting Pre-Training Data Processing to Every Language","authors":["G Penedo, H Kydlíček, V Sabolčec, B Messmer… - arXiv preprint arXiv …, 2025"],"snippet":"… Finally, we use our pipeline to process almost 100 Common Crawl1 snapshots spanning the summer of 2013 to April 2024 to create … We extend our gratitude to the Common Crawl project for freely providing and maintaining their regular crawls …","url":["https://arxiv.org/pdf/2506.20920"]} {"year":"2025","title":"First polarization study of the M87 jet and active galactic nuclei at submillimeter wavelengths with ALMA","authors":["C Goddi, DF Carlos - arXiv preprint arXiv:2505.10181, 2025"],"snippet":"We present full-polarization observations at $\\lambda = 0.87$ mm (345 GHz) conducted with the Atacama Large Millimeter/submillimeter Array (ALMA) toward Messier 87 (M87) and seven other radio-loud active galactic nuclei (AGN). We …","url":["https://arxiv.org/pdf/2505.10181"]} {"year":"2025","title":"FlexOlmo: Open Language Models for Flexible Data Use","authors":["W Shi, A Bhagia, K Farhat, N Muennighoff, P Walsh… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce FlexOlmo, a new class of language models (LMs) that supports (1) distributed training without data sharing, where different model parameters are independently trained on closed datasets, and (2) data-flexible inference, where …","url":["https://arxiv.org/pdf/2507.07024"]} +{"year":"2025","title":"FOAM: Blocked State Folding for Memory-Efficient LLM Training","authors":["Z Wen, J Wang, P Luo, D Li, T Sun - arXiv preprint arXiv:2512.07112, 2025"],"snippet":"Large language models (LLMs) have demonstrated remarkable performance due to their large parameter counts and extensive training data. However, their scale leads to significant memory bottlenecks during training, especially when using memory-intensive …","url":["https://arxiv.org/pdf/2512.07112"]} {"year":"2025","title":"Forgetting: A New Mechanism Towards Better Large Language Model Fine-tuning","authors":["AT Ghahrizjani, A Taban, Q Wang, S Ye, A Mirzaei… - arXiv preprint arXiv …, 2025"],"snippet":"Supervised fine-tuning (SFT) plays a critical role for pretrained large language models (LLMs), notably enhancing their capacity to acquire domain-specific knowledge while preserving or potentially augmenting their general-purpose …","url":["https://arxiv.org/pdf/2508.04329"]} {"year":"2025","title":"Form and function: automatic methods for prediction of functions","authors":["S Sharoff"],"snippet":"From the viewpoint of Systemic Functional Linguistics (SFL), language has evolved in society to provide means for negotiating with others about offering and requesting information or actions. These communicative needs are realised through the options …","url":["https://ssharoff.github.io/publications/2025-sfl-nlu.pdf"]} {"year":"2025","title":"Formalising lexical and syntactic diversity for data sampling in French","authors":["L Estève, M Scholivet, A Savary - arXiv preprint arXiv:2501.08003, 2025"],"snippet":"Diversity is an important property of datasets and sampling data for diversity is useful in dataset creation. Finding the optimally diverse sample is expensive, we therefore present a heuristic significantly increasing diversity relative to random sampling. We …","url":["https://arxiv.org/pdf/2501.08003"]} @@ -10061,6 +10163,7 @@ {"year":"2025","title":"From Bias to Balance How Multilingual Dataset Composition Affects Tokenizer Performance Across Languages","authors":["A Selvamurugan, R Dandekar, R Dandekar, S Panat - NeurIPS 2025 Workshop on …"],"snippet":"Tokenization serves as a crucial preprocessing step in multilingual language models, affecting performance in both high-resource and low-resource languages. However, current tokenizers seem to adopt language biases due to unbalanced training …","url":["https://openreview.net/pdf?id=kIRynQytBj"]} {"year":"2025","title":"From ChatGPT to DeepSeek AI: A Comprehensive Analysis of Evolution, Deviation, and Future Implications in AI-Language Models","authors":["S Singh, S Bansal, AE Saddik, M Saini - arXiv preprint arXiv:2504.03219, 2025"],"snippet":"… Training was carried out on the huge data, comprising of approximately 570GB of text after filtering, sourced from Common Crawl (60% of the training mix), WebText2 (19 billion, 22% training weight), Books1 (19 billion, 8% training weight), Books2 (55 …","url":["https://arxiv.org/pdf/2504.03219"]} {"year":"2025","title":"From classification to taxonomy: Automated structuring of vehicle repair names in multilingual corpora","authors":["SV Mashtalir, OV Nikolenko - Вісник сучасних інформаційних технологій, 2025"],"snippet":"This study introduces and rigorously validates a hybrid, five-stage Natural Language Processing pipeline that transforms unstructured, bilingual repair-order text into fully navigable, hierarchical action taxonomy – bridging the gap between flat keyword …","url":["https://hait.od.ua/index.php/journal/article/download/185/179"]} +{"year":"2025","title":"From Data Scarcity to Data Care: Reimagining Language Technologies for Serbian and other Low-Resource Languages","authors":["SA Ubois - arXiv preprint arXiv:2512.10630, 2025"],"snippet":"Large language models are commonly trained on dominant languages like English, and their representation of low resource languages typically reflects cultural and linguistic biases present in the source language materials. Using the Serbian …","url":["https://arxiv.org/pdf/2512.10630"]} {"year":"2025","title":"From data to detection: Developing a corpus and training language models for the identification of anti-refugee narratives in Spanish","authors":["J Mata, E Gualda, V Pachón, C Rebollo, JL Domínguez - 2025"],"snippet":"This study addresses the automatic detection of negative anti-refugee messages in Spanish texts, using language models based on pre-trained Transformers models. Despite numerous studies on hate speech detection, few have concentrated on …","url":["https://digibug.ugr.es/bitstream/handle/10481/107287/1-s2.0-S2590005625001535-main.pdf?sequence=1&isAllowed=y"]} {"year":"2025","title":"From Data to Grassroots Initiatives: Leveraging Transformer-Based Models for Detecting Green Practices in Social Media","authors":["A Glazkova, O Zakharova - Proceedings of the 1st Workshop on Ecology …, 2025"],"snippet":"Green practices are everyday activities that support a sustainable relationship between people and the environment. Detecting these practices in social media helps track their prevalence and develop recommendations to promote eco-friendly …","url":["https://aclanthology.org/2025.nlp4ecology-1.2.pdf"]} {"year":"2025","title":"From Embeddings to Explainability: A Tutorial on Large-Language-Model-Based Text Analysis for Behavioral Scientists","authors":["R Debelak, TK Koch, M Aßenmacher, C Stachl - Advances in Methods and Practices …, 2025"],"snippet":"Large language models (LLMs) are transforming research in psychology and the behavioral sciences by enabling advanced text analysis at scale. Their applications range from the analysis of social media posts to infer psychological traits to the …","url":["https://journals.sagepub.com/doi/pdf/10.1177/25152459251351285"]} @@ -10071,13 +10174,17 @@ {"year":"2025","title":"From inpainting to painting: exploring conservation of Chinese paintings with generative artificial intelligence","authors":["S Dai - 2024"],"snippet":"Chinese painting conservation faces several challenges, such as the inherent conflict between the conservation principles of minimal intervention, recognizability, and reversibility (Muñoz-Viñas, 2012), and the traditional pursuit of completeness in …","url":["https://summit.sfu.ca/_flysystem/fedora/2025-01/etd23518.pdf"]} {"year":"2025","title":"From keywords to key embeddings–contrasting French and Swedish web registers using multilingual deep learning","authors":["S Hellström, V Skantsi, A Salmela, V Laippala - Corpus Linguistics and Linguistic …, 2025"],"snippet":"The pervasiveness of the internet has given web language use a central role in society. However, the lack of multilingual corpora and scalable methods has led to the focus on English in web language research. To address this gap, the present …","url":["https://www.degruyter.com/document/doi/10.1515/cllt-2024-0070/html"]} {"year":"2025","title":"From Large AI Models to Agentic AI: A Tutorial on Future Intelligent Communications","authors":["F Jiang, C Pan, L Dong, K Wang, OA Dobre, M Debbah - arXiv preprint arXiv …, 2025"],"snippet":"With the advent of 6G communications, intelligent communication systems face multiple challenges, including constrained perception and response capabilities, limited scalability, and low adaptability in dynamic environments. This tutorial …","url":["https://arxiv.org/pdf/2505.22311"]} +{"year":"2025","title":"From Negative to Positive: Automated Text Style Transfer in Turkish Banking","authors":["G Tahaoglu, K Dilbaz, T Dogan, H Guner"],"snippet":"In this study, we address positive style transfer for customer communication in the Turkish banking domain, reframing negatively phrased sentences into constructive alternatives while preserving meaning. We build a Turkish sentence-level dataset …","url":["http://www.eleco.org.tr/ELECO2025/Eleco2025-Papers/171.pdf"]} {"year":"2025","title":"From Origins to Future: The Evolution and Prospects of Artificial Intelligence in the Reasoning Era","authors":["B Yang, J Qu - J. Int. Eco. Glo. Gov, 2025"],"snippet":"With the release of the OpenAI o1 model, artificial intelligence (AI) technology has ushered in a new era of Reasoning. This article reviews the development history of AI technology, from the early days of symbolic reasoning and logic programming, to …","url":["https://www.mospbs.com/uploads/files/2025/03/20250304/c9eeafc698c8337c04a19b46734e97f9.pdf"]} {"year":"2025","title":"From Past to Present: A Survey of Malicious URL Detection Techniques, Datasets and Code Repositories","authors":["Y Tian, Y Yu, J Sun, Y Wang - arXiv preprint arXiv:2504.16449, 2025"],"snippet":"Malicious URLs persistently threaten the cybersecurity ecosystem, by either deceiving users into divulging private data or distributing harmful payloads to infiltrate host systems. Gaining timely insights into the current state of this ongoing …","url":["https://arxiv.org/pdf/2504.16449"]} {"year":"2025","title":"From Pixels to Words--Towards Native Vision-Language Primitives at Scale","authors":["H Diao, M Li, S Wu, L Dai, X Wang, H Deng, L Lu, D Lin… - arXiv preprint arXiv …, 2025"],"snippet":"The edifice of native Vision-Language Models (VLMs) has emerged as a rising contender to typical modular VLMs, shaped by evolving model architectures and training paradigms. Yet, two lingering clouds cast shadows over its widespread …","url":["https://arxiv.org/pdf/2510.14979"]} {"year":"2025","title":"From PMI to Bots","authors":["K Church - International Journal of Lexicography, 2025"],"snippet":"My paper with Patrick Hanks on PMI (pointwise mutual information) was the most successful paper I ever wrote, or ever will write. I believe the paper was successful because it appealed to a number of different audiences for a number of different …","url":["https://academic.oup.com/ijl/advance-article/doi/10.1093/ijl/ecaf007/8160774"]} {"year":"2025","title":"From Pre-Trained Language Models to Agentic AI: Evolution and Architectures for Autonomous Intelligence","authors":["A Koubaa - 2025"],"snippet":"In this position paper, we present a comprehensive analysis of the evolution of artificial intelligence from pre-trained language models to agentic AI systems designed for autonomous intelligence. This evolution is structured across seven …","url":["https://www.preprints.org/frontend/manuscript/12afe22c52fa2522b9a5ad67711cf3be/download_pub"]} +{"year":"2025","title":"From salad bar feminism to denotative portraiture: How neoliberalism shapes feminism in AI-Generated visuals","authors":["C Bouko, G Colombo, T Joseph, I Cenni - Feminist Media Studies, 2025"],"snippet":"This article investigates how three major generative AI systems—DALL·E 3, Midjourney, and Stable Diffusion—visualize feminism through 2,400 images, using both quantitative and qualitative content analysis. We identify three overarching …","url":["https://www.tandfonline.com/doi/full/10.1080/14680777.2025.2593317"]} {"year":"2025","title":"From Scarcity to Capability: Empowering Fake News Detection in Low-Resource Languages with LLMs","authors":["HM Shibu, S Datta, MS Miah, N Sami, MS Chowdhury…"],"snippet":"The rapid spread of fake news presents a significant global challenge, particularly in lowresource languages like Bangla, which lack adequate datasets and detection tools. Although manual fact-checking is accurate, it is expensive and slow to prevent …","url":["https://www.researchgate.net/profile/Hrithik-Majumdar/publication/387798798_From_Scarcity_to_Capability_Empowering_Fake_News_Detection_in_Low-Resource_Languages_with_LLMs/links/677e006b18ad70589ea34325/From-Scarcity-to-Capability-Empowering-Fake-News-Detection-in-Low-Resource-Languages-with-LLMs.pdf"]} +{"year":"2025","title":"From Show Programmes to Data: Designing a Workflow to Make Performing Arts Ephemera Accessible Through Language Models","authors":["C Bardiot, PC Langlais, B Jacquemin, J Hart… - arXiv preprint arXiv …, 2025"],"snippet":"Many heritage institutions hold extensive collections of theatre programmes, which remain largely underused due to their complex layouts and lack of structured metadata. In this paper, we present a workflow for transforming such documents into …","url":["https://arxiv.org/pdf/2512.07452"]} {"year":"2025","title":"From Small to Large Language Models: Revisiting the Federalist Papers","authors":["SW Jeong, V Rockova - arXiv preprint arXiv:2503.01869, 2025"],"snippet":"… et al., 2019; Brown et al., 2020), BERT (Devlin et al., 2019), RoBERTa (Liu et al., 2019), BART (Lewis et al., 2019), and LLaMA (Touvron et al., 2023) rely heavily on massive training datasets sourced from diverse corpora, including BookCorpus …","url":["https://arxiv.org/pdf/2503.01869"]} +{"year":"2025","title":"From Tabula Rasa to Inductive Bias: Reframing Locke's Problem in the Age of Generative AI","authors":["X Zhang, H Li"],"snippet":"Large language models (LLMs) often appear to vindicate a radical empiricist picture: train on vast corpora of experience-like text, and capacities emerge without explicit symbolic rules. Yet contemporary machine learning research repeatedly …","url":["https://philpapers.org/archive/ZHAFTR-3.pdf"]} {"year":"2025","title":"From Transformers to llm","authors":["A Aslam"],"snippet":"… LLMs typically rely on massive text corpora drawn from web crawl data (eg, Common Crawl), books, and Wikipedia. Tokenization schemes such as Byte-Pair Encoding (BPE) [23] and SentencePiece [24] balance vocabulary size against …","url":["https://www.academia.edu/download/123103199/mk.pdf"]} {"year":"2025","title":"From Translation to Generative LLMs: Classification of Code-Mixed Affective Tasks","authors":["A Yadav, T Garg, M Klemen, M Ulcar, B Agarwal… - IEEE Transactions on …, 2025"],"snippet":"… It was trained on large Indian corpora consisting of English and 16 Indian languages where they utilize publicly available corpora from Wikipedia and Common Crawl, including Hindi and Tamil, which we consider in this work. We use …","url":["https://ieeexplore.ieee.org/abstract/document/10938193/"]} {"year":"2025","title":"From Tweets to Insights: Social Opinion Mining on Corporate Social Responsibility","authors":["C Leggerini, M Bannò - Corporate Social Responsibility and Environmental …, 2025"],"snippet":"Corporate Social Responsibility (CSR) has become increasingly critical as firms seek to balance financial goals with social and environmental responsibilities. Our study introduces a three‐phase structured method to analyze stakeholders' opinions …","url":["https://onlinelibrary.wiley.com/doi/pdf/10.1002/csr.70016"]} @@ -10091,12 +10198,15 @@ {"year":"2025","title":"Future of AI Models: A Computational perspective on Model collapse","authors":["T Satharasi, SS Iyengar - arXiv preprint arXiv:2511.05535, 2025"],"snippet":"… Using a filtered subset of the Common Crawl corpus (English-language Wikipedia articles), we compute year-wise semantic similarity from 2013 to 2025 through Transformer-based embeddings and cosine metrics. Results indicate a …","url":["https://arxiv.org/pdf/2511.05535"]} {"year":"2025","title":"Future-Proof Yourself: An AI Era Survival Guide","authors":["T Kim - arXiv preprint arXiv:2504.04378, 2025"],"snippet":"Future-Proof Yourself is a practical guide that helps readers navigate the fast-changing world of artificial intelligence in everyday life. The book begins by explaining how computers learn from data in simple, relatable terms, and gradually introduces the …","url":["https://arxiv.org/pdf/2504.04378"]} {"year":"2025","title":"FuxiMT: Sparsifying Large Language Models for Chinese-Centric Multilingual Machine Translation","authors":["S Zhu, T Dong, B Li, D Xiong - arXiv preprint arXiv:2505.14256, 2025"],"snippet":"In this paper, we present FuxiMT, a novel Chinese-centric multilingual machine translation model powered by a sparsified large language model (LLM). We adopt a two-stage strategy to train FuxiMT. We first pre-train the model on a massive …","url":["https://arxiv.org/pdf/2505.14256"]} +{"year":"2025","title":"GAIA Ecosystem Paradigm","authors":["J Donne - Ophthalmology"],"snippet":"No man is an island entire of itself; every man is a piece of the continent, a part of the main; if a clod be washed away by the sea, Europe is the less, as well as if a promontory were, as well as any manner of thy friends or of thine own were; any …","url":["https://www.ruppin.ac.il/en/research-authority/research-laboratories/laboratory-for-knowledge-engineering-and-robotics/lab-paradigm/"]} {"year":"2025","title":"Gaining the Edge: Visualizing Information Advantage through Machine Learning-Driven Dashboards","authors":["A El Ouadi, W Knowlton, A Pimentel, D Beskow - 2025"],"snippet":"… , two straightforward ways to access news data are through the Common Crawl News (CC-News) feed or the Global Database of … Common Crawl News and GDELT data for diverse academic, commercial, and government use cases. This …","url":["https://www.ieworldconference.org/content/WP2025/Papers/GDRKMCC25_11.pdf"]} {"year":"2025","title":"GENDER BIAS DETECTION IN GREEK LANGUAGE MODELS","authors":["CG Grigoriadis - 2025"],"snippet":"Gender bias in language models has emerged as a critical ethical and technical challenge in Natural Language Processing (NLP). This thesis investigates the presence and extent of gender bias in Greek language models, focusing on two …","url":["https://pergamos.lib.uoa.gr/uoa/dl/object/5299629/file.pdf"]} {"year":"2025","title":"Gender bias in language and artificial intelligence tools","authors":["O Marki"],"snippet":"This master’s thesis represents an interdisciplinary approach to understanding gender bias manifested in the output of artificial intelligence tools, which are based on language models. Biases and stereotypes become problematic when we …","url":["https://www.academia.edu/download/112701039/20240324_MagistrskaNaloga_AnkaSupej_ZADNJA_VERZIJA_za_oddajo_EN_PDFA.pdf"]} {"year":"2025","title":"Gender Bias in Translation Automation: Addressing Bias and Inequality","authors":["MG González - The Social Impact of Automating Translation, 2024"],"snippet":"Machine translation (MT) has become an essential tool for overcoming language barriers and facilitating cross-cultural communication. However, it has also raised significant concerns, particularly regarding gender bias—the tendency of MT …","url":["https://www.taylorfrancis.com/chapters/edit/10.4324/9781003465522-6/gender-bias-translation-automation-marta-garc%C3%ADa-gonz%C3%A1lez"]} {"year":"2025","title":"General purpose models for the chemical sciences","authors":["N Alampara, A Aneesh, M Ríos-García, A Mirza… - arXiv preprint arXiv …, 2025"],"snippet":"… One can utilize a “top-down” approach where a large and diverse pool of data—eg, results from web-crawled resources such as CommonCrawl… filtered CommonCrawl for mathematical text using a combination of regular expressions …","url":["https://arxiv.org/pdf/2507.07456"]} +{"year":"2025","title":"Generalist Foundation Models Are Not Clinical Enough for Hospital Operations","authors":["LY Jiang, A Chen, X Han, XC Liu, R Dua, K Eaton… - arXiv preprint arXiv …, 2025"],"snippet":"Hospitals and healthcare systems rely on operational decisions that determine patient flow, cost, and quality of care. Despite strong performance on medical knowledge and conversational benchmarks, foundation models trained on general …","url":["https://arxiv.org/pdf/2511.13703"]} {"year":"2025","title":"Generalizable Cross-Lingual Cognitive Distortion Detection with Standardized Annotations and Multi-Task Learning","authors":["H Qi, N Bai, J Li, W Zhai, Q Zhao, Q Gao, BX Yang… - Findings of the Association …, 2025"],"snippet":"… ), pre-trained on 2.5 TB of CommonCrawl data1 from 100 languages. Its key features include extended training steps, dynamic masking, and unigram SentencePiece tokenization, enabling consistent crosslanguage processing. 1https://commoncrawl.org/ …","url":["https://aclanthology.org/2025.findings-acl.826.pdf"]} +{"year":"2025","title":"Generalization Gaps in Political Fake News Detection: An Empirical Study on the LIAR Dataset","authors":["SM Hasan, S Roy, AJ Nafis - arXiv preprint arXiv:2512.18533, 2025"],"snippet":"The proliferation of linguistically subtle political disinformation poses a significant challenge to automated fact-checking systems. Despite increasing emphasis on complex neural architectures, the empirical limits of text-only linguistic modeling …","url":["https://arxiv.org/pdf/2512.18533"]} {"year":"2025","title":"Generate-Distill: Training Cross-Language IR Models with Synthetically-Generated Data","authors":["D Lawrie, E Kayi, E Yang, J Mayfield, DW Oard, S Miller - Proceedings of the 48th …, 2025"],"snippet":"Most pretrained language models that support neural information retrieval are fine-tuned on the MS MARCO dataset. MS MARCO is expressed in English, so it naturally supports monolingual English retrieval. However, for Cross-Language Information …","url":["https://dl.acm.org/doi/pdf/10.1145/3726302.3730201"]} {"year":"2025","title":"Generating language assessment content free from representational harms","authors":["I Choi, J Zu - Language Testing, 2025"],"snippet":"Today’s language models can produce syntactically accurate and semantically coherent texts. This capability presents new opportunities for generating content for language assessments, which have traditionally required intensive expert resources …","url":["https://journals.sagepub.com/doi/abs/10.1177/02655322251349560"]} {"year":"2025","title":"Generating targeted and tailored health communication narratives with AI","authors":["H Chu, S Liu - Risk Analysis, 2025"],"snippet":"Customized narratives are effective tools to promote risk prevention behaviors in populations. However, the development of such narratives is resource‐intensive. Advances in generative artificial intelligence (AI) offer promising solutions to these …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1111/risa.70076"]} @@ -10105,16 +10215,19 @@ {"year":"2025","title":"Generative AI Chatbots in Higher Education: ATAM-Based Analysis of Discipline-Specific Adoption Patterns for Students at the University of Borås","authors":["E Hagsér, T Rademacher - 2025"],"snippet":"… For example, LLMs are trained on text from sources like Wikipedia and the Common Crawl (a collection of web pages). They predict the probability of words appearing in a particular context, generating text by selecting words based on these …","url":["https://www.diva-portal.org/smash/get/diva2:1990501/FULLTEXT01.pdf"]} {"year":"2025","title":"Generative AI Decision-Making Attributes in Complex Health Services: A Rapid Review","authors":["D Nandini, H Louise - Cureus, 2025"],"snippet":"… -3, the third generation of the Generative Pre-trained Transformer, can retrieve information from a large corpus of books, articles, websites, and many other sources of text data; its primary source being the repository of documents and web pages …","url":["https://search.proquest.com/openview/a5e8317b7b016a1f11b8ff5f87a58f87/1?pq-origsite=gscholar&cbl=2045583"]} {"year":"2025","title":"Generative AI for Industry Transformation: A Systematic Review of ChatGPT's Capabilities and Integration Challenges","authors":["S Salih, O Husain, EAM Abdalla, AO Ibrahim… - International Journal of …, 2025"],"snippet":"The rapid advancement of Generative Artificial Intelligence (GAI), particularly OpenAI's ChatGPT, has significantly transformed various industries by enhancing efficiency, reducing operational costs, and fostering innovation. This systematic …","url":["https://koreascience.kr/article/JAKO202516439602807.pdf"]} +{"year":"2025","title":"Generative AI for Sequence Learning in Cybersecurity","authors":["DN Nguyen, L Vu, QU Nguyen, DT Hoang - Generative AI for Cybersecurity, 2026"],"snippet":"… They conducted the experiments on the PhishTank and Common Crawl repositories, and the results demonstrate significant performance improvements after using the proposed solution. Robic-Butez and Win [24] proposed a novel …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.1201/9781003502531-6&type=chapterpdf"]} {"year":"2025","title":"Generative AI in Academic Writing: A Comparison of DeepSeek, Qwen, ChatGPT, Gemini, Llama, Mistral, and Gemma","authors":["Ö Aydın, E Karaarslan, FS Erenay, NB Džakula"],"snippet":"… The team developed a FastText-based classifier to filter mathematical content at scale, starting with a robust seed dataset comprising OpenWebMath as positive examples and Common Crawl as negatives. This approach enabled the extraction …","url":["https://www.researchgate.net/profile/Oemer-Aydin-9/publication/388681921_Generative_AI_in_Academic_Writing_A_Comparison_of_DeepSeek_Qwen_ChatGPT_Gemini_Llama_Mistral_and_Gemma/links/67a25d1152b58d39f26db428/Generative-AI-in-Academic-Writing-A-Comparison-of-DeepSeek-Qwen-ChatGPT-Gemini-Llama-Mistral-and-Gemma.pdf"]} {"year":"2025","title":"Generative AI in Focus: A Comprehensive Review of Leading Models Across Modalities","authors":["S Aishwarya, C Selvamurugan, KG Parthiban… - 2024 4th International …, 2024"],"snippet":"GenAI has revolutionized the generation of realistic and imaginative data in ways that were previously beyond the capabilities of other machine learning algorithms. This area is rapidly gaining traction, with extensive research currently underway to …","url":["https://ieeexplore.ieee.org/abstract/document/10867014/"]} {"year":"2025","title":"Generative AI Unleashed: A Multi-Domain Journey of Successful Implementations of Large Language Models","authors":["N Kumar, A Barthwal, S Mishra, A Jain - … : Large Language Models and Their Real …, 2025"],"snippet":"The entire work describes various fields addressed by generative artificial intelligence and provides a cross-disciplinary approach not limited to a particular discipline. As this chapter showcases examples of using generative AI in contexts …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=QPl1EQAAQBAJ&oi=fnd&pg=PA125&dq=commoncrawl&ots=5dFxrAmvvR&sig=5mLLAVJUbUVypTPIwqBKF3xkb1g"]} {"year":"2025","title":"Generative AI","authors":["A Ibrahim"],"snippet":"After almost six years of study, earning two bachelor’s degrees and now completing this master’s, my student journey comes to an end with this thesis, for now. My time at TU Delft has been a journey of both intellectual growth and personal discovery …","url":["https://repository.tudelft.nl/file/File_f005f661-bad5-4dea-a8ec-7647ab02bd99"]} {"year":"2025","title":"Generative AI's Copyright Enigma: A Comparative Study of Fair Use and Fair Dealing","authors":["T Awad - IP Theory, 2025"],"snippet":"… Rather, LAION-5B, assisted by the CLIP (Contrastive Language-Image Pre-training) exploit images collected by Common Crawl to create a … Common Crawl itself does not engage in any reproduction or copyright infringement because they do not …","url":["https://www.repository.law.indiana.edu/cgi/viewcontent.cgi?article=1085&context=ipt"]} {"year":"2025","title":"Generative AI: Techniques, Models and Applications","authors":["R Gupta"],"snippet":"The rapid advancement of artificial intelligence (AI) has ushered in a new era of technological innovation, with generative AI standing at the forefront of this transformation. This book, Generative AI—Techniques, Models and Applications …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=Mp5REQAAQBAJ&oi=fnd&pg=PR9&dq=commoncrawl&ots=WAcnfH9XEg&sig=WguE7cjjih5AIbF75KqXZVznYpc"]} -{"year":"2025","title":"Generative Artificial Intelligence Data Risks and Governance Pathways","authors":["B Jiang, X Ye - Beijing Law Review, 2025"],"snippet":"Generative AI, exemplified by ChatGPT, offers societal benefits while posing challenges to data governance. Addressing data risks is vital for its healthy development. This paper examines the technical framework and pre-training data …","url":["https://www.scirp.org/journal/paperinformation?paperid=144693"]} +{"year":"2025","title":"Generative Artificial Intelligence Data Risks and Governance Pathways","authors":["B Jiang, X Ye - Beijing L. Rev., 2025","B Jiang, X Ye - Beijing Law Review, 2025"],"snippet":"Generative Al, exemplified by ChatGPT, offers societal benefits while posing challenges to data governance. Addressing data risks is vital for its healthy development. This paper examines the technical framework and pre-training data …","url":["https://heinonline.org/hol-cgi-bin/get_pdf.cgi?handle=hein.journals/beijlar16§ion=83","https://www.scirp.org/journal/paperinformation?paperid=144693"]} {"year":"2025","title":"Generative Artificial Intelligence in Academic Surgery: Ethical Implications and Transformative Potential","authors":["JR Robinson, A Stey, DF Schneider, AN Kothari… - Journal of Surgical …, 2025"],"snippet":"Artificial intelligence (AI) is rapidly being used in medicine due to its advanced capabilities in image and video recognition, clinical decision support, surgical education, and administrative task automation. Large language models such as …","url":["https://www.sciencedirect.com/science/article/pii/S0022480425000216"]} {"year":"2025","title":"Generative artificial intelligence interventions to English language","authors":["N Glenn, G Hardaker, A Sabki - The International Journal of Information and Learning …, 2025"],"snippet":"Purpose The paper explores the alignment of artificial intelligence (AI) with what seems to be a belief that language is a natural growth. We reflect back and look forward on George Orwell and CS Lewis, based on commonalities and differences …","url":["https://www.emerald.com/ijilt/article/doi/10.1108/IJILT-06-2024-0117/1304885"]} {"year":"2025","title":"Generative Artificial Intelligence","authors":["D Carnahan - Digital Health, AI and Generative AI in Healthcare: A …, 2025"],"snippet":"… And there are variants of this data—Google, for one example, manages a cleaned version of the Common Crawl that they call the ‘Colossal Clean Crawled Corpus’ (C4). Another large repository of data is known as The Pile [8]. This data source is a large …","url":["https://link.springer.com/chapter/10.1007/978-3-031-83526-1_5"]} +{"year":"2025","title":"Generative models","authors":["NLP Early"],"snippet":"… dataset, derived from Common Crawl, retained only 3.2 TB of compressed text starting from the February 2019 monthly Common Crawl snapshot. Google’… To train GPT-3, OpenAI filtered 570 GB of text from an initial 45 TB of Common Crawl …","url":["https://mlbenchmarks.org/10-generative-models.html"]} +{"year":"2025","title":"Generic visuality of war? How image-generative AI models (mis) represent Russia's war against Ukraine","authors":["M Makhortykh, M Bareikytė - arXiv preprint arXiv:2512.06570, 2025"],"snippet":"The rise of generative AI (genAI) can transform the representation of different aspects of social reality, including modern wars. While scholarship has largely focused on the military applications of AI, the growing adoption of genAI …","url":["https://arxiv.org/pdf/2512.06570"]} {"year":"2025","title":"Geosocial media's perspective on energy: a text classification approach using natural language processing","authors":["J Verdoodt, K Milleville, H Huang, C Vandeviver… - Journal of Location Based …, 2025"],"snippet":"This study examines public opinion on various energy sources through Twitter data, focusing on fossil fuels, nuclear energy, and renewable energy sources like solar and wind. Utilizing natural language processing techniques, specifically BERTweet …","url":["https://www.tandfonline.com/doi/abs/10.1080/17489725.2025.2501632"]} {"year":"2025","title":"Geospatiality: the effect of topics on the presence of geolocation in English text data","authors":["J Mast, R Lemoine-Rodríguez, V Rittlinger… - International Journal of …, 2025"],"snippet":"Geolocated text data are a promising data source for spatial analyses in many fields, from disease surveillance to the spatial humanities. This study investigates the relationship between texts’ thematic categories and their likelihood of containing …","url":["https://www.tandfonline.com/doi/full/10.1080/13658816.2025.2460051"]} {"year":"2025","title":"GeRe: Towards Efficient Anti-Forgetting in Continual Learning of LLM via General Samples Replay","authors":["Y Zhang, S Jiang, M Zhao, Y Li, Y Fan, X Wu, Q Chen - arXiv preprint arXiv …, 2025"],"snippet":"The continual learning capability of large language models (LLMs) is crucial for advancing artificial general intelligence. However, continual fine-tuning LLMs across various domains often suffers from catastrophic forgetting, characterized by: 1) …","url":["https://arxiv.org/pdf/2508.04676"]} @@ -10133,6 +10246,7 @@ {"year":"2025","title":"GPT and Prejudice: A Sparse Approach to Understanding Learned Representations in Large Language Models","authors":["M Mahran, K Simbeck - arXiv preprint arXiv:2510.01252, 2025"],"snippet":"As large language models (LLMs) are increasingly trained on massive, uncurated corpora, understanding both model representations and the data they internalize has become a major challenge. In this work, we show that pairing LLMs with sparse …","url":["https://arxiv.org/pdf/2510.01252"]} {"year":"2025","title":"GPT-3-Based AI Cover Letter Generator: A Feasibility Study & Implementation","authors":["TA Bloch, I Inusa - Vinit Kumar Gunjan"],"snippet":"Natural Language Processing (NLP) is the emerging field research studies of the interaction between human and computing systems. With advancement of NLP techniques, machines are becoming increasingly proficient in understanding …","url":["https://link.springer.com/content/pdf/10.1007/978-981-97-8861-3.pdf#page=23"]} {"year":"2025","title":"GPU Implementation of the Wavelet Tree","authors":["M Franzreb, M Burtscher, S Rudolph - arXiv preprint arXiv:2505.03372, 2025"],"snippet":"I present a new GPU implementation of the wavelet tree data structure. It includes binary rank and select support structures that provide at least 10 times higher throughput of binary rank and select queries than the best publicly available CPU …","url":["https://arxiv.org/pdf/2505.03372"]} +{"year":"2025","title":"GRAD LIS 9704","authors":["A Mayhew"],"snippet":"… This should not be necessarily surprising as we do know that most LLMs are trained using the CommonCrawl dataset, a collection of twelve years’ worth of publicly available Internet pages, where sites like AO3 with over 11 million freely …","url":["https://uwo.scholaris.ca/server/api/core/bitstreams/33670117-8972-43d5-9bba-fcd2069acf35/content"]} {"year":"2025","title":"Gradient Weight-normalized Low-rank Projection for Efficient LLM Training","authors":["E Kanoulas, JIAH HUANG, Y Shen, H Zhu, S Rudinac - Greeks in AI Symposium 2025","JH Huang, Y Shen, H Zhu, S Rudinac, E Kanoulas - arXiv preprint arXiv:2412.19616, 2024"],"snippet":"… 2020b), a cleaned version of Common Crawl’s web corpus, with perplexity as the performance metric. In our fine-tuning experiments, we use BERTbase (Devlin et al. 2018), RoBERTabase, RoBERTalarge (Liu et al. 2019a), and BARTbase (Lewis et al …","url":["https://arxiv.org/pdf/2412.19616","https://openreview.net/pdf?id=5ACIZQ1Oz3"]} {"year":"2025","title":"Gradient-Attention Guided Dual-Masking Synergetic Framework for Robust Text-based Person Retrieval","authors":["T Zheng, Y Zhang, X An, Z Feng, K Yang, Q Ding - arXiv preprint arXiv:2509.09118, 2025"],"snippet":"… 2022), a large-scale dataset that contains 747M image-text pairs collected from CommonCrawl, as our web-crawled images source. To filter high-quality person-centric images, we initially deploy the YOLOv11 model (Jocher and Qiu, 2024) to detect …","url":["https://arxiv.org/pdf/2509.09118"]} {"year":"2025","title":"Grammar or Crammer? The Role of Morphology in Distinguishing Orthographically Similar but Semantically Unrelated Words","authors":["G Ercan, OT Yildiz - IEEE Access, 2025"],"snippet":"We show that n-gram-based distributional models fail to distinguish unrelated words due to the noise in semantic spaces. This issue remains hidden in conventional benchmarks but becomes more pronounced when orthographic similarity is high. To …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10947740.pdf"]} @@ -10140,24 +10254,29 @@ {"year":"2025","title":"GRAPE: Optimize Data Mixture for Group Robust Multi-target Adaptive Pretraining","authors":["S Fan, MI Glarou, M Jaggi - arXiv preprint arXiv:2505.20380, 2025"],"snippet":"The performance of large language models (LLMs) across diverse downstream applications is fundamentally governed by the quality and composition of their pretraining corpora. Existing domain reweighting algorithms primarily optimize data …","url":["https://arxiv.org/pdf/2505.20380"]} {"year":"2025","title":"Grid based hybrid search for spatio-textual data","authors":["I Sasati - 2025"],"snippet":"In this thesis, we present a new approach to the approximate similarity search problem over spatio-textual data, where queries involve both geographic locations and semantically rich text. Unlike traditional approaches that rely on exact keyword …","url":["https://dione.lib.unipi.gr/xmlui/bitstream/handle/unipi/17962/Sasati_me2327.pdf?sequence=1&isAllowed=y"]} {"year":"2025","title":"GRIP: A Graph-Based Reasoning Instruction Producer","authors":["J Wang, J Xu, X Wang, Y Wang, M Xing, S Fang, H Xie - The Thirty-ninth Annual Conference …"],"snippet":"… For example, compared to MAmmoTH2, which filters 10M samples from the massive Common Crawl corpus, our method achieves a 5.1-point higher score (57.3% vs 52.2%) using only 7.5K seed examples. Similarly, our method outperforms …","url":["https://openreview.net/pdf?id=36TBVGwzAY"]} +{"year":"2025","title":"Grounded Multilingual Medical Reasoning for Question Answering with Large Language Models","authors":["P Ferrazzi, A Soroa, R Agerri - arXiv preprint arXiv:2512.05658, 2025"],"snippet":"Large Language Models (LLMs) with reasoning capabilities have recently demonstrated strong potential in medical Question Answering (QA). Existing approaches are largely English-focused and primarily rely on distillation from …","url":["https://arxiv.org/pdf/2512.05658"]} {"year":"2025","title":"Group then Scale: Dynamic Mixture-of-Experts Multilingual Language Model","authors":["C Li, Y Deng, J Zhang, C Zong - arXiv preprint arXiv:2506.12388, 2025"],"snippet":"The curse of multilinguality phenomenon is a fundamental problem of multilingual Large Language Models (LLMs), where the competition between massive languages results in inferior performance. It mainly comes from limited capacity and …","url":["https://arxiv.org/pdf/2506.12388"]} {"year":"2025","title":"Grouped Differential Attention","authors":["J Lim, S Lee, D Kim, WT Cheung, B Kim, T Kim, H Lee… - arXiv preprint arXiv …, 2025"],"snippet":"… Our corpus consists of large-scale web data from Common Crawl [1], mathematical content from FineMath [2], and additional open-source reasoning datasets. Optimization was performed using AdamW [13] with a learning rate of 5 × …","url":["https://arxiv.org/pdf/2510.06949"]} {"year":"2025","title":"Guardrails for safe implementations of AI-based services","authors":["DC Verma, R Ratnaparkhi - Assurance and Security for AI-enabled Systems 2025, 2025"],"snippet":"… In some cases, an enterprise may also want to use publicly available data on the Internet, eg common crawl data and its derivatives.The … [14] Foundation, CC, “Common crawl data,” (2023). Accessed: 2025-02-18. [15] Gutiérrez-Fandino, A., Pérez-Fernández …","url":["https://www.spiedigitallibrary.org/conference-proceedings-of-spie/13476/134760I/Guardrails-for-safe-implementations-of-AI-based-services/10.1117/12.3051891.short"]} {"year":"2025","title":"GUICourse: From General Vision Language Model to Versatile GUI Agent","authors":["W Chen, J Cui, J Hu, Y Qin, J Fang, Y Zhao, C Wang… - Proceedings of the 63rd …, 2025"],"snippet":"… We collected 4M URLs from the Cleaned Common Crawl Corpus (Raffel et al.… However, some screenshots in our GUIEnv dataset are collected from the Cleaned Common Crawl Corpus, so we cannot guarantee that these website screenshots are …","url":["https://aclanthology.org/2025.acl-long.1065.pdf"]} {"year":"2025","title":"GUIDE: Guided Initialization and Distillation of Embeddings","authors":["K Trinh, G Menghani, E Vee - arXiv preprint arXiv:2510.06502, 2025"],"snippet":"Algorithmic efficiency techniques such as distillation (\\cite{hinton2015distillation}) are useful in improving model quality without increasing serving costs, provided a larger teacher model is available for a smaller student model to learn from during …","url":["https://arxiv.org/pdf/2510.06502"]} {"year":"2025","title":"Guided by Style: Fine-Grained Modulation in Multi-Style Artistic Transfer","authors":["C Zhang, C Ba"],"snippet":"We propose a novel diffusion-based framework for artistic multi-style transfer that uniquely combines compositional denoising and classifier-free guidance (CFG) to enable fine-grained control over both content preservation and stylistic blending …","url":["https://cs231n.stanford.edu/papers/text_file_840587412-CS231N___Final_Project_Report.pdf"]} +{"year":"2025","title":"Hacking Neural Evaluation Metrics with Single Hub Text","authors":["H Deguchi, K Chousa, Y Sakai - arXiv preprint arXiv:2512.16323, 2025"],"snippet":"Strongly human-correlated evaluation metrics serve as an essential compass for the development and improvement of generation models and must be highly reliable and robust. Recent embedding-based neural text evaluation metrics, such as …","url":["https://arxiv.org/pdf/2512.16323"]} {"year":"2025","title":"HAGEN//ANALYTICS","authors":["H There, I Alina"],"snippet":"AI refers to a computer’s ability to emulate human intelligence and thought. When people refer to AI, they are often referring to the concept of Generative AI (GenAI), which refers to a computer’s ability to create new content out of synthesized data …","url":["https://hagenanalytics.com/2025/03/02/gea1-generative-ai-in-educational-spaces/"]} {"year":"2025","title":"Hajj-FQA: A benchmark Arabic dataset for developing question-answering systems on Hajj fatwas: H. Aleid and A. Azmi","authors":["HA Aleid, AM Azmi - Journal of King Saud University Computer and …, 2025"],"snippet":"Deep learning has significantly advanced the question-answering (QA) systems across various sectors. However, Arabic-language systems for Hajj-related fatwas (non-binding Islamic legal opinions issued by muftis) remain underdeveloped. This paper …","url":["https://link.springer.com/article/10.1007/s44443-025-00128-w"]} {"year":"2025","title":"Hardwired-Neurons Language Processing Units as General-Purpose Cognitive Substrates","authors":["Y Liu, Y Chen, Y Zhao, Y Hao, Z Zheng, W Kong, Z Li… - arXiv preprint arXiv …, 2025"],"snippet":"… For instance, Common Crawl [12] has amassed an 8 PB text corpus from web pages, growing consistently by approximately 250 TB per month. This unparalleled scale of pre-training data is instrumental in enabling the zero-shot generalization …","url":["https://arxiv.org/pdf/2508.16151"]} {"year":"2025","title":"Harnessing Deep Learning and Generative AI for Molecular Docking Simulations: Tools, Challenges, and Future Directions","authors":["B Shaker, K Barakat - 2025"],"snippet":"Molecular docking has become a cornerstone in modern drug discovery, helping scientists predict how small molecules, or ligands, interact with target proteins. With the rise of artificial intelligence, particularly deep learning and generative models …","url":["https://www.intechopen.com/online-first/1232081"]} {"year":"2025","title":"Harnessing Large Language Models and Deep Neural Networks for Fake News Detection","authors":["E Papageorgiou, I Varlamis, C Chronis - Information, 2025"],"snippet":"… It was trained on the Colossal Clean Crawled Corpus (C4) dataset, a 750 GB dataset created from Common Crawl’s web-extracted text. The model architecture is similar to the original … It was trained on the RealNews dataset, created from …","url":["https://www.mdpi.com/2078-2489/16/4/297"]} +{"year":"2025","title":"Harnessing LLMs for Arabic Grammar: Enhanced Plural Classification with Mini BERT","authors":["ZA Elashaal, AM Goweder - 2025"],"snippet":"This study presents a lightweight and effective approach for Arabic plural classification using the pre-trained asafaya/bert-mini-arabic transformer model. The proposed method leverages transfer learning and efficient preprocessing …","url":["https://www.researchgate.net/profile/Zahra-Elashaal/publication/398421555_Harnessing_LLMs_for_Arabic_Grammar_Enhanced_Plural_Classification_with_Mini_BERT/links/693553c5a1fd01798905f368/Harnessing-LLMs-for-Arabic-Grammar-Enhanced-Plural-Classification-with-Mini-BERT.pdf"]} {"year":"2025","title":"Harvard Data Science Review ⢠Special Issue 5: Grappling With the Generative AI Revolution","authors":["QV Liao, JW Vaughan"],"snippet":"The rise of powerful large language models (LLMs) brings about tremendous opportunities for innovation but also looming risks for individuals and society at large. We have reached a pivotal moment for ensuring that LLMs and LLM-infused …","url":["https://assets.pubpub.org/7o0l1csl/8036d03b-47f2-4be4-b5e3-daae9d0ef1d1.html"]} {"year":"2025","title":"Has My Code Been Stolen for Model Training? A Naturalness Based Approach to Code Contamination Detection","authors":["HA Khan, Y Jiang, Q Umer, Y Zhang, W Akram, H Liu - Proceedings of the ACM on …, 2025"],"snippet":"It is often valuable to know whether a given piece of source code has or hasn’t been used to train a given deep learning model. On one side, it helps avoid data contamination problems that may exaggerate the performance of evaluated models …","url":["https://dl.acm.org/doi/pdf/10.1145/3715765"]} {"year":"2025","title":"Hash-Based Bernoulli Constructions: Space-Optimal Probabilistic Data Structures","authors":["A Towell"],"snippet":"We present a universal construction for space-optimal probabilistic data structures based on hash functions and Bernoulli types. Our framework unifies Bloom filters, Count-Min sketches, HyperLogLog, and other probabilistic structures under a …","url":["https://metafunctor.com/latex/paper3-hash-constructions/paper3_hash_constructions.pdf"]} {"year":"2025","title":"HASTIKA: hate speech and target identification in Kannada-English code-mixed text","authors":["S Kavatagi, R Rachh - Language Resources and Evaluation, 2025"],"snippet":"In the modern era, the widespread use of social media has facilitated connections among millions of people worldwide. However, these platforms have also been exploited for spreading hate speech, particularly in multilingual contexts. The …","url":["https://link.springer.com/article/10.1007/s10579-025-09836-1"]} {"year":"2025","title":"Hate Speech Detection in Code-Mixed Datasets Using Pretrained Embeddings and Transformers","authors":["T Sohail, A Aiman, E Hashmi, AS Imran, SM Daudpota… - … International Conference on …, 2024"],"snippet":"… The model utilizes FastText’s unsupervised learning method, trained on data from Common Crawl and Wikipedia, to embed words into 300-dimensional vectors. By integrating character n-grams, it enhances its grasp of word morphology and …","url":["https://ieeexplore.ieee.org/abstract/document/10838452/"]} {"year":"2025","title":"Hate Speech Detection in Code-Mixed English-Hindi with Bilingual Large Language Models","authors":["M Robnik-Šikonja - … Advancements in Artificial Intelligence: Proceedings of …, 2025"],"snippet":"… This model is trained on 2.5 TB of Common Crawl text across 100 languages, making it one of the most powerful multilingual language … It was trained from scratch using diverse datasets, including Wikipedia, Common Crawl, PMINDIA, and …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=tyCOEQAAQBAJ&oi=fnd&pg=PA259&dq=commoncrawl&ots=zg2eJYup1o&sig=mJSrSRbSO-PJll1_QRAvYGEKCfU"]} +{"year":"2025","title":"Hate Speech Detection of Manglish (Malay+ English) in X (Twitter) Using XLM-RoBERTa and XLNet","authors":["FB Azmi, N Mamat, NAA Bakar, SM Hussin - Open International Journal of Informatics, 2025"],"snippet":"This study explores hate speech detection in Manglish, a code-mixed language of Malay and English widely used among Malaysian social media users. The main objective is to develop and evaluate deep learning-based models capable of …","url":["https://oiji.utm.my/index.php/oiji/article/download/347/252"]} {"year":"2025","title":"Health Sentinel: An AI Pipeline For Real-time Disease Outbreak Detection","authors":["D Pant, RR Grandhe, V Samaria, M Paul, S Kumar… - arXiv preprint arXiv …, 2025"],"snippet":"… We identified regional news websites that are often overlooked by platforms like Common Crawl or Google News. To address this, we developed a custom crawler that manually collects articles from these sources, improving regional representation. 3. …","url":["https://arxiv.org/pdf/2506.19548"]} +{"year":"2025","title":"HealthContradict: Evaluating Biomedical Knowledge Conflicts in Language Models","authors":["B Zhang, A Bornet, R Yang, N Liu, D Teodoro - arXiv preprint arXiv:2512.02299, 2025"],"snippet":"How do language models use contextual information to answer health questions? How are their responses impacted by conflicting contexts? We assess the ability of language models to reason over long, conflicting biomedical contexts using …","url":["https://arxiv.org/pdf/2512.02299"]} {"year":"2025","title":"HelixTrain: Enhancing Long-Context LLM Training via 3D Dynamic Parallelism","authors":["S Wang, Y Liu, Z Jiawang - Tsinghua University Course: Advanced Machine …"],"snippet":"Data Parallelism. Data Parallelism (DP) assigns training input of sequences to different devices in a DP group, where the model states (parameters, gradients, and optimizer states) are duplicated and gradients need to be reduced to ensure …","url":["https://openreview.net/pdf?id=61k2uuNyS7"]} {"year":"2025","title":"Hephaestus: Improving Fundamental Agent Capabilities of Large Language Models through Continual Pre-Training","authors":["Y Zhuang, J Yang, H Jiang, X Liu, K Cheng… - arXiv preprint arXiv …, 2025"],"snippet":"Due to the scarcity of agent-oriented pre-training data, LLM-based autonomous agents typically rely on complex prompting or extensive fine-tuning, which often fails to introduce new capabilities while preserving strong generalizability. We introduce …","url":["https://arxiv.org/pdf/2502.06589"]} {"year":"2025","title":"Hermes: Algorithm-System Co-design for Efficient Retrieval-Augmented Generation At Scale","authors":["M Shen, M Umar, K Maeng, GE Suh, U Gupta - 2025"],"snippet":"… less than 10B tokens we use a subset of Common Crawl [36]. We generate a synthetic set of … indices from the 10B token subset of Common Crawl, ranging from 5GB to 11GB each and … of indices built using the 10B token Common Crawl …","url":["https://michaeltshen.github.io/Files/Hermes.pdf"]} @@ -10168,6 +10287,7 @@ {"year":"2025","title":"High-Fidelity Simultaneous Speech-To-Speech Translation","authors":["T Labiausse, L Mazaré, E Grave, P Pérez, A Défossez… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Hibiki, a decoder-only model for simultaneous speech translation. Hibiki leverages a multistream language model to synchronously process source and target speech, and jointly produces text and audio tokens to perform speech-to-text …","url":["https://arxiv.org/pdf/2502.03382"]} {"year":"2025","title":"HiligayNER: A Baseline Named Entity Recognition Model for Hiligaynon","authors":["JA Teves, RD Cal, JM Villaluz, J Malolos, M Magtira… - arXiv preprint arXiv …, 2025"],"snippet":"The language of Hiligaynon, spoken predominantly by the people of Panay Island, Negros Occidental, and Soccsksargen in the Philippines, remains underrepresented in language processing research due to the absence of annotated corpora and …","url":["https://arxiv.org/pdf/2510.10776"]} {"year":"2025","title":"History of Large Language Models","authors":["F De Luzi - Engineering Information Systems with Large Language …, 2025"],"snippet":"This chapter explores the historical development of artificial intelligence (AI) and natural language processing (NLP), focusing on the evolution of language modeling. We begin by outlining the foundations of AI, from symbolic approaches to the …","url":["https://link.springer.com/chapter/10.1007/978-3-031-92285-5_2"]} +{"year":"2025","title":"HmBlogs: A Comprehensive Corpus and Benchmarking Study for Persian Word Embedding and Language Modeling","authors":["HM Khansari, M Shamsfard, M Masumi, SS Majd - SN Computer Science, 2026"],"snippet":"… The main part of PRT is obtained from the Persian section of the Common Crawl project [8]. Common Crawl is a web crawl project that crawls and collects resources available on the web in any language, including some in Persian. PRT is bigger …","url":["https://link.springer.com/article/10.1007/s42979-025-04612-y"]} {"year":"2025","title":"Homophily-induced Emergence of Biased Structures in LLM-based Multi-Agent AI Systems","authors":["A Mehdizadeh, M Hilbert - arXiv preprint arXiv:2510.02637, 2025"],"snippet":"This study examines how interactions among artificially intelligent (AI) agents, guided by large language models (LLMs), drive the evolution of collective network structures. We ask LLM-driven agents to grow a network by informing them about …","url":["https://arxiv.org/pdf/2510.02637"]} {"year":"2025","title":"Homophonic Pun Generation in Code Mixed Hindi English","authors":["YR Sarrof - Proceedings of the 1st Workshop on Computational …, 2025"],"snippet":"In this study, we investigate Hinglish—a blend of Hindi and English commonly found in informal online communication—with a particular focus on automated pun generation. Our work examines the applicability and adaptability of existing English …","url":["https://aclanthology.org/2025.chum-1.4.pdf"]} {"year":"2025","title":"Horizon-scale variability of from 2017--2021 EHT observations","authors":["K Akiyama, E Albentosa-Ruíz, A Alberdi, W Alef… - Astronomy & Astrophysics"],"snippet":"We report three epochs of polarized images of at 230,GHz using data from the Event Horizon Telescope (EHT) taken in 2017, 2018, and 2021. The baseline coverage of the 2021 observations is significantly improved through the addition of two new EHT …","url":["https://www.aanda.org/articles/aa/pdf/forth/aa55855-25.pdf"]} @@ -10175,8 +10295,10 @@ {"year":"2025","title":"HOW CONVERSATIONAL SYSTEMS ARE BUILT USING LANGUAGE MODELS","authors":["DB Hydyrova, D Jumayeva - ОБРАЗОВАНИЕ И НАУКА В XXI ВЕКЕ, 2025"],"snippet":"Conversational systems, also known as chatbots or virtual assistants, have evolved significantly with the advancement of large language models (LLMs). These systems rely on natural language processing (NLP), deep learning, and reinforcement …","url":["https://mpcareer-google.ru/index.php/journal/article/download/1168/1135"]} {"year":"2025","title":"How do data owners say no? A case study of data consent mechanisms in web-scraped vision-language AI training datasets","authors":["CP Lee, R Hong, HH Jiang, A Plotnik, W Agnew… - NeurIPS 2025 Workshop on …"],"snippet":"… .txt to disallow CommonCrawl crawling (via CCBot), … CommonCrawl, which respects robots.txt 281 when sourcing the web pages, we still observe CCBot in 353K robots.txt. The most likely reason 282 is that the user adopts robots.txt to …","url":["https://openreview.net/pdf?id=auGKbfDyTO"]} {"year":"2025","title":"How Good is BLI as an Alignment Measure: A Study in Word Embedding Paradigm","authors":["K Wickramasinghe, N de Silva - International Conference on Computational …, 2025"],"snippet":"… This result improvement has been shown using both Wikipedia (wiki) and Common-Crawl (cc) FastText models. These results show that the standard BLI task can output pessimistic results in inflected language cases and the proposed stem-based …","url":["https://link.springer.com/chapter/10.1007/978-3-032-10202-7_26"]} +{"year":"2025","title":"How Learning Rate Decay Wastes Your Best Data in Curriculum-Based LLM Pretraining","authors":["K Luo, Z Sun, H Wen, X Shi, J Cui, C Dang, K Lyu… - arXiv preprint arXiv …, 2025"],"snippet":"Due to the scarcity of high-quality data, large language models (LLMs) are often trained on mixtures of data with varying quality levels, even after sophisticated data curation. A natural approach to better leverage high-quality data is curriculum-based …","url":["https://arxiv.org/pdf/2511.18903"]} {"year":"2025","title":"How Long Do Financial Markets Need to Fully Respond to FOMC Announcements?","authors":["PL Tran"]} {"year":"2025","title":"How Much Do LLMs Hallucinate across Languages? On Multilingual Estimation of LLM Hallucination in the Wild","authors":["A Lauscher, G Glavaš - arXiv preprint arXiv:2502.12769, 2025"],"snippet":"… Across all 30 languages, however, we find no correlation between the hallucination rates and measures of language “resourceness”: (i) proportion of language-specific data in Common Crawl and (ii) number of articles in the language-specific …","url":["https://arxiv.org/pdf/2502.12769"]} +{"year":"2025","title":"How Prevalent Is Gender Bias in ChatGPT?-Exploring German and English ChatGPT Responses","authors":["C Heumann, S Thiemichen - Detecting Gender Discrimination in Natural Language …"],"snippet":"… 1 For more information, see https://commoncrawl. org/. 2 We use the term minorised groups according to the definition of D’Ignazio and Klein [4]:“While the term minority describes a social group that is comprised of fewer people, minoritized …","url":["https://edoc.ub.uni-muenchen.de/36297/1/Urchs_Stefanie.pdf#page=70"]} {"year":"2025","title":"How Sampling Affects the Detectability of Machine-written texts: A Comprehensive Study","authors":["M Dubois, F Yvon, P Piantanida - arXiv preprint arXiv:2510.13681, 2025"],"snippet":"As texts generated by Large Language Models (LLMs) are ever more common and often indistinguishable from human-written content, research on automatic text detection has attracted growing attention. Many recent detectors report near-perfect …","url":["https://arxiv.org/pdf/2510.13681"]} {"year":"2025","title":"How to Compare Things Properly? A Study of Argument Relevance in Comparative Question Answering","authors":["I Nikishina, S Anwar, N Dolgov, M Manina, D Ignatenko…"],"snippet":"… using the Comparative Argumentative Machine (CAM 2.0), which retrieves relevant content from CommonCrawl (Schildwächter et al.… 2019), which involves retrieving relevant sentences from the CommonCrawl corpus, sentence classification …","url":["https://www.inf.uni-hamburg.de/en/inst/ab/lt/publications/2025-nikishinaetal-acl-cqa.pdf"]} {"year":"2025","title":"How to Tune a Multilingual Encoder Model for Germanic Languages: A Study of PEFT, Full Fine-Tuning, and Language Adapters","authors":["R Oji, J Kunz - arXiv preprint arXiv:2501.06025, 2025"],"snippet":"This paper investigates the optimal use of the multilingual encoder model mDeBERTa for tasks in three Germanic languages -- German, Swedish, and Icelandic -- representing varying levels of presence and likely data quality in …","url":["https://arxiv.org/pdf/2501.06025"]} @@ -10194,6 +10316,7 @@ {"year":"2025","title":"Humanitarian classification of crisis-related microblogs in Bengali: A comparison of multilingual pre-trained language models","authors":["K Das, D Datta, M Basu, S Ghosh - International Journal of Disaster Risk Reduction, 2025"],"snippet":"During a crisis or disaster event, humanitarian organizations need various types of situational information that is essential for planning relief efforts. Social media platforms like X (erstwhile Twitter) have proven to be effective platforms for …","url":["https://www.sciencedirect.com/science/article/pii/S2212420925004972"]} {"year":"2025","title":"Hybrid AI for Large-Scale Foundation Models","authors":["V Bengani"],"snippet":"… o Common Crawl: A massive corpus of web data that serves as a source for training transformer-based models like GPT-4, which requires large-scale text data for pretraining on diverse content. o Wikipedia: Used to fine-tune LLMs for domain-specific …","url":["https://www.researchgate.net/profile/Vedika-Bengani/publication/390760730_Hybrid_AI_for_Large-Scale_Foundation_Models/links/67fd16d5d1054b0207d35ed5/Hybrid-AI-for-Large-Scale-Foundation-Models.pdf"]} {"year":"2025","title":"Hybrid natural language processing tool for semantic annotation of medical texts in Spanish","authors":["L Campillos-Llanos, A Valverde-Mateos… - BMC Bioinformatics, 2025"],"snippet":"… CLIN-X-ES is derived from the XML RoBERTA multilingual model (originally pre-trained on 2.5 terabytes of the CommonCrawl corpus for 100 languages), by continuous pre-training on a corpus of medical texts from SciELO, MedlinePlus, EMEA or PubMed. This …","url":["https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-024-05949-6"]} +{"year":"2025","title":"Hydraulis: Balancing Large Transformer Model Training via Co-designing Parallel Strategies and Data Assignment","authors":["H Li, F Fu, S Lin, H Ge, X Wang, J Niu, J Xue, Y Tao… - Proceedings of the ACM on …, 2025"],"snippet":"… Using the LLaMA 7B model and the CommonCrawl dataset, we start with a default setup of 16 GPUs, 32K maximum sequence length, and 200K tokens per iteration. By systematically altering one factor while maintaining others, we gain …","url":["https://dl.acm.org/doi/abs/10.1145/3769802"]} {"year":"2025","title":"HYLR-FO: Hybrid Approach Using Language Models and Rule-Based Systems for On-Device Food Ordering","authors":["S Yang, D Kim, S Lee - Electronics, 2025"],"snippet":"… Its multilingual counterpart, mT5, is trained on a dataset derived from Common Crawl, covering 101 languages [30]. MASS (masked sequence-to-sequence pre-training) employs an encoder–decoder framework to reconstruct missing segments within a …","url":["https://www.mdpi.com/2079-9292/14/4/775"]} {"year":"2025","title":"Ibom NLP: A Step Toward Inclusive Natural Language Processing for Nigeria's Minority Languages","authors":["O Kalejaiye, LH Beyene, DI Adelani, MMG Edet… - arXiv preprint arXiv …, 2025"],"snippet":"Nigeria is the most populous country in Africa with a population of more than 200 million people. More than 500 languages are spoken in Nigeria and it is one of the most linguistically diverse countries in the world. Despite this, natural language …","url":["https://arxiv.org/pdf/2511.06531"]} {"year":"2025","title":"IDENTIFYING ARGUMENTATIVE CLAIMS IN BIOMEDICAL RESEARCH ARTICLES","authors":["GN Patil - 2025"],"snippet":"… Common Crawl is a vast dataset of web crawls that contains a variety of web pages, offering more diversity in text and gathering various online content. The combined data results in a large and diverse collection of text containing hundreds …","url":["https://ir.lib.uwo.ca/cgi/viewcontent.cgi?article=13726&context=etd"]} @@ -10201,13 +10324,17 @@ {"year":"2025","title":"Identifying Rare Languages in Common Crawl Data is a Needles-in-a-Haystack Problem","authors":["R Dent, PO Suarez, T Clérice, B Sagot - Findings of the Association for …, 2025"],"snippet":"Automatic language identification is frequentlyframed as a multi-class classification problem. However, when creating digital corpora forless commonly written languages, it may bemore appropriate to consider it a data min-ing problem. For …","url":["https://aclanthology.org/2025.findings-emnlp.77/"]} {"year":"2025","title":"Identifying School Shooter Threats Through Online Texts","authors":["OJ Liahagen, MJ Nilsen, B Gambäck - … in Natural Language Processing and Social …, 2025"],"snippet":"… GloVe embeddings were extracted using Wikipedia 2014 + Gigaword 53 and the Common Crawl 840B3 sets as frozen embedding layers. Two different vector dimensionalities, 50 and 300, were utilized to study their effects on prediction …","url":["https://ieeexplore.ieee.org/abstract/document/10970666/"]} {"year":"2025","title":"Identifying western ideological bias embedded in large language models through Marxist epistemology","authors":["X Jie - Медиалингвистика, 2025"],"snippet":"… Pre-training data, often from internet crawls (like Common Crawl), is a primary source; DeepSeek's large English corpus likely …","url":["https://cyberleninka.ru/article/n/identifying-western-ideological-bias-embedded-in-large-language-models-through-marxist-epistemology"]} +{"year":"2025","title":"Ideological Media Markets, Cross-Pressured Voters, and Spatial Voting: A Computational and Experimental Approach","authors":["LP Da Silva - 2025"],"snippet":"… I modify the news-please Python package to extract one million randomly-selected English-language articles from Common Crawl, a repository … First, I obtain the corpus by modifying a Python crawler to extract one million media articles from …","url":["https://www.tara.tcd.ie/bitstreams/38b66f6a-403b-4389-8f76-5acf946c993f/download"]} {"year":"2025","title":"Idiosyncrasies in Large Language Models","authors":["M Sun, Y Yin, Z Xu, JZ Kolter, Z Liu - arXiv preprint arXiv:2502.12150, 2025"],"snippet":"In this work, we unveil and study idiosyncrasies in Large Language Models (LLMs) -- unique patterns in their outputs that can be used to distinguish the models. To do so, we consider a simple classification task: given a particular text output, the objective …","url":["https://arxiv.org/pdf/2502.12150"]} {"year":"2025","title":"IFEvalCode: Controlled Code Generation","authors":["J Yang, W Zhang, S Liu, L Chai, Y Tan, J Liu, G Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"… Forward Constraints Generation Given the recalled code-related documents from Common Crawl, we adopt Qwen2.5-Coder-32B to create new questions by drawing inspiration from the coderelated documents for a general question. To effectively …","url":["https://arxiv.org/pdf/2507.22462"]} +{"year":"2025","title":"IfGPT: A Dataset in Bulgarian for Large Language Models","authors":["S Koeva, I Stoyanova, J Kralev"],"snippet":"… CommonCrawl creates and maintains an open web crawl dataset. Since 2008, CommonCrawl has collected petabytes of data, including raw web page data, metadata, and text extractions. CommonCrawl is typically used to retrieve subsets of …","url":["https://acl-bg.org/proceedings/2025/LowResNLP%202025/pdf/2025.lowresnlp-1.7.pdf"]} {"year":"2025","title":"Impact of Deep Learning for Multilingual Natural Language Processing in Educational Applications","authors":["P Tamilarasan, V Selvaraj, LR Buckingham - … of International Conference on Recent Trends …"],"snippet":"Over the past few years, the field of educational technology has experienced notable progress by incorporating advanced deep learning methods, namely, in the area of multilingual Natural Language Processing (NLP). This work examines the utilization …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=lkxLEQAAQBAJ&oi=fnd&pg=PA407&dq=commoncrawl&ots=AapRFa3NJ2&sig=Tc0tJlLe8ofFbb0SbCPDw-H-OM8"]} +{"year":"2025","title":"Impact of Ontologies and Data Structures on Retrieval Augmented Generation Systems in Manufacturing Simulation Software","authors":["M Gautam - 2025"],"snippet":"Manufacturing simulation software enables engineers to model, analyse, and optimise production systems using both structured and unstructured data. These platforms support virtual representations of manufacturing processes, equipment …","url":["https://aaltodoc.aalto.fi/bitstreams/a5ea9fbe-3a85-4c8a-9020-760794b24d98/download"]} {"year":"2025","title":"Impact of Pretraining Word Co-occurrence on Compositional Generalization in Multimodal Models","authors":["H Qu, SM Xie - arXiv preprint arXiv:2507.08000, 2025"],"snippet":"… LAION-400M [21] is a dataset of 400 million image-text pairs curated from Common Crawl by filtering out pairs with CLIP embedding cosine similarity below 0.3. LAION-400M was created to emulate the closed-source WIT-400M [1] dataset used to train the …","url":["https://arxiv.org/pdf/2507.08000"]} {"year":"2025","title":"Implementation of neural networks in Russian newsrooms","authors":["K Nigmatullina, R Kasymov, A Polyakov - Медиалингвистика, 2025"],"snippet":"… Pre-training data, often from internet crawls (like Common Crawl), is a primary source; DeepSeek's large English corpus likely …","url":["https://cyberleninka.ru/article/n/implementation-of-neural-networks-in-russian-newsrooms"]} {"year":"2025","title":"Implicit Evaluation of Health Answers from Large Language Models","authors":["J Probst"],"snippet":"… Scripts are provided to download the documents from the CommonCrawl archive, as well as the social media content directly over the … The web documents are easily accessible via the CommonCrawl archive. Before accessing all relevant …","url":["https://downloads.webis.de/theses/papers/probst_2024.pdf"]} {"year":"2025","title":"Implicit knowledge-augmented prompting for commonsense explanation generation","authors":["Y Ge, HT Yu, C Lei, X Liu, A Jatowt, K Kim, S Lynden… - Knowledge and Information …, 2025"],"snippet":"… OPT’s pre-training primarily involves English text, though a small amount of non-English data from CommonCrawl is present in the training corpus. It is pre-trained with a causal language modeling objective and is a decoder-only model, similar to GPT-3. …","url":["https://link.springer.com/article/10.1007/s10115-024-02326-w"]} +{"year":"2025","title":"Impression Change Prediction Method Based on Speakers' Personalities in Speed Dating Dialogue Simulations","authors":["K Matsuo, Y Ishii, A Otsuka, R Ishii, H Sugiyama… - Journal of Information …, 2025"],"snippet":"This paper focuses on simulating text dialogues in which impressions between speakers improve during speed dating. This simulation involves selecting an utterance from multiple candidates generated by a text generation model that …","url":["https://www.jstage.jst.go.jp/article/ipsjjip/33/0/33_1201/_pdf"]} {"year":"2025","title":"Improving Acoustic Recognition Models/Author Paul Primus","authors":["P Primus - 2024"],"snippet":"Sound is one of the fundamental signals through which we perceive our surroundings and consequently, humans have evolved to perform complex auditory tasks effortlessly. The field of intelligent audio processing aims to replicate these …","url":["https://epub.jku.at/obvulihs/content/titleinfo/11472142/full.pdf"]} {"year":"2025","title":"Improving complex reasoning in large language models","authors":["Y Fu - 2025"],"snippet":"This thesis studies complex reasoning in language models. We use the term reasoning to refer to tasks that would require a human to perform slow deliberate, step-by-step thinking (instead of providing an intuitive and instantaneous response) …","url":["https://era.ed.ac.uk/bitstream/handle/1842/43549/Fu2025.pdf?sequence=1&isAllowed=y"]} {"year":"2025","title":"Improving critical infrastructure security through hybrid embeddings for vulnerability classification","authors":["AB Yahya, H El Akhal, AEB El Alaoui - Journal of Information Security and …, 2025"],"snippet":"The growing prevalence of vulnerabilities in embedded devices poses a significant risk to critical infrastructure. While deep learning has advanced vulnerability classification, its effectiveness is often hindered by limitations in word representation …","url":["https://www.sciencedirect.com/science/article/pii/S2214212625002224"]} @@ -10218,6 +10345,7 @@ {"year":"2025","title":"Improving LLMs' Generalized Reasoning Abilities by Graph Problems","authors":["Q Zhang, N Chen, Z Li, M Peng, J Tang, J Li - arXiv preprint arXiv:2507.17168, 2025"],"snippet":"Large Language Models (LLMs) have made remarkable strides in reasoning tasks, yet their performance often falters on novel and complex problems. Domain-specific continued pretraining (CPT) methods, such as those tailored for mathematical …","url":["https://arxiv.org/pdf/2507.17168"]} {"year":"2025","title":"Improving Machine Translation Formality with Large Language Models","authors":["M Yang, F Li - Computers, Materials and Continua, 2025"],"snippet":"Preserving formal style in neural machine translation (NMT) is essential, yet often overlooked as an optimization objective of the training processes. This oversight can lead to translations that, though accurate, lack formality. In this paper, we propose …","url":["https://www.sciencedirect.com/org/science/article/pii/S154622182500150X"]} {"year":"2025","title":"Improving Model Representation and Reducing KV Cache via Skip Connections with First Value Heads","authors":["Z Wu, Y Zhang, Y Dong, C Zhang, C Fang, K Yuan… - arXiv preprint arXiv …, 2025"],"snippet":"… We pre-train on C4 dataset, a colossal, cleaned version of Common Crawl’s web crawl corpus [52]. We evaluate sizes from 60 M up to 3 B parameters to test scalability. More details are provided in Appendix B.2. …","url":["https://arxiv.org/pdf/2510.16807"]} +{"year":"2025","title":"IMPROVING MULTICLASS CLASSIFICATION FOR SOFTWARE ISSUE TRIAGE","authors":["H Park"],"snippet":"As software systems evolve rapidly and the volume of issue reports continues to grow, effective issue triage has become more critical than ever. Manual triage places a significant burden on IT technicians and developers, often leading to human errors …","url":["https://run.unl.pt/bitstream/10362/190717/1/TGI4409.pdf"]} {"year":"2025","title":"Improving Multilingual Retrieval-Augmented Language Models through Dialectic Reasoning Argumentations","authors":["L Ranaldi, F Ranaldi, FM Zanzotto, B Haddow, A Birch - arXiv preprint arXiv …, 2025"],"snippet":"Retrieval-augmented generation (RAG) is key to enhancing large language models (LLMs) to systematically access richer factual knowledge. Yet, using RAG brings intrinsic challenges, as LLMs must deal with potentially conflicting knowledge, especially in …","url":["https://arxiv.org/pdf/2504.04771"]} {"year":"2025","title":"Improving Romanian LLM Pretraining Data using Diversity and Quality Filtering","authors":["V Negoita, M Masala, T Rebedea - arXiv preprint arXiv:2511.01090, 2025"],"snippet":"… All three datasets stem from CommonCrawl, with different number of snapshots used and different rules for processing and filtering. Crucially, all datasets employ rather standard rules based on n-gram frequency, stop word ratio or text length, and …","url":["https://arxiv.org/pdf/2511.01090"]} {"year":"2025","title":"Improving story points estimation using ensemble machine learning","authors":["Z Ahmad, MMY Kuo - Software Quality Journal, 2025"],"snippet":"Agile software development (ASD) emphasizes iterative development, continuous feedback, and team collaboration, addressing the limitations of traditional methodologies. This research explores the application of machine learning (ML) to …","url":["https://link.springer.com/article/10.1007/s11219-025-09731-6"]} @@ -10226,21 +10354,26 @@ {"year":"2025","title":"IMPROVING VISION LLM PERFORMANCE ON STANDARDIZED TEST QUESTIONS","authors":["SE Sert - 2025"],"snippet":"In our research, we show that open-source vision-language models can be trained to rival proprietary systems on complex, multimodal Turkish high-school exam questions — a domain where no benchmark previously existed. This thesis introduces …","url":["https://open.metu.edu.tr/bitstream/handle/11511/116126/10755108.pdf"]} {"year":"2025","title":"In Generative AI We Trust: Measuring the Potential for Deception in LLM-Generated Health Information Using Computational Content Analysis","authors":["M Cardona - 2025"],"snippet":"Misleading health information remains a central concern in medical sociology and public health due to its harmful effects on individuals and society. As health information-seeking increasingly shifts to digital platforms, Large Language Models (LLMs)—now …","url":["https://lup.lub.lu.se/luur/download?func=downloadFile&recordOId=9200826&fileOId=9200829"]} {"year":"2025","title":"In the Mood to Exclude: Revitalizing Trespass to Chattels in the Era of GenAI Scraping","authors":["D Atkinson - arXiv preprint arXiv:2510.16049, 2025"],"snippet":"This paper argues that website owners have the right to exclude others from their websites. Accordingly, when generative AI (GenAI) scraping bots intentionally circumvent reasonable technological barriers, their conduct could be actionable as …","url":["https://arxiv.org/pdf/2510.16049"]} +{"year":"2025","title":"In-Context and Few-Shots Learning for Forecasting Time Series Data based on Large Language Models","authors":["S Gopali, B Chhetri, D Giri, S Siami-Namini, AS Namin - arXiv preprint arXiv …, 2025"],"snippet":"… The system models local and global patterns across several domains, and textual information obtained from multiple sources, such as opensource code, such as common crawl, enables TimesFM to identify language trends. For example, the …","url":["https://arxiv.org/pdf/2512.07705"]} {"year":"2025","title":"In-Context Learning as Conditioned Associative Memory Retrieval","authors":["W Wu, TY Hsiao, JYC Hu, W Zhang, H Liu - Forty-second International Conference on …"],"snippet":"We provide an exactly solvable example for interpreting In-Context Learning (ICL) with one-layer attention models as conditional retrieval of dense associative memory models. Our main contribution is to interpret ICL as memory reshaping in the modern …","url":["https://openreview.net/pdf?id=Zup6F3MwQO"]} +{"year":"2025","title":"In-language Exams for Massively Multilingual Vision Evaluation Salazar, Israfel; Burda, Manuel Fernández; Islam, Shayekh Bin; Moakhar, Arshia Soltani; Singh …","authors":["I Salazar"],"snippet":"Evaluations are the backbone of measuring progress in machine learning, yet many benchmarks–especially for language models–continue to mirror an English and Western-centric worldview (Joshi et al., 2020; Fan et al., 2020; Dodge et al., 2021; …","url":["https://vbn.aau.dk/ws/files/776476487/2504.07072v1.pdf"]} {"year":"2025","title":"Incident Cause Classification in Insurance claims using Generative AI","authors":["M Uzair - 2025"],"snippet":"Automation plays a critical role in modern insurance operations, enabling companies like OP-Pohjola to process claims more rapidly, reduce manual workloads, and improve customer satisfaction. However, claim automation rates are …","url":["https://aaltodoc.aalto.fi/bitstreams/e9ba20e5-5a3b-45f9-b6bb-b85212df4745/download"]} {"year":"2025","title":"Incorporating Symmetry and Constraints Into Machine Learning for Molecular and Solid-State Systems","authors":["W Gong - 2025"],"snippet":"This thesis focuses on the development and application of ML models incorporating physics constraints and symmetry for predicting complex physical quantities of both solid-state condensed matter systems and molecules with the aim to accelerate the …","url":["https://search.proquest.com/openview/8c93c72ef3008d1940bb12a92336f490/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Increasing LLM Coding Capabilities through Diverse Synthetic Coding Tasks","authors":["A Abed, I Lukic, JKH Franke, F Hutter - arXiv preprint arXiv:2510.23208, 2025"],"snippet":"Large language models (LLMs) have shown impressive promise in code generation, yet their progress remains limited by the shortage of large-scale datasets that are both diverse and well-aligned with human reasoning. Most existing resources pair …","url":["https://arxiv.org/pdf/2510.23208"]} {"year":"2025","title":"Incremental Tensor Induction through Unbounded Pseudo-Contextualization in Pretrained Language Models","authors":["O Strickland, H Whitlam, R Cattermole, S Chilvers…"],"snippet":"… Model Selection and Modification The base model selected for architectural modification was a 7B-parameter causal decoder-only transformer pretrained on a mixture of Common Crawl, Wikipedia, and high-quality instructional corpora …","url":["https://www.researchgate.net/profile/Kent-Blumberg-2/publication/395935667_Incremental_Tensor_Induction_through_Unbounded_Pseudo-Contextualization_in_Pretrained_Language_Models/links/68d9092b9383755fd707648d/Incremental-Tensor-Induction-through-Unbounded-Pseudo-Contextualization-in-Pretrained-Language-Models.pdf"]} {"year":"2025","title":"Indian integration","authors":["B Raj"],"snippet":"… Reference.org uses data and images under license from Common Crawl, Getty Images, MusicBrainz, TMDB, Unsplash, Wikipedia …","url":["https://reference.org/facts/Praja_Mandal/eI2VeAu1"]} {"year":"2025","title":"Indian Legal Judgment Summarization using LEGAL-BERT and BiLSTM model with Adaptive Length","authors":["V Naik, K Rajeswari - EPJ Web of Conferences, 2025"],"snippet":"… While pretrained general corpora (eg, Wikipedia, BooksCorpus, and Common Crawl) trained language models have proven effective across generalized tasks, they often fall short when considering domain-specific tasks that require in-domain …","url":["https://www.epj-conferences.org/articles/epjconf/pdf/2025/13/epjconf_icetsf2025_01043.pdf"]} +{"year":"2025","title":"IndicParam: Benchmark to evaluate LLMs on low-resource Indic Languages","authors":["A Maheshwari, K Sharma, V Patel, A Maheshwari - arXiv preprint arXiv:2512.00333, 2025"],"snippet":"While large language models excel on high-resource multilingual tasks, lowand extremely low-resource Indic languages remain severely under-evaluated. We present IndicParam, a human-curated benchmark of over 13,000 multiple-choice …","url":["https://arxiv.org/pdf/2512.00333"]} {"year":"2025","title":"IndicSuperTokenizer: An Optimized Tokenizer for Indic Multilingual LLMs","authors":["S Rana, A Menezes, A Kulkarni, C Khatri, S Agarwal - arXiv preprint arXiv …, 2025"],"snippet":"Tokenizers play a crucial role in determining the performance, training efficiency, and the inference cost of Large Language Models (LLMs). Designing effective tokenizers for multilingual LLMs is particularly challenging due to diverse scripts and …","url":["https://arxiv.org/pdf/2511.03237"]} {"year":"2025","title":"Indo-Aryan Languages: A Transformer-Based Survey","authors":["S Roy, JR Saini - Intelligent System and Data Analysis: SSIC 2023 …"],"snippet":"… It has been trained on large (2.5 TB) Common Crawl Data [30]. It has performed well for all multiple cross-lingual benchmarks. This model consists of 12 … Dirt cheap web-scale parallel text from the common crawl. In: Proceedings of the 51st …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=RFlCEQAAQBAJ&oi=fnd&pg=PA390&dq=commoncrawl&ots=ksbMf6VReK&sig=OLSGWjzlIpqdGGNqhh2JY-1QaPg"]} {"year":"2025","title":"Indonesian Abstractive Text Summarization Using Stacked Embeddings and Transformer Decoder","authors":["E Winarko, L Tanoto, MH Reza"],"snippet":"Document summarization can be categorized into two categories: extractive and abstractive summarization. Research in abstractive summarization is more limited than that of extractive summarization, especially for Indonesian documents. Most …","url":["https://www.iaeng.org/IJCS/issues_v52/issue_4/IJCS_52_4_16.pdf"]} +{"year":"2025","title":"Inevitable Errors: Defamation by Hallucination in AI Reasoning Models","authors":["LB Lidsky, A Daves - 2025"],"snippet":"Over the last millennium, defamation law has adapted to many new information technologies, including the printing press, the telegraph, and the internet. Now, defamation law must adapt to the challenges presented by generative artificial …","url":["https://scholarship.law.ufl.edu/cgi/viewcontent.cgi?article=2385&context=facultypub"]} {"year":"2025","title":"Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index","authors":["H Xu, J Liu, Y Choi, NA Smith, H Hajishirzi - arXiv preprint arXiv:2506.12229, 2025"],"snippet":"… In the future, we will keep indexing the latest crawl in Common Crawl and update contamination results to track benchmark contamination as corpora evolve. The system also allows anyone to add or upload new benchmarks to be monitored …","url":["https://arxiv.org/pdf/2506.12229"]} {"year":"2025","title":"Influence of parallel data on multilingual representation space of language models","authors":["J Leino, J Karlgren - 2025"],"snippet":"Language models have become increasingly performant in processing textual data in recent years, leading to their rapidly growing adoption in the everyday lives of people. As these models transform the way we work, learn, and live across sectors …","url":["https://helda.helsinki.fi/bitstreams/4324aa4d-8156-4ceb-b012-cd8a81f46dcb/download"]} {"year":"2025","title":"INFORMATION EXTRACTION FROM SCIENTIFIC LITERATURE","authors":["H Pan - 2025"],"snippet":"The exponential growth of scientific literature, with millions of new articles published annually, has created an unsustainable discovery bottleneck across research communities. Manual extraction of critical information—including methodologies …","url":["https://cis.temple.edu/~latecki/Dissertations/JoPan_Dissertation2025.pdf"]} {"year":"2025","title":"Informative task classification with concatenated embeddings using deep learning on crisisMMD","authors":["T Jain, D Gopalani, Y Kumar Meena - International Journal of Computers and …, 2025"],"snippet":"Disastrous situations pose a formidable challenge, testing our resilience against nature's fury and the race against time to prevent the loss of human life. It is noted that in such situations that Microblogging platforms like Twitter(now X) have proven …","url":["https://www.tandfonline.com/doi/abs/10.1080/1206212X.2024.2447066"]} {"year":"2025","title":"Informed Digital Systems: Knowledge Procurement, Gate/Keeping, and Experience","authors":["RX Nokes - 2025"],"snippet":"… Generative models like ChatGPT are similarly reliant upon a system of crawlers (known as GPTBot), the Common Crawl (a massive nonprofit organization that maintains an open repository of information collected across the internet), and other content such …","url":["https://search.proquest.com/openview/691423f46f56e17fb021c563c32a00c2/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2025","title":"Infrastructural consent: robots.txt as a protocol for automated data extraction","authors":["K MacKinnon, E Maemura - Information, Communication & Society, 2025"],"snippet":"… Footnote 13 What is not often addressed or acknowledged is how the technologies of IA’s collecting (as well as other large-scale archives like Common Crawl) share their origins with the crawler technologies used for search engine …","url":["https://www.tandfonline.com/doi/abs/10.1080/1369118X.2025.2598054"]} {"year":"2025","title":"Inner Thinking Transformer: Leveraging Dynamic Depth Scaling to Foster Adaptive Internal Thinking","authors":["Y Chen, J Shang, Z Zhang, Y Xie, J Sheng, T Liu… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) face inherent performance bottlenecks under parameter constraints, particularly in processing critical tokens that demand complex reasoning. Empirical analysis reveals challenging tokens induce abrupt gradient …","url":["https://arxiv.org/pdf/2502.13842"]} {"year":"2025","title":"Innovative AI‐Driven Data Annotation Techniques","authors":["G Viswanath, GKK Reddy, KS Rao, C Rambabu - Adaptive Artificial Intelligence …, 2026"],"snippet":"Big data analytics offers significant potential for organizations to gain valuable insights and improve performance. Anyhow, the quality data mining is importantly dependent on data quality. This chapter addresses the vital challenges obstructing …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/9781394389070.ch15"]} {"year":"2025","title":"Inside Out 2: Make Room for New Emotions & LLM: A Reproducibility Study of the Emotional Side of Search in the Classroom","authors":["H Chakrabarti, DM Tobia, M Landoni, MS Pera - … of the 48th International ACM SIGIR …, 2025"],"snippet":"In an existing study, the InsideOut Framework is used to produce and explore the emotional profiles of search engines (SE) in response to queries formulated by children aged 9 to 11 in the classroom context, revealing the emotional diversity of …","url":["https://dl.acm.org/doi/pdf/10.1145/3726302.3730315"]} @@ -10253,6 +10386,7 @@ {"year":"2025","title":"INSTRUCTING LANGUAGE MODELS TO BE INTELLIGENT AI ASSISTANTS","authors":["Z Zhang - 2025"],"snippet":"… sources have been utilized for such purposes: Conversation logs between human users and online LM services (eg, OpenAI API) [20, 214]; Online QA forums like StackExchange, WikiHow, and Reddit [218]; Directly extracting QA pairs from web …","url":["https://curate.nd.edu/ndownloader/files/56105813/1"]} {"year":"2025","title":"Instructing Large Language Models for Low-Resource Languages: A Systematic Study for Basque","authors":["O Sainz, N Perez, J Etxaniz, JF de Landa, I Aldabe… - arXiv preprint arXiv …, 2025"],"snippet":"Instructing language models with user intent requires large instruction datasets, which are only available for a limited set of languages. In this paper, we explore alternatives to conventional instruction adaptation pipelines in low-resource …","url":["https://arxiv.org/pdf/2506.07597"]} {"year":"2025","title":"Instruction-Tuning Data Synthesis from Scratch via Web Reconstruction","authors":["Y Jiang, Y Wang, C Wu, X Dai, Y Xu, W Gan, Y Wang… - arXiv preprint arXiv …, 2025"],"snippet":"The improvement of LLMs' instruction-following capabilities depends critically on the availability of high-quality instruction-response pairs. While existing automatic data synthetic methods alleviate the burden of manual curation, they often rely heavily on …","url":["https://arxiv.org/pdf/2504.15573"]} +{"year":"2025","title":"Instruction-tuning pretrained causal language models to restore ancient Greek papyri and inscriptions","authors":["E Cullhed - Digital Scholarship in the Humanities, 2025"],"snippet":"… info and inscriptions.packhum.org occasionally appear in the indices of the Common Crawl corpus (index. commoncrawl.org). In a final step, the inscription restoration model was fine-tuned from scratch, starting from the papyrus restoration model …","url":["https://academic.oup.com/dsh/advance-article/doi/10.1093/llc/fqaf131/8342432"]} {"year":"2025","title":"INTEGRATING LARGE LANGUAGE MODELS AND VIRTUAL REALITY FOR INTERACTIVE CIRCUIT ANALYSIS","authors":["M Ibrahim, V Eriksson - 2025"],"snippet":"This Master thesis explores the integration of Artificial Intelligence (AI) into Virtual Reality (VR) as a tool for interactive learning in electronics education. The work was carried out in collaboration with ByBrick and Knightec Group, focusing on creating …","url":["https://www.diva-portal.org/smash/get/diva2:1965755/FULLTEXT01.pdf"]} {"year":"2025","title":"Integrating LLMs with ITS: Recent Advances, Potentials, Challenges, and Future Directions","authors":["D Mahmud, H Hajmohamed, S Almentheri, S Alqaydi… - arXiv preprint arXiv …, 2025"],"snippet":"Intelligent Transportation Systems (ITS) are crucial for the development and operation of smart cities, addressing key challenges in efficiency, productivity, and environmental sustainability. This paper comprehensively reviews the transformative …","url":["https://arxiv.org/pdf/2501.04437"]} {"year":"2025","title":"Integrating product data from the web using deep learning techniques","authors":["A Brinkmann - 2025"],"snippet":"… org Dataset Series, a publicly available dataset derived from the Common Crawl, facilitating the analysis of schema. org adoption on the Web and providing distant supervision for machine learning tasks such as product classification and entity …","url":["https://madoc.bib.uni-mannheim.de/70659/1/Dissertation_Alexander_Brinkmann.pdf"]} @@ -10271,6 +10405,7 @@ {"year":"2025","title":"Introducing a Bangla sentence gloss pair dataset for Bangla sign language translation and research","authors":["NA Roudra, N Saha, R Shahriyar, S Sakib - 2025"],"snippet":"Bangla Sign Language translation and recognition has been an evolving research topic throughout the years. However, existing research on this field is limited to word and alphabet level detection. For a more continuous sentence level detection of …","url":["https://dspace.bracu.ac.bd:8443/xmlui/bitstream/handle/10361/26615/21301410,21301198,21301181,21101091_CSE.pdf?sequence=1"]} {"year":"2025","title":"Introducing A Bangla Sentence-Gloss Pair Dataset for Bangla Sign Language Translation and Research","authors":["N Saha, R Shahriyar, NA Roudra, S Sakib, AA Rasel - arXiv preprint arXiv …, 2025"],"snippet":"Bangla Sign Language (BdSL) translation represents a low-resource NLP task due to the lack of large-scale datasets that address sentence-level translation. Correspondingly, existing research in this field has been limited to word and …","url":["https://arxiv.org/pdf/2511.08507"]} {"year":"2025","title":"Introduction and Fundamentals","authors":["P Passban, M Rezagholizadeh, A Way - … LLM Performance: Efficacy, Fine-Tuning, and …, 2025"],"snippet":"In this chapter, we explain the intricacies of language modelling, focusing on the evolution from statistical models to the sophisticated large language models (LLMs) that dominate the field today. We explore the transition from n-gram models to neural …","url":["https://link.springer.com/chapter/10.1007/978-3-031-85747-8_1"]} +{"year":"2025","title":"Introduction to the Special Issue 2025 Looking Back and Looking Ahead: A Wide Angle Lens on Audiovisual Translation","authors":["D Chiaro, L Rossato - Journal of Audiovisual Translation, 2025"],"snippet":"… digital neglect arises because languages serve as vessels for centuries of deep, localised human experience, specialized expertise and socio-philosophical worldviews which often remain undocumented and oral and absent from accessible …","url":["https://www.jatjournal.org/index.php/jat/article/download/467/169"]} {"year":"2025","title":"Investigating Ageism, Ableism, and Nationality Bias in Norwegian and Multilingual Language Models","authors":["MS Sjåvik - 2025"],"snippet":"We investigate biases related to ageism, ableism, and nationality in four Norwegian and two multilingual language models. These types of bias are underexplored in the current literature, and existing work on Norwegian models has primarily focused on …","url":["https://bora.uib.no/bora-xmlui/bitstream/handle/11250/3208171/69831905.pdf?sequence=1"]} {"year":"2025","title":"Investigating Code Review Quality in ML Libraries: Patterns of Missed Bugs and Bug Detection with LLMs","authors":["V Thaker - 2025"],"snippet":"Over the past several years, ML techniques have become commonplace in numerous technological areas where many real-world environments depend on such techniques. More recently, tasks that relied on traditional ML approaches are …","url":["https://carleton.scholaris.ca/bitstreams/156683d5-00f8-4c09-aba5-32aa0979eeff/download"]} {"year":"2025","title":"Investigating How Pre-training Data Leakage Affects Models' Reproduction and Detection Capabilities","authors":["M Kaneko, T Baldwin - Proceedings of the 2025 Conference on Empirical …, 2025"],"snippet":"… The most common sources included in all LLMs are web page sources such as C4, CommonCrawl, and the Pile. Because they are collected from various web pages, there is a risk that they may contain personal information, copyrighted texts …","url":["https://aclanthology.org/2025.emnlp-main.1201.pdf"]} @@ -10279,12 +10414,14 @@ {"year":"2025","title":"Investigating the cross-lingual generalizability of readability assessment using a multilingual BERT model fine-tuned in a single language","authors":["M Nordstedt - 2025"],"snippet":"… As previously stated the model used in this study is implemented using the xlm-roBERTa-large model, a multilingual BERT variant pre-trained on the CommonCrawl dataset, comprising 2.5 TB of data across 100 languages. It is implemented using the …","url":["https://www.diva-portal.org/smash/get/diva2:1990576/FULLTEXT01.pdf"]} {"year":"2025","title":"Investigating the Feasibility and Risks of Leveraging Artificial Intelligence and Open Source Intelligence to Manage Predictive Cyber Threat Models","authors":["OA Obioha-Val, TI Lawal, OO Olaniyi, MO Gbadebo…"],"snippet":"This study investigates the integration of Artificial Intelligence (AI) and Open Source Intelligence (OSINT) to enhance predictive threat modeling in cybersecurity, addressing the growing complexity and frequency of cyber threats. Integrating AI …","url":["https://www.researchgate.net/profile/Oluwaseun-Olaniyi/publication/388320618_Investigating_the_Feasibility_and_Risks_of_Leveraging_Artificial_Intelligence_and_Open_Source_Intelligence_to_Manage_Predictive_Cyber_Threat_Models/links/679277be207c0c20fa555a4b/Investigating-the-Feasibility-and-Risks-of-Leveraging-Artificial-Intelligence-and-Open-Source-Intelligence-to-Manage-Predictive-Cyber-Threat-Models.pdf"]} {"year":"2025","title":"Investigating the Impact of Language-Adaptive Fine-Tuning on Sentiment Analysis in Hausa Language Using AfriBERTa","authors":["SA Sani, SH Muhammad, D Jarvis - arXiv preprint arXiv:2501.11023, 2025"],"snippet":"Sentiment analysis (SA) plays a vital role in Natural Language Processing (NLP) by ~identifying sentiments expressed in text. Although significant advances have been made in SA for widely spoken languages, low-resource languages such as Hausa …","url":["https://arxiv.org/pdf/2501.11023"]} +{"year":"2025","title":"Investigating the impact of training data coverage on large language model hallucinations","authors":["S Zhang - 2025"],"snippet":"… In contrast, the commoncrawl subset is divided into five temporal partitions—2019-30, 2020-05, 2021-04, 2022-05, and 2023-06—each … For the commoncrawl subset, the corpus is further divided into five temporal partitions, as described earlier …","url":["https://umontreal.scholaris.ca/bitstreams/0a26be5b-9f09-4d80-8920-ed1f06940ba8/download"]} {"year":"2025","title":"Investigating the Validity Evidence of Automated Scoring Methods for Divergent Thinking Assessments","authors":["J Saretzki, M Benedek"],"snippet":"Divergent thinking (DT) ability is a fundamental aspect of creativity, but its assessment remains challenging by the reliance on effortful human ratings and persistent uncertainty regarding how to aggregate scores across a variable number …","url":["https://www.researchgate.net/profile/Janika-Saretzki/publication/393518080_Investigating_the_Validity_Evidence_of_Automated_Scoring_Methods_for_Divergent_Thinking_Assessments/links/686e733ae4632b045dcadfe0/Investigating-the-Validity-Evidence-of-Automated-Scoring-Methods-for-Divergent-Thinking-Assessments.pdf"]} {"year":"2025","title":"Invisible Languages of the LLM Universe","authors":["S Khanna, X Li - arXiv preprint arXiv:2510.11557, 2025"],"snippet":"Large Language Models are trained on massive multilingual corpora, yet this abundance masks a profound crisis: of the world's 7,613 living languages, approximately 2,000 languages with millions of speakers remain effectively invisible …","url":["https://arxiv.org/pdf/2510.11557"]} {"year":"2025","title":"IRBlock: A Large-Scale Measurement Study of the Great Firewall of Iran","authors":["P Whiting"],"snippet":"… These domain test lists are collected from various sources, including top-level domains (TLD) zone files [6], the Citizen Lab test lists (CLTL) [13], the Tranco list [60], and the Common Crawl project [2]. We use a new domain list generated every day …","url":["https://www.usenix.org/system/files/usenixsecurity25-tai.pdf"]} {"year":"2025","title":"Irish-BLiMP: A Linguistic Benchmark for Evaluating Human and Language Model Performance in a Low-Resource Setting","authors":["J McGiff, KT Tran, W Mulcahy, DÓ Luinín, J Dalzell… - arXiv preprint arXiv …, 2025"],"snippet":"We present Irish-BLiMP (Irish Benchmark of Linguistic Minimal Pairs), the first dataset and framework designed for fine-grained evaluation of linguistic competence in the Irish language, an endangered language. Drawing on a variety of …","url":["https://arxiv.org/pdf/2510.20957"]} {"year":"2025","title":"Is 3D Technology a Curse or a Blessing for the Market of Contemporary Sculpture?","authors":["V Wiesinger - Sites of Reproduction. Fotografie und Skulptur …, 2025"],"snippet":"The Saint-Maur Gallic Warrior| Fig. 3|, in brass and silver, had been unearthed in twenty-two pieces in the north of France in 1983 before it was purchased by the Musée départemental de l’Oise, Beauvais, in 1985 (inv. 85.16↗). It was …","url":["https://journals.ub.uni-heidelberg.de/index.php/kchronik/issue/view/7389/1344#page=94"]} {"year":"2025","title":"Is ChatGpt Better Than Epileptologists at Interpreting Seizure Semiology?","authors":["Y Luo - 2024"],"snippet":"Objective: This study aims to evaluate the clinical value of representative large language models (LLMs), namely ChatGPT, on interpreting seizure semiology to localize epileptogenic zones (EZs) for presurgical assessment in patients with focal …","url":["https://search.proquest.com/openview/f7e76f12672af85c47a11c6ed4ae06dd/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2025","title":"Is ChatGPT conservative or liberal? A novel approach to assess ideological stances and biases in generative LLMs","authors":["CP Walker, JC Timoneda - Political Science Research and Methods, 2025"],"snippet":"Extant work shows that generative AI such as GPT-3.5 and perpetuate social stereotypes and biases. A less explored source of bias is ideology: do GPT models take ideological stances on politically sensitive topics? We develop a novel …","url":["https://www.cambridge.org/core/services/aop-cambridge-core/content/view/406C5424CA3E49174781B0112C0BB04F/S2049847025100575a.pdf/is_chatgpt_conservative_or_liberal_a_novel_approach_to_assess_ideological_stances_and_biases_in_generative_llms.pdf"]} {"year":"2025","title":"Is Neural Machine Translation Viable for Low-Resource Languages? An Experimental Study of the Irish Language","authors":["J Quigley - 2025"],"snippet":"Transformer-based Neural Machine Translation (NMT) models are Large Language Models (LLMs) designed and developed for translating between two or more given languages. These are typically most successful in the context of high-resource …","url":["https://search.proquest.com/openview/056fb6ba73c6f3a865817e3dadb87cfa/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Is Single-View Mesh Reconstruction Ready for Robotics?","authors":["F Nolte, B Schölkopf, I Posner - arXiv preprint arXiv:2505.17966, 2025"],"snippet":"This paper evaluates single-view mesh reconstruction models for creating digital twin environments in robot manipulation. Recent advances in computer vision for 3D reconstruction from single viewpoints present a potential breakthrough for efficiently …","url":["https://arxiv.org/pdf/2505.17966"]} {"year":"2025","title":"Is There a Case for Conversation Optimized Tokenizers in Large Language Models?","authors":["R Ferrando, J Conde, G Martínez, P Reviriego - arXiv preprint arXiv:2506.18674, 2025"],"snippet":"The computational and energy costs of Large Language Models (LLMs) have increased exponentially driven by the growing model sizes and the massive adoption of LLMs by hundreds of millions of users. The unit cost of an LLM is the …","url":["https://arxiv.org/pdf/2506.18674"]} @@ -10295,6 +10432,7 @@ {"year":"2025","title":"IV An Updated Superscript: Paradoxes of Writing Amidst Generative AI","authors":["RH Gibson - Ecologies of Writing: Natural, Technical, and Social …, 2025"]} {"year":"2025","title":"Jabuticaba: The largest commercial corpus for LLMs in Portuguese","authors":["M Amadeus, WAC Castaneda, JRH da Silva, R Scotti"],"snippet":"… ’s 3 billion bpet and 45 TB of compressed plaintext from Common Crawl before filtering and 570 GB after filtering, equivalent to 400 billion bpet. … Models, includes over 100B text documents coming from 84 CommonCrawl snapshots and processed …","url":["https://preprints.scielo.org/index.php/scielo/preprint/download/12696/23290"]} {"year":"2025","title":"JAI-1: A Thai-Centric Large Language Model","authors":["AT Rutherford, J Karnjanaekarin, N Panitsrisit… - arXiv preprint arXiv …, 2025"],"snippet":"… The latter utilizes Upstage’s LP data pipeline [23] to process a large-scaled Common Crawl dump data. Both … 12.53B cleaned version of Common Crawl’s corpus5 th-oscar … of our own corpus and filtering process from a large scaled open-sourced …","url":["https://arxiv.org/pdf/2510.08620"]} +{"year":"2025","title":"Jeopardizing Linguistic Diversity: How AI-generated Translations Neutralize Vernacular Irish English","authors":["K Walter - Applying Artificial Intelligence in Translation, 2025"],"snippet":"This chapter compares human and automated German translations of three short excerpts from Kevin Barry’s Night Boat to Tangier (2019), a novel written in vernacular Irish English. It is shown that machine output not only reinforces the …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.4324/9781003539698-4&type=chapterpdf"]} {"year":"2025","title":"Jet-Nemotron: Efficient Language Model with Post Neural Architecture Search","authors":["Y Gu, Q Hu, S Yang, H Xi, J Chen, S Han, H Cai - arXiv preprint arXiv:2508.15884, 2025"],"snippet":"We present Jet-Nemotron, a new family of hybrid-architecture language models, which matches or exceeds the accuracy of leading full-attention models while significantly improving generation throughput. Jet-Nemotron is developed using …","url":["https://arxiv.org/pdf/2508.15884"]} {"year":"2025","title":"JiuZhou: open foundation language models and effective pre-training framework for geoscience","authors":["Z Chen, M Lin, M Zang, Z Wang, J Li, Y Bai - International Journal of Digital Earth, 2025"],"snippet":"Geoscience research has generated vast amounts of data, creating a need for effective extraction and integration of knowledge to address global-change challenges, promote sustainable development, and accelerate scientific discovery …","url":["https://www.tandfonline.com/doi/pdf/10.1080/17538947.2025.2449708"]} {"year":"2025","title":"Joint Multi-modal Modeling","authors":["J Ou, H Xu, H Zan - Machine Translation: 20th China Conference, CCMT …"],"snippet":"… The text translation data are normally news or common crawl, while the speech translation data are usually talks and recitations. There is a significant domain gap, and selectively using the text translation data which are more close to speech …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=XRpIEQAAQBAJ&oi=fnd&pg=PA98&dq=commoncrawl&ots=ExfTViOdJT&sig=RKmL_C5fNAI_hxykW5ESDbfKd_c"]} @@ -10302,6 +10440,7 @@ {"year":"2025","title":"JT-Math: A Multi-Stage Framework for Advanced Mathematical Reasoning in Large Language Models","authors":["Y Hao, F Chao, Y Hao, Z Cui, H Bai, H Zhang, Y Liu… - arXiv preprint arXiv …, 2025"],"snippet":"… These models undergo continual pre-training with a 120B-token corpus of high-quality mathematical web data sourced from Common Crawl. The series includes an instruction-tuned variant, DeepSeek-Math-Instruct, trained on problems with Chain-of-Thought …","url":["https://arxiv.org/pdf/2507.19748"]} {"year":"2025","title":"JT-Safe: Intrinsically Enhancing the Safety and Trustworthiness of LLMs","authors":["J Feng, F Meng, C Long, P Cong, D Wang, Y Zheng… - arXiv preprint arXiv …, 2025"],"snippet":"… Data sources are broad spanning from world wide web raw data (eg Common Crawl), generated data, various types of specialized data (such as data from industries and fields like books, code, and education) to private datasets. …","url":["https://arxiv.org/pdf/2510.17918"]} {"year":"2025","title":"Judging Quality Across Languages: A Multilingual Approach to Pretraining Data Filtering with Language Models","authors":["M Ali, M Brack, M Lübbering, E Wendt, AG Khan… - arXiv preprint arXiv …, 2025"],"snippet":"… The vast majority of training data for large language models is sourced from the web, with Common Crawl (CC) being the most important corpus. Traditionally, many works have relied heavily, and in some cases exclusively, on heuristic-based filtering …","url":["https://arxiv.org/pdf/2505.22232"]} +{"year":"2025","title":"K2-V2: A 360-Open, Reasoning-Enhanced LLM","authors":["Z Liu, L Tang, L Jin, H Li, N Ranjan, D Fan, S Rohatgi… - arXiv preprint arXiv …, 2025"],"snippet":"… We include a substantial amount of Arabic text in our pretraining corpus, primarily sourced from Common Crawl web scrapes extending the … To obtain a high-level view of the content distribution in our corpus, we perform a topic analysis on the …","url":["https://arxiv.org/pdf/2512.06201"]} {"year":"2025","title":"Kaleidoscope: In-language Exams for Massively Multilingual Vision Evaluation","authors":["I Salazar, MF Burda, SB Islam, AS Moakhar, S Singh… - arXiv preprint arXiv …, 2025"],"snippet":"The evaluation of vision-language models (VLMs) has mainly relied on English-language benchmarks, leaving significant gaps in both multilingual and multicultural coverage. While multilingual benchmarks have expanded, both in size and languages, many …","url":["https://arxiv.org/pdf/2504.07072"]} {"year":"2025","title":"Kanana: Compute-efficient Bilingual Language Models","authors":["Y Bak, H Lee, M Ryu, J Ham, S Jung, DW Nam, T Eo… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Kanana, a series of bilingual language models that demonstrate exceeding performance in Korean and competitive performance in English. The computational cost of Kanana is significantly lower than that of state-of-the-art …","url":["https://arxiv.org/pdf/2502.18934"]} {"year":"2025","title":"KatotohananQA: Evaluating Truthfulness of Large Language Models in Filipino","authors":["LA Nery, RD Catignas, TJ Tiam-Lee - arXiv preprint arXiv:2509.06065, 2025"],"snippet":"… Only 0.83% of the widely-used pre-training dataset Common Crawl is in Filipino while 45.26% is in English [7]. Aside from this, over two-thirds of instruction data for fine-tuning LLMs is in English [8]. This highlights the need for further research into …","url":["https://arxiv.org/pdf/2509.06065"]} @@ -10312,6 +10451,7 @@ {"year":"2025","title":"Kimi k1. 5: Scaling Reinforcement Learning with LLMs","authors":["K Team, A Du, B Gao, B Xing, C Jiang, C Chen, C Li… - arXiv preprint arXiv …, 2025"],"snippet":"Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of …","url":["https://arxiv.org/pdf/2501.12599"]} {"year":"2025","title":"Kimi-VL Technical Report","authors":["K Team, A Du, B Yin, B Xing, B Qu, B Wang, C Chen… - arXiv preprint arXiv …, 2025"],"snippet":"We present Kimi-VL, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers advanced multimodal reasoning, long-context understanding, and strong agent capabilities - all while activating only 2.8B parameters in its …","url":["https://arxiv.org/pdf/2504.07491"]} {"year":"2025","title":"Know Your Limits: Entropy Estimation Modeling for Compression and Generalization","authors":["BL Badger, M Neligeorge - arXiv preprint arXiv:2511.10618, 2025"],"snippet":"Language prediction is constrained by informational entropy intrinsic to language, such that there exists a limit to how accurate any language model can become and equivalently a lower bound to language compression. The most efficient language …","url":["https://arxiv.org/pdf/2511.10618"]} +{"year":"2025","title":"Knowledge engineering information technology for cultural-educational scenarios based on RAG","authors":["K Lipianina-Honcharenko, N Melnyk, M Komar… - 2025"],"snippet":"… A notable example is the C4 corpus (Colossal Clean Crawled Corpus), created on the basis of Common Crawl with subsequent cleaning [5]. During HTML parsing, libraries such as Boilerpipe or Readability are employed to extract the main text and …","url":["https://ceur-ws.org/Vol-4141/paper11.pdf"]} {"year":"2025","title":"Knowledge Extraction on Semi-Structured Content: Does It Remain Relevant for Question Answering in the Era of LLMs?","authors":["K Sun, Y Huang, S Mehra, M Kachuee, X Chen, R Tao… - arXiv preprint arXiv …, 2025"],"snippet":"The advent of Large Language Models (LLMs) has significantly advanced web-based Question Answering (QA) systems over semi-structured content, raising questions about the continued utility of knowledge extraction for question answering. This …","url":["https://arxiv.org/pdf/2509.25107"]} {"year":"2025","title":"Knowledge Graph Completion using RAG and Improved Structural Information","authors":["B Li, Z Mao, R Yan, A Ling, Q Hu, Q Zeng - 2025 IEEE 2nd International Conference …, 2025"],"snippet":"… To convert triple data into natural language text, the large model RAG uses the Wikipedia Dump, Common Crawl datasets, and the Phi-3-medium-4k-instruct model [17]. These datasets, sourced from Wikipedia and web data, contain natural language …","url":["https://ieeexplore.ieee.org/abstract/document/11087034/"]} {"year":"2025","title":"knowledge on biodiversity beyond national jurisdiction","authors":["M Zhang, Y Chen - Advances in Marine Environmental Protection …, 2025"],"snippet":"Areas beyond national jurisdiction (ABNJ) face persistent degradation of marine biodiversity (Humphries and Harden-Davies, 2020). A United Nations agreement on the conservation and sustainable use of marine biodiversity in areas beyond …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=evFZEQAAQBAJ&oi=fnd&pg=PA133&dq=commoncrawl&ots=8R8I4s1x8V&sig=UqQ9NenOOMWzYXzSJOBR6BqdUW8"]} @@ -10320,11 +10460,13 @@ {"year":"2025","title":"KORMo: Korean Open Reasoning Model for Everyone","authors":["M Kim, H Lim, H Yoo, I Won, S Song, M Cho, J Yuk… - arXiv preprint arXiv …, 2025"],"snippet":"… sources: existing open resources and raw dumps directly parsed from Common Crawl. … extracted Korean text from the raw dumps of Common Crawl. Specifically, we parsed WARC files from … The language identification provided by Common …","url":["https://arxiv.org/pdf/2510.09426"]} {"year":"2025","title":"Kr\\'eyoLID From Language Identification Towards Language Mining","authors":["R Dent, PO Suarez, T Clérice, B Sagot - arXiv preprint arXiv:2503.06547, 2025"],"snippet":"… more of distracting documents in a 2.6 billion page Common Crawl snapshot in a few hours on a … of first pass filtering on the December 2024 Common Crawl snapshot for each target label. … After that, we test document-level filtering on a full …","url":["https://arxiv.org/pdf/2503.06547"]} {"year":"2025","title":"Krutrim LLM: Multilingual Foundational Model for over a Billion People","authors":["A Kallappa, P Kamble, A Ravi, A Patidar, V Dhruv… - arXiv preprint arXiv …, 2025"],"snippet":"… Indic languages comprise only 1 percent of Common Crawl corpora despite India representing 18 percent of the global population, leading to linguistic biases. Thousands of regional languages, dialects, and code mixing create additional …","url":["https://arxiv.org/pdf/2502.09642"]} +{"year":"2025","title":"KV-CAR: KV Cache Compression using Autoencoders and KV Reuse in Large Language Models","authors":["S Roy, S Sridharan, S Selvam, A Raghunathan - arXiv preprint arXiv:2512.06727, 2025"],"snippet":"… C4 (Colossal Clean Crawled Corpus) is a large-scale dataset derived from a cleaned version of the Common Crawl web corpus. It is often used in pretraining large language models. Due to computational limitations, we employed only a small …","url":["https://arxiv.org/pdf/2512.06727"]} {"year":"2025","title":"Lance: Efficient Random Access in Columnar Storage through Adaptive Structural Encodings","authors":["W Pace, C She, L Xu, W Jones, A Lockett, J Wang… - arXiv preprint arXiv …, 2025"],"snippet":"The growing interest in artificial intelligence has created workloads that require both sequential and random access. At the same time, NVMe-backed storage solutions have emerged, providing caching capability for large columnar datasets in cloud …","url":["https://arxiv.org/pdf/2504.15247"]} {"year":"2025","title":"Language Arithmetics: Towards Systematic Language Neuron Identification and Manipulation","authors":["D Gurgurov, K Trinley, YA Ghussin, T Baeumel… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) exhibit strong multilingual abilities, yet the neural mechanisms behind language-specific processing remain unclear. We analyze language-specific neurons in Llama-3.1-8B, Mistral-Nemo-12B, and Aya-Expanse-8B …","url":["https://arxiv.org/pdf/2507.22608"]} {"year":"2025","title":"Language Dominance in Multilingual Large Language Models","authors":["N Shani, A Basirat - Proceedings of the 8th BlackboxNLP Workshop …, 2025"],"snippet":"This paper investigates the language dominance hypothesis in multilingual large language models (LLMs), which posits that cross-lingual understanding is facilitated by an implicit translation into a dominant language seen more frequently during …","url":["https://aclanthology.org/2025.blackboxnlp-1.7.pdf"]} {"year":"2025","title":"Language Grounding in Vision","authors":["H Shahmohammadi - 2025"],"snippet":"… In this thesis, we make use of the 300-dimensional GloVe Embeddings trained on 840 billion tokens sourced from Commoncrawl covering … We make use of the 300-dimensional fastText Embeddings trained on Commoncrawl covering 2M unique words with …","url":["https://tobias-lib.ub.uni-tuebingen.de/xmlui/bitstream/handle/10900/162512/Dissertation_Hassan_Shahmohammadi.pdf?sequence=2&isAllowed=y"]} {"year":"2025","title":"Language Is Leaving Me: An AI Exploration of Epigenetic or Inherited Trauma of Cultures of Diaspora","authors":["E Pearlman - Leonardo, 2025"],"snippet":"… However, to briefly summarize the topic, English language image banks are built using Common Crawl, a web scraper. They skew towards popular sites like Pinterest, Wikimedia, Tumblr, various shopping sites, stock images, and images of celebrities …","url":["https://direct.mit.edu/leon/article/doi/10.1162/LEON.a.96/131956"]} +{"year":"2025","title":"Language machines: Toward a linguistic anthropology of large language models","authors":["S Lamoureaux, M Castelle, A Weichselbraun - Journal of Linguistic Anthropology, 2025"],"snippet":"Large language models (LLMs) challenge long‐standing assumptions in linguistics and linguistic anthropology by generating human‐like language without relying on rule‐based structures. This introduction to the special issue Language Machines …","url":["https://anthrosource.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jola.70033"]} {"year":"2025","title":"Language Modeling Over Logical Forms","authors":["M Sullivan - 2025"],"snippet":"This dissertation introduces the research program of language modeling over logical forms: the employment of language models (LMs) that take as input semantic representations. The use of such models is motivated by the Accelerated Learning …","url":["https://search.proquest.com/openview/6b21607c924074afbcc7050879625f5f/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Language Models at the Syntax-Semantics Interface: A Case Study of the Long-Distance Binding of Chinese Reflexive Ziji","authors":["X Yang - Proceedings of the 31st International Conference on …, 2025"],"snippet":"This paper explores whether language models can effectively resolve the complex binding patterns of the Mandarin Chinese reflexive ziji, which are constrained by both syntactic and semantic factors. We construct a dataset of 320 synthetic …","url":["https://aclanthology.org/2025.coling-main.257.pdf"]} {"year":"2025","title":"Language Models Improve When Pretraining Data Matches Target Tasks","authors":["D Mizrahi, ABL Larsen, J Allardice, S Petryk… - arXiv preprint arXiv …, 2025"],"snippet":"… This data pool also processes CommonCrawl but with slightly different preprocessing choices and global fuzzy deduplication. Perhaps most importantly, NemotronCC includes 1.9T synthetic tokens created through model-based rephrasing4 [Maini et al.…","url":["https://arxiv.org/pdf/2507.12466"]} @@ -10339,13 +10481,16 @@ {"year":"2025","title":"Large Language Model System Design","authors":["J Ren, A Li - Silicon Valley Python Engineer Interview Guide: Data …, 2025"],"snippet":"… – Use publicly available text datasets such as Common Crawl, BooksCorpus, Wikipedia, Reddit conversations, and OpenWebText. – For pre-training, you aim to teach the model the structure of language, grammar, facts, reasoning, and some …","url":["https://link.springer.com/chapter/10.1007/978-981-96-3201-5_24"]} {"year":"2025","title":"Large language model trained on clinical oncology data predicts cancer progression","authors":["M Zhu, H Lin, J Jiang, AJ Jinia, J Jee, K Pichotta… - npj Digital Medicine, 2025"],"snippet":"Subspecialty knowledge barriers have limited the adoption of large language models (LLMs) in oncology. We introduce Woollie, an open-source, oncology-specific LLM trained on real-world data from Memorial Sloan Kettering Cancer Center (MSK) …","url":["https://www.nature.com/articles/s41746-025-01780-2"]} {"year":"2025","title":"Large Language Models as Search Engines: Societal Challenges","authors":["Z Sadeddine, W Maxwell, G Varoquaux, FM Suchanek"],"snippet":"Large Language Models (LLMs) may one day replace search engines as the primary portal to information on the Web. In this opinion paper, we investigate the societal challenges that such a change could bring. We focus on the roles of LLM …","url":["https://sadzac.github.io/files/llm_survey.pdf"]} +{"year":"2025","title":"Large Language Models Distinguishing One-Shot from Repeated Games: Understanding and Intervening in Risky Decision-Making","authors":["Z Lei, L Litong, W Xu, Q Huafeng, H Qianyu, L Aimei…"],"snippet":"Theoretical research on risky decision-making has primarily relied on reverse inference from behavioral outcomes and self-report data, lacking direct observation of the decision-making process, which constrains explanations of its underlying …","url":["https://chinarxiv.org/items/chinaxiv-202509.00060/"]} {"year":"2025","title":"Large Language Models for Arabic Sentiment Analysis and Machine Translation","authors":["M Zouidine, M Khalil - Engineering, Technology & Applied Science Research, 2025"],"snippet":"Large Language Models (LLMs) have recently demonstrated outstanding performance in a variety of Natural Language Processing (NLP) tasks. Although many LLMs have been developed, only a few models have been evaluated in the …","url":["https://etasr.com/index.php/ETASR/article/download/9584/4649"]} {"year":"2025","title":"Large Language Models for NLP: An In-depth Comparative Examination","authors":["Z Alomari, O Sharma, S Sawarn, Y Shao, A Makanju"],"snippet":"The discipline of Large Language Models (LLMs) is rapidly advancing, and it is essential to explore their capabilities and limitations for further development. This study conducts a comparative analysis of six prominent models: GPT-4, LLaMA 2 …","url":["https://www.researchgate.net/profile/Zakaria-Alomari/publication/390546301_Large_Language_Models_for_NLP_An_In-depth_Comparative_Examination/links/67f371e095231d5ba5b9a2a1/Large-Language-Models-for-NLP-An-In-depth-Comparative-Examination.pdf"]} {"year":"2025","title":"Large Language Models for Psychological Assessment: A Comprehensive Overview","authors":["J Brickman, M Gupta"],"snippet":"Large language models (LLMs) are extraordinary tools demonstrating potential to improve our understanding of psychological characteristics. They provide an unprecedented opportunity to supplement self-report in psychology research and …","url":["https://osf.io/qm9ae/download"]} {"year":"2025","title":"Large Language Models for Security Operations Centers: A Comprehensive Survey","authors":["A Habibzadeh, F Feyzi, RE Atani - arXiv preprint arXiv:2509.10858, 2025"],"snippet":"Large Language Models (LLMs) have emerged as powerful tools capable of understanding and generating human-like text, offering transformative potential across diverse domains. The Security Operations Center (SOC), responsible for …","url":["https://arxiv.org/pdf/2509.10858"]} +{"year":"2025","title":"Large Language Models for Software Development: Evaluating the Feasibility of Local Large Language Models for Code Generation","authors":["J Buscaglia Uchaneishvili - 2025"],"snippet":"… Typical sources include public web crawls (eg Common Crawl), online encyclopedias (Wikipedia), large e-book libraries, news articles, and sometimes code or social media text. Raw text is extensively cleaned before training. Cleaning …","url":["https://reposit.haw-hamburg.de/bitstream/20.500.12738/18438/1/BA_Large%20Language%20Models%20for%20Software%20Development%20Evaluating%20the%20Feasibility_geschw%C3%A4rzt.pdf"]} {"year":"2025","title":"Large language models for software vulnerability detection: a guide for researchers on models, methods, techniques, datasets, and metrics","authors":["SM Taghavi Far, F Feyzi - International Journal of Information Security, 2025"],"snippet":"Large language models (LLMs) have emerged as transformative tools in the domain of software vulnerability detection and management, offering sophisticated capabilities in identifying, analyzing, and mitigating security risks. This article delves …","url":["https://link.springer.com/article/10.1007/s10207-025-00992-7"]} {"year":"2025","title":"Large Language Models for Summarizing Czech Historical Documents and Beyond","authors":["V Tran, J Šmıd, J Martınek, L Lenc, P Král"],"snippet":"Text summarization is the task of shortening a larger body of text into a concise version while retaining its essential meaning and key information. While summarization has been significantly explored in English and other high-resource …","url":["https://www.scitepress.org/Papers/2025/133741/133741.pdf"]} {"year":"2025","title":"Large Language Models for Text Classification: What, Why, When, Where, and How","authors":["Z Wang, Y Lin, J Shen, X Zhu - 2025"],"snippet":"In an age where unstructured text data is growing rapidly, effective methods for text classification (TC) have become critical. Large Language Models (LLMs), such as the revolutionary GPT-4, have taken the lead in tackling this challenge, showing …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.174559786.61330197"]} +{"year":"2025","title":"Large Language Models for the Summarization of Czech Documents: From History to the Present","authors":["V Tran, J Šmíd, L Lenc, JP Salmon, P Král - arXiv preprint arXiv:2511.18848, 2025"],"snippet":"Text summarization is the task of automatically condensing longer texts into shorter, coherent summaries while preserving the original meaning and key information. Although this task has been extensively studied in English and other high-resource …","url":["https://arxiv.org/pdf/2511.18848"]} {"year":"2025","title":"Large Language Models in Crisis Informatics for Zero and Few-Shot Classification","authors":["C Sánchez, A Abeliuk, B Poblete - ACM Transactions on the Web, 2025"],"snippet":"This article presents an exploration of the use of pre-trained Large Language Models (LLMs) for crisis classification to address labeled data dependency issues. We present a methodology that enhances open LLMs through fine-tuning, creating …","url":["https://dl.acm.org/doi/pdf/10.1145/3736160"]} {"year":"2025","title":"Large language models in machine learning","authors":["GK Saha - 2024"],"snippet":"In recent years, large language models (LLMs) have revolutionized the field of machine learning (ML), demonstrating unprecedented capabilities in natural language processing (NLP) tasks. This review provides an in-depth analysis of the …","url":["https://www.indianjournals.com/ijor.aspx?target=ijor:ijaritac&volume=15&issue=1to3&article=003"]} {"year":"2025","title":"Large Language Models in the Justice Domain","authors":["G Contissa, G Sartor - Facilitating Judicial Cooperation in the EU, 2025"],"snippet":"… Data Protection: the majority of the training data for LLM s originates from texts taken from freely accessible internet sources, such as the Common Crawl dataset, which includes information from over 3 billion web pages. These datasets, obtained …","url":["https://brill.com/edcollchap-oa/book/9789004705791/BP000011.xml"]} @@ -10353,6 +10498,7 @@ {"year":"2025","title":"Large Language Models Transform Organic Synthesis From Reaction Prediction to Automation","authors":["KKL Tharwani, R Kumar, N Ahmed, Y Tang - arXiv preprint arXiv:2508.05427, 2025"],"snippet":"Large language models (LLMs) are beginning to reshape how chemists plan and run reactions in organic synthesis. Trained on millions of reported transformations, these text-based models can propose synthetic routes, forecast reaction outcomes …","url":["https://arxiv.org/pdf/2508.05427"]} {"year":"2025","title":"Large Language Models With Contrastive Decoding Algorithm for Hallucination Mitigation in Low‐Resource Languages","authors":["Z Hongying, A Javed, M Abdullah, J Rashid, M Faheem - CAAI Transactions on …, 2025"],"snippet":"… through human effort and web crawlers (ParaCrawl, Bitextor, Common Crawl and OpenNMT). ParaCrawl is a project that aims to build large… Common Crawl provides a large, open repository of web crawl data. OpenNMT provides a suite of …","url":["https://ietresearch.onlinelibrary.wiley.com/doi/pdf/10.1049/cit2.70004"]} {"year":"2025","title":"Large language models: an overview of foundational architectures, recent trends, and a new taxonomy","authors":["ID Mienye, N Jere, G Obaido, OO Ogunruku… - 2025"],"snippet":"… XLM-R [150] built upon this with robust pretraining on 2.5TB of CommonCrawl data across 100 languages, significantly outperforming previous models on cross-lingual benchmarks, such as XNLI and MLQA. Similarly, mT5 [151] extended the T5 …","url":["https://www.researchgate.net/profile/Ebenezer-Esenogho/publication/395194586_Large_language_models_an_overview_of_foundational_architectures_recent_trends_and_a_new_taxonomy/links/68b71d16360112563e0ff9d0/Large-language-models-an-overview-of-foundational-architectures-recent-trends-and-a-new-taxonomy.pdf"]} +{"year":"2025","title":"Large language models: applications, limitations, challenges, and recommendations in cybersecurity, digital forensics, and ethical hacking","authors":["JPA Yaacoub, HN Noura, O Salman, G Pujolle - Annals of Telecommunications, 2025"],"snippet":"Large Language Models (LLMs) are a significant leap in Artificial Intelligence (AI), providing tremendous capabilities for understanding and creating human-like language. LLMs offer significant advantages in automating and enhancing …","url":["https://link.springer.com/article/10.1007/s12243-025-01134-9"]} {"year":"2025","title":"Large Language Models: Creation, Optimisation, and Application","authors":["AAA Alsayed - The Palgrave Encyclopedia of Computer-Assisted …, 2025"],"snippet":"… One of the main sources of this data is text extracted from Internet websites, such as website crawling data from the Common Crawl repository. Many milestones of NLP research developments paved the way for the current generation of LLMs (Raiaan …","url":["https://link.springer.com/content/pdf/10.1007/978-3-031-51447-0_102-1.pdf"]} {"year":"2025","title":"Large Scale Cyber Security Log Classification Using Semi-Supervised Clustering","authors":["P Cai, M Lazarescu, ST Soh, R Ryan - 2025 IEEE International Conference on Cyber …, 2025"],"snippet":"In this paper we present a semi-supervised approach developed with the aim addressing the challenge of large-scale cyber security log entry classification that is faced by organizations that lack significant in-house expertise. Our approach is to …","url":["https://ieeexplore.ieee.org/abstract/document/11130139/"]} {"year":"2025","title":"Large-Scale AI in Telecom: Charting the Roadmap for Innovation, Scalability, and Enhanced Digital Experiences","authors":["A Shahid, A Kliks, A Al-Tahmeesschi, A Elbakary… - arXiv preprint arXiv …, 2025"],"snippet":"This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It …","url":["https://arxiv.org/pdf/2503.04184"]} @@ -10375,10 +10521,13 @@ {"year":"2025","title":"LegoAI: Auto-Scaling Large Model Training","authors":["SJ Purandare - 2025"],"snippet":"Training large AI models is computationally intensive. State-of-the-art language and vision models (LLMs and VLMs) often require thousands of GPUs and weeks or even months of training. As models scale to meet the demands of modern …","url":["https://search.proquest.com/openview/b7b895bdf28ee84926355613af1d6896/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Lemma Dilemma: On Lemma Generation Without Domain-or Language-Specific Training Data","authors":["O Toporkov, A Akbik, R Agerri - arXiv preprint arXiv:2510.07434, 2025"],"snippet":"Lemmatization is the task of transforming all words in a given text to their dictionary forms. While large language models (LLMs) have demonstrated their ability to achieve competitive results across a wide range of NLP tasks, there is no prior …","url":["https://arxiv.org/pdf/2510.07434"]} {"year":"2025","title":"Lemmatization of Polish Multi-word Expressions","authors":["M Król, A Smywiński-Pohl, Z Kaleta, P Lewkowicz - Proceedings of the 2025 …, 2025"],"snippet":"This paper explores the lemmatization of multi-word expressions (MWEs) and proper names in Polish–tasks complicated by linguistic irregularities and historical factors. Instead of using rule-based methods, we apply a machine learning …","url":["https://aclanthology.org/2025.emnlp-main.1126.pdf"]} +{"year":"2025","title":"Length-MAX Tokenizer for Language Models","authors":["D Dong, W Su - arXiv preprint arXiv:2511.20849, 2025"],"snippet":"We introduce a new tokenizer for language models that minimizes the average tokens per character, thereby reducing the number of tokens needed to represent text during training and to generate text during inference. Our method, which we …","url":["https://arxiv.org/pdf/2511.20849"]} {"year":"2025","title":"Less is More: Selective Reflection for Compatible and Efficient Knowledge Distillation in Large Language Models","authors":["L Liu, M Zhang - arXiv preprint arXiv:2508.06135, 2025"],"snippet":"Knowledge Distillation (KD) is a fundamental technique for compressing large language models (LLMs) into compact, efficient student models. However, existing white-box KD methods mainly focus on balancing ground truth and student-generated …","url":["https://arxiv.org/pdf/2508.06135"]} {"year":"2025","title":"Leveraging Contrastive Semantics and Language Adaptation for Robust Financial Text Classification Across Languages","authors":["L Zhang, Q Lin, F Meng, S Liang, J Lu, S Liu, K Chen… - Computers, 2025"],"snippet":"With the growing demand for multilingual financial information, cross-lingual financial sentiment recognition faces significant challenges, including semantic misalignment, ambiguous sentiment expression, and insufficient transferability. To …","url":["https://www.mdpi.com/2073-431X/14/8/338"]} {"year":"2025","title":"Leveraging Deep Learning Models and Social Media Data for Enhanced Situation Awareness in Disaster Management","authors":["AA Adesokan - 2024"],"snippet":"In recent years, social media has become a crucial source of real-time data for disaster management, supporting emergency responses when traditional channels like 911 are overcrowded and overwhelmed. It offers authorities valuable data for …","url":["https://search.proquest.com/openview/eb91a55acfe266de48cf81ab7bfd6059/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Leveraging GloVe Embeddings to Enhance Memory-Based Language Modeling for Commonsense Reasoning","authors":["J Armengol Tapiolas - 2025"],"snippet":"… This dataset is derived from educational web pages filtered from the larger Common Crawl dataset. Its content-rich and diverse nature provides a robust corpus for assessing language modeling capabilities. A sample of the first 100,000 lines of …","url":["https://studenttheses.uu.nl/bitstream/handle/20.500.12932/49844/Thesis_report.pdf?sequence=1"]} +{"year":"2025","title":"Leveraging Language Models for Document Type Classification in Low-Resource Afrikaans Archives","authors":["E Kotzé, BA Senekal, W Daelemans - Southern African Conference for Artificial …, 2025"],"snippet":"Document type classification is essential for effective information retrieval and management within archival systems, particularly in low-resource languages like Afrikaans. This study examines the feasibility of utilising multilingual transformer-based …","url":["https://link.springer.com/chapter/10.1007/978-3-032-11733-5_2"]} +{"year":"2025","title":"Leveraging large language models and ensemble techniques for real-time mental health monitoring on social media","authors":["R Mohamed, EMG Younis, AA Ali - Network Modeling Analysis in Health Informatics …, 2025"],"snippet":"… RoBERTa-base has the same architecture size as BERT-base (12 layers, 768 hidden units, 12 attention heads) but is trained longer on more diverse corpora, including Common Crawl. In our experiments, RoBERTa was fine-tuned on our …","url":["https://link.springer.com/article/10.1007/s13721-025-00608-z"]} {"year":"2025","title":"Leveraging Large Language Models for a Swahili Mathematics ITS in Tanzania: Designing Effective Prompts","authors":["EP Rutatola, K Stroeken, T Belpaeme - International Conference on Intelligent …, 2025"],"snippet":"The advancement of Large Language Models (LLMs) has significantly enhanced intelligent tutoring systems, enabling them to engage learners through natural dialogues. This interaction boosts learner engagement but presents challenges for …","url":["https://link.springer.com/chapter/10.1007/978-3-031-98281-1_1"]} {"year":"2025","title":"Leveraging large language models for detecting and preserving emotions in quran translations","authors":["A Almarzoqi, M Alsuhaibani - Journal of King Saud University Computer and …, 2025"],"snippet":"Over the years, translations of the Quran have served as a vital tool for conveying its message to non-Arabic speaking audiences. This research proposes a computational quantitative approach to investigate emotion preservation in English …","url":["https://link.springer.com/article/10.1007/s44443-025-00269-y"]} {"year":"2025","title":"Leveraging Large Language Models for Legal Document Understanding and Software System Analysis: Addressing Key Challenges","authors":["EQ Caballero - 2024"],"snippet":"In the rapidly advancing field of software development, ensuring compliance with legal regulations and policies has become increasingly critical. The intricate separation between legal expertise and software engineering creates challenges …","url":["https://search.proquest.com/openview/008ba9ac0834da09ebe204040efc11c9/1?pq-origsite=gscholar&cbl=18750&diss=y"]} @@ -10395,7 +10544,9 @@ {"year":"2025","title":"Leveraging XLM-RoBERTa with CNN and BiLSTM for Hinglish Toxicity Detection","authors":["N Singhal, A Yadav, A Ankush, R Kumar - Journal of Communications Software and …, 2025","N Singhal, A Yadav, GS Ankush, R Kumar"],"snippet":"… It was trained on 2.5 TB of data from CommonCrawl covering 100 languages, after which it bested mBERT on many CL benchmark evaluations (10). It eliminates the need for training independent embeddings for each language, as well as it …","url":["https://jcoms.fesb.unist.hr/pdfs/v21n4_2025-0133_Singhal.pdf"]} {"year":"2025","title":"Library Genesis to Llama 3: Navigating the Waters of Scientific Integrity, Ethics, and the Scholarly Record","authors":["L Ridenour, H Thach, SE Knudsen - Proceedings of the Association for Information …, 2025"],"snippet":"This work examines the intricate connections between Generative AI (GenAI), its training data, and the scholarly record through a data‐driven discourse analysis. Meta's Llama3 was trained using pirated data from the file‐sharing space Library …","url":["https://asistdl.onlinelibrary.wiley.com/doi/abs/10.1002/pra2.1340"]} {"year":"2025","title":"like 70 Follow Language Technologies Unit@ Barcelona Supercomputing Center 225","authors":["SAM Card"],"snippet":"… Common Crawl: Repository that holds website data and is run by the Common Crawl non-profit organization. It is updated monthly and is distributed under the CC0 1.0 … Web-sourced datasets with some preprocessing available under permissive …","url":["https://huggingface.co/BSC-LT/ALIA-40b/blob/7d733a67ef4ead89daf89e205080cfb642756f76/README.md"]} +{"year":"2025","title":"LIME: Making LLM Data More Efficient with Linguistic Metadata Embeddings","authors":["S Sztwiertnia, F Friedrich, K Kersting, P Schramowski… - arXiv preprint arXiv …, 2025"],"snippet":"Pre-training decoder-only language models relies on vast amounts of high-quality data, yet the availability of such data is increasingly reaching its limits. While metadata is commonly used to create and curate these datasets, its potential as a …","url":["https://arxiv.org/pdf/2512.07522"]} {"year":"2025","title":"Limited Generalizability in Argument Mining: State-Of-The-Art Models Learn Datasets, Not Arguments","authors":["M Feger, K Boland, S Dietze - arXiv preprint arXiv:2505.22137, 2025"],"snippet":"Identifying arguments is a necessary prerequisite for various tasks in automated discourse analysis, particularly within contexts such as political debates, online discussions, and scientific reasoning. In addition to theoretical advances in …","url":["https://arxiv.org/pdf/2505.22137"]} +{"year":"2025","title":"Linear socio-demographic representations emerge in Large Language Models from indirect cues","authors":["P Bouchaud, P Ramaciotti - arXiv preprint arXiv:2512.10065, 2025"],"snippet":"… to a 10-billion-token sample of deduplicated English Common Crawl [28], a dataset extensively used in training contemporary LLMs [7… While we cannot verify that the closed-source models analyzed in this study used this exact data, Common …","url":["https://arxiv.org/pdf/2512.10065"]} {"year":"2025","title":"LinguaSafe: A Comprehensive Multilingual Safety Benchmark for Large Language Models","authors":["Z Ning, T Gu, J Song, S Hong, L Li, H Liu, J Li, Y Wang… - arXiv preprint arXiv …, 2025"],"snippet":"The widespread adoption and increasing prominence of large language models (LLMs) in global technologies necessitate a rigorous focus on ensuring their safety across a diverse range of linguistic and cultural contexts. The lack of a comprehensive …","url":["https://arxiv.org/pdf/2508.12733"]} {"year":"2025","title":"Linguistic Entity Masking to Improve Cross-Lingual Representation of Multilingual Language Models for Low-Resource Languages","authors":["A Fernando, S Ranathunga - arXiv preprint arXiv:2501.05700, 2025"],"snippet":"… It is a collection of document-level data of 3 Trillion tokens from Common Crawl2 for 419 languages. As the dependent-monolingual data, we obtain the monolingual sides from the SiTa-Trilingual parallel dataset [44]. It is a human-curated gold …","url":["https://arxiv.org/pdf/2501.05700"]} {"year":"2025","title":"Linguistic Feature-based Depression Prediction Using Hierarchical Transformer-attentive Model for Mental Disabilities Diagnosis","authors":["FS Alhussen - Journal of Disability Research, 2025"],"snippet":"… For baseline comparisons, 300-dimensional GloVe embeddings trained on the Common Crawl corpus were also evaluated. In the case of GloVe, out-of-vocabulary tokens were replaced with a generic unknown token and initialized using random …","url":["https://www.scienceopen.com/hosted-document?doi=10.57197/JDR-2025-0578"]} @@ -10405,11 +10556,16 @@ {"year":"2025","title":"LLäMmlein: Transparent, Compact and Competitive German-Only Language Models from Scratch","authors":["J Pfister, J Wunderle, A Hotho - Proceedings of the 63rd Annual Meeting of the …, 2025"],"snippet":"We transparently create two German-only decoder models, LLäMmlein 120M and 1B, from scratch and publish them, along with the training data, for the (German) NLP research community to use. The model training involved several key steps, including …","url":["https://aclanthology.org/2025.acl-long.111.pdf"]} {"year":"2025","title":"Llama-GENBA-10B: A Trilingual Large Language Model for German, English and Bavarian","authors":["M Hoffmann, J John, S Schweter, G Ramakrishnan… - arXiv preprint arXiv …, 2025"],"snippet":"We present Llama-GENBA-10B, a trilingual foundation model addressing English-centric bias in large language models. Built on Llama 3.1-8B and scaled to 10B parameters, Llama-GENBA-10B is continuously pretrained on 164B tokens (82B English, 82B …","url":["https://arxiv.org/pdf/2509.05668"]} {"year":"2025","title":"LLM Based Sentiment Classification From Bangladesh E-Commerce Reviews","authors":["S Tabassum - arXiv preprint arXiv:2510.01276, 2025"],"snippet":"… It offers a balance between speed and accuracy, and like its larger version, it was trained on 2.5TB of Common Crawl material in 100 different languages. It performs better than mBERT on multilingual benchmarks like XNLI and MLQA, although being …","url":["https://arxiv.org/pdf/2510.01276"]} +{"year":"2025","title":"LLM Harms: A Taxonomy and Discussion","authors":["K Chen, S Afroogh, A Murali, D Atkinson, A Dhurandhar… - arXiv preprint arXiv …, 2025"],"snippet":"This study addresses categories of harm surrounding Large Language Models (LLMs) in the field of artificial intelligence. It addresses five categories of harms addressed before, during, and after development of AI applications: pre-development, direct …","url":["https://arxiv.org/pdf/2512.05929"]} {"year":"2025","title":"LLM in the Middle: A Systematic Review of Threats and Mitigations to Real-World LLM-based Systems","authors":["VHG Moia, IJ Sanz, GAF Rebello, RD de Meneses… - arXiv preprint arXiv …, 2025"],"snippet":"The success and wide adoption of generative AI (GenAI), particularly large language models (LLMs), has attracted the attention of cybercriminals seeking to abuse models, steal sensitive data, or disrupt services. Moreover, providing security to LLM-based …","url":["https://arxiv.org/pdf/2509.10682"]} +{"year":"2025","title":"LLM-as-a-judge for sarcasm detection using supervised fine-tuning of transformers","authors":["SV Oprea, A Bâra - Journal of King Saud University Computer and …, 2025"],"snippet":"This research conducts a systematic comparative study of large pre-trained language models (LLMs) for sarcasm and irony detection. While pretrained transformers often struggle to capture sarcastic intent, we fine-tune multiple domain-specific …","url":["https://link.springer.com/article/10.1007/s44443-025-00379-7"]} {"year":"2025","title":"llm-jp-modernbert: A ModernBERT Model Trained on a Large-Scale Japanese Corpus with Long Context Length","authors":["I Sugiura, K Nakayama, Y Oda - arXiv preprint arXiv:2504.15544, 2025"],"snippet":"Encoder-only transformer models like BERT are widely adopted as a pre-trained backbone for tasks like sentence classification and retrieval. However, pretraining of encoder models with large-scale corpora and long contexts has been relatively …","url":["https://arxiv.org/pdf/2504.15544"]} {"year":"2025","title":"LLM360 K2: Scaling Up 360-Open-Source Large Language Models","authors":["Z Liu, B Tan, H Wang, W Neiswanger, T Tao, H Li… - arXiv preprint arXiv …, 2025"],"snippet":"We detail the training of the LLM360 K2-65B model, scaling up our 360-degree OPEN SOURCE approach to the largest and most powerful models under project LLM360. While open-source LLMs continue to advance, the answer to \"How are the …","url":["https://arxiv.org/pdf/2501.07124"]} {"year":"2025","title":"LLMControl: Grounded Control of Text-to-Image Diffusion-based Synthesis with Multimodal LLMs","authors":["J Wang, R Chen, H Cui - arXiv preprint arXiv:2507.19939, 2025"],"snippet":"… We use a dataset of random 1 M image-text pairs with high scores in the Common Crawl Web index and adjust the resolution of the images to 512×512. Subsequently, we apply ODISE [20] to obtain the instance segmentation map. To obtain …","url":["https://arxiv.org/pdf/2507.19939"]} {"year":"2025","title":"LLMic: Romanian Foundation Language Model","authors":["VA Bădoiu, MV Dumitru, AM Gherghescu, A Agache… - arXiv preprint arXiv …, 2025"],"snippet":"… of tokens requires extensive filtering and cleaning of CommonCrawl’s petabyte-scale dataset. … We leverage two filtered CommonCrawl sources for Romanian language data: FuLG [3], … We further augment our dataset by incorporating filtered content …","url":["https://arxiv.org/pdf/2501.07721"]} +{"year":"2025","title":"LLMLagBench: Identifying Temporal Training Boundaries in Large Language Models","authors":["P Pęzik, K Kaczyński, M Szymańska, F Żarnecki… - arXiv preprint arXiv …, 2025"],"snippet":"… [10] introduced perplexity-based probing to operationalize and identify “effective cutoffs” in LLMs, revealing significant temporal misalignment due to deduplication failures and CommonCrawl contamination with outdated content. Their approach …","url":["https://arxiv.org/pdf/2511.12116"]} +{"year":"2025","title":"LLMs Against Digital Deviance: Scalable Hate Speech Detection in Low-Resource and Code-Mixed Social Media","authors":["MJ Alam, I Hossain, S Puppala, S Talukder"],"snippet":"In this paper we present a comprehensive study on hate speech detection in Bengali (Bangla), a low-resource language with significant online presence. We explore the potential of large language models (LLMs) such as GPT-4, Qwen, and DeepSeek in …","url":["https://web.ntpu.edu.tw/~myday/doc/ASONAM2025/ASONAM2025_Proceedings/pdf/papers/2130_127.pdf"]} +{"year":"2025","title":"LLMs Against Digital Deviance: Scalable Hate Speech Detection in Low-Resource","authors":["MJ Alam, I Hossain¹, S Puppala, S Talukder¹ - … in Social Networks Analysis and Mining …, 2025"],"snippet":"In this paper we present a comprehensive study on hate speech detection in Bengali (Bangla), a low-resource language with significant online presence. We explore the potential of large language models (LLMs) such as GPT-4, Qwen, and DeepSeek in …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=iZCiEQAAQBAJ&oi=fnd&pg=PA406&dq=commoncrawl&ots=roQQJG4NgY&sig=c-tbByJOl-N9V5hpLl8UaVtl7ec"]} {"year":"2025","title":"LLMs Are Globally Multilingual Yet Locally Monolingual: Exploring Knowledge Transfer via Language and Thought Theory","authors":["E Kang, J Kim - arXiv preprint arXiv:2505.24409, 2025"],"snippet":"… According to Common Crawl statistics, these languages exhibit varying levels of multilingual web content (ZH: 5.27%, KO: 0.76%, AR: 0.68%),2 reflecting a spectrum from relatively higher… 2commoncrawl.github.io/cc-crawl-statistics/ plots/languages …","url":["https://arxiv.org/pdf/2505.24409"]} {"year":"2025","title":"LLMs as Data Preprocessors for Natural Language Processing in Manufacturing","authors":["T Körner, MC May, MF Huber - 2025 IEEE 30th International Conference on …, 2025"],"snippet":"… LLMs, trained on extensive and diverse datasets—including large-scale, multilingual web corpora such as Common Crawl and domain-specific sources like arXiv—possess a broad and deep knowledge base. This enables them to effectively …","url":["https://ieeexplore.ieee.org/abstract/document/11205804/"]} {"year":"2025","title":"LLMs on support of privacy and security of mobile apps: state of the art and research directions","authors":["TTL Nguyen, B Carminati, E Ferrari - arXiv preprint arXiv:2506.11679, 2025"],"snippet":"Modern life has witnessed the explosion of mobile devices. However, besides the valuable features that bring convenience to end users, security and privacy risks still threaten users of mobile apps. The increasing sophistication of these threats in …","url":["https://arxiv.org/pdf/2506.11679"]} @@ -10436,7 +10592,9 @@ {"year":"2025","title":"Lost and Found: Computational Quality Assurance of Crowdsourced Knowledge on Morphological Defectivity in Wiktionary","authors":["J Sakunkoo, A Sakunkoo - ACL 2025 Student Research Workshop"],"snippet":"Morphological defectivity is an intriguing and understudied phenomenon in linguistics. Addressing defectivity, where expected inflectional forms are absent, is essential for improving the accuracy of NLP tools in morphologically rich languages …","url":["https://openreview.net/pdf?id=Kvf4uDPEYn"]} {"year":"2025","title":"Lost, but Preserved–A Web Archiving Perspective on the Ephemeral Web","authors":["S Alam, M Graham"],"snippet":"… It is worth noting that the last three years in Figure 2 seem to be rescued almost completely, but it is due to some data contamination as we have started ingesting CommonCrawl data from the recent years into the Wayback Machine, which …","url":["https://wadlworkshop.github.io/2025/papers/WADL2025_paper_7643.pdf"]} {"year":"2025","title":"Low-Resource Neural Machine Translation Using Recurrent Neural Networks and Transfer Learning: A Case Study on English-to-Igbo","authors":["OA Ekle, B Das - arXiv preprint arXiv:2504.17252, 2025"],"snippet":"In this study, we develop Neural Machine Translation (NMT) and Transformer-based transfer learning models for English-to-Igbo translation - a low-resource African language spoken by over 40 million people across Nigeria and West Africa. Our …","url":["https://arxiv.org/pdf/2504.17252"]} +{"year":"2025","title":"Low-Resource, High-Impact: Building Corpora for Inclusive Language Technologies","authors":["E Artemova, L Burchell, D Dementieva, S Okabe… - arXiv preprint arXiv …, 2025"],"snippet":"This tutorial (https://tum-nlp.github.io/low-resource-tutorial) is designed for NLP practitioners, researchers, and developers working with multilingual and low-resource languages who seek to create more equitable and socially impactful language …","url":["https://arxiv.org/pdf/2512.14576"]} {"year":"2025","title":"LumiViz: Automating Business Data Visualization with Generative AI","authors":["S Górtowski, E Lewańska - International Conference on Business Information …, 2025"],"snippet":"The paper presents LumiViz, a system leveraging Generative AI to automate business data visualization processes. The tool processes user queries in natural language, retrieves relevant data, generates visualizations, and provides business-oriented …","url":["https://link.springer.com/chapter/10.1007/978-3-031-94193-1_4"]} +{"year":"2025","title":"Luxical: High-Speed Lexical-Dense Text Embeddings","authors":["L Merrick, A Fang, A Carranza, A Deng, A Abbas… - arXiv preprint arXiv …, 2025"],"snippet":"Frontier language model quality increasingly hinges on our ability to organize web-scale text corpora for training. Today's dominant tools trade off speed and flexibility: lexical classifiers (eg, FastText) are fast but limited to producing classification output scores …","url":["https://arxiv.org/pdf/2512.09015"]} {"year":"2025","title":"Lyric-Based Passwords: Enhancing Security and Recall with AI","authors":["J Wise, MT Hoque - Cyber Security and Applications, 2025"],"snippet":"In the digital age, text-based passwords remain the cornerstone of user authentication. However, the balance between security and memorability remains a significant challenge. Users often face a dilemma between creating complex …","url":["https://www.sciencedirect.com/science/article/pii/S2772918425000256"]} {"year":"2025","title":"M+: Extending MemoryLLM with Scalable Long-Term Memory","authors":["Y Wang, D Krotov, Y Hu, Y Gao, W Zhou, J McAuley… - arXiv preprint arXiv …, 2025"],"snippet":"Equipping large language models (LLMs) with latent-space memory has attracted increasing attention as they can extend the context window of existing language models. However, retaining information from the distant past remains a challenge …","url":["https://arxiv.org/pdf/2502.00592"]} {"year":"2025","title":"Machine Learners Should Acknowledge the Legal Implications of Large Language Models as Personal Data","authors":["H Nolte, M Finck, K Meding - arXiv preprint arXiv:2503.01630, 2025"],"snippet":"Does GPT know you? The answer depends on your level of public recognition; however, if your information was available on a website, the answer is probably yes. All Large Language Models (LLMs) memorize training data to some extent. If an …","url":["https://arxiv.org/pdf/2503.01630"]} @@ -10446,14 +10604,17 @@ {"year":"2025","title":"Machine Learning has the Capability to Monitor the Advancement of Climate Technology Innovation Using Climate-Related Texts","authors":["B Meenal - 2025"],"snippet":"… such as materials from WIKIPEDIA published in 2015, a segment of the CCNEWS dataset discussed in work by Nagel in 2016, the OPEN-WEBTEXT corpus derived from web material linked via Reddit according to Gokaslan and Cohen in 2019, and …","url":["https://www.researchsquare.com/article/rs-5942954/latest.pdf"]} {"year":"2025","title":"Machine Learning Method Employed for the Objective of Identifying Text on Tweet Dataset","authors":["S Pandey - Demystifying Emerging Trends in Machine Learning, 2025"],"snippet":"When it comes to training ML systems, internet-based data is invaluable. Despite the difficulty in collecting this information, teams of experts from academic institutions and research labs have created publicly accessible databases. Twitter and other …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=7gBIEQAAQBAJ&oi=fnd&pg=PA81&dq=commoncrawl&ots=dMettUG_Mr&sig=pkGTCY4jN_0YXdk9kgGsYF7bURw"]} {"year":"2025","title":"MACHINE LEARNING SOFTWARE FRAMEWORK FOR PREDICTING AND OPTIMIZING WEB APPLICATION PERFORMANCE","authors":["PV Burchak, LM Oleshchenko - ВЧЕНІ ЗАПИСКИ, 2025"],"snippet":"The research has resulted in a scalable, automated, and interpretable software solution for predictive diagnostics and performance optimization of web applications. Proposed software system has also generated categorized web applications …","url":["http://www.tech.vernadskyjournals.in.ua/journals/2025/4_2025/part_2/4-2_2025.pdf#page=56"]} +{"year":"2025","title":"Machine Learning with Applications","authors":["MA Ullah"],"snippet":"… The model uses 300-dimensional fastText embeddings pretrained on Bangla Common Crawl data. The final sentiment prediction is made through a softmax classifier. Training was conducted over 25 epochs with a batch size of 64, using the …","url":["https://www.researchgate.net/profile/Engr-Mohammad-Aman-Ullah-2/publication/397422910_BCLSA_Advancing_Bangla_Sentiment_Analysis_with_Concept-Level_Reasoning_and_Efficiency/links/693142d306a9ab54f843c275/BCLSA-Advancing-Bangla-Sentiment-Analysis-with-Concept-Level-Reasoning-and-Efficiency.pdf"]} {"year":"2025","title":"Machine Learning-Based Phishing Websites Classification Using Diverse Datasets: An Empirical Analysis","authors":["S Haider, B Khan, W Khan, S Ullah, Z Ali - … of Blockchain, Internet of Everything, and …, 2025"],"snippet":"Recent technological developments make users vulnerable to several cyber-attacks, where phishing attacks compromise users’ sensitive information. To identify these attacks, there are different social techniques, which bring user awareness. However …","url":["https://www.igi-global.com/chapter/machine-learning-based-phishing-websites-classification-using-diverse-datasets/380167"]} {"year":"2025","title":"Machine Translation Model Optimization Based on Deep Learning","authors":["R Li - 2025 3rd International Conference on Integrated …, 2025"],"snippet":"Machine translation (MT) is widely used in cross-language communication as globalization accelerates. However, existing MT models, such as rule-based methods, still have issues, such as inadequate accuracy of translation. To address …","url":["https://ieeexplore.ieee.org/abstract/document/10967889/"]} {"year":"2025","title":"Machine-Learning Classification Model and Tools for Real-time URL Phishing Detection","authors":["R Saifan, H Ahmad, TA Edwan - 2025"],"snippet":"Phishing attacks are considered a significant cybersecurity concern, employing deceptive tactics to entice individuals into engaging with counterfeit websites. These malicious pages are skillfully designed replicas of legitimate platforms, aiming to …","url":["https://www.researchsquare.com/article/rs-7666636/latest.pdf"]} +{"year":"2025","title":"Made-in China, Thinking in America: US Values Persist in Chinese LLMs","authors":["D Haslett, LTL Huang, L Khalatbari, JH Hsiao, AB Chan - arXiv preprint arXiv …, 2025"],"snippet":"As large language models increasingly mediate access to information and facilitate decision-making, they are becoming instruments in soft power competitions between global actors such as the United States and China. So far, language models seem to …","url":["https://arxiv.org/pdf/2512.13723"]} {"year":"2025","title":"MAGA: MAssive Genre-Audience Reformulation to Pretraining Corpus Expansion","authors":["X Hao, K Shen, C Li - arXiv preprint arXiv:2502.04235, 2025"],"snippet":"Despite the remarkable capabilities of large language models across various tasks, their continued scaling faces a critical challenge: the scarcity of high-quality pretraining data. While model architectures continue to evolve, the natural language …","url":["https://arxiv.org/pdf/2502.04235"]} {"year":"2025","title":"MAGNET: Augmenting Generative Decoders with Representation Learning and Infilling Capabilities","authors":["S Khosla, K Kafle, S Jenni, H Zhao, J Collomosse, J Shi - arXiv preprint arXiv …, 2025"],"snippet":"While originally designed for unidirectional generative modeling, decoder-only large language models (LLMs) are increasingly being adapted for bidirectional modeling. However, unidirectional and bidirectional models are typically trained separately …","url":["https://arxiv.org/pdf/2501.08648"]} {"year":"2025","title":"MAiDE-up: Multilingual Deception Detection of AI-generated Hotel Reviews","authors":["O Ignat, X Xu, R Mihalcea","O Ignat, X Xu, R Mihalcea - Findings of the Association for Computational …, 2025"],"snippet":"Deceptive reviews are becoming increasingly common, especially given the increase in performance and the prevalence of LLMs. While work to date has addressed the development of models to differentiate between truthful and …","url":["https://aclanthology.org/2025.findings-naacl.88.pdf","https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-findings.88.pdf"]} {"year":"2025","title":"MAIN DIRECTIONS OF COMPUTATIONAL LINGUISTICS","authors":["D Alisherova - Journal of Multidisciplinary Sciences and Innovations, 2025"],"snippet":"Computational linguistics, an interdisciplinary field at the intersection of linguistics and computer science, focuses on developing algorithms and models to process and understand human language. This article explores the main directions of …","url":["https://inlibrary.uz/index.php/jmsi/article/view/89962"]} {"year":"2025","title":"Maintaining Academic Integrity in the Era of","authors":["AD Latief, R Fajri - Advanced AI and Prompt Engineering Techniques and …, 2025"],"snippet":"This chapter provides comprehensive guidance for academic researchers on effectively integrating Large Language Models (LLMs) in research workflows. Beginning with technical foundations and capabilities, it examines LLMs’ architecture …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=32WLEQAAQBAJ&oi=fnd&pg=PA121&dq=commoncrawl&ots=IFqEsOIDDz&sig=t4L2qKwnfuWMBfxqjRYhC4rEHrU"]} +{"year":"2025","title":"MALBO: Optimizing LLM-Based Multi-Agent Teams via Multi-Objective Bayesian Optimization","authors":["A Sabbatella - arXiv preprint arXiv:2511.11788, 2025"],"snippet":"The optimal assignment of Large Language Models (LLMs) to specialized roles in multi-agent systems is a significant challenge, defined by a vast combinatorial search space, expensive black-box evaluations, and an inherent trade-off between …","url":["https://arxiv.org/pdf/2511.11788"]} {"year":"2025","title":"MammoTab 25: A Large-Scale Dataset for Semantic Table Interpretation–Training, Testing, and Detecting Weaknesses","authors":["J D'Souza, M Palmonari"],"snippet":"The paper presents MammoTab 25, a new dataset comprising approximately 838930 Wikipedia tables extracted from over 63 million English Wikipedia pages and semantically annotated through Wikidata. Each table in MammoTab 25 is …","url":["https://sem-tab-challenge.github.io/2025/papers/paper_275.pdf"]} {"year":"2025","title":"Managing Output Risks From Imperfect LLMS","authors":["M Sanmugam, J Boldiston - Enhancing Learning Experiences With Digital Tools: AI …, 2025"],"snippet":"… URLs crawled are specifically selected by the Common Crawl society, and the Common Crawl data comprises 60% of ChatGPT 3’s entire … the Common Crawl dataset is not easily found or comprehensible. The pages and their content are …","url":["https://www.igi-global.com/chapter/managing-output-risks-from-imperfect-llms/372170"]} {"year":"2025","title":"Mangosteen: An Open Thai Corpus for Language Model Pretraining","authors":["W Phatthiyaphaibun, C Udomcharoenchaikit… - arXiv preprint arXiv …, 2025"],"snippet":"… We describe the setup of the data ablation studies on Common Crawl and FineWeb2 as follows: Common Crawl. We train a GPT-2 model on each of five dataset variations, with each containing 10 billion tokens. Each dataset variation …","url":["https://arxiv.org/pdf/2507.14664"]} @@ -10467,6 +10628,7 @@ {"year":"2025","title":"Matrix factorization techniques for Large Language Models","authors":["S Pandini"],"snippet":"In the last years the development of Large Language Models (LLMs) has revolutionized the field of natural language processing (NLP), enabling significant advancements in several contexts, such as text translation, code generation and …","url":["https://amslaurea.unibo.it/id/eprint/34224/1/Thesis_Simone_Pandini.pdf"]} {"year":"2025","title":"Matryoshka Model Learning for Improved Elastic Student Models","authors":["C Verma, AS Timmaraju, C Jui-Hsieh, S Damle, N Bui… - arXiv preprint arXiv …, 2025"],"snippet":"Industry-grade ML models are carefully designed to meet rapidly evolving serving constraints, which requires significant resources for model development. In this paper, we propose MatTA, a framework for training multiple accurate Student …","url":["https://arxiv.org/pdf/2505.23337"]} {"year":"2025","title":"MaXIFE: Multilingual and Cross-lingual Instruction Following Evaluation","authors":["Y Liu, Z Ma, X Jiang, J Hu, J Chang, L Li - arXiv preprint arXiv:2506.01776, 2025"],"snippet":"With the rapid adoption of large language models (LLMs) in natural language processing, the ability to follow instructions has emerged as a key metric for evaluating their practical utility. However, existing evaluation methods often focus on …","url":["https://arxiv.org/pdf/2506.01776"]} +{"year":"2025","title":"MCP vs RAG vs NLWeb vs HTML: A Comparison of the Effectiveness and Efficiency of Different Agent Interfaces to the Web (Technical Report)","authors":["A Steiner, R Peeters, C Bizer - arXiv preprint arXiv:2511.23281, 2025"],"snippet":"… These scenarios are found in the WebMall benchmark [8], which provides four locally hostable shops populated with 4,421 product offers drawn from the October 2024 Common Crawl via schema.org annotations. The shops span PC components …","url":["https://arxiv.org/pdf/2511.23281"]} {"year":"2025","title":"Mdsbots@ nlu of devanagari script languages 2025: Detection of language, hate speech, and targets using murtweet","authors":["P Ale, A Thapaliya, S Paudel - Proceedings of the First Workshop on Challenges in …, 2025"],"snippet":"In multilingual contexts, an automated system for accurate language identification, followed by hate speech detection and target identification, plays a critical role in processing low-resource hate speech data and mitigating its negative impact. This …","url":["https://aclanthology.org/2025.chipsal-1.35.pdf"]} {"year":"2025","title":"Meandering Interpretations","authors":["P Proff, A Lang, M Pfaff, M Dörk"],"snippet":"The interpretation of texts is a central practice in most academic disciplines. In contrast to leisure reading, academic reading is characterized by distinct methods and practices for text analysis, eg, close reading. In this regard, this work has …","url":["https://philippproff.eu/media/me/Masterarbeit.pdf"]} {"year":"2025","title":"Measuring Controversy in online discussions","authors":["I Andreev"],"snippet":"… 18], that are trained on Common Crawl and Wikipedia for German, which is highly beneficial for our analysis of posts on derStandard.at. These pretrained models provide a solid foundation for embedding the textual data, ensuring that both …","url":["https://recsys-lab.at/wp-content/uploads/2025/02/BA_Ivan_Andreev.pdf"]} @@ -10474,6 +10636,7 @@ {"year":"2025","title":"Measuring Fingerprints of Web-filtered Text Datasets and Fingerprint Propagation Through Training","authors":["Y Mansour, R Heckel - The Thirty-ninth Annual Conference on Neural …"],"snippet":"… Building on prior work demonstrating the existence of fingerprints or biases in popular computer vision datasets, we analyze popular open-source pretraining datasets for LLMs derived from CommonCrawl including C4, RefinedWeb, DolmaCC …","url":["https://openreview.net/forum?id=iKwHwCaddB"]} {"year":"2025","title":"Measuring memorization in language models via probabilistic extraction","authors":["J Hayes, M Swanberg, H Chaudhari, I Yona…"],"snippet":"… Since we know that Llama relied on Common Crawl data for training, we use 10,000 examples drawn from Common Crawl. It is, of course, possible that the examples we use were not contained in the OPT or Llama training datasets. We …","url":["https://www.researchgate.net/profile/A-Cooper-2/publication/389788662_Measuring_memorization_in_language_models_via_probabilistic_extraction/links/67d263c9d759700065087b7d/Measuring-memorization-in-language-models-via-probabilistic-extraction.pdf"]} {"year":"2025","title":"Measuring Risks to Users' Health Privacy Posed by Third-Party Web Tracking and Targeted Advertising","authors":["E Zeng, X Wu, EN Ertmann, L Huang, DF Johnson… - 2025"],"snippet":"Online advertising platforms may be able to infer privacy-sensitive information about people, such as their health conditions. This could lead to harms like exposure to predatory targeted advertising or unwanted disclosure of health conditions to …","url":["https://www.ericwzeng.com/papers/Zeng-CHI2025-HealthPrivacyAds.pdf"]} +{"year":"2025","title":"Measuring the Impact of Lexical Training Data Coverage on Hallucination Detection in Large Language Models","authors":["S Zhang, F Gotti, F Mo, JY Nie - arXiv preprint arXiv:2511.17946, 2025"],"snippet":"Hallucination in large language models (LLMs) is a fundamental challenge, particularly in open-domain question answering. Prior work attempts to detect hallucination with model-internal signals such as token-level entropy or generation …","url":["https://arxiv.org/pdf/2511.17946"]} {"year":"2025","title":"Measuring the Prevalence and Variety of Online Age Gates","authors":["TS Dhesi, N Apthorpe"],"snippet":"The legal landscape regarding age-based restrictions (age gates) for online services is rapidly changing. In order to comply with existing and proposed regulations, online services must determine whether users are older or younger than …","url":["https://www.ieee-security.org/TC/SPW2025/ConPro/papers/dhesi-conpro25.pdf"]} {"year":"2025","title":"Mechanistic Interpretability in the Presence of Architectural Obfuscation","authors":["M Florencio, T Barton - arXiv preprint arXiv:2506.18053, 2025"],"snippet":"… By contrast, GPT-3 was trained using a mixture of data from Common Crawl, WebText2, books, and Wikipedia [3]. For our case, the custom model was trained on a large corpus of text data called Fineweb-Edu, available on HuggingFace, which …","url":["https://arxiv.org/pdf/2506.18053"]} {"year":"2025","title":"Medical foundation large language models for comprehensive text analysis and beyond","authors":["Q Xie, Q Chen, A Chen, C Peng, Y Hu, F Lin, X Peng… - npj Digital Medicine, 2025"],"snippet":"Recent advancements in large language models (LLMs) show significant potential in medical applications but are hindered by limited specialized medical knowledge. We present Me-LLaMA, a family of open-source medical LLMs integrating extensive …","url":["https://www.nature.com/articles/s41746-025-01533-1"]} @@ -10503,23 +10666,29 @@ {"year":"2025","title":"Microdosing Psychedelics for Cognitive Enhancement: A Naturalistic Exploration of User Experiences","authors":["L Pelham - 2025"],"snippet":"Psychedelics have recently grown in scientific interest, among this, there have been promising findings regarding its potential to treat a range of health problems (such as depression, anxiety, PTSD). Despite the growing research interest in psychedelics …","url":["https://openaccess.wgtn.ac.nz/articles/thesis/Microdosing_Psychedelics_for_Cognitive_Enhancement_A_Naturalistic_Exploration_of_User_Experiences/30193615/1/files/58179229.pdf"]} {"year":"2025","title":"Microsoft Corporation: OpenAI and ChatGPT in the Workplace","authors":["V Chauhan, J Lomeli, P Hough, JS O'Rourke - 2025"],"snippet":"… The datasets used to pre-train GPT-1 were Common Crawl, a dataset of web pages with billions of words, and the BookCorpus, a collection of over 11,000 books covering numerous genres.7 Some of its limitations included producing repetitive text …","url":["https://sk.sagepub.com/cases/embed/microsoft-corporation-openai-and-chatgpt-in-the-workplace"]} {"year":"2025","title":"Mid-Training of Large Language Models: A Survey","authors":["K Mo, Y Shi, W Weng, Z Zhou, S Liu, H Zhang, A Zeng - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) are typically developed through large-scale pre-training followed by task-specific fine-tuning. Recent advances highlight the importance of an intermediate mid-training stage, where models undergo multiple annealing-style …","url":["https://arxiv.org/pdf/2510.06826"]} +{"year":"2025","title":"MIDUS: Memory-Infused Depth Up-Scaling","authors":["T Kim, H Byun, Y Choi, S Park, K Song - arXiv preprint arXiv:2512.13751, 2025"],"snippet":"Scaling large language models (LLMs) demands approaches that increase capacity without incurring excessive parameter growth or inference cost. Depth Up-Scaling (DUS) has emerged as a promising strategy by duplicating layers and applying Continual …","url":["https://arxiv.org/pdf/2512.13751"]} {"year":"2025","title":"MIGRATE: Cross-Lingual Adaptation of Domain-Specific LLMs through Code-Switching and Embedding Transfer","authors":["S Hong, S Lee, H Moon, HS Lim - Proceedings of the 31st International Conference …, 2025"],"snippet":"… 2023), which contains highquality mathematical texts sourced from Common Crawl. … Pre-trained FastText word vectors for 157 languages are trained on Common Crawl and Wikipedia using CBOW with position-weights. The embeddings …","url":["https://aclanthology.org/2025.coling-main.617.pdf"]} {"year":"2025","title":"Mind the Gap: Assessing Wiktionary's Crowd-Sourced Linguistic Knowledge on Morphological Gaps in Two Related Languages","authors":["J Sakunkoo, A Sakunkoo - arXiv preprint arXiv:2506.17603, 2025"],"snippet":"… Common Crawl (CC-100) (Wenzek et al., 2020): From CC-100, we use an 8.3GB dataset … Common Crawl (Raw Text Corpus) Tokenize with UDPipe Morphologically Tagged Corpus … Tube model is used to annotate text from the …","url":["https://arxiv.org/pdf/2506.17603"]} {"year":"2025","title":"Mind the Gap: Computational Quality Assurance of Crowd-Sourced Linguistic Knowledge on Latin and Italian Morphological Gaps","authors":["J Sakunkoo, A Sakunkoo - Society for Computation in Linguistics, 2025"],"snippet":"… This study uses Universal Dependencies (UD), Common Crawl, and Wiktionary in the computational validation of morphological gaps. Universal Dependencies is a collection of multilingual treebanks for syntactic and morphological analysis across …","url":["https://openpublishing.library.umass.edu/scil/article/id/3186/download/pdf/"]} {"year":"2025","title":"MiniCPM-V 4.5: Cooking Efficient MLLMs via Architecture, Data, and Training Recipe","authors":["T Yu, Z Wang, C Wang, F Huang, W Ma, Z He, T Cai… - arXiv preprint arXiv …, 2025"],"snippet":"Multimodal Large Language Models (MLLMs) are undergoing rapid progress and represent the frontier of AI development. However, their training and inference efficiency have emerged as a core bottleneck in making MLLMs more accessible …","url":["https://arxiv.org/pdf/2509.18154"]} {"year":"2025","title":"MiniCPM4: Ultra-Efficient LLMs on End Devices","authors":["M Team, C Xiao, Y Li, X Han, Y Bai, J Cai, H Chen… - arXiv preprint arXiv …, 2025"],"snippet":"This paper introduces MiniCPM4, a highly efficient large language model (LLM) designed explicitly for end-side devices. We achieve this efficiency through systematic innovation in four key dimensions: model architecture, training data …","url":["https://arxiv.org/pdf/2506.07900"]} +{"year":"2025","title":"MiniLingua: A Small Open-Source LLM for European Languages","authors":["A Aksenova, B Zverkov, N Dainese, A Nikitin… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models are powerful but often limited by high computational cost, privacy concerns, and English-centric training. Recent progress demonstrates that small, efficient models with around one billion parameters can deliver strong results …","url":["https://arxiv.org/pdf/2512.13298"]} {"year":"2025","title":"MiniLingua: Training a Multilingual Small Language Model","authors":["A Aksenova - 2025"],"snippet":"This thesis presents the development of a multilingual large language model (LLM) MiniLingua trained on European languages, with a focus on efficiency, linguistic diversity, and open access. The model contains approximately 1 billion, placing it in …","url":["https://aaltodoc.aalto.fi/bitstreams/4a22e1e6-5cd0-48fe-998b-3a7caa7254ca/download"]} {"year":"2025","title":"MiniMax-01: Scaling Foundation Models with Lightning Attention","authors":["A Li, B Gong, B Yang, B Shan, C Liu, C Zhu, C Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient …","url":["https://arxiv.org/pdf/2501.08313"]} {"year":"2025","title":"Mining Hidden Thoughts from Texts: Evaluating Continual Pretraining with Synthetic Data for LLM Reasoning","authors":["Y Ishibashi, T Yano, M Oyamada - arXiv preprint arXiv:2505.10182, 2025"],"snippet":"Large Language Models (LLMs) have demonstrated significant improvements in reasoning capabilities through supervised fine-tuning and reinforcement learning. However, when training reasoning models, these approaches are primarily …","url":["https://arxiv.org/pdf/2505.10182"]} {"year":"2025","title":"Minnesota Journal of Law, Science & Technolog y","authors":["L Commons - 2025"],"snippet":"The European Union's Artificial Intelligence Act (AI Act) represents a pioneering attempt to regulate AI technologies. However, this Paper argues that the Act's framework is inadequate for addressing the challenges posed by generative and …","url":["https://scholarship.law.umn.edu/cgi/viewcontent.cgi?article=1576&context=mjlst"]} +{"year":"2025","title":"MiroThinker: Pushing the Performance Boundaries of Open-Source Research Agents via Model, Context, and Interactive Scaling","authors":["MM Team, S Bai, L Bing, C Chen, G Chen, Y Chen… - arXiv preprint arXiv …, 2025"],"snippet":"We present MiroThinker v1.0, an open-source research agent designed to advance tool-augmented reasoning and information-seeking capabilities. Unlike previous agents that only scale up model size or context length, MiroThinker explores …","url":["https://arxiv.org/pdf/2511.11793"]} {"year":"2025","title":"MiST: Understanding the Role of Mid-Stage Scientific Training in Developing Chemical Reasoning Models","authors":["AM Bran, T Xie, S Pranesh, J Goumaz, XV Nguyen…"],"snippet":"Large Language Models (LLMs) can acquire emergent reasoning via online fine-tuning with simple rule-based rewards when tasks are already latent-solvable by the base model. We study chemical reasoning and identify two pre-requisites for RL-based …","url":["https://openreview.net/pdf?id=a42RSnbI6l"]} {"year":"2025","title":"Mitigating Bias LLM-Powered Employee Engagement Models: AI Ethics in Enterprise HR Systems","authors":["F Smith, M Chen - 2024"],"snippet":"… Large Language Models in review were developed on the backbone of publicly available datasets, including Common Crawl, Wikipedia, and OpenWebText. Proprietary HR datasets provided by industry collaborators added depth to the …","url":["https://www.researchgate.net/profile/Huma-Sarwar-5/publication/389520858_Mitigating_Bias_LLM-Powered_Employee_Engagement_Models_AI_Ethics_in_Enterprise_HR_Systems/links/67c68c2a207c0c20faa0416f/Mitigating-Bias-LLM-Powered-Employee-Engagement-Models-AI-Ethics-in-Enterprise-HR-Systems.pdf"]} +{"year":"2025","title":"Mitigating Catastrophic Forgetting in Target Language Adaptation of LLMs via Source-Shielded Updates","authors":["A Yamaguchi, T Morishita, A Villavicencio, N Aletras - arXiv preprint arXiv:2512.04844, 2025"],"snippet":"Expanding the linguistic diversity of instruct large language models (LLMs) is crucial for global accessibility but is often hindered by the reliance on costly specialized target language labeled data and catastrophic forgetting during adaptation. We …","url":["https://arxiv.org/pdf/2512.04844"]} {"year":"2025","title":"Mitigating Distribution Bias in Multimodal Datasets via Clustering-Based Curation","authors":["M El Aichouni, L Gomez, L Kang - Iberian Conference on Pattern Recognition and …, 2025"],"snippet":"… Large-scale multimodal datasets are commonly sourced from web crawls, such as CommonCrawl [15, 16], and are filtered using heuristic rules—eg, constraints on image resolution, limiting captions to English with a predefined vocabulary or clip …","url":["https://link.springer.com/chapter/10.1007/978-3-031-99565-1_35"]} +{"year":"2025","title":"Mitigating Intersectional Gender and Racial Bias in Sentiment Analysis: A T5-Based Data Augmentation Approach for English and Low-Resource Bengali","authors":["MS Islam - Journal of Digital Information Management, 2025"],"snippet":"… In this study, English FastText embeddings were sourced from large-scale Common Crawl and Wikipedia corpora (600B tokens). In contrast, Bengali FastText embeddings were trained on Indic Corp and OSCAR datasets with 5-character n-grams …","url":["https://www.dline.info/fpaper/jdim/v23i4/jdimv23i4_1.pdf"]} {"year":"2025","title":"Mixed-Initiative Conversational Intelligence in the Era of Large Pre-Trained Models","authors":["ML Chen - 2025"],"snippet":"With the rise of large pre-trained models, the idea of intelligent conversational agents has quickly gained attention in the public eye. Such conversational agents promise impressive capabilities in a multi-turn interaction setting, whether it be …","url":["https://search.proquest.com/openview/50c2dde2c8bfa6bf1423f209f6b3ee8f/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Mixtera: A Data Plane for Foundation Model Training","authors":["M Böther, X Yao, T Kerimoglu, A Klimovic - arXiv preprint arXiv:2502.19790, 2025"],"snippet":"… A mixture describes how the data is mixed based on their characteristics, ie, we might train on 50 % data from Common Crawl and 50 % from movie subtitles. The data can be combined based on multiple characteristics simultaneously. For instance …","url":["https://arxiv.org/pdf/2502.19790"]} {"year":"2025","title":"Mixture of Hidden-Dimensions: Not All Hidden-States' Dimensions are Needed in Transformer","authors":["Y Chen, J Shang, Z Zhang, J Sheng, T Liu, S Wang… - Forty-second International …"],"snippet":"Transformer models encounter inefficiency when scaling hidden dimensions due to the uniform expansion of parameters. When delving into the sparsity of hidden dimensions, we observe that only a small subset of dimensions are highly activated …","url":["https://openreview.net/pdf?id=H9CDAY3DPW"]} {"year":"2025","title":"Mixture of Neuron Experts","authors":["R Cheng, Y Guan, Y Ding, Q Hu, Y Wei, C Yuan… - arXiv preprint arXiv …, 2025"],"snippet":"In this work, we first explore whether the parameters activated by the MoE layer remain highly sparse at inference. We perform a sparsification study on several representative MoE models. For each expert, we rank parameters by the magnitude …","url":["https://arxiv.org/pdf/2510.05781"]} {"year":"2025","title":"MixtureVitae: Open Web-Scale Pretraining Dataset With High Quality Instruction and Reasoning Data Built from Permissive-First Text Sources","authors":["H Nguyen, V May, H Raj, M Nezhurina, Y Wang, Y Luo… - arXiv preprint arXiv …, 2025"],"snippet":"We present MixtureVitae, an open-access pretraining corpus built to minimize legal risk while providing strong model performance. MixtureVitae follows a risk-mitigated sourcing strategy that combines public-domain and permissively licensed text (eg …","url":["https://arxiv.org/pdf/2509.25531"]} +{"year":"2025","title":"MM-Telco: Benchmarks and Multimodal Large Language Models for Telecom Applications","authors":["GR Gupta, A Kumar, M Rai, A Chakraborty, A Modi… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have emerged as powerful tools for automating complex reasoning and decision-making tasks. In telecommunications, they hold the potential to transform network optimization, automate troubleshooting, enhance …","url":["https://arxiv.org/pdf/2511.13131"]} {"year":"2025","title":"Model and qualification","authors":["A Gramsci"],"snippet":"In developing Alexis de Tocqueville's observations, Marx identified civil society as the economic base and political society as the political superstructure. 2 Marx postulated the essentials of the base–superstructure concept in his preface to A …","url":["https://reference.org/facts/Base_and_superstructure/v2USoBjM"]} {"year":"2025","title":"Model-Agnostic Gender Bias Control for Text-to-Image Generation via Sparse Autoencoder","authors":["C Wu, Z Wang, K Xie, NK Devulapally, VS Lokhande… - arXiv preprint arXiv …, 2025"],"snippet":"Text-to-image (T2I) diffusion models often exhibit gender bias, particularly by generating stereotypical associations between professions and gendered subjects. This paper presents SAE Debias, a lightweight and model-agnostic framework for …","url":["https://arxiv.org/pdf/2507.20973"]} {"year":"2025","title":"Modeling False Memories with Conceptual Spaces","authors":["S Högborg Rosengren - 2025"],"snippet":"The most well researched associative memory illusion (AMI) is the Deese–Roediger–McDermott (DRM) paradigm: a list learning task found to induce false memories though study lists with associated words. Typically, the study lists are composed of the 12—15 …","url":["https://lup.lub.lu.se/luur/download?func=downloadFile&recordOId=9205342&fileOId=9205350"]} @@ -10540,6 +10709,7 @@ {"year":"2025","title":"MuCPT: Music-related Natural Language Model Continued Pretraining","authors":["K Tian, Y Mao, W Bi, H Wang, Q Wenhui - AI for Music Workshop"],"snippet":"… Book contributes roughly 332 million tokens, common crawl about 17.4 billion, instruction about 2.7 billion, paper about 1.3 billion, and … Figure 2 presents the source composition of the cleaned and normalized Matrix-music corpus: Common …","url":["https://openreview.net/pdf?id=9gL6FYoBNU"]} {"year":"2025","title":"Multi Domain Specific Sentiment Analysis for A Strategic Customer Queries and Feedback, from both Direct and Latent Sentiment by Semantic Associations: Survey.","authors":["MK Shruthishree, JV Gorabal - Journal of Computational Analysis & Applications, 2025"],"snippet":"… On the other hand, FEEL-IT is built upon UmBERTo2, which is based on the RoBERTa architecture and pre-trained on the Common Crawl Italian dataset. FEEL-IT focuses on classifying four emotional categories (excluding neutral), …","url":["https://search.ebscohost.com/login.aspx?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=15211398&AN=187183833&h=zK628bnW2fWFNtJSX9xrKIFMZrWP9C9rEnYKqR5nbkZ4bx28YR4Ib63f6%2BGlc5yG7bFGb0khmaP4Vsz6S35wyw%3D%3D&crl=c"]} {"year":"2025","title":"Multi-Agent Multimodal Models for Multicultural Text to Image Generation","authors":["P Bhalerao, M Yalamarty, B Trinh, O Ignat - arXiv preprint arXiv:2502.15972, 2025"],"snippet":"Large Language Models (LLMs) demonstrate impressive performance across various multimodal tasks. However, their effectiveness in cross-cultural contexts remains limited due to the predominantly Western-centric nature of existing data and …","url":["https://arxiv.org/pdf/2502.15972"]} +{"year":"2025","title":"Multi-Head Low-Rank Attention","authors":["S Liu, H Peng, Z Zhang, Z Chen, Y Guo - NeurIPS 2025 Workshop on Efficient Reasoning"],"snippet":"Long-context inference in large language models is bottlenecked by Key-Value (KV) cache loading during the decoding stage, where the sequential nature of generation requires repeatedly transferring the KV cache from off-chip to on-chip memory at …","url":["https://openreview.net/pdf?id=lD13399d8J"]} {"year":"2025","title":"Multi-Label Classification of Indonesian Voice Phishing Conversations: A Comparative Study of XLM-RoBERTa and ELECTRA","authors":["A Hidayat, S Madenda, H Hustinawaty - Journal of Applied Data Sciences, 2025"],"snippet":"Mobile phones have become a primary means of communication, yet their advancement has also been exploited by cybercriminals, particularly through voice phishing schemes. Voice phishing is a form of social engineering fraud carried out …","url":["https://bright-journal.org/Journal/index.php/JADS/article/download/858/487"]} {"year":"2025","title":"Multi-lingual Functional Evaluation for Large Language Models","authors":["V Ojewale, ID Raji, S Venkatasubramanian - arXiv preprint arXiv:2506.20793, 2025"],"snippet":"… CommonCrawl (see 3) – these languages were thus selected to cover a spectrum of high-resource and lower-resource contexts. As with prior work (Montariol et al., … Table 3 classifies the languages used in our evaluation based on their relative …","url":["https://arxiv.org/pdf/2506.20793"]} {"year":"2025","title":"Multi-Modal Framing Analysis of News","authors":["A Arora, S Yadav, M Antoniak, S Belongie, I Augenstein - arXiv preprint arXiv …, 2025"],"snippet":"… We query the publicly available Common Crawl archives4 for the corresponding publishers and extracted each article’s text, headline, publication date, image_urls and other metadata in JSON format. Post scraping, we filter extremely short and long …","url":["https://arxiv.org/pdf/2503.20960"]} @@ -10575,6 +10745,7 @@ {"year":"2025","title":"Multimodal Transformer Training in Personalized Federated Learning","authors":["X Cao, G Sun, Z Li, H Yu - Proceedings of the Second International Conference …"],"snippet":"… These are generated from Common Crawl, extracting image sources with corresponding alt-text. Caption generation is tested using the MS COCO Caption [31] and Flickr30k [32] datasets, employing the COCO Karpathy split test set and …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=lnxGEQAAQBAJ&oi=fnd&pg=PA60&dq=commoncrawl&ots=fseAlU_eN3&sig=sgJUEInpbFkURXBS7kQvvczFO9s"]} {"year":"2025","title":"Multiple large language models versus clinical guidelines for postmenopausal osteoporosis: a comparative study of ChatGPT-3.5, ChatGPT-4.0, ChatGPT-4o, Google …","authors":["CR Lin, YJ Chen, PA Tsai, WY Hsieh, SHL Tsai, TS Fu… - Archives of Osteoporosis, 2025"],"snippet":"The study assesses the performance of AI models in evaluating postmenopausal osteoporosis. We found that ChatGPT-4o produced the most appropriate responses, highlighting the potential of AI to enhance clinical decision-making and improve …","url":["https://link.springer.com/article/10.1007/s11657-025-01587-4"]} {"year":"2025","title":"MuRating: A High Quality Data Selecting Approach to Multilingual Large Language Model Pretraining","authors":["Z Chen, P Guo, W Han, Y Zhang, B Liu, H Lin, F Liu… - arXiv preprint arXiv …, 2025"],"snippet":"Data quality is a critical driver of large language model performance, yet existing model-based selection methods focus almost exclusively on English. We introduce MuRating, a scalable framework that transfers high-quality English data-quality …","url":["https://arxiv.org/pdf/2507.01785"]} +{"year":"2025","title":"Nanbeige4-3B Technical Report: Exploring the Frontier of Small Language Models","authors":["C Yang, G Peng, J Zhu, R Le, R Feng, T Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"We present Nanbeige4-3B, a family of small-scale but high-performing language models. Pretrained on 23T high-quality tokens and finetuned on over 30 million diverse instructions, we extend the boundary of the scaling law for small language …","url":["https://arxiv.org/pdf/2512.06266"]} {"year":"2025","title":"Native Hybrid Attention for Efficient Sequence Modeling","authors":["J Du, J Hu, T Zhang, W Sun, Y Cheng - arXiv preprint arXiv:2510.07019, 2025"],"snippet":"Transformers excel at sequence modeling but face quadratic complexity, while linear attention offers improved efficiency but often compromises recall accuracy over long contexts. In this work, we introduce Native Hybrid Attention (NHA), a novel …","url":["https://arxiv.org/pdf/2510.07019"]} {"year":"2025","title":"Native OS-Integrated AI in a European OSS/Open-Core Operating System","authors":["X Pillet - 2025"],"snippet":"This document, produced with the assistance of ChatGPT o3 and GPT-5 Thinking, is released under the Apache 2.0 license. It is a voluntary defensive publication (prior art) and therefore enters the prior art upon release under the applicable patent …","url":["https://www.tdcommons.org/cgi/viewcontent.cgi?article=9843&context=dpubs_series"]} {"year":"2025","title":"Natural Fingerprints of Large Language Models","authors":["T Suzuki, R Ri, S Takase - arXiv preprint arXiv:2504.14871, 2025"],"snippet":"Large language models (LLMs) often exhibit biases -- systematic deviations from expected norms -- in their outputs. These range from overt issues, such as unfair responses, to subtler patterns that can reveal which model produced them. We …","url":["https://arxiv.org/pdf/2504.14871"]} @@ -10591,14 +10762,17 @@ {"year":"2025","title":"Navigating the linguistic landscape: Unveiling the future trajectory of natural language processing in an evolving digital era","authors":["J Snehi, I Kansal, A Kumar - High-Performance Automation Methods for …"],"snippet":"Comprehending unstructured text data from a variety of sources, including emails, social media, and digital periodicals, is becoming increasingly difficult in today’s digital environment due to its exponential expansion. This study investigates how …","url":["https://www.taylorfrancis.com/chapters/edit/10.1201/9781003643609-6/navigating-linguistic-landscape-jyoti-snehi-isha-kansal-ashish-kumar"]} {"year":"2025","title":"neDIOM: Dataset and Analysis of Nepali Idioms","authors":["R Pokharel, A Agrawal - Proceedings of the First Workshop on Challenges in …, 2025"],"snippet":"Idioms, integral to any language, convey nuanced meanings and cultural references. However, beyond English, few resources exist to support any meaningful exploration of this unique linguistic phenomenon. To facilitate such an inquiry in a …","url":["https://aclanthology.org/2025.chipsal-1.16.pdf"]} {"year":"2025","title":"Negative news posts are less prevalent and generate lower user engagement than non-negative news posts across six countries","authors":["S Talaga, D Batorski, M Wojcieszak - arXiv preprint arXiv:2507.19300, 2025"],"snippet":"Although news negativity is often studied, missing is comparative evidence on the prevalence of and engagement with negative political and non-political news posts on social media. We use 6,081,134 Facebook posts published between January 1 …","url":["https://arxiv.org/pdf/2507.19300"]} +{"year":"2025","title":"Nemotron 3 Nano: Open, Efficient Mixture-of-Experts Hybrid Mamba-Transformer Model for Agentic Reasoning","authors":["A Blakeman, A Grattafiori, A Basant, A Gupta, A Khattar… - arXiv preprint arXiv …, 2025"],"snippet":"… We first filtered out the code pages in Common Crawl based on a fast pattern matching code classifier for webpages. We then constructed our high-quality code pretraining corpus by applying a modified version of the Nemotron-CC-Math …","url":["https://arxiv.org/pdf/2512.20848"]} {"year":"2025","title":"Nemotron-CC-Math: A 133 Billion-Token-Scale High Quality Math Pretraining Dataset","authors":["R Karimi Mahabadi, S Satheesh, S Prabhumoye… - arXiv e-prints, 2025","RK Mahabadi, S Satheesh, S Prabhumoye, M Patwary…","RK Mahabadi, S Satheesh, S Prabhumoye, M Patwary… - arXiv preprint arXiv …, 2025"],"snippet":"… However, existing math-focused datasets built from Common Crawl suffer from degraded quality due to brittle extraction heuristics, lossy … In this work, we introduce Nemotron-CC-Math, a large-scale, high-quality mathematical corpus …","url":["https://arxiv.org/pdf/2508.15096","https://research.nvidia.com/labs/adlr/files/NVIDIA-Nemotron-CC-Math.pdf","https://ui.adsabs.harvard.edu/abs/2025arXiv250815096K/abstract"]} {"year":"2025","title":"Nemotron-CLIMB: Clustering-based Iterative Data Mixture Bootstrapping for Language Model Pre-training","authors":["S Diao, Y Yang, Y Fu, X Dong, D Su, M Kliegl, Z Chen… - The Thirty-ninth Annual …"],"snippet":"Pre-training datasets are typically collected from web content and lack inherent domain divisions. For instance, widely used datasets like Common Crawl do not include explicit domain labels, while manually curating labeled datasets such as …","url":["https://openreview.net/pdf?id=aBlqKPkc4a"]} {"year":"2025","title":"NEMOTRON-CROSSTHINK: Scaling Self-Learning beyond Math Reasoning","authors":["SN Akter, S Prabhumoye, M Novikov, S Han, Y Lin… - arXiv preprint arXiv …, 2025"],"snippet":"… We (a) curate QA pairs from from synthetic (Common Crawl) and open-source datasets, categorized into general-purpose reasoning (Dgpr) and mathematical reasoning (Dmr); (b) apply structured templates to convert data into multiple-choice (MCQ) …","url":["https://arxiv.org/pdf/2504.13941"]} {"year":"2025","title":"Nemotron-H: A Family of Accurate and Efficient Hybrid Mamba-Transformer Models","authors":["A Blakeman, A Basant, A Khattar, A Renduchintala… - arXiv preprint arXiv …, 2025"],"snippet":"… To achieve this, we start with technical pre-training documents from Common Crawl and leverage Nemotron-4-340B to generate dialogues, where a knowledgeable persona guides a less-experienced one (eg, an interaction between …","url":["https://arxiv.org/pdf/2504.03624"]} {"year":"2025","title":"NeoBERT: A Next-Generation BERT","authors":["LL Breton, Q Fournier, ME Mezouar, S Chandar - arXiv preprint arXiv:2502.19587, 2025"],"snippet":"… become small in comparison to modern web-scraped datasets built by filtering and deduplicating Common Crawl dumps. Following the same trend, we pre-trained NeoBERT on RefinedWeb (Penedo et al., 2023), a massive dataset containing 600B …","url":["https://arxiv.org/pdf/2502.19587"]} {"year":"2025","title":"NepaliGPT: A Generative Language Model for the Nepali Language","authors":["S Pudasaini, A Shakya, S Shrestha, S Bhatta, S Thapa… - arXiv preprint arXiv …, 2025"],"snippet":"After the release of ChatGPT, Large Language Models (LLMs) have gained huge popularity in recent days and thousands of variants of LLMs have been released. However, there is no generative language model for the Nepali language, due to …","url":["https://arxiv.org/pdf/2506.16399"]} +{"year":"2025","title":"NetPhish-Mix: A Multi-Modal Phishing Detection Method Utilizing URL Graphs and Page Screenshot Vision Transformer","authors":["A Mohammad, N Praveen, R Shreeshayana… - Engineering, Technology & …, 2026"],"snippet":"In order to avoid detection, modern phishing assaults use techniques such as dynamic HTML obfuscation and site mimicry. This study presents NetPhish-Mix, a powerful framework for detecting phishing attempts by combining the analysis of …","url":["https://etasr.com/index.php/ETASR/article/download/15759/6121"]} {"year":"2025","title":"Network information security protection method based on additive Gaussian noise and mutual information neural network in cloud computing background","authors":["Y Zhong, X Li - Egyptian Informatics Journal, 2025"],"snippet":"In the cloud computing environment, data security and privacy have received unprecedented attention, but current information security protection methods cannot simultaneously balance data utility and privacy protection effects. Therefore, a …","url":["https://www.sciencedirect.com/science/article/pii/S1110866525000660"]} {"year":"2025","title":"NEU-ESC: A Comprehensive Vietnamese dataset for Educational Sentiment analysis and topic Classification toward multitask learning","authors":["PQH Mai, QH Nguyen, PG Duong, HH Nguyen… - arXiv preprint arXiv …, 2025"],"snippet":"In the field of education, understanding students' opinions through their comments is crucial, especially in the Vietnamese language, where resources remain limited. Existing educational datasets often lack domain relevance and student slang. To …","url":["https://arxiv.org/pdf/2506.23524"]} +{"year":"2025","title":"NeuCLIRBench: A Modern Evaluation Collection for Monolingual, Cross-Language, and Multilingual Information Retrieval","authors":["D Lawrie, J Mayfield, E Yang, A Yates, S MacAvaney… - arXiv preprint arXiv …, 2025"],"snippet":"… , Persian, and Russian articles that appeared in CommonCrawl News, providing a challenging scenario for first-stage retrieval systems … , and Russian, drawn from CommonCrawl News.The documents were obtained by the CommonCrawl service …","url":["https://arxiv.org/pdf/2511.14758"]} {"year":"2025","title":"Neural AQG, Part 2: Transformers","authors":["M Flor - Automatic Question Generation, 2025"],"snippet":"… Unlike UniLM, which was trained on data from English Wikipedia and a corpus of 11K e-books, T5 was trained on a much larger collection of text data—the Common Crawl web archive. Specifically, T5 was trained in 5 different sizes, from T5-small …","url":["https://link.springer.com/chapter/10.1007/978-3-031-92072-1_7"]} {"year":"2025","title":"Neural dynamics of semantic control underlying generative storytelling","authors":["C Braun, N De Pisapia"],"snippet":"… The first was a CBOW model trained using fastText on a concatenation of the Common Crawl and Wikipedia105, while the second model was trained using the Skip-gram approach on Wikipedia106. The third model was also a Skip-gram model …","url":["https://search.proquest.com/openview/ae1e8aa0c085d66915efd2209ad26836/1?pq-origsite=gscholar&cbl=4669726"]} {"year":"2025","title":"Neural Text Embeddings in Psychological Research: A Guide With Examples in R","authors":["L Teitelbaum, A Simchon - 2025"],"snippet":"In this guide, we review neural embedding models and compare three methods of quantifying psychological constructs for use with embeddings: Distributed Dictionary Representation (DDR), Contextualized Construct Representation (CCR), and a …","url":["https://osf.io/j9g4a/download"]} @@ -10611,30 +10785,37 @@ {"year":"2025","title":"NExT-OMNI: Towards Any-to-Any Omnimodal Foundation Models with Discrete Flow Matching","authors":["R Luo, X Xia, L Wang, L Chen, R Shan, J Luo, M Yang… - arXiv preprint arXiv …, 2025"],"snippet":"Next-generation multimodal foundation models capable of any-to-any cross-modal generation and multi-turn interaction will serve as core components of artificial general intelligence systems, playing a pivotal role in human-machine interaction …","url":["https://arxiv.org/pdf/2510.13721"]} {"year":"2025","title":"Ngalawan Ujaran Sengit: hate speech detection in indonesian code-mixed social media data","authors":["EW Pamungkas, P Chiril - Language Resources and Evaluation, 2025"],"snippet":"… We used pre-trained FastText Indonesian word vectors, which have an embedding dimension of 300 and were trained on Wikipedia and Common Crawl Additionally, to leverage the benefits of multilingual word embeddings, we have also …","url":["https://link.springer.com/article/10.1007/s10579-025-09810-x"]} {"year":"2025","title":"NGU_Research at CheckThat! 2025: an LLM based hybrid fact-checking pipeline for numerical claims","authors":["MA Abdallah, RM Fekry, SR El-Beltagy - Faggioli et al, 2025"],"snippet":"In this work, we present a four-stage, retrieval-augmented LLM pipeline for fact-checking numerical claims. The pipeline rewrites each numerical claim into a focused question, fuses OpenAI dense vectors with BM25 to fetch evidence, answers in context with …","url":["https://ceur-ws.org/Vol-4038/paper_55.pdf"]} +{"year":"2025","title":"NLP Datasets for Idiom and Figurative Language Tasks","authors":["B Matheny, PM Nguyen, ML Nguyen, S Reynolds - arXiv preprint arXiv:2511.16345, 2025"],"snippet":"… downstream derivations of the Common Crawl 1 corpus. Many datasets have been derived from Common Crawl and shared for research. … , in this research OSCAR 2 [1]and C4 [2] filters of Common Crawl were used to match a compiled list …","url":["https://arxiv.org/pdf/2511.16345"]} {"year":"2025","title":"NLP modeling recommendations for restricted data availability in clinical settings","authors":["F Villena, F Bravo-Marquez, J Dunstan - BMC Medical Informatics and Decision …, 2025"],"snippet":"… A multilingual version of XLM-RoBERTa masked language model, pre-trained using a self-supervised technique on a corpus of 2.5 TB of filtered CommonCrawl raw text data containing one hundred languages [43]. This model is the broadest of …","url":["https://link.springer.com/article/10.1186/s12911-025-02948-2"]} {"year":"2025","title":"Noise-Adaptive Layerwise Learning Rates: Accelerating Geometry-Aware Optimization for Deep Neural Network Training","authors":["J Hao, X Gong, J Xu, Z Wang, M Liu - arXiv preprint arXiv:2510.14009, 2025"],"snippet":"Geometry-aware optimization algorithms, such as Muon, have achieved remarkable success in training deep neural networks (DNNs). These methods leverage the underlying geometry of DNNs by selecting appropriate norms for different layers and …","url":["https://arxiv.org/pdf/2510.14009"]} {"year":"2025","title":"NormLens: Massively Multicultural MLLM Reasoning with Fine-Grained Social Awareness","authors":["YR Fung, H Ji - First Workshop on Social Simulation with LLMs"],"snippet":"Multimodal large language models (MLLMs) have revolutionized many applications but still face challenges related to cultural bias and a lack of cultural commonsense knowledge crucial for guiding cross-culture communication and interactions. In …","url":["https://openreview.net/pdf?id=JDAsUSpRxn"]} {"year":"2025","title":"Not All Documents Are What You Need for Extracting Instruction Tuning Data","authors":["C Zhang, H Zhong, H Li, C Chai, J Hong, Y Deng… - arXiv preprint arXiv …, 2025"],"snippet":"… In reality, there is plenty of high-quality web corpus (eg, Common Crawl) which contains rich knowledge and can be leveraged as highquality instruction data. However, this wealth of knowledge is widely spread within the corpus. Recently, Yue et al. …","url":["https://arxiv.org/pdf/2505.12250"]} {"year":"2025","title":"Not All Models Suit Expert Offloading: On Local Routing Consistency of Mixture-of-Expert Models","authors":["J Liang, S Wang, M Tian, Y Li, D Tang, Z Wei - arXiv preprint arXiv:2505.16056, 2025"],"snippet":"Mixture-of-Experts (MoE) enables efficient scaling of large language models (LLMs) with sparsely activated experts during inference. To effectively deploy large MoE models on memory-constrained devices, many systems introduce *expert offloading …","url":["https://arxiv.org/pdf/2505.16056"]} +{"year":"2025","title":"NOTE: UNDER PEER REVIEW Not for dissemination without author permission.","authors":["TES Charlesworth, LK Werden, J van den Hoogen…"],"snippet":"Disentangling the cultural drivers of ecological degradation and recovery remains a central challenge for a regenerative future. Here, we use language to develop the first systematic record of global variation in nature attitudes and explore the …","url":["https://osf.io/download/vumza/"]} {"year":"2025","title":"Notifications 0 new","authors":["N Dev"],"snippet":"… They trained on 2 trillion tokens of English and Chinese text acquired by deduplicating the Common Crawl. [26] … Further pretrain with 500B tokens (6% DeepSeekMath Corpus, 4% AlgebraicStack, 10% arXiv, 20% GitHub code, 10 …","url":["https://admithel.com/employer/namsoo-dev/"]} {"year":"2025","title":"NusaAksara: A Multimodal and Multilingual Benchmark for Preserving Indonesian Indigenous Scripts","authors":["MF Adilazuarda, MI Wijanarko, L Susanto, K Nur'aini… - arXiv preprint arXiv …, 2025"],"snippet":"Indonesia is rich in languages and scripts. However, most NLP progress has been made using romanized text. In this paper, we present NusaAksara, a novel public benchmark for Indonesian languages that includes their original scripts. Our …","url":["https://arxiv.org/pdf/2502.18148"]} {"year":"2025","title":"NusaDialogue: Dialogue Summarization and Generation for Underrepresented and Extremely Low-Resource Languages","authors":["A Purwarianti, D Adhista, A Baptiso, M Mahfuzh… - Proceedings of the Second …, 2025"],"snippet":"Developing dialogue summarization for extremely low-resource languages is a challenging task. We introduce NusaDialogue, a dialogue summarization dataset for three underrepresented languages in the Malayo-Polynesian language family …","url":["https://aclanthology.org/2025.sealp-1.8.pdf"]} {"year":"2025","title":"NVIDIA Nemotron Nano 2: An Accurate and Efficient Hybrid Mamba-Transformer Reasoning Model","authors":["A Basant, A Khairnar, A Paithankar, A Khattar… - arXiv preprint arXiv …, 2025"],"snippet":"… Our curated Common Crawl-based multilingual data performed slightly better than the Fineweb2-based multilingual data, while the … The diverse pairs translated from English Common Crawl achieved the highest average score over the 8 …","url":["https://arxiv.org/pdf/2508.14444"]} {"year":"2025","title":"NVIDIA Nemotron Nano V2 VL","authors":["AS Deshmukh, K Chumachenko, T Rintamaki, M Le… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Nemotron Nano V2 VL, the latest model of the Nemotron vision-language series designed for strong real-world document understanding, long video comprehension, and reasoning tasks. Nemotron Nano V2 VL delivers significant …","url":["https://arxiv.org/pdf/2511.03929"]} +{"year":"2025","title":"NVIDIA Nemotron Parse 1.1","authors":["K Chumachenko, AS Deshmukh, J Seppanen… - arXiv preprint arXiv …, 2025"],"snippet":"… Common Crawl Data We utilize a set of diverse data samples from Common Crawl which are annotated in plaintext format along with bounding boxes and semantic class labels by human experts. We further autolabel text inside images and …","url":["https://arxiv.org/pdf/2511.20478"]} {"year":"2025","title":"OASIS Uncovers: High-Quality T2I Models, Same Old Stereotypes","authors":["S Dehdashtian, G Sreekumar, VN Boddeti - arXiv preprint arXiv:2501.00962, 2025"],"snippet":"Images generated by text-to-image (T2I) models often exhibit visual biases and stereotypes of concepts such as culture and profession. Existing quantitative measures of stereotypes are based on statistical parity that does not align with the …","url":["https://arxiv.org/pdf/2501.00962"]} +{"year":"2025","title":"Of models and monocultures: epistemic risks of artificial intelligence in medical research","authors":["JI Alvarado-Sánchez - Colombian Journal of Anesthesiology, 2025"],"snippet":"Artificial intelligence (AI) is transforming medical research. From automating literature reviews to detecting patterns in large datasets, AI promises efficiency and access at unprecedented scale (1). Language models and writing assistants also …","url":["https://www.researchgate.net/profile/Jorge-Alvarado-Sanchez/publication/398054924_Of_models_and_monocultures_epistemic_risks_of_artificial_intelligence_in_medical_research/links/6928bce3acf4cf6385391a7d/Of-models-and-monocultures-epistemic-risks-of-artificial-intelligence-in-medical-research.pdf"]} {"year":"2025","title":"of thesis The Disruption of Due Diligence: How Generative AI is Trans","authors":["I Käyhkö - 2025"],"snippet":"This thesis explores the emerging role of generative artificial intelligence (GenAI) in transforming due diligence processes within mergers and acquisitions (M&A), with a particular focus on financial and operational due diligence conducted by large …","url":["https://aaltodoc.aalto.fi/server/api/core/bitstreams/fdf733e8-3f3d-4e4c-b3c4-b95f2733b5a7/content"]} {"year":"2025","title":"Oh, the humanity! A human-centric approach to social bias research in natural language processing","authors":["EL Ungless - 2025"],"snippet":"Much current research into social bias in Natural Language Processing (NLP) -- that is, the tendency for NLP technologies to reflect human biases such as sexism and homophobia in the relative probability of different outputs -- suffers from relying on a …","url":["https://era.ed.ac.uk/bitstream/handle/1842/44139/UnglessEL_2025.pdf?sequence=1&isAllowed=y"]} +{"year":"2025","title":"Olmo 3","authors":["T Olmo, A Ettinger, A Bertsch, B Kuehl, D Graham… - arXiv preprint arXiv …, 2025"],"snippet":"… Deduplication The web data we collect from CommonCrawl naturally contains an abundance of duplicated documents. This duplication arises from repeated crawls of the same website, near-copies of documents appearing across multiple web pages …","url":["https://arxiv.org/pdf/2512.13961"]} {"year":"2025","title":"OLMoASR: Open Models and Data for Training Robust Speech Recognition Models","authors":["H Ngo, M Deitke, M Bartelds, S Pratt, J Gardner… - arXiv preprint arXiv …, 2025"],"snippet":"Improvements in training data scale and quality have led to significant advances, yet its influence in speech recognition remains underexplored. In this paper, we present a large-scale dataset, OLMoASR-Pool, and series of models, OLMoASR, to study …","url":["https://arxiv.org/pdf/2508.20869"]} {"year":"2025","title":"Omnilingual ASR: Open-Source Multilingual Speech Recognition for 1600+ Languages","authors":["ASR Omnilingual, G Keren, A Kozhevnikov, Y Meng… - arXiv preprint arXiv …, 2025"],"snippet":"Automatic speech recognition (ASR) has advanced in high-resource languages, but most of the world's 7,000+ languages remain unsupported, leaving thousands of long-tail languages behind. Expanding ASR coverage has been costly and limited …","url":["https://arxiv.org/pdf/2511.09690"]} {"year":"2025","title":"On Multilingual Encoder Language Model Compression for Low-Resource Languages","authors":["D Gurgurov, M Gregor, J van Genabith, S Ostermann - arXiv preprint arXiv …, 2025"],"snippet":"In this paper, we combine two-step knowledge distillation, structured pruning, truncation, and vocabulary trimming for extremely compressing multilingual encoder-only language models for low-resource languages. Our novel approach systematically …","url":["https://arxiv.org/pdf/2505.16956"]} {"year":"2025","title":"On Regulating Downstream AI Developers","authors":["S Williams, J Schuett, M Anderljung - arXiv preprint arXiv:2503.11922, 2025"],"snippet":"Foundation models - models trained on broad data that can be adapted to a wide range of downstream tasks - can pose significant risks, ranging from intimate image abuse, cyberattacks, to bioterrorism. To reduce these risks, policymakers are starting …","url":["https://arxiv.org/pdf/2503.11922"]} {"year":"2025","title":"On resolving the out of vocabulary problem in DisCoCat-based quantum natural language processing","authors":["A Bhatuse, A Khandelwal, SS Udmale, MG Chandra… - Evolving Systems, 2025"],"snippet":"… In this work, we employ pre-trained embeddings for words from a FastText model trained on the CommonCrawl and Wikipedia corpora (Grave et al. 2018). These word embeddings are later passed to DCA to create the quantum state for each …","url":["https://link.springer.com/article/10.1007/s12530-025-09714-9"]} +{"year":"2025","title":"On the “bullshit” of the intelligent explosion and singularity discourse","authors":["ET Ugar - South African Journal of Philosophy, 2025"],"snippet":"Should artificial intelligence/machine ethicists be concerned with the singularity/ intelligent explosion (the view that superintelligent technologies will emerge in the future) and their potential harms, or should they channel their ethical ounce on …","url":["https://www.tandfonline.com/doi/abs/10.1080/02580136.2025.2536950"]} {"year":"2025","title":"On the Application of Fundamental Clustering Methods to Large Scale Cyber Security Log Classification","authors":["J Le, M Lazarescu, ST Soh, R Ryan, P Cai, Q Li - 2025 13th International Symposium …, 2025"],"snippet":"This paper presents the results from an investigation of using traditional clustering approaches to address the problem of large-scale cyber security log entry classification. We applied two approaches to a large-scale dataset., and we …","url":["https://ieeexplore.ieee.org/abstract/document/11012045/"]} {"year":"2025","title":"On the caveats of AI autophagy","authors":["X Xing, F Shi, J Huang, Y Wu, Y Nan, S Zhang, Y Fang… - Nature Machine Intelligence, 2025"],"snippet":"Generative artificial intelligence (AI) technologies and large models are producing realistic outputs across various domains, such as images, text, speech and music. Creating these advanced generative models requires significant resources …","url":["https://www.nature.com/articles/s42256-025-00984-1"]} {"year":"2025","title":"On the Effectiveness of Large Language Models in Automating Categorization of Scientific Texts","authors":["GK Shahi, O Hummel - arXiv preprint arXiv:2502.15745, 2025"],"snippet":"The rapid advancement of Large Language Models (LLMs) has led to a multitude of application opportunities. One traditional task for Information Retrieval systems is the summarization and classification of texts, both of which are important for supporting …","url":["https://arxiv.org/pdf/2502.15745"]} {"year":"2025","title":"On the effects of machine translation on offensive language detection","authors":["A Dmonte, S Satapara, R Alsudais, T Ranasinghe… - Social Network Analysis and …, 2024"],"snippet":"Abstract Machine translation (MT) is widely used to translate content on social media platforms aiming to improve accessibility. A great part of the content circulated on social media is user-generated and often contains non-standard spelling, hashtags …","url":["https://link.springer.com/article/10.1007/s13278-024-01398-4"]} {"year":"2025","title":"On the Expressiveness of Softmax Attention: A Recurrent Neural Network Perspective","authors":["G Mongaras, EC Larson - arXiv preprint arXiv:2507.23632, 2025"],"snippet":"… FineWeb is a cleaned and de-duplicated 5-trillion token dataset compiled from 96 different common crawl snapshots. Each tested model is about 300 million parameters and trained on a sequence length of 1024. Adding a gate or norm …","url":["https://arxiv.org/pdf/2507.23632"]} {"year":"2025","title":"On the Impact of Noise in Differentially Private Text Rewriting","authors":["S Meisenbacher, M Chevli, F Matthes - arXiv preprint arXiv:2501.19022, 2025"],"snippet":"… To create training samples for sentence infilling, we employ two datasets: English Wikipedia and Common Crawl. The exact dataset … Common Crawl (C4). We employ the Colossal Clean Crawled Corpus (C4) made available by Raffel et al. (2020) …","url":["https://arxiv.org/pdf/2501.19022"]} +{"year":"2025","title":"On the Optimality of Discrete Object Naming: a Kinship Case Study","authors":["P Le, M Lindeman, RG Alhama - arXiv preprint arXiv:2511.19120, 2025"],"snippet":"… Vietnamese: We use the VietVault corpus (Pham 2024), a dataset filtered and curated from Common Crawl dumps prior to 2023. The full corpus contains 80GB of raw Vietnamese text spanning multiple domains. For our analysis, we sample a 5GB …","url":["https://arxiv.org/pdf/2511.19120"]} {"year":"2025","title":"On The Origin of Cultural Biases in Language Models: From Pre-training Data to Linguistic Phenomena","authors":["T Naous, W Xu - arXiv preprint arXiv:2501.04662, 2025"],"snippet":"Language Models (LMs) have been shown to exhibit a strong preference towards entities associated with Western culture when operating in non-Western languages. In this paper, we aim to uncover the origins of entity-related cultural biases in LMs by …","url":["https://arxiv.org/pdf/2501.04662"]} {"year":"2025","title":"On the Path to Make Ukrainian a High-Resource Language","authors":["M Haltiuk, A Smywiński-Pohl - Proceedings of the Fourth Ukrainian Natural Language …, 2025"],"snippet":"… Unlike some largescale efforts that process raw Common Crawl data directly, we focus on … reuse similar web sources, such as Common Crawl. Duplicates may occur both as exact … from processing the same documents from Common Crawl …","url":["https://aclanthology.org/2025.unlp-1.14.pdf"]} {"year":"2025","title":"On the Varieties of Fractal Geometry of Word Embeddings","authors":["AN Kallakunta, W Zadrozny - 2025"],"snippet":"Prior research showed the instability of word embeddings. That is, the neighborhoods of word vectors differ depending on corpora and training methods. In this article we compute, using the correlation dimension algorithm, as well as a …","url":["https://journals.flvc.org/FLAIRS/article/download/138951/144043"]} @@ -10682,7 +10863,9 @@ {"year":"2025","title":"Paraphrase detection for Urdu language text using fine-tune BiLSTM framework","authors":["MA Aslam, K Khan, W Khan, SU Khan, A Albanyan… - Scientific Reports, 2025"],"snippet":"… In this study, we used Common Crawl pre-trained vectors trained on a large amount of web-based text (42 billion tokens, 1.9 million words, 50 d vectors). We downloaded the Common Crawl GloVe embeddings from the official GloVe website …","url":["https://www.nature.com/articles/s41598-025-93260-6"]} {"year":"2025","title":"ParaPO: Aligning Language Models to Reduce Verbatim Reproduction of Pre-training Data","authors":["T Chen, F Brahman, J Liu, N Mireshghallah, W Shi… - arXiv preprint arXiv …, 2025"],"snippet":"… 2020) dataset, a filtered subset of the Common Crawl1 dataset, specifically designed for inclusion in The Pile (Gao et al.… 2024) often use similar sources such as Common Crawl. … 1https://commoncrawl.org/ 2https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct …","url":["https://arxiv.org/pdf/2504.14452"]} {"year":"2025","title":"Partial Parameter Updates for Efficient Distributed Training","authors":["A Filippova, A Katharopoulos, D Grangier, R Collobert - arXiv preprint arXiv …, 2025"],"snippet":"We introduce a memoryand compute-efficient method for low-communication distributed training. Existing methods reduce communication by performing multiple local updates between infrequent global synchronizations. We demonstrate that …","url":["https://arxiv.org/pdf/2509.22418"]} +{"year":"2025","title":"Patent protection of biological genetic resources based on deep learning and artificial intelligence","authors":["Z Liu, L Liu - Scientific Reports, 2025"],"snippet":"With the rapid development of artificial intelligence (AI), deep learning has provided new ideas for the patent protection of biological genetic resources in the field of intellectual property. This paper aims to explore the application of deep learning …","url":["https://www.nature.com/articles/s41598-025-25051-y"]} {"year":"2025","title":"Patent, Still a Leading Indicator in Al Technology Innovation?","authors":["S Tang - Tech Transformation and AI Readiness: Pioneering …"],"snippet":"… Open repositories like Wikipedia or Common Crawl are a small portion. However, accessing, cleaning, standardizing, and processing relevant data can encounter numerous technical, legal, and financial barriers. A growing number of technology …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=eMxGEQAAQBAJ&oi=fnd&pg=PA87&dq=commoncrawl&ots=sWLdKIIEVA&sig=G0IHgzw3x7g0MhGvJIKqBMjnSM4"]} +{"year":"2025","title":"PCMind-2.1-Kaiyuan-2B Technical Report","authors":["K Luo, Z Sun, X Shi, S Chen, B Yu, Y Chen, C Dang… - arXiv preprint arXiv …, 2025"],"snippet":"The rapid advancement of Large Language Models (LLMs) has resulted in a significant knowledge gap between the open-source community and industry, primarily because the latter relies on closed-source, high-quality data and training …","url":["https://arxiv.org/pdf/2512.07612"]} {"year":"2025","title":"Peasant movements by country or region","authors":["V Campesina"],"snippet":"Several peasant movement in India arose during the colonial era, when economic policies by various British colonial administrations led to the decline of traditional handicraft industries. These policies lead to change of ownership in lands, land …","url":["https://reference.org/facts/Peasant_movement/u6NltEmc"]} {"year":"2025","title":"PEFT A2Z: Parameter-Efficient Fine-Tuning Survey for Large Language and Vision Models","authors":["NJ Prottasha, UR Chowdhury, S Mohanto, T Nuzhat… - arXiv preprint arXiv …, 2025"],"snippet":"Large models such as Large Language Models (LLMs) and Vision Language Models (VLMs) have transformed artificial intelligence, powering applications in natural language processing, computer vision, and multimodal learning. However …","url":["https://arxiv.org/pdf/2504.14117"]} {"year":"2025","title":"Performance Analysis of Rule-Based, CRF, BiLSTM-CRF, and BERT Models for Named Entity Recognition","authors":["A TOPRAK, FS TOPRAK"],"snippet":"This study compares the performance of four categories of Named Entity Recognition algorithms—rulebased methods, CRF, BiLSTM-CRF, and transformer-based BERT models—using the CoNLL-2003 dataset as a benchmark. The dataset …","url":["https://www.researchgate.net/profile/Ahmet-Toprak-3/publication/397336724_Performance_Analysis_of_Rule-Based_CRF_BiLSTM-CRF_and_BERT_Models_for_Named_Entity_Recognition/links/690c9a42c900be105cbfc565/Performance-Analysis-of-Rule-Based-CRF-BiLSTM-CRF-and-BERT-Models-for-Named-Entity-Recognition.pdf"]} @@ -10691,11 +10874,13 @@ {"year":"2025","title":"Performance evaluation of GPT-4o on South Korean national exams for building mechanical equipment maintenance","authors":["H Choi, J Lee, J Kim - Scientific Reports, 2025"],"snippet":"This study evaluates the applicability of large language models (LLMs) in mechanical equipment maintenance in buildings by assessing GPT-4o’s performance on two national certification exams in South Korea: Engineer Energy …","url":["https://www.nature.com/articles/s41598-025-16118-x"]} {"year":"2025","title":"Performance Evaluation of Text Summarization Models on SAMSUM Chat Data","authors":["J Bhatia, D Patel, J Patel, M Kumhar, U Chauhan… - Advances in Data-Driven …, 2025"],"snippet":"There is widespread dependence on messaging apps and automated chat-bots in various situations. After a long debates, individuals may need to review the discussion's main points. Different approaches for extractive and abstractive text …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=PjKKEQAAQBAJ&oi=fnd&pg=PA197&dq=commoncrawl&ots=7Bgnoy2_JX&sig=ZzyMaWSX8Zg_dlt3AyREkTCWHoY"]} {"year":"2025","title":"Performant Multilingual Modulated and Multiplexed Memory Distilled Model with Adaptive Activation Ensembles","authors":["S Dikshit, R Dixit, R Tiwari, P Jain - SN Computer Science, 2025"],"snippet":"… [23] introduces multilingual form of RoBERTa trained on a 2.5 TB clean Common Crawl large data corpus from 100 diverse linguistics. A … mT5 is trained on a Common Crawl data corpus containing 101 dialects. Code and network are freely …","url":["https://link.springer.com/article/10.1007/s42979-025-04146-3"]} +{"year":"2025","title":"Persian-Phi: Efficient Cross-Lingual Adaptation of Compact LLMs via Curriculum Learning","authors":["AM Akhlaghi, A Shabani, M Abdolmaleki… - arXiv preprint arXiv …, 2025"],"snippet":"The democratization of AI is currently hindered by the immense computational costs required to train Large Language Models (LLMs) for low-resource languages. This paper presents Persian-Phi, a 3.8B parameter model that challenges the …","url":["https://arxiv.org/pdf/2512.07454"]} {"year":"2025","title":"Person-Centric Annotations of LAION-400M: Auditing Bias and Its Transfer to Models","authors":["L Girrbach, S Alaniz, G Smith, T Darrell, Z Akata - arXiv preprint arXiv:2510.03721, 2025"],"snippet":"Vision-language models trained on large-scale multimodal datasets show strong demographic biases, but the role of training data in producing these biases remains unclear. A major barrier has been the lack of demographic annotations in web-scale …","url":["https://arxiv.org/pdf/2510.03721"]} {"year":"2025","title":"Personalised video summarisation using video-text multi-modal fusion","authors":["R Akhare, SK Shinde - International Journal of Computational Vision and …, 2025"],"snippet":"Video summarisation techniques have evolved in recent years, mostly focusing on visual material and ignoring user preferences. In this work, the topic of query-focused video summarisation is addressed. Long videos are given as input, and the goal is …","url":["https://www.inderscienceonline.com/doi/abs/10.1504/IJCVR.2025.146294"]} {"year":"2025","title":"Pfefferkorn and EmilIe K. Sunde","authors":["V Bartlett"],"snippet":"… LAION-5b, currently the largest image database, is a dataset of over 5 billion image-text pairs taken from an archive of scraped website data called Common Crawl (LAION 2022). These image-text pairs are extracted from website data; then, a …","url":["https://www.openhumanitiespress.org/books/download/Bartlett-Pfefferkorn-Sunde_2025_Decentring_Ethics.pdf"]} {"year":"2025","title":"Pharmacometrics in the Age of Large Language Models: A Vision of the Future","authors":["EM Tosca, L Aiello, A De Carlo, P Magni - Pharmaceutics, 2025"],"snippet":"Background: Open Access Perspective Pharmacometrics in the Age of Large Language Models: A Vision of the Future by Elena Maria Tosca , Ludovica Aiello , Alessandro De Carlo and Paolo Magni * Dipartimento di Ingegneria Industriale e dell’Informazione …","url":["https://www.mdpi.com/1999-4923/17/10/1274"]} {"year":"2025","title":"Phi-Ground Tech Report: Advancing Perception in GUI Grounding","authors":["M Zhang, Z Xu, J Zhu, Q Dai, K Qiu, Y Yang, C Luo… - arXiv preprint arXiv …, 2025"],"snippet":"… To acquire larger-scale data for better scaling up of training, we also obtained web pages from CommonCrawl [50] and rendered … Index and domain deduplication We utilized the CC-MAIN-2024-46 crawl from CommonCrawl. After a …","url":["https://arxiv.org/pdf/2507.23779"]} +{"year":"2025","title":"PhishFind: A Machine Learning-Based System for Real-Time Phishing Detection","authors":["DO Mendoza Vega, DO Diaz Mercado, EJE Cardenas - Proceedings of the 2025 8th …, 2025"],"snippet":"Phishing attacks continue to exploit user trust, posing significant risks to individuals and organizations through increasingly sophisticated tactics. Existing detection tools often lack real-time analysis or transparent explanations, leaving a gap in effective …","url":["https://dl.acm.org/doi/pdf/10.1145/3771678.3771683"]} {"year":"2025","title":"PhishHunter-XLD: An ensemble approach integrating machine learning and deep learning for phishing URL classification","authors":["T Doshi, V Patel, N Shah, D Swain, D Swain, B Acharya - Franklin Open, 2025"],"snippet":"Phishing continues to pose a significant cybersecurity threat by deceiving users into disclosing sensitive information through maliciously crafted URLs. Traditional detection methods, including blacklists and heuristic analyses, have proven …","url":["https://www.sciencedirect.com/science/article/pii/S2773186325001379"]} {"year":"2025","title":"Phishing Attack Detection Through Recursive Feature Elimination Via Cross Validation","authors":["S Masmoudi, HM Kammoun, M Charfeddine… - 2025 International Wireless …, 2025"],"snippet":"Rising phishing attacks pose serious cybersecurity threats due to their use of fraudulent links to collect confidential user information. In this paper, we evaluate the performance of various Machine Learning (ML) models, including Decision Trees …","url":["https://ieeexplore.ieee.org/abstract/document/11059706/"]} {"year":"2025","title":"Phishing Attack Detection Using Whale Optimization Algorithm-Based Feature Selection","authors":["MM Abualhaj, S Al-Khatib, A Alalousi, MO Hiari… - 2025 5th International …, 2025"],"snippet":"This study presents an optimized phishing attack detection model integrating the Whale Optimization Algorithm (WOA) for feature selection with XGBoost and Support Vector Machine (SVM) classifiers. The proposed approach enhances classification …","url":["https://ieeexplore.ieee.org/abstract/document/11132149/"]} @@ -10706,6 +10891,7 @@ {"year":"2025","title":"Phishing Prevention in the Digital Age: An AI/ML Perspective","authors":["A Kadam, H Khirid, A Gawande, RB Chandrayan - Intelligent Strategies for ICT …, 2025"],"snippet":"Phishing means cybercrime where attackers pose themselves as a known legitimate user to plifer sensitive information such as passwords and financial details through misleading emails or websites. Hence, this type of continuous threat persists despite …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=jEqCEQAAQBAJ&oi=fnd&pg=PA402&dq=commoncrawl&ots=fvEegLh4g4&sig=j2t4qXnkXBpHaUqxjqz8UO9QN_Q"]} {"year":"2025","title":"Phishing URL Detection via Machine Learning: A Comprehensive Survey","authors":["J Islam, C Patra, PK Mani, S Biswas, D Giri, T Maitra - Proceedings of International …"],"snippet":"… • Common Crawl [44]: Common Crawl provides a publicly accessible dataset that includes extensive metadata and content from benign websites collected through large-scale web crawling. This dataset is valuable for phishing detection research as …","url":["https://link.springer.com/content/pdf/10.1007/978-981-96-6348-4.pdf#page=158"]} {"year":"2025","title":"Phishing Webpage Detection using URL and HTML Graphs based on a Multimodal AutoEncoder Ensemble","authors":["윤준호, 최석훈, 김혜정, 부석준 - Journal of KIISE, 2025"],"snippet":"인터넷의 발전으로 인해 피싱 공격에 노출되는 사용자가 증가하고 있으며, 이를 예방 하기 위한 효과적인 탐지 방법이 필수적이다. 기존의 피싱 탐지 방법은 주로 URL의 문자 시퀀스를 분석하는 데 중점을 두었으나, 피싱 URL은 정상 URL과 유사한 패턴을 …","url":["https://www.dbpia.co.kr/Journal/articleDetail?nodeId=NODE12252209"]} +{"year":"2025","title":"Phishing Website Detection through Machine Learning Algorithms: A Comparative Analysis","authors":["O Piserchia"],"snippet":"… They experimented with a balanced datasets with 5,000 legitimate webpages drawn from Alexa and Common Crawl and 5,000 phishing webpages taken from phishing tanks and open tanks, with 48 features from each URL. Using the feature …","url":["https://www.irasspublisher.com/assets/articles/1765195522.pdf"]} {"year":"2025","title":"PhishKey: A Novel Centroid-Based Approach for Enhanced Phishing Detection Using Adaptive HTML Component Extraction","authors":["F Castaño, E Fidalgo, E Alegre, R Alaiz-Rodríguez… - arXiv preprint arXiv …, 2025"],"snippet":"Phishing attacks pose a significant cybersecurity threat, evolving rapidly to bypass detection mechanisms and exploit human vulnerabilities. This paper introduces PhishKey to address the challenges of adaptability, robustness, and efficiency …","url":["https://arxiv.org/pdf/2506.21106"]} {"year":"2025","title":"PhishSecure: Enhancing Web Safety","authors":["C Shravage, S Vairagar, P Metri, SC Jaygude… - Smart Trends in Computing and …"],"snippet":"This project presents an innovative phishing detection system that addresses the limitations of traditional methods by combining URL-based and content-based features to accurately identify fraudulent websites. Unlike conventional approaches …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=qx2LEQAAQBAJ&oi=fnd&pg=PA193&dq=commoncrawl&ots=9z1sEbfzJz&sig=89uZpmqYPNgtLsNbABXfAjstkeA"]} {"year":"2025","title":"PhreshPhish: A Real-World, High-Quality, Large-Scale Phishing Website Dataset and Benchmark","authors":["T Dalton, H Gowda, G Rao, S Pargi, AH Khodabakhshi… - arXiv preprint arXiv …, 2025"],"snippet":"… These user-sourced URLs provide a more realistic and representative sample of benign pages that users are likely to encounter on the web as opposed to those from other datasets such as Common Crawl [12]. The dataset was cleaned and curated to …","url":["https://arxiv.org/pdf/2507.10854"]} @@ -10716,12 +10902,16 @@ {"year":"2025","title":"Pirates of the RAG: Adaptively Attacking LLMs to Leak Knowledge Bases","authors":["C Di Maio, C Cosci, M Maggini, V Poggioni, S Melacci"],"snippet":"The growing ubiquity of Retrieval-Augmented Generation (RAG) systems in several realworld services triggers severe concerns about their security. A RAG system improves the generative capabilities of a Large Language Models (LLM) by a …","url":["https://arxiv.org/pdf/2412.18295"]} {"year":"2025","title":"PLaMo 2 Technical Report","authors":["P Networks, K Chubachi, Y Fujita, S Hemmi… - arXiv preprint arXiv …, 2025"],"snippet":"… 100B, extracting data deemed particularly relevant to coding from CommonCrawl data. This time, we implemented the following methods: … Removal of irrelevant data through filtering: Since parsing all HTML content from CommonCrawl data to …","url":["https://arxiv.org/pdf/2509.04897"]} {"year":"2025","title":"PLLuM: A Family of Polish Large Language Models","authors":["J Kocoń, M Piasecki, A Janz, T Ferdinan, Ł Radliński… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) play a central role in modern artificial intelligence, yet their development has been primarily focused on English, resulting in limited support for other languages. We present PLLuM (Polish Large Language Model) …","url":["https://arxiv.org/pdf/2511.03823"]} +{"year":"2025","title":"PLMAS: Adaptive Sample Selection for Prompting LLMs in Knowledge-Based Visual Question Answering","authors":["J Li, Q Xu, L Zhou, F Zhang, R Huang - ACM Transactions on Multimedia Computing …, 2025"],"snippet":"… GPT-3, a decoder-only LLM with 175B parameters, is trained on diverse data sources including Common Crawl, web texts, books, and Wikipedia. However, PICa’s sample selection strategy is overly simplistic, relying solely on question-image …","url":["https://dl.acm.org/doi/pdf/10.1145/3777476"]} +{"year":"2025","title":"PLN PPM ISB at MentalRiskES 2024: Detection of Gambling Disorders and Type of Addiction","authors":["PP Molina, IS Bedmar - IberLEF (Working Notes). CEUR Workshop …, 2025"],"snippet":"The following paper describes the solution to the two tasks presented by MentalRiskES 2025 [1], a competition dedicated to the early detection of mental disorders. In this case, the tasks were focused on detecting problems related to …","url":["https://ceur-ws.org/Vol-4098/MENTALRISKES2025_paper6.pdf"]} +{"year":"2025","title":"PoETa v2: Toward More Robust Evaluation of Large Language Models in Portuguese","authors":["TS Almeida, R Nogueira, H Pedrini - arXiv preprint arXiv:2511.17808, 2025"],"snippet":"Large Language Models (LLMs) exhibit significant variations in performance across linguistic and cultural contexts, underscoring the need for systematic evaluation in diverse languages. In this work, we present the most extensive evaluation of LLMs …","url":["https://arxiv.org/pdf/2511.17808"]} {"year":"2025","title":"Pointwise Mutual Information as a Performance Gauge for Retrieval-Augmented Generation","authors":["T Liu, J Qi, P He, A Bisazza, M Sachan, R Cotterell"],"snippet":"Recent work suggests that large language models enhanced with retrieval-augmented generation are easily influenced by the order in which the retrieved documents are presented to the model when solving tasks such as question answering (QA) …","url":["https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-long.78.pdf"]} {"year":"2025","title":"POLISH LANGUAGE MODELS IN BUSINESS AND PUBLIC SECTOR: A STRATEGIC PERSPECTIVE","authors":["R ULATOWSKA"],"snippet":"Purpose: The aim of this article is to analyse the first two Polish language models (Large Language Models) from the point of view of strategic dimensions of implementing national (LLMs) in the business and public sector. The study outlines both the …","url":["https://managementpapers.polsl.pl/wp-content/uploads/2025/07/225-Ulatowska.pdf"]} {"year":"2025","title":"Political Competition and Tax Expenditures: A Machine Learning Approach to 121 Years of US Laws","authors":["J Rodríguez - 2025"],"snippet":"Tax expenditures (TEs)—exemptions, deductions, and credits—are a major source of tax complexity. What drives the enactment of TEs? Does political competition play a role? I examine these questions in the context of the US using a novel dataset of …","url":["https://jarodriguez849.github.io/website/JMP_Rodriguez.pdf"]} {"year":"2025","title":"Political Leaning and Politicalness Classification of Texts","authors":["M Volf, J Simko - arXiv preprint arXiv:2507.13913, 2025"],"snippet":"This paper addresses the challenge of automatically classifying text according to political leaning and politicalness using transformer models. We compose a comprehensive overview of existing datasets and models for these tasks, finding that …","url":["https://arxiv.org/pdf/2507.13913"]} {"year":"2025","title":"PolyPrompt: Automating Knowledge Extraction from Multilingual Language Models with Dynamic Prompt Generation","authors":["N Roll - arXiv preprint arXiv:2502.19756, 2025"],"snippet":"Large language models (LLMs) showcase increasingly impressive English benchmark scores, however their performance profiles remain inconsistent across multilingual settings. To address this gap, we introduce PolyPrompt, a novel …","url":["https://arxiv.org/pdf/2502.19756"]} {"year":"2025","title":"PolyTruth: Multilingual Disinformation Detection using Transformer-Based Language Models","authors":["Z Gouliev, J Waters, C Wang - arXiv preprint arXiv:2509.10737, 2025"],"snippet":"Disinformation spreads rapidly across linguistic boundaries, yet most AI models are still benchmarked only on English. We address this gap with a systematic comparison of five multilingual transformer models: mBERT, XLM, XLM-RoBERTa …","url":["https://arxiv.org/pdf/2509.10737"]} +{"year":"2025","title":"PortBERT: Navigating the Depths of Portuguese Language Models","authors":["R Scheible-Schmitt, H He, AB Mendes"],"snippet":"Transformer models dominate modern NLP, but efficient, language-specific models remain scarce. In Portuguese, most focus on scale or accuracy, often neglecting training and deployment efficiency. In the present work, we introduce PortBERT, a …","url":["https://acl-bg.org/proceedings/2025/GlobalNLP%202025/pdf/2025.globalnlp-1.8.pdf"]} {"year":"2025","title":"POS tagging of low-resource Pashto language: annotated corpus and BERT-based model","authors":["I Haq, Y Zhang, IA Qadri - Language Resources and Evaluation, 2025"],"snippet":"This paper presents the development of a comprehensive part-of-speech (POS) annotated corpus for the low-resource Pashto language, along with a deep learning model for automatic POS tagging. The corpus comprises approximately 700K words (30K …","url":["https://link.springer.com/article/10.1007/s10579-025-09834-3"]} {"year":"2025","title":"Position: Beyond Euclidean--Foundation Models Should Embrace Non-Euclidean Geometries","authors":["N He, J Liu, B Zhang, N Bui, A Maatouk, M Yang, I King… - arXiv preprint arXiv …, 2025"],"snippet":"In the era of foundation models and Large Language Models (LLMs), Euclidean space has been the de facto geometric setting for machine learning architectures. However, recent literature has demonstrated that this choice comes with …","url":["https://arxiv.org/pdf/2504.08896"]} {"year":"2025","title":"Position: Formal Mathematical Reasoning—A New Frontier in AI","authors":["K Yang, G Poesia, J He, W Li, KE Lauter, S Chaudhuri… - Forty-second International …"],"snippet":"… 2024) as the base math LLM, which was trained on high-quality mathematical documents retrieved from Common Crawl through a carefully engineered data pipeline that combined automatic filtering and manual annotation. …","url":["https://openreview.net/pdf?id=HuvAM5x2xG"]} @@ -10731,19 +10921,23 @@ {"year":"2025","title":"Position: When Incentives Backfire, Data Stops Being Human","authors":["S Santy, P Bhattacharya, MH Ribeiro, KR Allen, S Oh - Forty-second International …"],"snippet":"Progress in AI has relied on human-generated data, from annotator marketplaces to the wider Internet. However, the widespread use of large language models now threatens the quality and integrity of human-generated data on these very platforms …","url":["https://openreview.net/pdf?id=4UhTWPwVke"]} {"year":"2025","title":"Positional Fragility in LLMs: How Offset Effects Reshape Our Understanding of Memorization Risks","authors":["Y Xu, A Bosselut, I Schlag - arXiv preprint arXiv:2505.13171, 2025"],"snippet":"Large language models are known to memorize parts of their training data, posing risk of copyright violations. To systematically examine this risk, we pretrain language models (1B/3B/8B) from scratch on 83B tokens, mixing web-scale data with public …","url":["https://arxiv.org/pdf/2505.13171"]} {"year":"2025","title":"Post navigation","authors":["LM Campbell"],"snippet":"… LLMs and common crawl data sets are out there in the world now. The genie is very much out of the bottle and there’s not a great deal we can do to put it back, even if we wanted to. It’s also debatable what, if anything, content creators, organisations …","url":["https://lornamcampbell.org/page/2/"]} +{"year":"2025","title":"Poster: Did I Just Browse A Website Written by LLMs?","authors":["SS He, R Govindan, HV Madhyastha - Proceedings of the 2025 ACM Internet …, 2025"],"snippet":"… Common Crawl. To understand the historical trend, we analyzed 10,479 random sites across Common Crawl archives from 2020 to 2025 (284,523 pages). Overall, only 451 sites (4.30%) are detected as LLM(-dominant), much lower than the 9.84 …","url":["https://dl.acm.org/doi/abs/10.1145/3730567.3768603"]} {"year":"2025","title":"PP3D: An In-Browser Vision-Based Defense Against Web Behavior Manipulation Attacks","authors":["S King, I Ozen, K Subramani, S Senthivel, P Vadrevu… - arXiv preprint arXiv …, 2025"],"snippet":"… Using the crawler to emulate multiple devices and resolutions, we obtained 396,255 distinct screenshots from the popular websites and 386,180 distinct screenshots from the Common Crawl URLs. While some webpages sampled from …","url":["https://arxiv.org/pdf/2510.18465"]} {"year":"2025","title":"Practical Datasets for Analyzing LLM Corpora Derived from Common Crawl","authors":["N Hagar, J Bandy - Proceedings of the International AAAI Conference on …, 2025"],"snippet":"Large language models (LLMs) rely heavily on web-derived training datasets, yet understanding how filtering and curation decisions affect these datasets remains challenging. This paper presents two complementary datasets designed to enable …","url":["https://ojs.aaai.org/index.php/ICWSM/article/download/35948/38102"]} {"year":"2025","title":"Practical Necromancy for Beginners: A Short Incomplete Opinionated Introduction to Artificial Intelligence for Archaeology and History Students","authors":["S Graham - 2025"],"snippet":"I will probably get this wrong: but maybe it will be wrong in useful and interesting ways. This book isn’ta hymn of praise to artificial intelligence. It’s not even all that scholarly a book. This is the book that I wish I had handy that day in September of …","url":["https://commons.und.edu/cgi/viewcontent.cgi?article=1033&context=press-books"]} {"year":"2025","title":"PrahokBART: A Pre-trained Sequence-to-Sequence Model for Khmer Natural Language Generation","authors":["H Kaing, R Dabre, H Song, VH Tran, H Tanaka… - Proceedings of the 31st …, 2025"],"snippet":"… We also find that some Khmer texts, particularly from Common Crawl (CC), were tokenized with spaces as word delimiters. While we cannot trace the exact source, these texts likely originated from preprocessed corpora. Additionally, the functional …","url":["https://aclanthology.org/2025.coling-main.87.pdf"]} {"year":"2025","title":"Pre-trained BERT Model Retrieval: Inference-Based No-Learning Approach using k-Nearest Neighbour Algorithm","authors":["HL PHAM, R MIBAYASHI, T YAMAMOTO, MP KATO… - IEICE Transactions on …, 2025"],"snippet":"In this study, we propose a method to efficiently retrieve BERT pre-trained models that achieve good performance on a specific document classification task. In natural language processing problems, the common practice involves fine-tuning existing …","url":["https://www.jstage.jst.go.jp/article/transinf/advpub/0/advpub_2024DAT0003/_pdf"]} {"year":"2025","title":"Pre-trained language model for code-mixed text in Indonesian, Javanese, and English using transformer","authors":["AF Hidayatullah, RA Apong, DTC Lai, A Qazi - Social Network Analysis and Mining, 2025"],"snippet":"Pre-trained language models (PLMs) have become increasingly popular due to their ability to achieve state-of-the-art performance on various natural language processing tasks with less training data and time. However, they struggle when …","url":["https://link.springer.com/article/10.1007/s13278-025-01444-9"]} +{"year":"2025","title":"Pre-training large language models based on Transformer architecture for building industry application: A review","authors":["J Liang, Y Lin, Y Liu, Y Huang, H Wu - Building Simulation, 2025"],"snippet":"The construction field is known for its high professional requirements, complexity, and diversity. Over the past few decades, CO 2 emissions and energy consumption have increased significantly, making it necessary to improve the efficiency of …","url":["https://link.springer.com/article/10.1007/s12273-025-1324-9"]} {"year":"2025","title":"Pre-training under infinite compute","authors":["K Kim, S Kotha, P Liang, T Hashimoto - arXiv preprint arXiv:2509.14786, 2025"],"snippet":"Since compute grows much faster than web text available for language model pre-training, we ask how one should approach pre-training under fixed data and no compute constraints. We first show that existing data-constrained approaches of increasing …","url":["https://arxiv.org/pdf/2509.14786"]} {"year":"2025","title":"Predicting emerging trends: a machine learning approach to topic popularity on social media","authors":["Z Wu, Y Liao, C Luo, J Shi, Y Yang - PeerJ Computer Science, 2025"],"snippet":"In the dynamic realm of social media, various topics emerge daily, with some evolving into widespread trends. This study focuses on predicting whether a topic will gain popularity in the future, treating this challenge as a binary classification …","url":["https://peerj.com/articles/cs-3245/"]} {"year":"2025","title":"Predicting LLM Reasoning Performance with Small Proxy Model","authors":["W Koh, J Suk, S Han, SY Yun, J Shin - arXiv preprint arXiv:2509.21013, 2025"],"snippet":"Given the prohibitive cost of pre-training large language models, it is essential to leverage smaller proxy models to optimize datasets before scaling up. However, this approach becomes challenging for reasoning capabilities, which exhibit emergent …","url":["https://arxiv.org/pdf/2509.21013"]} +{"year":"2025","title":"Predicting the Formation of Induction Heads","authors":["T Aoyama, EG Wilcox, N Schneider - arXiv preprint arXiv:2511.16893, 2025"],"snippet":"Arguably, specialized attention heads dubbed induction heads (IHs) underlie the remarkable in-context learning (ICL) capabilities of modern language models (LMs); yet, a precise characterization of their formation remains unclear. In this study, we …","url":["https://arxiv.org/pdf/2511.16893"]} {"year":"2025","title":"Predictive Data Selection: The Data That Predicts Is the Data That Teaches","authors":["K Shum, Y Huang, H Zou, D Qi, Y Liao, X Chen, Q Liu… - arXiv preprint arXiv …, 2025"],"snippet":"Language model pretraining involves training on extensive corpora, where data quality plays a pivotal role. In this work, we aim to directly estimate the contribution of data during pretraining and select pretraining data in an efficient manner. Specifically …","url":["https://arxiv.org/pdf/2503.00808"]} {"year":"2025","title":"Preference Curriculum: LLMs Should Always Be Pretrained on Their Preferred Data","authors":["X Zhang, L Xu, F Duan, Y Zhou, S Wang, J Wang, X Cai - arXiv preprint arXiv …, 2025"],"snippet":"Current large language models (LLMs) generally utilize a consistent data distribution throughout the entire pretraining process. However, as the model's ability improves, it intuitively should be pretrained with differentiated data. To …","url":["https://arxiv.org/pdf/2501.13126"]} {"year":"2025","title":"Preprint: Did I Just Browse A Website Written by LLMs?","authors":["R Govindan, HV Madhyastha - arXiv e-prints, 2025","S He, R Govindan, HV Madhyastha - arXiv preprint arXiv:2507.13933, 2025"],"snippet":"… Common Crawl. To understand the historical trend, we analyzed 10,479 random sites from Common Crawl archives from 2020 to 2025 (284,523 pages). Overall, only 451 sites (4.30%) are detected as LLM(-dominant), much lower than the 9.84 …","url":["https://arxiv.org/pdf/2507.13933","https://ui.adsabs.harvard.edu/abs/2025arXiv250713933S/abstract"]} {"year":"2025","title":"Prepublication Draft","authors":["C Ohge, K Schuster, AI Honey"],"snippet":"… Whether trained on a highly curated photo collection or the billion web pages of the Common Crawl, large language models start by atomizing content in the archive and then compressing it into an engine that can produce new artefacts derived from …","url":["https://jonippolito.net/writing/ippolito_ai_as_compression_v2.1.pdf"]} +{"year":"2025","title":"Pretrained Word Vectors for Latin Philology","authors":["PJ Burns - Evolving Perspectives on Digital Classics, 2025"],"snippet":"Word vectors have, in recent years, become a staple of cutting-edge research in Latin philology and literary criticism, although disciplinary-specific reviews of this computational resource are limited. This chapter offers a brief explanation of word …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.4324/9781003584155-12&type=chapterpdf"]} {"year":"2025","title":"Pretraining GPT-style models in Hungarian","authors":["K Szentmihályi, DM Nemeskey, AM Szekeres…"],"snippet":"… We compiled our web text corpus from all Common Crawl dumps until the end of 2023. We followed the procedure outlined in [25] with a … We compiled our web text corpus from all Common Crawl dumps until the end of 2023. We followed the …","url":["https://www.infocommunications.hu/documents/169298/4797540/InfocomJournal_2025_1_EA_1_vj.pdf"]} {"year":"2025","title":"Primus: A Pioneering Collection of Open-Source Datasets for Cybersecurity LLM Training","authors":["YC Yu, TH Chiang, CW Tsai, CM Huang, WK Tsao - arXiv preprint arXiv:2502.11191, 2025"],"snippet":"Large Language Models (LLMs) have shown remarkable advancements in specialized fields such as finance, law, and medicine. However, in cybersecurity, we have noticed a lack of open-source datasets, with a particular lack of high-quality …","url":["https://arxiv.org/pdf/2502.11191"]} {"year":"2025","title":"PRIMUS: A Pioneering Collection of Open-Source Datasets for","authors":["CLLM Training"],"snippet":"Large Language Models (LLMs) have shown remarkable advancements in specialized fields such as finance, law, and medicine. However, in cybersecurity, we have noticed a lack of open-source datasets, with a particular lack of high-quality …","url":["https://openreview.net/pdf?id=9XcOPyOZCa"]} @@ -10752,6 +10946,7 @@ {"year":"2025","title":"Privacy Ripple Effects from Adding or Removing Personal Information in Language Model Training","authors":["J Borkar, M Jagielski, K Lee, N Mireshghallah… - arXiv preprint arXiv …, 2025","P Prompt"],"snippet":"Due to the sensitive nature of personally identifiable information (PII), its owners may have the authority to control its inclusion or request its removal from large-language model (LLM) training. Beyond this, PII may be added or removed from training …","url":["https://arxiv.org/pdf/2502.15680","https://openreview.net/pdf?id=JrqOE14nwU"]} {"year":"2025","title":"Privacy-Preserving Transformers: SwiftKey's Differential Privacy Implementation","authors":["A Abouelenin, M Abdelrehim, R Fahim, A Hendy… - arXiv preprint arXiv …, 2025"],"snippet":"In this paper we train a transformer using differential privacy (DP) for language modeling in SwiftKey. We run multiple experiments to balance the trade-off between the model size, run-time speed and accuracy. We show that we get small and …","url":["https://arxiv.org/pdf/2505.05648"]} {"year":"2025","title":"Probabilistic Orthogonal Decay for Gradient Alignment Modulation in Large Language Model Pretraining","authors":["J Harrison, A Delta, R Green, C Simpson, A Scolto…"],"snippet":"… The pretraining dataset consisted of a 1.1 trillion token corpus drawn from publicly available Common Crawl and academic benchmarks, filtered for deduplication, language coverage, and quality through a tiered ranking system based on perplexity …","url":["https://www.researchgate.net/profile/Andrew-Scolto/publication/391909309_Probabilistic_Orthogonal_Decay_for_Gradient_Alignment_Modulation_in_Large_Language_Model_Pretraining/links/682d1c2ad1054b0207f03b76/Probabilistic-Orthogonal-Decay-for-Gradient-Alignment-Modulation-in-Large-Language-Model-Pretraining.pdf"]} +{"year":"2025","title":"Probing jet base emission of M87* with the 2021 Event Horizon Telescope observations","authors":["E Chavez, B Georgiev, TP Krichbaum, K Moriyama… - 2025"],"snippet":"We investigate the presence and spatial characteristics of the jet base emission in M87* at 230 GHz, enabled by the significantly enhanced (u, v) coverage in the 2021 Event Horizon Telescope (EHT) observations. The integration of the 12− m Kitt Peak …","url":["https://www.aanda.org/articles/aa/pdf/forth/aa57022-25.pdf"]} {"year":"2025","title":"Procedural history","authors":["J Kennedy's' Masterpiece'Ruling"],"snippet":"In 2012, same-sex couple Charlie Craig and David Mullins from Colorado made plans to be lawfully married in Massachusetts and return to Colorado to celebrate with their family and friends. At that time the state constitution prohibited same-sex …","url":["https://reference.org/facts/masterpiece_cakeshop_v_colorado_civil_rights_commission/WjnPg2AU"]} {"year":"2025","title":"Profiling and optimization of multi-card GPU machine learning jobs","authors":["M Lawenda, K Khloponin, K Samborski, Ł Szustak - arXiv preprint arXiv:2505.22905, 2025"],"snippet":"The effectiveness and efficiency of machine learning methodologies are crucial, especially with respect to the quality of results and computational cost. This paper discusses different model optimization techniques, providing a comprehensive …","url":["https://arxiv.org/pdf/2505.22905"]} {"year":"2025","title":"Progress in the Application of Artificial Intelligence in English Corpus Pattern Recognition","authors":["Y Song, G Shan - 2025 3rd International Conference on Data Science …, 2025"],"snippet":"… The model adopts a Transformer-based bidirectional encoding architecture, learns context representation based on the 1.2TB Common Crawl dataset in the pre-training phase, introduces a domain adaptation layer in the fine-tuning phase to handle …","url":["https://ieeexplore.ieee.org/abstract/document/11071015/"]} @@ -10764,20 +10959,24 @@ {"year":"2025","title":"PTUK-HULAT at AraGenEval Shared Task: Fine-tuning XLM-RoBERTa for AI-Generated Arabic News Detection","authors":["T Duridi, A Jaber, P Martínez - Proceedings of The Third Arabic Natural Language …, 2025"],"snippet":"The authenticity of digital content has become an increasingly critical challenge with the rapid adoption of generative AI tools, especially for low-resource languages such as Arabic. The language’s rich morphology and domain diversity further …","url":["https://aclanthology.org/2025.arabicnlp-sharedtasks.7.pdf"]} {"year":"2025","title":"Pula: Training Large Language Models for Setswana","authors":["N Brown, V Marivate, AI Lelapa"],"snippet":"In this work we present Pula, a suite of bilingual language models proficient in both Setswana and English. Leveraging recent advancements in data availability and efficient fine-tuning, Pula 8B and Pula 14B outperform GPT-4o and Gemini 1.5 Pro …","url":["https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-long.338.pdf"]} {"year":"2025","title":"Pushing the Boundaries of Large Language Models: Innovations and Limitations in NLP, Finance, and Mathematics","authors":["AMM Rahman - 2024"],"snippet":"Large Language Models (LLMs) have emerged as transformative tools across a spectrum of domains, yet their practical deployment reveals a blend of remarkable potential and notable limitations. This research explores innovative methodologies …","url":["https://search.proquest.com/openview/d6191bb7d579cc725f301cc20169d052/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2025","title":"PWNC: A Large-Scale Persian Corpus for Joint WSD and NER Using Semi-Supervised and Supervised Learning","authors":["A Keshtkar, SS Sadidpour, H Shirazi","SS Sadidpour, A Keshtkar, H Shirazi - Journal of AI and Data Mining, 2025"],"snippet":"Word Sense Disambiguation (WSD) is a longstanding challenge in natural language processing, particularly in morphologically rich and low-resource languages such as Persian. The inherent ambiguity of Persian named entities exacerbated by domain-specific …","url":["https://jad.shahroodut.ac.ir/article_3681_17ab2901eade86649b5970d8b2b4d8a6.pdf","https://jad.shahroodut.ac.ir/article_3681_22bf6a793359875c84fcb59972a56499.pdf"]} {"year":"2025","title":"Q-Adam-mini: Memory-Efficient 8-bit Quantized Optimizer for Large Language Model Training","authors":["Y Han, C Yang, C Chen, X Wang, R Sun - ES-FoMo III: 3rd Workshop on Efficient Systems for …"],"snippet":"We propose $\\textbf{Q-Adam-mini}$, a memory-efficient optimizer for Large Language Model (LLM) training that achieves $\\textbf{8$\\times$}$ reduction in GPU memory usage while maintaining performance parity with full-precision AdamW …","url":["https://openreview.net/pdf?id=sa3uVJLEsR"]} {"year":"2025","title":"Quality Beyond A Glance: Revealing Large Quality Differences Between Web-Crawled Parallel Corpora","authors":["R Van Noord, M Esplà-Gomis, M Chichirău… - Proceedings of the 31st …, 2025"],"snippet":"… CCAligned This corpus was created through URL-based document alignment on a collection of 68 Common Crawl Snapshots. Document … through sentence alignment on a collection of documents from 10 Common Crawl Snapshots. FastText …","url":["https://aclanthology.org/2025.coling-main.124.pdf"]} {"year":"2025","title":"Quality over Quantity: Boosting Data Efficiency Through Ensembled Multimodal Data Curation","authors":["J Xu, Y Song, D Wang, W Zhao, M Chen, K Chen, Q Li - arXiv preprint arXiv …, 2025"],"snippet":"In an era overwhelmed by vast amounts of data, the effective curation of web-crawl datasets is essential for optimizing model performance. This paper tackles the challenges associated with the unstructured and heterogeneous nature of such …","url":["https://arxiv.org/pdf/2502.08211"]} {"year":"2025","title":"Quality-aware Neural Machine Translation with Self-evaluation","authors":["J Cui, L Mu, Q Liu, H Xu - 2025"],"snippet":"The performance of neural machine translation relies on a large amount of data, but crawled sentence pairs are of different quality. The low-quality sentence pairs may provide helpful translation knowledge but also teach the model to generate low-quality …","url":["https://aclanthology.org/anthology-files/anthology-files/pdf/ccl/2025.ccl-1.87.pdf"]} {"year":"2025","title":"Quantifying, Understanding, and Improving Generalization in Deep Learning","authors":["Y Jiang - 2025"],"snippet":"Generalization is a defining challenge of modern machine learning. Classical theory explains small supervised models but struggles with the surprising behavior of over-parameterized neural networks and with other paradigms such as reinforcement learning and large-scale …","url":["https://kilthub.cmu.edu/ndownloader/files/57486691"]} +{"year":"2025","title":"Quantitative analysis of datasets and their usage patterns in language models","authors":["A Jachimczyk - Zagadnienia Informacji Naukowej-Studia Informacyjne, 2025"],"snippet":"Cel/Teza: Rosnąca popularność modeli językowych implikuje wzrost zainteresowania zestawami danych wykorzystywanymi do ich szkolenia. Analiza wpisuje się w nurt badań poświęconych tym zbiorom, koncentrując się na …","url":["http://ojs.sbp.pl/index.php/zin/article/view/1260"]} {"year":"2025","title":"Quantizing Large Language Models for Code Generation: A Differentiated Replication","authors":["A Giagnorio, A Mastropaolo, S Afrin, M Di Penta… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) have shown an impressive capability in code generation and, specifically, to automatically implement requirements described in natural language. The LLM effectiveness generally increases with its size: The …","url":["https://arxiv.org/pdf/2503.07103"]} {"year":"2025","title":"Quantum leap in medical","authors":["S Chokkakula, S Chong, B Yang, H Jiang, J Yu, R Han… - 2025"],"snippet":"… These models were trained on massive datasets, with GPT-3 using 45 terabytes of text data from various sources including Common Crawl, WebText2, Books1, Books2, and Wikipedia (22). This vast and multifarious dataset permit the models to …","url":["https://www.researchgate.net/profile/Bing-Yang-40/publication/391241421_Quantum_leap_in_medical_mentorship_exploring_ChatGPT's_transition_from_textbooks_to_terabytes/links/681001eedf0e3f544f4d367d/Quantum-leap-in-medical-mentorship-exploring-ChatGPTs-transition-from-textbooks-to-terabytes.pdf"]} {"year":"2025","title":"Quantum-Enhanced Attention Mechanism in NLP: A Hybrid Classical-Quantum Approach","authors":["SM Tomal, AA Shafin, D Bhattacharjee, MD Amin… - arXiv preprint arXiv …, 2025"],"snippet":"Transformer-based models have achieved remarkable results in natural language processing (NLP) tasks such as text classification and machine translation. However, their computational complexity and resource demands pose challenges for …","url":["https://arxiv.org/pdf/2501.15630"]} +{"year":"2025","title":"QuCo-RAG: Quantifying Uncertainty from the Pre-training Corpus for Dynamic Retrieval-Augmented Generation","authors":["D Min, K Zhang, T Wu, L Cheng - arXiv preprint arXiv:2512.19134, 2025"],"snippet":"… Factual knowledge is largely drawn from common sources such as Common Crawl, Wikipedia, and curated web text, making frequency and cooccurrence statistics from one comprehensive corpus a reliable proxy for others. This property …","url":["https://arxiv.org/pdf/2512.19134"]} {"year":"2025","title":"Query Details","authors":["RK Prova, S Basak"],"snippet":"Cyberbullying has emerged as a significant concern in the modern world. In Bangladesh receiving hate comments and bullying on social media platforms, particularly on Facebook, has unfortunately become a common occurrence. As a low-resource …","url":["https://www.researchgate.net/profile/Sarnali-Basak-2/publication/394012309_Cyberbullying_Detection_in_Bangla_Facebook_Comments_Using_Pre-trained_Transformer_Models/links/68ae00147984e374aceb8322/Cyberbullying-Detection-in-Bangla-Facebook-Comments-Using-Pre-trained-Transformer-Models.pdf"]} {"year":"2025","title":"Query Smarter, Trust Better? Exploring Search Behaviours for Verifying News Accuracy","authors":["D Elsweiler, S Ateia, M Bink, G Donabauer, MF Pichel… - arXiv preprint arXiv …, 2025"],"snippet":"While it is often assumed that searching for information to evaluate misinformation will help identify false claims, recent work suggests that search behaviours can instead reinforce belief in misleading news, particularly when users generate …","url":["https://arxiv.org/pdf/2504.05146"]} {"year":"2025","title":"Query-guided expansion and contraction of document sets","authors":["A Deloose, B De Baets, J Verwaeren - Information Sciences, 2025"],"snippet":"Document set expansion is a fundamental problem in information retrieval, involving the augmentation of an initial document set with additional relevant documents from a larger corpus. Query reformulation techniques, such as query expansion and …","url":["https://www.sciencedirect.com/science/article/pii/S0020025525009156"]} {"year":"2025","title":"Quotegraph: A Social Network Extracted from Millions of News Quotations","authors":["M Čuljak, R West, A Spitz, A Arora - arXiv preprint arXiv:2507.17626, 2025"],"snippet":"We introduce Quotegraph, a novel large-scale social network derived from speaker-attributed quotations in English news articles published between 2008 and 2020. Quotegraph consists of 528 thousand unique nodes and 8.63 million directed edges, pointing …","url":["https://arxiv.org/pdf/2507.17626"]} {"year":"2025","title":"Qwen 2.5: A Comprehensive Review of the Leading Resource-Efficient LLM with potentioal to Surpass All Competitors","authors":["I Ahmed, S Islam, PP Datta, I Kabir, NUR Chowdhury…"],"snippet":"The purpose of the review is to provide a comprehensive analysis of Qwen 2.5, highlighting its advancements in AI models. Key findings indicate that Qwen 2.5 features significant improvements in dataset size (expanding from 7 trillion to 18 …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.174060306.65738406"]} {"year":"2025","title":"Qwen2. 5-1M Technical Report","authors":["A Yang, B Yu, C Li, D Liu, F Huang, H Huang, J Jiang… - 2025"],"snippet":"In this report, we introduce Qwen2. 5-1M, a series of models that extend the context length to 1 million tokens. Compared to the previous 128K version, the Qwen2. 5-1M series have significantly enhanced long-context capabilities through long-context …","url":["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/Qwen2_5_1M_Technical_Report.pdf"]} +{"year":"2025","title":"Qwen3-VL Technical Report","authors":["S Bai, Y Cai, R Chen, K Chen, X Chen, Z Cheng… - arXiv preprint arXiv …, 2025"],"snippet":"… Document Parsing: For document parsing, we collect 3 million PDFs from Common Crawl, evenly distributed across 10 document types (300K samples each), along with 4 million internal documents. An in-house layout model first predicts the …","url":["https://arxiv.org/pdf/2511.21631"]} {"year":"2025","title":"RAG in the Wild: On the (In) effectiveness of LLMs with Mixture-of-Knowledge Retrieval Augmentation","authors":["R Xu, Y Zhuang, Y Yu, H Wang, W Shi, C Yang - arXiv preprint arXiv:2507.20059, 2025"],"snippet":"… 2024) – a large-scale, multi-domain datastore that combines general web sources (eg, CommonCrawl) with specialized domains (eg, PubMed). We evaluate tasks spanning both general knowledge and domain-specific QA, where no prior …","url":["https://arxiv.org/pdf/2507.20059"]} {"year":"2025","title":"RAG-sec bot: Orchestrating compliance and portability by leveraging localized LLMs in contextualized dialogue systems","authors":["A Pal, MG Mathew, AVG Moorthy, CVS Babu - AIP Conference Proceedings, 2025"],"snippet":"The design objective of a conversational agent, is to mimic the colloquial comportment and discourse patterns exhibited by human interlocutors, primarily via textual modality. To maintain its relevance and efficacy, a chatbot necessitates …","url":["https://pubs.aip.org/aip/acp/article-abstract/3260/1/020006/3355405"]} {"year":"2025","title":"Ranking Generated Answers","authors":["S Heineking, J Probst, D Steinbach, M Potthast…"],"snippet":"… We therefore obtained only the original web documents from CommonCrawl, discarded those containing fewer than 50 characters in the HTML body, and extracted plain text using the Resiliparse library.We were able to restore 6,692 web …","url":["https://downloads.webis.de/publications/papers/heineking_2025a.pdf"]} @@ -10785,9 +10984,11 @@ {"year":"2025","title":"Reading and Writing at a Distance: Integrating Corpus and AI Literacy in the Classroom","authors":["K Löser - International Conference on Artificial Intelligence in …, 2025"],"snippet":"This paper presents a conceptual framework that fuses corpus-based analysis (distant reading) with generative AI practice (distant writing) to advance critical digital literacy in secondary and tertiary classrooms. Using accessible corpus tools—COCA, DWDS …","url":["https://link.springer.com/chapter/10.1007/978-3-031-98465-5_56"]} {"year":"2025","title":"Real-TabPFN: Improving Tabular Foundation Models via Continued Pre-training With Real-World Data","authors":["A Garg, M Ali, N Hollmann, L Purucker, S Müller… - 1st ICML Workshop on Foundation …"],"snippet":"… accuracy compared to using broader, potentially noisier corpora like CommonCrawl or GitTables. Our resulting model, Real-TabPFN, … The prevalence of smaller datasets in broad corpora like CommonCrawl and GitTable contrasts with …","url":["https://openreview.net/pdf?id=BtEiqKsIMw"]} {"year":"2025","title":"Real-time Monitoring of Economic Shocks using Company Websites","authors":["M Koenig, J Rauch, M Woerter - arXiv preprint arXiv:2502.17161, 2025"],"snippet":"… We use the CommonCrawl dataset to access historical information from company websites. CommonCrawl is an extensive and constantly updated collection of web data that covers a large part of the web content and allows access to the historical …","url":["https://arxiv.org/pdf/2502.17161"]} +{"year":"2025","title":"Real-World Phishing and Smishing Detection Using Deep Learning: A Comparative Study of LSTM, GRU, and GloVe Embeddings","authors":["AM CHOUGULE, DRKS OZA, VT PATIL…"],"snippet":"… GloVe embeddings are pretrained on large corpora (eg, Common Crawl), enabling them to capture both semantic and syntactic relationships between words in vector space. This property allows the model to better understand the context of …","url":["https://www.researchgate.net/profile/Annasaheb-Chougule/publication/398120452_Real-World_Phishing_and_Smishing_Detection_Using_Deep_Learning_A_Comparative_Study_of_LSTM_GRU_and_GloVe_Embeddings/links/692be3ad7185551710653cd9/Real-World-Phishing-and-Smishing-Detection-Using-Deep-Learning-A-Comparative-Study-of-LSTM-GRU-and-GloVe-Embeddings.pdf"]} {"year":"2025","title":"Realistic and Ethical Use of Artificial Intelligence","authors":["M Savin-Baden, Z Savin-Baden - 2025"],"snippet":"Current perspectives on artificial intelligence (AI) tend to focus on the wide parameters between fear and hope. Realistic and Ethical Use of Artificial Intelligence provides some (ethical) hope with realistic solutions. It deals with the …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=tneWEQAAQBAJ&oi=fnd&pg=PT9&dq=commoncrawl&ots=viQfDHzidb&sig=x1cAUsVBRmV1_hpnHpXy0Uu_gds"]} {"year":"2025","title":"RealSyn: An Effective and Scalable Multimodal Interleaved Document Transformation Paradigm","authors":["T Gu, K Yang, C Zhang, Y Xie, X An, Z Feng, D Liu… - arXiv preprint arXiv …, 2025"],"snippet":"… 2024) dataset uses a comprehensive filtering strategy and includes 141 million web pages, 353 million associated images, and 115 billion text tokens extracted from Common Crawl. However, due to data format constraints and training …","url":["https://arxiv.org/pdf/2502.12513"]} {"year":"2025","title":"Reasoning Beyond Limits: Advances and Open Problems for LLMs","authors":["MA Ferrag, N Tihanyi, M Debbah - arXiv preprint arXiv:2503.22732, 2025"],"snippet":"… 7B architecture, the model is further pre-trained on an extensive corpus of 120 billion math-related tokens extracted from Common Crawl, complemented by natural language and code data. As a result, DeepSeekMath 7B achieves an impressive …","url":["https://arxiv.org/pdf/2503.22732"]} +{"year":"2025","title":"Reasoning transfer for an extremely low-resource and endangered language: Bridging languages through sample-efficient language understanding","authors":["KT Tran, B O'Sullivan, HD Nguyen - 2025"],"snippet":"Recent advances have enabled Large Language Models (LLMs) to tackle reasoning tasks by generating chain-of-thought (CoT) rationales, yet these gains have largely applied to high-resource languages, leaving low-resource languages behind. In this …","url":["https://cora.ucc.ie/bitstreams/5154a05b-f7ae-4951-849c-a572b6780e7b/download"]} {"year":"2025","title":"Rebranding empire in the age of generative AI","authors":["D Lakshmi S - Frontiers in Communication, 2025"],"snippet":"… These datasets are scraped from Wikipedia articles, Reddit forums, and Common Crawl archives. But whose knowledge is scraped? Which languages are missing? …","url":["https://www.frontiersin.org/journals/communication/articles/10.3389/fcomm.2025.1604361/abstract"]} {"year":"2025","title":"Recall language modeling","authors":["J Hewitt"],"snippet":"Let’s say I have a batch size B and a maximum sequence length n. I’ve probably set B and n such that I have as long sequences as I can, and as large a batch as I can, such that it’ll fit on my GPU cluster. So, that’s Bn tokens I can potentially learn from …","url":["https://www.cs.columbia.edu/~johnhew/coms4705/lectures/lec7.pdf"]} {"year":"2025","title":"Recent Trends on Artificial Intelligence in Automated Hate Speech Detection","authors":["N Goyal, A Kumar, A Chaddha, D Lakshmi - Ethical AI Solutions for Addressing Social …, 2025"],"snippet":"This study investigates the performance of AI in detecting HS in diverse cultural and contextual settings. Existing AI models, trained primarily on English datasets, struggle with regional dialects, idiomatic phrases, and cultural nuances. A …","url":["https://www.igi-global.com/chapter/recent-trends-on-artificial-intelligence-in-automated-hate-speech-detection/371743"]} @@ -10800,11 +11001,13 @@ {"year":"2025","title":"REFRAG: Rethinking RAG based Decoding","authors":["X Lin, A Ghosh, BKH Low, A Shrivastava, V Mohan - arXiv preprint arXiv:2509.01092, 2025"],"snippet":"Large Language Models (LLMs) have demonstrated remarkable capabilities in leveraging extensive external knowledge to enhance responses in multi-turn and agentic applications, such as retrieval-augmented generation (RAG). However …","url":["https://arxiv.org/pdf/2509.01092"]} {"year":"2025","title":"Reframing the performance and ethics of “empathic” AI: Wisdom of the crowd and placebos","authors":["MA Thornton, MA Thornton"],"snippet":"Recently, claims have emerged that artificial intelligence (AI) is better at providing empathy than humans. These claims come paired with suggestions that people should use empathic AI to supplement human empathy. This paper critically …","url":["https://osf.io/zf9w5_v2/download"]} {"year":"2025","title":"Register Always Matters: Analysis of LLM Pretraining Data Through the Lens of Language Variation","authors":["A Myntti, E Henriksson, V Laippala, S Pyysalo - arXiv preprint arXiv:2504.01542, 2025"],"snippet":"… (2020) validated the quality of their Pile datasets by evaluating models trained on the Pile, CommonCrawl, and CC-100. Likewise, Burchell et al. (2025… HPLT v2 datasets have been processed from a combination of Internet Archive and Common …","url":["https://arxiv.org/pdf/2504.01542"]} +{"year":"2025","title":"Regulating AI as a Person?","authors":["D Sella-Villa - 2025"],"snippet":"… The LLMs they conceived “trained on a dataset called the Common Crawl, which is a collection of over three billion web pages.”… The latter phrase is grammatically correct, but less likely to be found in a training data set, like the Common Crawl. See …","url":["https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=5919967"]} {"year":"2025","title":"Reimagining Unit Test Generation with AI: A Journey from Evolutionary Models to Transformers","authors":["SZ Esubalew, BG Assefa - IEEE Access, 2025"],"snippet":"The rapid evolution of software development demands efficient and scalable unit testing methodologies to ensure software reliability. Traditional manual test case generation is time-consuming and often inadequate for modern agile workflows …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11121142.pdf"]} {"year":"2025","title":"Reinforced Disentangled HTML Representation Learning with Hard-Sample Mining for Phishing Webpage Detection","authors":["JH Yoon, SJ Buu, HJ Kim - Electronics, 2025"],"snippet":"… The datasets used in this study include benign data from Common Crawl and phishing data from Phishtank and Mendeley Data, as summarized in Table 3. The benign dataset, collected in February 2023, contains 1,048,575 instances, providing …","url":["https://www.mdpi.com/2079-9292/14/6/1080"]} {"year":"2025","title":"Reinforcement Pre-Training on General-Domain Corpora: Scaling Next-Token Reasoning Beyond Mathematical Text","authors":["A Abdulloh - Authorea Preprints, 2025"],"snippet":"… Scaling to full WebText or Common Crawl distributions introduces additional challenges: fine-grained domain classification becomes … Future work should investigate GD-RPT on trillion-token corpora such as Common Crawl or The Pile …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.176315661.10663467"]} {"year":"2025","title":"Relationships between Urban Growth and Intercity Networks based on Toponym Co-occurrences","authors":["B Lee, H Shin, M Woo - Journal of the Korean Regional Science Association, 2025"],"snippet":"It is crucial to comprehend the operational principles of intercity networks, as the hub-centered network model to enhance regional competitiveness is being advocated such as a megaregion plan. However, traditional urban network analysis as a factor of urban …","url":["https://koreascience.kr/article/JAKO202518254003443.pdf"]} {"year":"2025","title":"Remembering Unequally: Global and Disciplinary Bias in LLM-Generated Co-Authorship Networks","authors":["G Kalhor, A Mashhadi - arXiv preprint arXiv:2511.00476, 2025"],"snippet":"… While the exact sources of the Llama 4 Scout training dataset are unknown, it is known that Llama models are trained on a mixture of publicly available data, including Common Crawl. This dataset likely contains a substantial amount of …","url":["https://arxiv.org/pdf/2511.00476"]} +{"year":"2025","title":"Remoe: Towards Efficient and Low-Cost MoE Inference in Serverless Computing","authors":["W Liu, Y Hu, R Zhou, B Li, N Wang - arXiv preprint arXiv:2512.18674, 2025"],"snippet":"Mixture-of-Experts (MoE) has become a dominant architecture in large language models (LLMs) due to its ability to scale model capacity via sparse expert activation. Meanwhile, serverless computing, with its elasticity and pay-per-use billing, is well-suited …","url":["https://arxiv.org/pdf/2512.18674"]} {"year":"2025","title":"Reparameterized LLM Training via Orthogonal Equivalence Transformation","authors":["Z Qiu, S Buchholz, TZ Xiao, M Dax, B Schölkopf, W Liu - arXiv preprint arXiv …, 2025"],"snippet":"While large language models (LLMs) are driving the rapid advancement of artificial intelligence, effectively and reliably training these large models remains one of the field's most significant challenges. To address this challenge, we propose POET, a …","url":["https://arxiv.org/pdf/2506.08001"]} {"year":"2025","title":"Representation Learning for Tabular Data: A Comprehensive Survey","authors":["JP Jiang, SY Liu, HR Cai, Q Zhou, HJ Ye - arXiv preprint arXiv:2504.16109, 2025"],"snippet":"Tabular data, structured as rows and columns, is among the most prevalent data types in machine learning classification and regression applications. Models for learning from tabular data have continuously evolved, with Deep Neural Networks (DNNs) …","url":["https://arxiv.org/pdf/2504.16109"]} {"year":"2025","title":"Representation Learning Methods for Association Prediction Tasks in Drug Discovery","authors":["S Sadeghi - 2024"],"snippet":"Abstract Representation learning is a key step in bridging machine learning and drug discovery. Understanding the interactions between drugs and various biological entities is critical for drug discovery. In this research, we explore advanced …","url":["https://search.proquest.com/openview/75c5425bc7fdba6bb813d3eb7b00851b/1?pq-origsite=gscholar&cbl=18750&diss=y"]} @@ -10812,17 +11015,21 @@ {"year":"2025","title":"RePro: Training Language Models to Faithfully Recycle the Web for Pretraining","authors":["Z Yu, C Xiong - arXiv preprint arXiv:2510.10681, 2025"],"snippet":"High-quality pretraining data is the fossil fuel of large language models (LLMs), yet its reserves are running low for frontier models. In this paper, we introduce RePro, a novel web recycling method that trains a relatively small LM with reinforcement …","url":["https://arxiv.org/pdf/2510.10681"]} {"year":"2025","title":"Research Challenges and Opportunities for Open Generative Modeling","authors":["A Gokaslan - 2025"],"snippet":"This dissertation develops methods to make generative modeling more accessible, reliable, and legally grounded across vision, biology, and language. I introduce CommonCanvas, an open latent diffusion pipeline trained solely on Creative-Commons-licensed …","url":["https://search.proquest.com/openview/34330f6a1cc3c63473153aad3bd532ac/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Research on Key Methods for Extracting High-Quality Chinese Corpus Based on Common Crawl","authors":["L Xiao, Z Zhao, C Wang, J Zhang, F Liu - 2025 28th International Conference on …, 2025"],"snippet":"… This paper introduces a method to extract high-quality Chinese corpora based on the Common Crawl dataset, enhancing existing quality filtering and deduplication methods. In terms of deduplication, this paper enhances the Simhash algorithm to …","url":["https://ieeexplore.ieee.org/abstract/document/11033399/"]} +{"year":"2025","title":"Research on the Application of Cross-Language Transfer Learning Model in English Translation for Low-Resource Scenarios","authors":["X Wu, R Deng - IEEE Access, 2025"],"snippet":"… is a large-scale collection of automatically mined parallel sentences extracted from Common Crawl web data. It spans hundreds of language pairs and is aligned using LASER embeddings and cosine similarity scoring. While noisier than …","url":["https://ieeexplore.ieee.org/iel8/6287639/10820123/11224889.pdf"]} {"year":"2025","title":"ResearchQA: Evaluating Scholarly Question Answering at Scale Across 75 Fields with Survey-Mined Questions and Rubrics","authors":["LS Yifei, A Chang, C Malaviya, M Yatskar - arXiv preprint arXiv:2509.00496, 2025"],"snippet":"Evaluating long-form responses to research queries heavily relies on expert annotators, restricting attention to areas like AI where researchers can conveniently enlist colleagues. Yet, research expertise is widespread: survey articles synthesize …","url":["https://arxiv.org/pdf/2509.00496"]} {"year":"2025","title":"Resilience Outcomes Benchmark: Toward an Outcome-Labeled Coping Strategy Dataset for Precision Mental Health","authors":["S Anand - NeurIPS 2025 AI for Science Workshop"],"snippet":"Most AI benchmarks still measure static competence—accuracy on fixed math, coding, and knowledge-recall tasks. But intelligence that matters in care is adaptive effectiveness: knowing which actions help which people, at what dose, and on what …","url":["https://openreview.net/pdf?id=XFMw8Ww9Gp"]} {"year":"2025","title":"Responsible AI and AI Governance","authors":["T Duke, P Giudici - Responsible AI in Practice: A Practical Guide to Safe …, 2025"],"snippet":"Responsible AI is a new and nascent field, and the term “responsible AI” has been used interchangeably with the term “ethical AI” in recent years. In this chapter, we’ll look at a brief history of responsible AI and the factors influencing its emergence as …","url":["https://link.springer.com/chapter/10.1007/979-8-8688-1166-1_1"]} {"year":"2025","title":"Responsible AI in Practice","authors":["AI Human, T Duke, P Giudici"],"snippet":"… Sourced from the Common Crawl web index, LAION-5B is a popular open source training dataset containing over 5.8 billion images used for image generation. It was used to train Stable Diffusion, an image generator introduced by Stability AI (a UK-based …","url":["https://link.springer.com/content/pdf/10.1007/979-8-8688-1166-1.pdf"]} +{"year":"2025","title":"Responsible Health AI Readiness and Maturity Index (RHAMI): Healthcare Systems' Novel Automated Optimization of Responsible Scaled AI Outcomes and ROI …","authors":["DJ Monlezun, G Marshall, L Omutoko, P Oduor… - 2025"],"snippet":"… for more healthcare system actors worldwide regardless of technical background, we also performed an AI assisted literature review using the free ChatGPT large language model (LLM), in addition to reviewing news articles, social media, books …","url":["https://www.preprints.org/frontend/manuscript/8553afe834cd6008d66d7bfe1e6da340/download_pub"]} {"year":"2025","title":"Responsibly Training Foundation Models: Actualizing Ethical Principles for Curating Large-Scale Training Datasets in the Era of Massive AI Models","authors":["MK Scheuerman, D Zhao, JTA Andrews, A Birhane… - Companion Publication of …, 2025"],"snippet":"AI technologies have become ubiquitous, influencing domains from healthcare to finance and permeating our daily lives. Concerns about the values underlying the creation and use of datasets to develop AI technologies are growing. Current …","url":["https://dl.acm.org/doi/abs/10.1145/3715070.3748285"]} {"year":"2025","title":"Restoring Rhythm: Punctuation Restoration Using Transformer Models for Bangla, a Low-Resource Language","authors":["MO Mamun, MA Mamun, A Ahmad, MIH Emu - arXiv preprint arXiv:2507.18448, 2025"],"snippet":"Punctuation restoration enhances the readability of text and is critical for post-processing tasks in Automatic Speech Recognition (ASR), especially for low-resource languages like Bangla. In this study, we explore the application of transformer-based …","url":["https://arxiv.org/pdf/2507.18448"]} +{"year":"2025","title":"Restructuring the Corpus Makes RAG Work for Math","authors":["N Arabzadeh, W Ma, S Min, M Zaharia - The 5th Workshop on Mathematical Reasoning and …"],"snippet":"Large Language Models (LLMs) achieve strong performance on mathematical problem solving when guided by chain-of-thought prompting or trained on reasoning traces. Yet it remains unclear whether Retrieval-Augmented Generation (RAG) …","url":["https://openreview.net/pdf?id=6cYmnzJViJ"]} {"year":"2025","title":"Retention analysis of edited knowledge after fine-tuning","authors":["F Wen, S Zhang - arXiv preprint arXiv:2507.14198, 2025"],"snippet":"… Specifically, for our experiments on the GPT-2 XL, which was pre-trained on webtext, we choose the Common Crawl dataset for fine-tuning. Since a small subset of the data suffices to demonstrate the influence, we sampled 60k data for our …","url":["https://arxiv.org/pdf/2507.14198"]} {"year":"2025","title":"Rethinking Data Mixture for Large Language Models: A Comprehensive Survey and New Perspectives","authors":["Y Liu, C Chen, J Yang, R Sun - arXiv preprint arXiv:2505.21598, 2025"],"snippet":"Training large language models with data collected from various domains can improve their performance on downstream tasks. However, given a fixed training budget, the sampling proportions of these different domains significantly impact the …","url":["https://arxiv.org/pdf/2505.21598"]} {"year":"2025","title":"Rethinking Fingerprinting: An Assessment of Behavior-based Methods at Scale and Implications for Web Tracking","authors":["K Crichton, LF Cranor, N Christin - … on Privacy Enhancing Technologies YYYY (X)"],"snippet":"Most common forms of web tracking fail to maintain the continuity of a user’s identity over long periods of time: cookies get deleted, IP addresses are reassigned, attributes used for browser fingerprinting change. These identity discontinuities help …","url":["https://www.andrew.cmu.edu/user/nicolasc/publications/Crichton-PETS25.pdf"]} {"year":"2025","title":"Rethinking Multilingual Continual Pretraining: Data Mixing for Adapting LLMs Across Languages and Resources","authors":["Z Li, S Ji, H Luo, J Tiedemann - arXiv preprint arXiv:2504.04152, 2025"],"snippet":"… 2024), a large-scale multilingual dataset derived from Common Crawl5, covering 419 languages. Since web-crawled text does not inherently guarantee monolingual integrity, we employ GlotLID (Kargaran et al.… 5https://commoncrawl.org/ …","url":["https://arxiv.org/pdf/2504.04152"]} {"year":"2025","title":"Rethinking phishing detection: How dataset quality affects model generalization","authors":["P Afonso, E Maia, I Amorim, I Praça - 2025 15th International Conference on …, 2025"],"snippet":"Phishing remains a pervasive cybersecurity threat, prompting the development of numerous detection models and the creation of various public benchmark datasets for evaluation. However, despite the apparent diversity of these datasets, their …","url":["https://ieeexplore.ieee.org/abstract/document/11185656/"]} +{"year":"2025","title":"Rethinking Web Cache Design for the AI Era","authors":["Y Zhang, J Cai, A Wildani, A Klimovic - 2025"],"snippet":"… Public crawl statistics from Common Crawl, which performs large-scale legitimate web crawls on a monthly basis, show that the vast majority of fetched pages are unique [16]. Our experiments using Crawl4AI [50], a web crawling framework …","url":["https://yazhuozhang.com/assets/publication/socc25-rethinking-web-cache.pdf"]} {"year":"2025","title":"Rethinking Web Security in the Age of Artificial Intelligence: Emerging Threats and Compliance Challenges","authors":["M Zha - 2025"],"snippet":"As AI-driven services become deeply integrated into the fabric of the web, security and compliance challenges increasingly arise at system boundaries—where platforms interoperate with users, third-party applications, and large language model …","url":["https://search.proquest.com/openview/b174190d4a8685d85c3c5dd47d9f0b87/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Retrieval-Augmented Generation for Benchmark Datasets: Techniques, Challenges, and Applications","authors":["K Alushi"],"snippet":"… The experiment will focus on eight datasets, which focus on varying domains ranging from the more commonly used Common Crawl or Wikipedia based datasets to more specialized fields such as social welfare, travel or cooking. They also have …","url":["https://www.inf.uni-hamburg.de/en/inst/ab/lt/teaching/theses/completed-theses/2025-ma-alushi.pdf"]} {"year":"2025","title":"Retrieval-Augmented Purifier for Robust LLM-Empowered Recommendation","authors":["L Ning, W Fan, Q Li - arXiv preprint arXiv:2504.02458, 2025"],"snippet":"… [66] introduced CCNet, an automated pipeline designed to efficiently extract vast amounts of high-quality monolingual datasets from the Common Crawl corpus across various languages. Beyond enhancing the quality of training data …","url":["https://arxiv.org/pdf/2504.02458"]} @@ -10844,6 +11051,7 @@ {"year":"2025","title":"Risk Assessment and Security Analysis of Large Language Models","authors":["X Zhang, D Lyu, X Li - arXiv preprint arXiv:2508.17329, 2025"],"snippet":"… Initially, they are parameterised using massive datasets such as OpenWebText and Common Crawl. Subsequently, by continuously expanding parameter scales as seen in models like the GPT series, PaLM, and LLaMA, they progressively expand …","url":["https://arxiv.org/pdf/2508.17329"]} {"year":"2025","title":"RiskHarvester: A Risk-based Tool to Prioritize Secret Removal Efforts in Software Artifacts","authors":["SK Basak, T Pardeshi, B Reaves, L Williams - arXiv preprint arXiv:2502.01020, 2025"],"snippet":"… In our study, we used the pre-trained fastText model cc.en.300.bin, trained on Common Crawl and Wikipedia with 5-character n-grams, a window size of 5, and 10 negatives. We used the fasttext [10] package of Python to access the model and …","url":["https://arxiv.org/pdf/2502.01020"]} {"year":"2025","title":"RLP: Reinforcement as a Pretraining Objective","authors":["A Hatamizadeh, SN Akter, S Prabhumoye, J Kautz… - arXiv preprint arXiv …, 2025"],"snippet":"The dominant paradigm for training large reasoning models starts with pre-training using next-token prediction loss on vast amounts of data. Reinforcement learning, while powerful in scaling reasoning, is introduced only as the very last phase of post-training …","url":["https://arxiv.org/pdf/2510.01265"]} +{"year":"2025","title":"Robust and Scalable Cross-Lingual Transfer","authors":["FD Schmidt - 2025"],"snippet":"Cross-Lingual Transfer (xlt) in natural language processing (NLP) fundamentally aims to beneficially leverage data in one language for performing tasks in another language. In recent years, this paradigm of transfer learning has only become ever …","url":["https://search.proquest.com/openview/6e42efcd54166fa1c395561ad75e2147/1?pq-origsite=gscholar&cbl=2026366&diss=y"]} {"year":"2025","title":"Robust Bias Detection in MLMs and its Application to Human Trait Ratings","authors":["I Shrestha, L Tay, P Srinivasan - arXiv preprint arXiv:2502.15600, 2025"],"snippet":"… The MLMs assessed are trained on datasets up to 2019 from sources like Common Crawl, BookCorpus, and Wikipedia. These likely lack adequate … Key to note is that RoBERTa’s training corpus is 50% news data from Common Crawl (CC-News) …","url":["https://arxiv.org/pdf/2502.15600"]} {"year":"2025","title":"Robust LLM Fingerprinting via Domain-Specific Watermarks","authors":["T Gloaguen, R Staab, N Jovanović, M Vechev - arXiv preprint arXiv:2505.16723, 2025"],"snippet":"As open-source language models (OSMs) grow more capable and are widely shared and finetuned, ensuring model provenance, ie, identifying the origin of a given model instance, has become an increasingly important issue. At the same time …","url":["https://arxiv.org/pdf/2505.16723"]} {"year":"2025","title":"Robust, efficient, and knowledge-augmented text generation with pre-trained language models","authors":["J Li - 2025"],"snippet":"Pre-trained Language Models (PLMs) have significantly advanced the field of text generation. However, their practical application is often hindered by challenges related to systematic capability evaluation, high computational costs for training and …","url":["https://umontreal.scholaris.ca/bitstreams/50e60c9c-19c6-4f09-84b3-c0bcd9357453/download"]} @@ -10853,6 +11061,7 @@ {"year":"2025","title":"RSTHFS: A Rough Set Theory-Based Hybrid Feature Selection Method for Phishing Website Classification","authors":["JH Setu, N Halder, A Islam, MA Amin - IEEE Access, 2025"],"snippet":"Phishing is a pervasive form of cybercrime where malicious websites deceive users into revealing sensitive information, eg, passwords and credit card details. Despite advances in cybersecurity, accurately detecting phishing websites remains …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10965675.pdf"]} {"year":"2025","title":"RUAccent: Advanced System for Stress Placement in Russian with Homograph Resolution","authors":["DA Petrov - Proceedings of the 31st International Conference on …, 2025"],"snippet":"This paper presents a novel approach to the problem of stress placement in Russian text, with a particular focus on resolving homographs. We introduce a comprehensive system that combines morphological analysis, context-aware neural …","url":["https://aclanthology.org/2025.coling-main.444.pdf"]} {"year":"2025","title":"S-DAT: A Multilingual, GenAI-Driven Framework for Automated Divergent Thinking Assessment","authors":["J Haase, PHP Hanel, S Pokutta - arXiv preprint arXiv:2505.09068, 2025"],"snippet":"This paper introduces S-DAT (Synthetic-Divergent Association Task), a scalable, multilingual framework for automated assessment of divergent thinking (DT) -a core component of human creativity. Traditional creativity assessments are often labor-intensive …","url":["https://arxiv.org/pdf/2505.09068"]} +{"year":"2025","title":"Saar Blueprints","authors":["A Serdaroglu"],"snippet":"… The composition of widely used training datasets further illustrates this issue: Common Crawl constitutes 60% of GPT-3’s training data, social media conversations make up 50% of PaLM’s, and Reddit content has been extensively …","url":["https://jean-monnet-saar.eu/wp-content/uploads/2025/10/7011205_Serdaroglu_clean.pdf"]} {"year":"2025","title":"Safeguarding Patient Data: Machine Learning for Phishing URL Detection in Healthcare Systems","authors":["AA Mousa, SADH Hassan, MK Rashid, M Al-Saady"],"snippet":"… Benign URLs for validation were sourced from a 2023 snapshot of the Common Crawl dataset, representing a broad spectrum of contemporary web content. Phishing URLs were aggregated from PhishTank (live phishing feed) and …","url":["https://www.researchgate.net/profile/Saif-Al-Deen-H-Hassan-2/publication/391835382_Safeguarding_Patient_Data_Machine_Learning_for_Phishing_URL_Detection_in_Healthcare_Systems/links/68286e12df0e3f544f550374/Safeguarding-Patient-Data-Machine-Learning-for-Phishing-URL-Detection-in-Healthcare-Systems.pdf"]} {"year":"2025","title":"Safety and Security Analysis of Large Language Models: Risk Profile and Harm Potential","authors":["C Akiri, H Simpson, K Aryal, A Khanna, M Gupta - arXiv preprint arXiv:2509.10655, 2025"],"snippet":"While the widespread deployment of Large Language Models (LLMs) holds great potential for society, their vulnerabilities to adversarial manipulation and exploitation can pose serious safety, security, and ethical risks. As new threats continue to …","url":["https://arxiv.org/pdf/2509.10655"]} {"year":"2025","title":"Sailor2: Sailing in South-East Asia with Inclusive Multilingual LLMs","authors":["L Dou, Q Liu, F Zhou, C Chen, Z Wang, Z Jin, Z Liu… - arXiv preprint arXiv …, 2025"],"snippet":"… For SEA language data that provide local text and knowledge, we extract content from 96 CommonCrawl snapshots spanning from summer 2013 to April 2024. Additionally, to extract high-quality and professional text, we also leverage publicly …","url":["https://arxiv.org/pdf/2502.12982"]} @@ -10868,7 +11077,10 @@ {"year":"2025","title":"Scalable Video-to-Dataset Generation for Cross-Platform Mobile Agents","authors":["Y Jang, Y Song, S Sohn, L Logeswaran, T Luo, DK Kim… - arXiv preprint arXiv …, 2025"],"snippet":"… Our data collection process begins with CommonCrawl web posts, specifically utilizing the C4 [40] and Dolma [46] datasets. These web posts represent actual user discussions and questions about mobile OS tasks, providing a natural distribution of …","url":["https://arxiv.org/pdf/2505.12632"]} {"year":"2025","title":"SCALE: Upscaled Continual Learning of Large Language Models","authors":["J Lee, J Choi, B Hwang, J Choo, B Kim, JS Yi, J Lee… - arXiv preprint arXiv …, 2025"],"snippet":"We revisit continual pre-training for large language models and argue that progress now depends more on scaling the right structure than on scaling parameters alone. We introduce SCALE, a width upscaling architecture that inserts lightweight …","url":["https://arxiv.org/pdf/2511.03270"]} {"year":"2025","title":"Scaling Agents via Continual Pre-training","authors":["L Su, Z Zhang, G Li, Z Chen, C Wang, M Song, X Wang… - arXiv preprint arXiv …, 2025"],"snippet":"Large language models (LLMs) have evolved into agentic systems capable of autonomous tool use and multi-step reasoning for complex problem-solving. However, post-training approaches building upon general-purpose foundation …","url":["https://arxiv.org/pdf/2509.13310"]} +{"year":"2025","title":"Scaling Behavior of Discrete Diffusion Language Models","authors":["D von Rütte, J Fluri, O Pooladzandi, B Schölkopf… - arXiv preprint arXiv …, 2025"],"snippet":"Modern LLM pre-training consumes vast amounts of compute and training data, making the scaling behavior, or scaling laws, of different models a key distinguishing factor. Discrete diffusion language models (DLMs) have been proposed as an …","url":["https://arxiv.org/pdf/2512.10858"]} +{"year":"2025","title":"Scaling Behavior of Encoder Language Models in Low-Resource Settings","authors":["R Visser, T Grobler, M Dunaiski - Southern African Conference for Artificial …, 2025"],"snippet":"… splitting the entire common crawl datasets into 1MB chunks and randomly selecting chucks until the desired data volume was reached. The 100MB-1GB volumes represent the amount of data commonly found for low-resource languages …","url":["https://link.springer.com/chapter/10.1007/978-3-032-11733-5_22"]} {"year":"2025","title":"Scaling Embedding Layers in Language Models","authors":["D Yu, E Cohen, B Ghazi, Y Huang, P Kamath, R Kumar… - arXiv preprint arXiv …, 2025"],"snippet":"We propose SCONE ($\\textbf{S}$calable, $\\textbf{C}$ontextualized, $\\textbf{O}$ffloaded, $\\textbf{N}$-gram $\\textbf{E}$mbedding), a method for extending input embedding layers to enhance language model performance as layer size scales. To avoid …","url":["https://arxiv.org/pdf/2502.01637"]} +{"year":"2025","title":"Scaling Equitable Reflection Assessment in Education via Large Language Models and Role-Based Feedback Agents","authors":["C Zhang, X Luo - arXiv preprint arXiv:2511.11772, 2025"],"snippet":"… “i have learn what are large language models,large language models(LLMs)are very large deep learning models that are pre-trained on vast amounts of data.this can ingest massive amounts of data,often from the Internet,but also from sources …","url":["https://arxiv.org/pdf/2511.11772"]} {"year":"2025","title":"Scaling Language-Free Visual Representation Learning","authors":["D Fan, S Tong, J Zhu, K Sinha, Z Liu, X Chen… - arXiv preprint arXiv …, 2025"],"snippet":"Visual Self-Supervised Learning (SSL) currently underperforms Contrastive Language-Image Pretraining (CLIP) in multimodal settings such as Visual Question Answering (VQA). This multimodal gap is often attributed to the semantics …","url":["https://arxiv.org/pdf/2504.01017"]} {"year":"2025","title":"Scaling Laws for Optimal Data Mixtures","authors":["M Shukor, L Bethune, D Busbridge, D Grangier, E Fini… - arXiv preprint arXiv …, 2025"],"snippet":"Large foundation models are typically trained on data from multiple domains, with the data mixture--the proportion of each domain used--playing a critical role in model performance. The standard approach to selecting this mixture relies on trial …","url":["https://arxiv.org/pdf/2507.09404"]} {"year":"2025","title":"Scaling Laws for Speculative Decoding","authors":["S Yan, M Zhu, G Jiang, J Wang, J Chen, W Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"The escalating demand for efficient decoding in large language models (LLMs) is particularly critical for reasoning-intensive architectures like OpenAI-o3 and DeepSeek-R1, which depend on extended chain-of-thought reasoning. This study …","url":["https://arxiv.org/pdf/2505.07858"]} @@ -10891,6 +11103,7 @@ {"year":"2025","title":"SecuDevSLM: A Systematic Security Evaluation Framework for Small Language Models on Mobile Devices","authors":["Z Li, T Tu, Y Lian, Z Kong, A Liu, W Li - 2025"],"snippet":"The deployment of Small Language Models (SLMs) on edge devices introduces critical security challenges, as their constrained architectures and heterogeneous runtime environments exacerbate vulnerabilities like hallucination and jailbreaking—risks …","url":["https://www.researchsquare.com/article/rs-7352742/latest.pdf"]} {"year":"2025","title":"Security Alignment of Large Language Models via Jailbreaking Attacks","authors":["NØ Jacobsen"],"snippet":"… of their resource availability from CommonCrawl 2. A language is categorized as a high-resource language if its data ratio on CommonCrawl is above 1%. A language is categorized as a medium-resource language if its data ratio on …","url":["https://projekter.aau.dk/projekter/files/784376436/Masters_Thesis_LLM_Jailbreaking.pdf"]} {"year":"2025","title":"Security and Privacy Challenges of AIGC in Metaverse: A Comprehensive Survey","authors":["S Zhang, H Li, K Sun, H Chen, Y Wang, S Li - ACM Computing Surveys, 2025"],"snippet":"The Metaverse is a hybrid environment that integrates both physical and virtual realms. The Metaverse has been accessible due to many facilitating technologies. One of the essential technologies that contribute to the Metaverse is AIGC. It is …","url":["https://dl.acm.org/doi/pdf/10.1145/3729419"]} +{"year":"2025","title":"Security Risks in Large Language Models and General Mitigation Strategies","authors":["R Zhang, M Kanyane - … Innovative Cybersecurity Solutions and Approaches to …, 2026"],"snippet":"… Even when prompt phrases are nontoxic, ChatGPT, which was trained on Common Crawl datasets, may generate sentences with significant toxicity. The content of these generative AIs, by and large, is problematic, more so in …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=2zycEQAAQBAJ&oi=fnd&pg=PA3&dq=commoncrawl&ots=GoSmdhxLcm&sig=uHPzKEEDAPDlbl5LTNQGvolnhhw"]} {"year":"2025","title":"Seed-Coder: Let the Code Model Curate Data for Itself","authors":["Y Zhang, J Su, Y Sun, C Xi, X Xiao, S Zheng, A Zhang… - arXiv preprint arXiv …, 2025"],"snippet":"… We implemented text extraction procedures on Common Crawl and identified two distinct categories of raw data: 1) web pages with explicit code tags (such as . . . ) in HTML that are readily extractable using standard rules, and 2) non-explicit …","url":["https://arxiv.org/pdf/2506.03524"]} {"year":"2025","title":"SeedAIchemy: LLM-Driven Seed Corpus Generation for Fuzzing","authors":["A Wen, NA Alzahrani, J Jiang, A Joe, K Shieh, A Zhang…"],"snippet":"… Common Crawl search does not use an LLM; it deterministically searches archived web content … Common Crawl [11] is an open source project started in 2007 that crawls the web and public archive data. Each crawl retrieves tens to …","url":["http://people.eecs.berkeley.edu/~daw/papers/seedaichemy.pdf"]} {"year":"2025","title":"Seeing Cause and Time: a Visually Grounded Evaluation of Multimodal Models","authors":["S Ergoli, A Bondielli, A Lenci - 2025"],"snippet":"… It is a largescale image corpus derived from CommonCrawl, composed exclusively of images with Creative Commons licenses. This choice ensures ethical usage and avoids copyright issues prevalent in many traditional image datasets. To …","url":["https://clic2025.unica.it/wp-content/uploads/2025/09/41_main_long.pdf"]} @@ -10923,12 +11136,14 @@ {"year":"2025","title":"Shortcut Learning in Generalist Robot Policies: The Role of Dataset Diversity and Fragmentation","authors":["Y Xing, X Luo, J Xie, L Gao, H Shen, J Song - arXiv preprint arXiv:2508.06426, 2025"],"snippet":"Generalist robot policies trained on large-scale datasets such as Open X-Embodiment (OXE) demonstrate strong performance across a wide range of tasks. However, they often struggle to generalize beyond the distribution of their training data. In this paper …","url":["https://arxiv.org/pdf/2508.06426"]} {"year":"2025","title":"Siamese Hybrid Network Approach for Sentence Similarity","authors":["DAA Deepal, A Bandara, PRS De Silva - Vidyodaya Journal of Science, 2024"],"snippet":"This paper presents a novel Siamese Hybrid Network approach, namely Siamese Bidirectional Long Short Memory with Convolutional Neural Network (SiBiLConv), for evaluating the similarity in natural language. The model integrates a Siamese …","url":["http://journals.sjp.ac.lk/index.php/vjs/article/view/7833/5489"]} {"year":"2025","title":"SIGIR 2025--LiveRAG Challenge Report","authors":["D Carmel, S Filice, G Horowitz, Y Maarek, O Somekh… - arXiv preprint arXiv …, 2025"],"snippet":"… The Fineweb dataset [11] consists of cleaned and de-duplicated Web content from CommonCrawl6. While Fineweb is relatively cleaner than other web-scale datasets, it still contains some toxic or offensive material and non-English pages …","url":["https://arxiv.org/pdf/2507.04942"]} +{"year":"2025","title":"Sigma-Moe-Tiny Technical Report","authors":["Q Hu, Z Lin, Z Yang, Y Ding, X Liu, Y Jiang, R Wang… - arXiv preprint arXiv …, 2025"],"snippet":"Mixture-of-Experts (MoE) has emerged as a promising paradigm for foundation models due to its efficient and powerful scalability. In this work, we present Sigma-MoE-Tiny, an MoE language model that achieves the highest sparsity compared to existing …","url":["https://arxiv.org/pdf/2512.16248"]} {"year":"2025","title":"Sign Operator for Coping with Heavy-Tailed Noise in Non-Convex Optimization: High Probability Bounds Under (L0, L1)-Smoothness","authors":["N Kornilov, P Zmushko, M Yandex, A Semenov…"],"snippet":"In recent years, non-convex optimization problems are more often described by generalized (L0, L1)-smoothness assumption rather than standard one. Meanwhile, severely corrupted data used in these problems has increased the demand for …","url":["https://labmmo.ru/upload/000/u8/c/c/2502-07923v2.pdf"]} {"year":"2025","title":"Sign Operator for Coping with Heavy-Tailed Noise: High Probability Convergence Bounds with Extensions to Distributed Optimization and Comparison Oracle","authors":["N Kornilov, P Zmushko, A Semenov, A Gasnikov… - arXiv preprint arXiv …, 2025"],"snippet":"The growing popularity of AI optimization problems involving severely corrupted data has increased the demand for methods capable of handling heavy-tailed noise, ie, noise with bounded $\\kappa$-th moment, $\\kappa \\in (1,2]$. For the widely used …","url":["https://arxiv.org/pdf/2502.07923"]} {"year":"2025","title":"Sign Spotting Disambiguation using Large Language Models","authors":["JH Low, OM Sincan, R Bowden - arXiv preprint arXiv:2507.03703, 2025"],"snippet":"Sign spotting, the task of identifying and localizing individual signs within continuous sign language video, plays a pivotal role in scaling dataset annotations and addressing the severe data scarcity issue in sign language translation. While …","url":["https://arxiv.org/pdf/2507.03703"]} {"year":"2025","title":"Sign-SGD is the Golden Gate between Multi-Node to Single-Node Learning: Significant Boost via Parameter-Free Optimization","authors":["D Medyakov, S Stanko, G Molodtsov, P Zmushko… - arXiv preprint arXiv …, 2025"],"snippet":"… 2020] — a cleaned and filtered version of Common Crawl data specifically curated for language model pre-training. See the detailed description of the experimental setup in Appendix A.2. We compare the following methods: Sign-SGD …","url":["https://arxiv.org/pdf/2506.03725"]} -{"year":"2025","title":"Similarity as Likelihood Ratio: Coupling Representations from Machine Learning (and Other Sources) With Cognitive Models","authors":["GE Cox"],"snippet":"Similarity lies at the core of theories of memory and perception. To understand similarity relations among complex items like text and images, researchers often rely on machine learning to derive high-dimensional vector representations of those …","url":["https://osf.io/download/v7xuz_v2/"]} +{"year":"2025","title":"Similarity as Likelihood Ratio: Coupling Representations from Machine Learning (and Other Sources) With Cognitive Models","authors":["GE Cox","GE Cox - Psychonomic Bulletin & Review, 2026"],"snippet":"Similarity lies at the core of theories of memory and perception. To understand similarity relations among complex items like text and images, researchers often rely on machine learning to derive high-dimensional vector representations of those …","url":["https://link.springer.com/article/10.3758/s13423-025-02828-w","https://osf.io/download/v7xuz_v2/"]} {"year":"2025","title":"Similarity Shuffled Criss-cross Transformer with Angle Loss for Image-text Matching","authors":["R Chen, T Su, H Wang, Z Ni - IEEE Transactions on Multimedia, 2025"],"snippet":"Image-text matching aims to retrieve images from the guidance of textual queries or retrieve text expressions with the help of images. Existing Transformer-based methods compute attention for all tokens and thus suffer from redundant information …","url":["https://ieeexplore.ieee.org/abstract/document/11194259/"]} +{"year":"2025","title":"Simple idea discovery in a minimalist llm architecture implementation","authors":["R Chihaia, M Trocan, F Leon - Proceedings of the 20th Conference on Computer …, 2025"],"snippet":"Large Language Models (LLMs) capture linguistic structure by operating on sequences of sub-word tokens, yet they often display behaviors that suggest an implicit grasp of high-level concepts. This study probes whether such “ideas” are …","url":["https://annals-csis.org/proceedings/2025/pliks/5480.pdf"]} {"year":"2025","title":"Simple Morphology, Complex Models: A Benchmark Study and Error Analysis of POS Tagging for Martinican Creole","authors":["L Mompelat - Proceedings of the 2025 CLASP Conference on …, 2025"],"snippet":"Part-of-speech (POS) tagging is a foundational task in NLP pipelines, but its development for Creole languages remains limited due to sparse annotated data and structural divergence from high-resource languages. This paper presents the …","url":["https://aclanthology.org/2025.clasp-main.1.pdf"]} {"year":"2025","title":"Simplification of German Narrative Documents with Longformer mBART","authors":["T Schomacker - 2025"],"snippet":"Transformer-models have become the most prominent method for solving a multitude of natural language processing (NLP) tasks since their introduction in 2017. Natural Language Generation (NLG) is one of these problems. In this thesis we …","url":["https://reposit.haw-hamburg.de/bitstream/20.500.12738/17075/1/MA_Simplification%20of%20German%20Narrative%20Documents.pdf"]} {"year":"2025","title":"SimRE: A Requirements Similarity Tool for Software Product Lines","authors":["MI Limaylla-Lunarejo, N Condori-Fernandez… - 2025"],"snippet":"A Software Product Line (SPL) is a paradigm that effectively describes families of products based on reuse. Requirements engineering in this domain is a complex task, especially when new products are introduced. In this context, identifying …","url":["https://lbd.udc.es/Repository/Publications/Drafts/1741695248136_227_220.pdf"]} @@ -10954,9 +11169,11 @@ {"year":"2025","title":"SoftMatcha: A Soft and Fast Pattern Matcher for Billion-Scale Corpus Searches","authors":["H Deguchi, G Kamoda, Y Matsushita, C Taguchi… - arXiv preprint arXiv …, 2025"],"snippet":"Researchers and practitioners in natural language processing and computational linguistics frequently observe and analyze the real language usage in large-scale corpora. For that purpose, they often employ off-the-shelf pattern-matching tools …","url":["https://arxiv.org/pdf/2503.03703"]} {"year":"2025","title":"SoK: Advances and Open Problems in Web Tracking","authors":["Y Vekaria, Y Beugin, S Munir, G Acar, N Bielova… - arXiv preprint arXiv …, 2025"],"snippet":"Web tracking is a pervasive and opaque practice that enables personalized advertising, retargeting, and conversion tracking. Over time, it has evolved into a sophisticated and invasive ecosystem, employing increasingly complex techniques …","url":["https://arxiv.org/pdf/2506.14057"]} {"year":"2025","title":"SoK: Data Minimization in Machine Learning","authors":["R Staab, N Jovanović, K Mai, P Ganesh, M Vechev… - arXiv preprint arXiv …, 2025"],"snippet":"Data minimization (DM) describes the principle of collecting only the data strictly necessary for a given task. It is a foundational principle across major data protection regulations like GDPR and CPRA. Violations of this principle have substantial real-world …","url":["https://arxiv.org/pdf/2508.10836"]} -{"year":"2025","title":"Sparse CLIP: Co-Optimizing Interpretability and Performance in Contrastive Learning","authors":["PINC LEARNING"],"snippet":"Contrastive Language-Image Pre-training (CLIP) has become a cornerstone in vision-language representation learning, powering diverse downstream tasks and serving as the default vision backbone in multimodal large language models (MLLMs). Despite its …","url":["https://openreview.net/pdf?id=DjefrO8TJr"]} +{"year":"2025","title":"Sparse CLIP: Co-Optimizing Interpretability and Performance in Contrastive Learning","authors":["C Qin, C Venhoff, S Joseph, F Xiao, S Scherer - arXiv preprint arXiv:2601.20075, 2026","PINC LEARNING"],"snippet":"Contrastive Language-Image Pre-training (CLIP) has become a cornerstone in vision-language representation learning, powering diverse downstream tasks and serving as the default vision backbone in multimodal large language models (MLLMs). Despite its …","url":["https://arxiv.org/pdf/2601.20075","https://openreview.net/pdf?id=DjefrO8TJr"]} {"year":"2025","title":"Sparse Subnetwork Enhancement for Underrepresented Languages in Large Language Models","authors":["D Gurgurov, J van Genabith, S Ostermann - arXiv preprint arXiv:2510.13580, 2025"],"snippet":"Large language models exhibit uneven performance across languages, with substantial gaps between highand low-resource languages. We present a framework for enhancing monolingual capabilities of LLMs in underrepresented …","url":["https://arxiv.org/pdf/2510.13580"]} +{"year":"2025","title":"Sparsity-Controllable Dynamic Top-p MoE for Large Foundation Model Pre-training","authors":["C Jin, H Peng, M Xiang, Q Zhang, X Yuan, A Hasan… - arXiv preprint arXiv …, 2025"],"snippet":"… We use the DCLM-Baseline dataset [30], a filtered subset of Common Crawl containing 3.8T tokens, from which we randomly sample 20B to 300B tokens for our experiments. We utilize the GPT-NeoX-20B [5] tokenizer with a sequence length of …","url":["https://arxiv.org/pdf/2512.13996"]} {"year":"2025","title":"Spatially resolved polarization swings in the supermassive binary black hole candidate OJ 287 with first Event Horizon Telescope observations","authors":["JL Gómez, I Cho, E Traianou, TP Krichbaum, GY Zhao… - 2025"],"snippet":"We present the first Event Horizon Telescope 1.3 mm observations of the supermassive binary black hole candidate OJ287. The observations achieved an unprecedented angular resolution of 18 µas and reveal significant structural and …","url":["https://www.aanda.org/articles/aa/pdf/forth/aa55831-25.pdf"]} +{"year":"2025","title":"Speaking to the Markets: The Role of IMF Announcements in Investors' Confidence","authors":["B Sagna, S Zerbo - 2025"],"snippet":"This paper examines the effect of IMF staff and executive board announcements on sovereign bond spreads across emerging and developing economies during economic uncertainty. We derive testable predictions from a stylized model in which …","url":["https://www.elibrary.imf.org/downloadpdf/view/journals/001/2025/258/001.2025.issue-258-en.pdf"]} {"year":"2025","title":"Spectral Reconfiguration of Transformer Attention Pathways: A Framework for Modular Signal Decomposition in Large Language Models","authors":["P Stroud, E Littlewood, M Thornleigh, J Watson…"],"snippet":"… Pretraining-compatible corpora consisted of public web crawls curated for safety and licensing, including filtered Common Crawl variants, high-quality encyclopedic text, code corpora with permissive licenses, and academic preprints stripped of …","url":["https://www.researchgate.net/profile/Andrew-Scolto/publication/397428001_Spectral_Reconfiguration_of_Transformer_Attention_Pathways_A_Framework_for_Modular_Signal_Decomposition_in_Large_Language_Models/links/690fdebfc900be105cc06678/Spectral-Reconfiguration-of-Transformer-Attention-Pathways-A-Framework-for-Modular-Signal-Decomposition-in-Large-Language-Models.pdf"]} {"year":"2025","title":"Speculating LLMs' Chinese Training Data Pollution from Their Tokens","authors":["Q Zhang, D Wang, H Qian, L Yan, T Zhang, K Xu, Q Li… - arXiv preprint arXiv …, 2025"],"snippet":"… corpus by mixing the related webpages from CommonCrawl8 of 200 normal Chinese tokens … the polluted webpages containing “波*野 结衣” within CommonCrawl and compute its pres- … related to “波*野结衣” from CommonCrawl …","url":["https://arxiv.org/pdf/2508.17771"]} {"year":"2025","title":"Sri Lanka Document Datasets: A Large-Scale, Multilingual Resource for Law, News, and Policy (v20251005)","authors":["NI Senaratna - arXiv preprint arXiv:2510.04124, 2025"],"snippet":"We present a collection of open, machine-readable document datasets covering parliamentary proceedings, legal judgments, government publications, news, and tourism statistics from Sri Lanka. As of v20251005, the collection currently comprises …","url":["https://arxiv.org/pdf/2510.04124"]} @@ -10971,6 +11188,7 @@ {"year":"2025","title":"Stochastic Topological Memory Embedding in Large Language Models: An Empirical Analysis Using Open-Source Neural Architectures","authors":["T Connor, Z Molyneux, E Watson, A Scolto, J Wilson"],"snippet":"Stochastic approaches to memory have long held promise for improving information retention in high-capacity sequence models, yet integration with topological constructs has rarely been explored in practice. Introducing stochastic topological …","url":["https://www.researchgate.net/profile/Andrew-Scolto/publication/393461642_Stochastic_Topological_Memory_Embedding_in_Large_Language_Models_An_Empirical_Analysis_Using_Open-Source_Neural_Architectures/links/686ba9f8e4632b045dca4e28/Stochastic-Topological-Memory-Embedding-in-Large-Language-Models-An-Empirical-Analysis-Using-Open-Source-Neural-Architectures.pdf"]} {"year":"2025","title":"Stories that (are) Move (d by) Markets: A Causal Exploration of Market Shocks and Semantic Shifts across Different Partisan Groups","authors":["F Drinkall, S Zohren, M McMahon, JB Pierrehumbert - arXiv preprint arXiv …, 2025"],"snippet":"Macroeconomic fluctuations and the narratives that shape them form a mutually reinforcing cycle: public discourse can spur behavioural changes leading to economic shifts, which then result in changes in the stories that propagate. We show …","url":["https://arxiv.org/pdf/2502.14497"]} {"year":"2025","title":"StoryGem: Voronoi treemap Approach for Semantics-Preserving Text Visualization","authors":["N Oda, Y Onoue - arXiv preprint arXiv:2506.18793, 2025"],"snippet":"… We use this pretrained model because it has word vectors for 157 languages, learned from CommonCrawl and Wikipedia using FastText, making it scalable for many languages. When extracting word vectors, we remove words that are not …","url":["https://arxiv.org/pdf/2506.18793"]} +{"year":"2025","title":"Strategic Decision Framework for Enterprise LLM Adoption","authors":["M Trusov, M Hwang, Z Jamal, S Chandra - arXiv preprint arXiv:2511.18589, 2025"],"snippet":"Organizations are rapidly adopting Large Language Models (LLMs) to transform their operations, yet they lack clear guidance on key decisions for adoption and implementation. While LLMs offer powerful capabilities in content generation …","url":["https://arxiv.org/pdf/2511.18589"]} {"year":"2025","title":"Strategies for Utilizing Generative AI in Educational Environments","authors":["WP Jones, SB Logan - 2025"],"snippet":"… That data was taken from databases like Common Crawl, internetbased book corpora, Wikipedia, and an internal corpus of data scraped specifically by OpenAI for its quality. This data is what powers the capabilities of generative AI, but the creators …","url":["https://www.igi-global.com/viewtitle.aspx?titleid=376028"]} {"year":"2025","title":"Strategies for Utilizing Generative","authors":["WP Jones, SB Logan - Institutes of Higher Education (IHE) and Workforce …, 2025"],"snippet":"… That data was taken from databases like Common Crawl, internet-based book corpora, Wikipedia, and an internal corpus of data scraped specifically by OpenAI for its quality. This data is what powers the capabilities of generative AI, but the creators …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=HPJXEQAAQBAJ&oi=fnd&pg=PA275&dq=commoncrawl&ots=XY6xlzJnmX&sig=1PmKeT97mt5iuRUCs0PablLAhRs"]} {"year":"2025","title":"Structural Latency Perturbation in Large Language Models Through Recursive State Induction","authors":["M Mangrum, J Pemberton, B Wetherby, P Montague - arXiv preprint arXiv:2502.00758, 2025"],"snippet":"Computational efficiency has remained a critical consideration in scaling high-capacity language models, with inference latency and resource consumption presenting significant constraints on real-time applications. The study has introduced a …","url":["https://arxiv.org/pdf/2502.00758"]} @@ -10985,6 +11203,9 @@ {"year":"2025","title":"SUN'IY INTELLEKT VA KOMPYUTER GRAFIKASI AVTOMATIK TASVIR GENERATSIYASI","authors":["XS Sharifxon o'g'li - Журнал научных исследований и их решений, 2025"],"snippet":"… AI tomonidan ishlab chiqarilgan kontentning sifati va haqiqiyligi bilan bog‘liq xavotirlar paydo bo‘ldi, tadqiqotlar shuni ko‘rsatdiki, Common Crawldan 6 milliarddan ortiq jumlalar namunasidagi jumlalarning 57% dan ortig‘i mashina …","url":["https://inlibrary.uz/index.php/ituy/article/download/82596/84258"]} {"year":"2025","title":"Sunflower: A New Approach To Expanding Coverage of African Languages in Large Language Models","authors":["B Akera, EN Ouma, G Yiga, P Walukagga… - arXiv preprint arXiv …, 2025"],"snippet":"… Large-scale web text from MADLAD-400 [6], Ugandan news articles via Common Crawl, and established multilingual datasets with parallel text across languages including FLORES-200 [42], MT560 [43], Google SMOL [44], TICO-19 [45], MAFAND-MT …","url":["https://arxiv.org/pdf/2510.07203"]} {"year":"2025","title":"Supernova: Achieving More with Less in Transformer Architectures","authors":["AV Tanase, E Pelican - arXiv preprint arXiv:2507.15773, 2025"],"snippet":"We present Supernova, a 650M-parameter decoder-only transformer that demonstrates how careful architectural design and tokenization innovation can achieve the performance of larger models while maintaining computational efficiency …","url":["https://arxiv.org/pdf/2507.15773"]} +{"year":"2025","title":"Supplementary Information: Historical reconstruction of human moralization with word association and text corpora","authors":["A Ramezani, JE Stellar, M Feinberg, Y Xu"],"snippet":"We compared our model performance in reconstructing empirical moral association with alternative model architectures and baselines. As described in Methods section of the main text, we designed three distinct architectures—Residual GCN, GCN, and …","url":["https://www.cs.toronto.edu/~yangxu/ramezani_et_al_2025_reconstruction_si.pdf"]} +{"year":"2025","title":"Supporting Information Across the Firewall: Foreign Media's Role in Shaping Chinese Social Media Narratives on the Russo-Ukrainian War","authors":["HWA Hanley, Y Lu, J Pan"],"snippet":"… For each website, we downloaded Common Crawl indexed pagesS3 from between January 1, 2022, … Common Crawl dataset, we performed a breadth-first crawl (15 hops from the homepage) of each website to gather the set of HTML pages …","url":["https://jenpan.com/jen_pan/narratives_appendix.pdf"]} +{"year":"2025","title":"Supporting Vertical Web Search and Customized Search Applications with the Modular and Open Framework MOSAIC","authors":["S Gürtl, A Nussbaumer, C Gütl - 2025"],"snippet":"… Although Common Crawl5 provides a large-scale open web corpus, maintaining an index of its full collection is often less practical than crawling and indexing a smaller, domain-specific subset [5]. The OpenWebSearch.eu6 project addresses this …","url":["https://ceur-ws.org/Vol-4137/WOWS_2025_paper_5.pdf"]} {"year":"2025","title":"SupraTok: Cross-Boundary Tokenization for Enhanced Language Model Performance","authors":["AV Tănase, E Pelican - arXiv preprint arXiv:2508.11857, 2025"],"snippet":"Tokenization remains a fundamental yet underexplored bottleneck in natural language processing, with strategies largely static despite remarkable progress in model architectures. We present SupraTok, a novel tokenization architecture that …","url":["https://arxiv.org/pdf/2508.11857"]} {"year":"2025","title":"Surfacing Subtle Stereotypes: A Multilingual, Debate-Oriented Evaluation of Modern LLMs","authors":["M Saeed, M Abdul-Mageed, S Shehata - arXiv preprint arXiv:2511.01187, 2025"],"snippet":"Large language models (LLMs) are widely deployed for open-ended communication, yet most bias evaluations still rely on English, classification-style tasks. We introduce DebateBias-8K, a new multilingual, debate-style benchmark designed to reveal how …","url":["https://arxiv.org/pdf/2511.01187"]} {"year":"2025","title":"Survey of Filtered Approximate Nearest Neighbor Search over the Vector-Scalar Hybrid Data","authors":["Y Lin, K Zhang, Z He, Y Jing, XS Wang - arXiv preprint arXiv:2505.06501, 2025"],"snippet":"… Each vector pair includes a 512-dimensional image embedding and a corresponding 512-dimensional text embedding, both generated from the Common Crawl corpus using the same CLIP model [81]. The scalar part includes the image …","url":["https://arxiv.org/pdf/2505.06501"]} @@ -11004,6 +11225,7 @@ {"year":"2025","title":"Synthetic Social Engineering Scenario Generation using LLMs for Awareness-Based Attack Resilience","authors":["J Webb, F Abri, S Akther - IEEE Access, 2025"],"snippet":"Social engineering is found in a strong majority of cyberattacks today, as it is a powerful manipulation tactic that does not require the technical skills of hacking. Calculated social engineers utilize simple communication to deceive and exploit …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11180053.pdf"]} {"year":"2025","title":"Systematic Technical Survey on LLMOps: Lifecycle, Tools, Challenges, and Emerging Practices","authors":["F Özer - 2025"],"snippet":"The emergence of Large Language Models (LLMs) has transformed artificial intelligence applications across industries, yet their operational management presents challenges that exceed traditional Machine Learning Operations (MLOps) …","url":["https://erepo.uef.fi/bitstreams/4212c75f-1822-4648-aa2c-7c55c55d86c2/download"]} {"year":"2025","title":"Systems and Soft Computing","authors":["A Rahman, SH Mahir, MTA Tashrif, MA Karim, AA Aishi…"],"snippet":"… Primary sources for general linguistic coverage included Common Crawl and WebText, and BooksCorpus was used for long-form structured text data [34, 35]. Domain-specific corpora like PubMed and arXiv were essential for testing the …","url":["https://www.researchgate.net/profile/Abu-Saleh-Musa-Miah/publication/396403075_Comparative_analysis_based_on_DeepSeek_ChatGPT_and_Google_Gemini_Features_techniques_performance_future_prospects/links/68e99ba67d9a4d4e8708a836/Comparative-analysis-based-on-DeepSeek-ChatGPT-and-Google-Gemini-Features-techniques-performance-future-prospects.pdf"]} +{"year":"2025","title":"T-pro 2.0: An Efficient Russian Hybrid-Reasoning Model and Playground","authors":["D Stoianov, D Taranets, O Tsymboi, R Latypov… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce T-pro 2.0, an open-weight Russian LLM for hybrid reasoning and efficient inference. The model supports direct answering and reasoning-trace generation, using a Cyrillic-dense tokenizer and an adapted EAGLE speculative-decoding …","url":["https://arxiv.org/pdf/2512.10430"]} {"year":"2025","title":"T\\'yr-the-Pruner: Unlocking Accurate 50% Structural Pruning for LLMs via Global Sparsity Distribution Optimization","authors":["G Li, Y Xu, Z Li, J Liu, X Yin, D Li, E Barsoum - arXiv preprint arXiv:2503.09657, 2025"],"snippet":"Structural pruning enhances hardware-agnostic inference efficiency for large language models (LLMs) but often struggles to maintain performance. Local pruning performs efficient layer-by-layer compression but ignores global topology. Global …","url":["https://arxiv.org/pdf/2503.09657"]} {"year":"2025","title":"TABLET: A Large-Scale Dataset for Robust Visual Table Understanding","authors":["I Alonso, I Miranda, E Agirre, M Lapata - arXiv preprint arXiv:2509.21205, 2025"],"snippet":"While table understanding increasingly relies on pixel-only settings where tables are processed as visual representations, current benchmarks predominantly use synthetic renderings that lack the complexity and visual diversity of real-world tables …","url":["https://arxiv.org/pdf/2509.21205"]} {"year":"2025","title":"Tabular Deep Learning: A Survey from Small Neural Networks to Large Language Models","authors":["S Raieli - 2025"],"snippet":"Tabular data are ubiquitous in several real-world domains, including finance, healthcare, cybersecurity, and ecommerce. In spite of the dominance of deep learning for homogeneous data (such as computer vision and natural language …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.175753732.26052568"]} @@ -11023,9 +11245,12 @@ {"year":"2025","title":"Technical and ethical debt in the AI fair use crisis","authors":["G Toscano, E Petrov, L Li, VS Bahl - 2025"],"snippet":"The widespread use of copyrighted data to train AI systems without permission triggered a prolonged “AI fair use crisis.” This paper explores the legal, policy, technical, and ethical dimensions of this issue through the lens of technical and …","url":["https://assets.pubpub.org/3hol3kj3/MITSPR-v6-191618006002-Fair-Use-Data-51754155715156.pdf"]} {"year":"2025","title":"Technical Challenges of Rightsholders' Opt-out From Gen AI Training after Robert Kneschke v. LAION","authors":["S Havlikova - JIPITEC–Journal of Intellectual Property, Information …, 2025"],"snippet":"… Common Crawl Foundation proclaims to comply with Robots.txt and no follow policies of the scraped websites (for these purposes the Common Crawl … ), at the same time Common Crawl’s publicly available Terms of use explicitly limit Common …","url":["https://www.jipitec.eu/jipitec/article/download/422/425"]} {"year":"2025","title":"Technical, legal, and ethical challenges of generative artificial intelligence: an analysis of the governance of training data and copyrights","authors":["M Pasetti, JW Santos, NK Corrêa, N de Oliveira… - Discover Artificial …, 2025"],"snippet":"This article examines the legal, technical, and ethical challenges of generative AI, focusing on the governance of training data and copyright compliance. It addresses the growing tension between AI development and the rights of content creators …","url":["https://link.springer.com/article/10.1007/s44163-025-00379-6"]} +{"year":"2025","title":"Technocultural Hegemony: What Role Does Natural Language Processing Play in the Reinforcement of Dominant Cultural Narratives?","authors":["D Markava"],"snippet":"While Natural Language Processing (NLP) tools keep gaining popularity among users from around the globe, their vast majority is developed in the west, mainly the US. Although plenty of studies have shown that NLP tools don't perform equally well …","url":["https://osf.io/download/s4mzb/"]} {"year":"2025","title":"Technological Determination of AI-Relevant Press and Copyright Law and Generative Content's Relevance for EU Competition Law-The referral in Case C-250/25 …","authors":["J Hoffmann - 2025"],"snippet":"… It is therefore not clear whether they also cover other players in the AI value chain – such as providers responsible for web scraping and crawling (eg Common Crawl), providers of training tools and datasets (eg LAION) and – perhaps most importantly …","url":["https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=5411443"]} {"year":"2025","title":"TEDI: Trustworthy and Ethical Dataset Indicators to Analyze and Compare Dataset Documentation","authors":["W Hutiri, M Cimpoi, M Scheuerman, V Matthews… - arXiv preprint arXiv …, 2025"],"snippet":"Dataset transparency is a key enabler of responsible AI, but insights into multimodal dataset attributes that impact trustworthy and ethical aspects of AI applications remain scarce and are difficult to compare across datasets. To address this …","url":["https://arxiv.org/pdf/2505.17841"]} {"year":"2025","title":"Tehran's US Options","authors":["LTCRB Price"],"snippet":"In our cover article this month, Matthew Levitt examines potential retaliation by Iran against the US homeland following its 12-day war with Israel and US airstrikes against three of its nuclear facilities.“Iran may seek to carry out reprisal attacks in the …","url":["https://ctc.westpoint.edu/wp-content/uploads/2025/08/CTC-SENTINEL-082025.pdf"]} +{"year":"2025","title":"Teleological Vectors: A Mathematical Framework for Semantic Goal Alignment","authors":["C Royse"],"snippet":"Methods: The Teleological Vectors Framework extends Harris’s distributional hypothesis from linguistic semantics to goal-directed systems through the Teleological Distributional Hypothesis: goals pursued through similar action …","url":["https://www.researchgate.net/profile/Chris-Royse/publication/398078250_Teleological_Vectors_A_Mathematical_Framework_for_Semantic_Goal_Alignment/links/6929ace8abe27c41e5163cb4/Teleological-Vectors-A-Mathematical-Framework-for-Semantic-Goal-Alignment.pdf"]} +{"year":"2025","title":"Template-Guided Rule Generation and Evaluation for Data Quality using Large Language Models","authors":["A Schneider - 2025"],"snippet":"… CommonCrawl [32] is often used to train on data crawled from the internet. It contains over 250 billion web pages that have been accumulated … , thus some model providers use pre-processed or filtered sources that are based on …","url":["https://repositum.tuwien.at/bitstream/20.500.12708/221852/1/Schneider%20Alexander%20-%202025%20-%20Template-Guided%20Rule%20Generation%20and%20Evaluation%20for...pdf"]} {"year":"2025","title":"Temporally Extending Existing Web Archive Collections for Longitudinal Analysis","authors":["L Frew, ML Nelson, MC Weigle - arXiv preprint arXiv:2505.24091, 2025"],"snippet":"… Existing data sets of webpage crawls commonly used for academic purposes, such as ClueWeb [28] and Common Crawl,2 aim to collect snapshots of a large amount of unique URLs. Other large crawls focused on specific domains include the …","url":["https://arxiv.org/pdf/2505.24091"]} {"year":"2025","title":"TepiSense: A Social Computing based Real-Time Epidemic Surveillance System using Artificial Intelligence.","authors":["B Tahir, MA Mehmood - IEEE Access, 2025"],"snippet":"Artificial Intelligence (AI) technologies have enabled researchers to develop tools to monitor real-world events and user behavior using social media platforms. Twitter is particularly useful for gathering invaluable information related to diseases and …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10858732.pdf"]} {"year":"2025","title":"Term-Driven Classification of Low-Resource Mathematical Documents in Uzbek Language","authors":["N Boltayev, S Urazmetova, S Yakubov, X Shonazarov… - 2025 IEEE 26th …, 2025"],"snippet":"This article describes an algorithm for classifying mathematical documents in the Uzbek language that relies on the use of terms with different meanings. For each section of mathematics (discrete mathematics, probability theory, mathematical …","url":["https://ieeexplore.ieee.org/abstract/document/11096769/"]} @@ -11034,6 +11259,7 @@ {"year":"2025","title":"Test-Time Code-Switching for Cross-lingual Aspect Sentiment Triplet Extraction","authors":["D Sheng, K Han, H Li, Y Zhang, Y Huang, J Lang… - arXiv preprint arXiv …, 2025","DSKHH Li, Y Zhang, YHJLW Liu"],"snippet":"Aspect Sentiment Triplet Extraction (ASTE) is a thriving research area with impressive outcomes being achieved on high-resource languages. However, the application of cross-lingual transfer to the ASTE task has been relatively unexplored …","url":["https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-long.260.pdf","https://arxiv.org/pdf/2501.14144"]} {"year":"2025","title":"Test-Time Learning for Large Language Models","authors":["J Hu, Z Zhang, G Chen, X Wen, C Shuai, W Luo, B Xiao… - arXiv preprint arXiv …, 2025"],"snippet":"While Large Language Models (LLMs) have exhibited remarkable emergent capabilities through extensive pre-training, they still face critical limitations in generalizing to specialized domains and handling diverse linguistic variations …","url":["https://arxiv.org/pdf/2505.20633"]} {"year":"2025","title":"Testimony by LLMs","authors":["J He, C Yang - AI & SOCIETY, 2025"],"snippet":"Artificial testimony generated by large language models (LLMs) can be a source of knowledge. However, the requirement that artificial testifiers must satisfy for successful knowledge acquisition is different from the requirement that human …","url":["https://link.springer.com/article/10.1007/s00146-025-02366-y"]} +{"year":"2025","title":"Text and Data Mining Research Exception: The Path to the Legal Woods","authors":["N Eloshvili - Ger.-Geor. J. Comp. L., 2025"],"snippet":"… In particular, LAION used the 'Common Crawl' dataset of the Common Crawl Foundation. The defendant downloaded these files from the indicated links and used software to check whether the image descriptions matched with the content …","url":["https://heinonline.org/hol-cgi-bin/get_pdf.cgi?handle=hein.journals/gergeo2025§ion=71"]} {"year":"2025","title":"Text sentiment analysis using machine learning and deep learning models","authors":["S Suruthi, R Sandhiya, P Usha - Advances in Electrical and Computer Technologies, 2025"],"snippet":"… Using GloVe 300-dimensional word vectors, 840 billion characters and 2.2 million tokens from the lexicon were utilized to generate a Common Crawl library. Using an unsupervised learning method, the model depicts words as vectors. The semantic …","url":["https://www.taylorfrancis.com/chapters/edit/10.1201/9781003515470-67/text-sentiment-analysis-using-machine-learning-deep-learning-models-suruthi-sandhiya-usha"]} {"year":"2025","title":"Text Summarization and Multilingual Text to Audio Translation using Deep Learning Models","authors":["B Soni, SK Bharti, A Choudhury - … Conference on Intelligent Computing and Emerging …, 2024"],"snippet":"Different technologies has been combined in this work to read the research paper or review paper. This work introduces a system that can automatically summarize research or review papers into audio files in Hindi and English, making it easier for …","url":["https://ieeexplore.ieee.org/abstract/document/10837105/"]} {"year":"2025","title":"Text Summarization of Indo-Aryan Languages Using Self-attention","authors":["S Hadawle, P Kotkar, O Bhatia, S Dongre, AR Singh - … Proceedings of ICEEE 2024, Volume 2"],"snippet":"Text summarization plays a crucial role in distilling relevant informa-tion from large textual documents. However, there are negligible language models available for working on texts available in local or indigenous languages. Regional languages …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=DMlCEQAAQBAJ&oi=fnd&pg=PA484&dq=commoncrawl&ots=wUfY-nAhKW&sig=OCjQYmBujpcLZrlxL8O6LImsXCc"]} @@ -11042,6 +11268,7 @@ {"year":"2025","title":"Text‑Guided Data Attribution: Attributing the Influence of Simplicity Bias to Dataset","authors":["K Shubham, P Sastry - NeurIPS 2025 Workshop: Reliable ML from Unreliable …"],"snippet":"The effectiveness of deep learning models heavily relies on the quality and diversity of their training data. However, datasets collected from different sources often introduce simplicity biases, where a model relies on easily learnable but non-predictive …","url":["https://openreview.net/pdf?id=Zjpacx0mez"]} {"year":"2025","title":"Textagon: Boosting Language Models with Theory-guided Parallel Representations","authors":["JP Lalor, R Qin, D Dobolyi, A Abbasi - Proceedings of the 63rd Annual Meeting of the …, 2025"],"snippet":"Pretrained language models have significantly advanced the state of the art in generating distributed representations of text. However, they do not account for the wide variety of available expert-generated language resources and lexicons that …","url":["https://aclanthology.org/2025.acl-demo.9.pdf"]} {"year":"2025","title":"TextAtlas5M: A Large-scale Dataset for Dense Text Image Generation","authors":["AJ Wang, D Mao, J Zhang, W Han, Z Dong, L Li, Y Lin… - arXiv preprint arXiv …, 2025"],"snippet":"… After obtaining images generated by Stable Diffusion and images from CommonCrawl, which contain large fillable text areas (such as billboards, electronic screens, etc.), we use YOLO v11 and RT-DETR_r50vd to identify and label the …","url":["https://arxiv.org/pdf/2502.07870"]} +{"year":"2025","title":"Textual Data Bias Detection and Mitigation-An Extensible Pipeline with Experimental Evaluation","authors":["R Görge, SS Gannamaneni, T Naeven, H Abdelwahab… - arXiv preprint arXiv …, 2025"],"snippet":"Textual data used to train large language models (LLMs) exhibits multifaceted bias manifestations encompassing harmful language and skewed demographic distributions. Regulations such as the European AI Act require identifying and …","url":["https://arxiv.org/pdf/2512.10734"]} {"year":"2025","title":"The “Web Browser” of the Collective Unconscious: The Mirror and Oracle of Generative AI","authors":["JD Batt - Depth Psychology, Myth and Artificial Intelligence Soul …"],"snippet":"… Assembled from Common Crawl, this dataset draws on billions of web-sourced images and their associated metadata. It employs advanced filtering mechanisms, including OpenAI’s CLIP model, to ensure relevance and quality, while its …","url":["https://link.springer.com/content/pdf/10.1007/978-3-031-94105-4.pdf#page=254"]} {"year":"2025","title":"The AI Fetish: When Wooden Brains Begin to Think","authors":["A Levant - Caderno Brasileiro de Ensino de Física, 2025"],"snippet":"Este artigo examina como a inteligência artificial se tornou fetichizada no discurso contemporâneo, sendo imaginada como uma força autônoma ao invés de trabalho humano coletivo cristalizado. Baseando-se na teoria do fetichismo da mercadoria …","url":["https://periodicos.ufsc.br/index.php/fisica/article/download/108832/60487"]} {"year":"2025","title":"The AI tool that can interpret any spreadsheet instantly","authors":["DC McElfresh - 2025"],"snippet":"… For example, LLMs such as OpenAI’s GPT-4 are pre-trained on hundreds of billions of documents (if not more), using sources such as Common Crawl (see commoncrawl.org). By contrast, there are very few tabular data sets: Kaggle, one of …","url":["https://www.nature.com/articles/d41586-024-03852-x"]} @@ -11058,6 +11285,7 @@ {"year":"2025","title":"The Cultural Devaluation of Feminized Work: The Evolution of US Occupational Prestige and Gender Typing in Linguistic Representations, 1900 to 2019","authors":["W Jiang - American Sociological Review"],"snippet":"Previous research on occupational devaluation typically evaluates the potential wage declines associated with a significant inflow of women into an occupation; results have been mixed. Few studies, however, examine the cultural mechanism …","url":["https://journals.sagepub.com/doi/abs/10.1177/00031224251362351"]} {"year":"2025","title":"The Data Governance Challenges","authors":["HWS May"],"snippet":"… Mozilla recently published a study noting the dangers of relying on the Common Crawl for trustworthy AI. Author Stefan Baack noted that the crawl’s mission does not align with the needs of trustworthy AI developers. He also pointed out that because …","url":["https://www.jstor.org/stable/pdf/resrep58361.9.pdf"]} {"year":"2025","title":"The data-quality illusion: Rethinking Classifier-based quality filtering for LLM Pretraining","authors":["TN Saada, L Bethune, M Klein, D Grangier, M Cuturi… - arXiv preprint arXiv …, 2025"],"snippet":"Large-scale models are pretrained on massive web-crawled datasets containing documents of mixed quality, making data filtering essential. A popular method is Classifier-based Quality Filtering (CQF), which trains a binary classifier to distinguish …","url":["https://arxiv.org/pdf/2510.00866"]} +{"year":"2025","title":"The Deleuzian Representation Hypothesis","authors":["C Cornet, R Besançon, HL Borgne - arXiv preprint arXiv:2512.19734, 2025"],"snippet":"We propose an alternative to sparse autoencoders (SAEs) as a simple and effective unsupervised method for extracting interpretable concepts from neural networks. The core idea is to cluster differences in activations, which we formally justify within …","url":["https://arxiv.org/pdf/2512.19734"]} {"year":"2025","title":"The Disruption of Due Diligence: How Generative AI is Transforming M&A Due Diligence Processes","authors":["I Käyhkö - 2025"],"snippet":"This thesis explores the emerging role of Generative Artificial Intelligence (GenAI) in transforming due diligence processes within mergers and acquisitions (M&A), with a particular focus on financial and operational due diligence conducted by large …","url":["https://aaltodoc.aalto.fi/bitstreams/fdf733e8-3f3d-4e4c-b3c4-b95f2733b5a7/download"]} {"year":"2025","title":"The Effectiveness of Uncased Tokeniziaion for Clinical Notes","authors":["C Paik, K von der Wense - Findings of the Association for Computational …, 2025"],"snippet":"The impact of case-sensitive tokenization on clinical notes is not well understood. While clinical notes share similarities with biomedical text in terminology, they often lack the proper casing found in polished publications. Language models, unlike …","url":["https://aclanthology.org/2025.findings-acl.775.pdf"]} {"year":"2025","title":"The Emergence of Abstract Thought in Large Language Models Beyond Any Language","authors":["Y Chen, Y Zhao, Y Zhang, A Zhang, K Kawaguchi… - arXiv preprint arXiv …, 2025"],"snippet":"… Common Crawl is a publicly available web crawl spanning petabytes of data. OSCAR further processes this raw data to produce monolingual corpora across a wide range of languages, making it a valuable resource for training large language …","url":["https://arxiv.org/pdf/2506.09890"]} @@ -11080,6 +11308,7 @@ {"year":"2025","title":"The KL3M Data Project: Copyright-Clean Training Resources for Large Language Models","authors":["MJ Bommarito II, J Bommarito, DM Katz - arXiv preprint arXiv:2504.07854, 2025"],"snippet":"… CommonCOW: Massively huge web corpora from CommonCrawl data and a method to distribute them freely under restrictive EU copyright laws. In Proceedings of the tenth international conference on language resources and evaluation (LREC’16) …","url":["https://arxiv.org/pdf/2504.07854"]} {"year":"2025","title":"The Landscape of Arabic Large Language Models (ALLMs): A New Era for Arabic Language Technology","authors":["S Al-Khalifa, N Durrani, H Al-Khalifa, F Alam - arXiv preprint arXiv:2506.01340, 2025"],"snippet":"The emergence of ChatGPT marked a transformative milestone for Artificial Intelligence (AI), showcasing the remarkable potential of Large Language Models (LLMs) to generate human-like text. This wave of innovation has revolutionized how we …","url":["https://arxiv.org/pdf/2506.01340"]} {"year":"2025","title":"The Landscape of Arabic Large Language Models","authors":["S Al-Khalifa, N Durrani, H Al-Khalifa, F Alam - Communications of the ACM"],"snippet":"… For pre-training, the datasets include Web content (for example, Common Crawl), Wikipedia, books, news, and code, covering a wide range of disciplines. Every ALLM development initiative curates, filters, and processes these datasets within …","url":["https://dl.acm.org/doi/full/10.1145/3737453"]} +{"year":"2025","title":"The Latin Language Evolved Over Time, Masked Models Disregard That","authors":["M Cuscito, A Ferrara, M Ruskov - Anthology of Computers and the Humanities, 2025"],"snippet":"Training of Latin language models is rarely done with consideration of important historical watersheds. Here we demonstrate how this leads to a poor performance when specific socio-temporal contextualisation is sought, something common to …","url":["https://anthology.ach.org/volumes/vol0003/the-latin-language-evolved-over-time-masked-models/10.63744@sLAHYnQdA8fu.pdf"]} {"year":"2025","title":"The Law and Ethics of AI Creativity","authors":["H Sun - St. John's Law Review, 2025"],"snippet":"The rise of generative artificial intelligence (“AI”) systems has triggered a backlash among creatives across the globe. In December 2022, artists initiated the No to AI Art movement on social media, 1 primarily as a response to AI companies exploiting …","url":["https://scholarship.law.stjohns.edu/cgi/viewcontent.cgi?article=7287&context=lawreview"]} {"year":"2025","title":"The LiveRAG Challenge at SIGIR 2025","authors":["D Carmel, S Filice, G Horowitz, Y Maarek, O Somekh… - Proceedings of the 48th …, 2025"],"snippet":"… The Fineweb dataset [7] consists of cleaned and de-duplicated Web data from CommonCrawl6. This dataset is relatively clean, compared to other Web datasets, however, it still contains some toxic/offensive data, as well as non-English pages …","url":["https://dl.acm.org/doi/abs/10.1145/3726302.3733591"]} {"year":"2025","title":"The Lucie-7B LLM and the Lucie Training Dataset: Open resources for multilingual language generation","authors":["O Gouvert, J Hunter, J Louradour, C Cerisara… - arXiv preprint arXiv …, 2025"],"snippet":"… The Common Crawl foundation15 regularly crawls the web to pick up new material, paying attention to respect opt-out choices from url holders. The OSCAR … We note that RedPajama also uses CCNet models to filter CommonCrawl content as …","url":["https://arxiv.org/pdf/2503.12294"]} @@ -11094,7 +11323,9 @@ {"year":"2025","title":"The PrivaSeer Project: Large-Scale Resources for Analysis of Privacy Policy Text","authors":["S Wilson, F Schaub, L Matheson, S Shayesteh, L Xian"],"snippet":"Privacy policies provide insight into organizations’ data processing practices, but the wealth of privacy policies available on the web contrasts with the challenges of understanding the state of digital privacy at scale. We report on progress made by …","url":["https://shomir.net/pdf/publications/privaseer_soups_2025_paper.pdf"]} {"year":"2025","title":"The promise and perils of smart (city) bots as educational tools","authors":["T Menkhoff, SN KAN, S FOONG - 2024"],"snippet":"… GPT-3 was trained extensively on several data sets such as Common Crawl’s web archive, WebText2 (a private OpenAI dataset created by crawling links from social news website and forum Reddit that had three upvotes) and Wikipedia. …","url":["https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=8675&context=lkcsb_research"]} {"year":"2025","title":"The Representational Alignment between Humans and Language Models is implicitly driven by a Concreteness Effect","authors":["C Iaia, B Choksi, E Wiebers, G Roig, CJ Fiebach - arXiv preprint arXiv:2505.15682, 2025"],"snippet":"The nouns of our language refer to either concrete entities (like a table) or abstract concepts (like justice or love), and cognitive psychology has established that concreteness influences how words are processed. Accordingly, understanding how …","url":["https://arxiv.org/pdf/2505.15682"]} +{"year":"2025","title":"The Responsible Health AI Readiness and Maturity Index (RHAMI): Applications for a Global Narrative Review of Leading AI Use Cases in Public Health Nutrition","authors":["DJ Monlezun, G Marshall, L Omutoko, P Oduor… - Nutrients, 2026"],"snippet":"… for more healthcare system actors worldwide regardless of technical background, we also performed an AI-assisted literature review using the free ChatGPT 4o large language model (LLM), in addition to reviewing news articles, social media, books …","url":["https://www.mdpi.com/2072-6643/18/1/38"]} {"year":"2025","title":"The Rise of AfricaNLP: Contributions, Contributors, and Community Impact (2005-2025)","authors":["TD Belay, KY Hussen, SH Imam, I Ameer, IS Ahmad… - arXiv preprint arXiv …, 2025"],"snippet":"Natural Language Processing (NLP) is undergoing constant transformation, as Large Language Models (LLMs) are driving daily breakthroughs in research and practice. In this regard, tracking the progress of NLP research and automatically …","url":["https://arxiv.org/pdf/2509.25477"]} +{"year":"2025","title":"The Rise Of Large Language Models In NLP: Technological Developments And Their Influence From GPT To Multimodal Systems","authors":["MS PS, MV Sunish"],"snippet":"Large-scale language models (LLMs) have basically changed the limits of what can be achieved by natural language processing and this was the case of GPT-3 at its very first demonstration back in 2020. This study outlines in detail their journey. It …","url":["https://rjwave.org/ijedr/papers/IJEDR2504422.pdf"]} {"year":"2025","title":"The Risks of Generative AI Non-Verifiable Interpretation: Exploring Japanese youkai in English","authors":["M Moriguchi, O Kennedy - Lexicography, 2025"],"snippet":"Since Open AI's release of ChatGPT 3.5 in November 2022, generative AI has greatly impacted lexicography. During this time, Japanese popular culture including manga and anime has become increasingly popular throughout the world, in which …","url":["https://utppublishing.com/doi/abs/10.3138/lexi-2025-0005"]} {"year":"2025","title":"The role of compute thresholds for AI governance","authors":["M Pistillo, S Van Arsdale, L Heim, C Winter - George Washington Journal of Law & …"],"snippet":"… [https://perma.cc/7JK3-JQJ7] (noting that OpenAI filtered the Common Crawl dataset down from 45TB to 570GB, and that the curated dataset was used for 60% of the examples during training). Data can also be filtered in other ways, such as to …","url":["https://gwjolt.org/files/volume_1/GW%20JOLT%201_1%20Winter.pdf"]} {"year":"2025","title":"The role of GPT in promoting inclusive higher education for people with various learning disabilities: a review","authors":["TR Gadekallu, G Yenduri, R Kaluri, DS Rajput… - PeerJ Computer Science, 2025"],"snippet":"The generative pre-trained transformer (GPT) is a notable breakthrough in the field of artificial intelligence, as it empowers machines to effectively comprehend and engage in interactions with humans. The GPT exhibits the capacity to enhance …","url":["https://peerj.com/articles/cs-2400/"]} @@ -11117,6 +11348,7 @@ {"year":"2025","title":"Time Series Foundation Models: Benchmarking Challenges and Requirements","authors":["M Meyer, S Kaltenpoth, K Zalipski, O Müller - arXiv preprint arXiv:2510.13654, 2025"],"snippet":"Time Series Foundation Models (TSFMs) represent a new paradigm for time series forecasting, offering zero-shot forecasting capabilities without the need for domain-specific pre-training or fine-tuning. However, as with Large Language Models (LLMs) …","url":["https://arxiv.org/pdf/2510.13654"]} {"year":"2025","title":"Time-IMM: A Dataset and Benchmark for Irregular Multimodal Multivariate Time Series","authors":["C Chang, J Hwang, Y Shi, H Wang, WC Peng, TF Chen… - arXiv preprint arXiv …, 2025"],"snippet":"… • Textual Data: To provide contextual background for environmental conditions, we collect news articles from Common Crawl spanning January to October 2024. Articles are filtered to include only those that mention both the county name and the …","url":["https://arxiv.org/pdf/2506.10412"]} {"year":"2025","title":"Tiny Language Models for Automation and Control: Overview, Potential Applications, and Future Research Directions","authors":["I Lamaakal, Y Maleh, K El Makkaoui, I Ouahbi… - Sensors, 2025"],"snippet":"… RedPajama [93]: A dataset comprising over 100 billion text documents, extracted from 84 CommonCrawl snapshots and processed via the … RoBERTa [95] CCNewsV2: A dataset that includes an updated version of the English text from the …","url":["https://www.mdpi.com/1424-8220/25/5/1318"]} +{"year":"2025","title":"TinyML vs LLMs: A Survey of Extreme Scales in Machine Learning","authors":["I Lamaakal, C Yahyati, K El Makkaoui, Y Maleh…"],"snippet":"ML now works with an unmatched range of computational scales, from microcontrollers with models that are only a few kilobytes big to cloud infrastructures with Large Language Models (LLMs) that have trillions of parameters. These …","url":["https://www.researchgate.net/profile/Ismail-Lamaakal/publication/398714289_TinyML_vs_LLMs_A_Survey_of_Extreme_Scales_in_Machine_Learning/links/6941580d7e61d05b530fd976/TinyML-vs-LLMs-A-Survey-of-Extreme-Scales-in-Machine-Learning.pdf"]} {"year":"2025","title":"TISEA: A Scalable Deep Learning Framework for Multi-Faceted Text Analytics with Topic Modeling, Summarization, and Emotion Classification","authors":["R Bera, abdul quadir, CJ Joshua, A Shahina - Engineering Research Express, 2025"],"snippet":"… The Common Crawl German dataset was used to create synthetic data that was used to build the abstractive summary of the provided text. Using the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) and BLUE scores as a gauge, the model's …","url":["https://iopscience.iop.org/article/10.1088/2631-8695/ae000d/meta"]} {"year":"2025","title":"TituLLMs: A Family of Bangla LLMs with Comprehensive Benchmarking","authors":["SK Nahin, RN Nandi, S Sarker, QS Muhtaseem… - arXiv preprint arXiv …, 2025"],"snippet":"In this paper, we present TituLLMs, the first large pretrained Bangla LLMs, available in 1B and 3B parameter sizes. Due to computational constraints during both training and inference, we focused on smaller models. To train TituLLMs, we collected a …","url":["https://arxiv.org/pdf/2502.11187"]} {"year":"2025","title":"To a Fandom Far, Far Away: Exploring the Influence of Science Fiction and Fantasy on Modal Judgment, Moral Permissibility, and Creativity","authors":["S Ceynek - 2025"],"snippet":"This study explores the impact of genre preference—science fiction and fantasy—on modal judgment, moral permissibility, creativity, and the role of fan identity. As these genres grow in popularity and influence culture, they spark discussions on futuristic …","url":["https://search.proquest.com/openview/9bf0efb3fc5d631004ada234223cd986/1?pq-origsite=gscholar&cbl=18750&diss=y"]} @@ -11124,11 +11356,19 @@ {"year":"2025","title":"Token Memory Transformer with Infinite Context","authors":["T Sun, K Fujita, K Markov, S Chang - International Conference on Intelligent …, 2025"],"snippet":"This study proposes an infinite context Transformer model based on Token Memory, which aims to solve the problem of contextual limitation in long text processing. The core of this model is Token Memory, which stores the context memory for each token …","url":["https://link.springer.com/chapter/10.1007/978-981-95-0020-8_27"]} {"year":"2025","title":"Token-level Ensembling of Models with Different Vocabularies","authors":["R Wicks, K Ravisankar, X Yang, P Koehn, M Post - arXiv preprint arXiv:2502.21265, 2025"],"snippet":"Model ensembling is a technique to combine the predicted distributions of two or more models, often leading to improved robustness and performance. For ensembling in text generation, the next token's probability distribution is derived from …","url":["https://arxiv.org/pdf/2502.21265"]} {"year":"2025","title":"Tokenization is Sensitive to Language Variation","authors":["A Wegmann, D Nguyen, D Jurgens - arXiv preprint arXiv:2502.15343, 2025"],"snippet":"Variation in language is ubiquitous and often systematically linked to regional, social, and contextual factors. Tokenizers split texts into smaller units and might behave differently for less common linguistic forms. This might affect downstream LLM …","url":["https://arxiv.org/pdf/2502.15343"]} +{"year":"2025","title":"Tokenizing Buildings: A Transformer for Layout Synthesis","authors":["ML de Guevara, J Rhee, A Bidgoli, V Razgaitis… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Small Building Model (SBM), a Transformer-based architecture for layout synthesis in Building Information Modeling (BIM) scenes. We address the question of how to tokenize buildings by unifying heterogeneous feature sets of …","url":["https://arxiv.org/pdf/2512.04832"]} {"year":"2025","title":"TokenSwap: A Lightweight Method to Disrupt Memorized Sequences in LLMs","authors":["K Ponkshe, PP Prashant, B Salimi - Generative and Protective AI for Content Creation"],"snippet":"… Even when available, disjoint datasets are unlikely given that most models train on overlapping web corpora like Common Crawl. Additionally, CP-Fuse requires identical tokenizers, limiting comparisons to models within the same family. Similarly, we …","url":["https://openreview.net/pdf?id=FAqLrmEMup"]} +{"year":"2025","title":"TokSuite: Measuring the Impact of Tokenizer Choice on Language Model Behavior","authors":["GS Altıntaş, M Ehghaghi, B Lester, F Liu, W Zhao… - arXiv preprint arXiv …, 2025"],"snippet":"… 2024), which provides highquality content filtered from Common Crawl data. For the multilingual components, we use the Chinese, Turkish, Italian, and Farsi subsets of the FineWeb-2 HQ Dataset (Messmer et al.…","url":["https://arxiv.org/pdf/2512.20757"]} {"year":"2025","title":"TorchTitan: A PyTorch Native Platform for Training Generative AI Models","authors":["T Liu, W Liang - Championing Open-source DEvelopment in ML …"],"snippet":"TorchTitan is a PyTorch native open-source platform (GitHub: https://github.com/pytorch/torchtitan) designed for scalable and flexible training of generative AI models. Integrated tightly with PyTorch's distributed stack while offering efficient optimizations and modular …","url":["https://openreview.net/pdf?id=WuQtmIkiUL"]} {"year":"2025","title":"Toward a Representative DNS Data Corpus: A Longitudinal Comparison of Collection Methods","authors":["C Kranig, E Pauley, WS Wung, P Barford, M Crovella… - 2025 9th Network Traffic …, 2025"],"snippet":"… c) Common Crawl: Since the collection of each common crawl dataset occurs over the course of a month we consider every domain that … 3) Common Crawl: Common Crawl is the smallest dataset that we considered and focuses on the Web …","url":["https://ieeexplore.ieee.org/abstract/document/11096967/"]} {"year":"2025","title":"Toward Bias-Aware and Efficient Offensive Language Detection Using QLoRA-Optimized LLaMA and GPT Models","authors":["N Hussain, A Qasim, F Liaquat, G Mehak, AGM Meque… - … International Conference on …, 2025"],"snippet":"It is well known that detecting offensive language is a challenging task in online content moderation. In this work, we compare four open-domain and easily accessible LLMs, namely, LLaMA 2 7B, LLaMA 3 8B, Mistral 8B, and GPT-4o mini 8B—on …","url":["https://link.springer.com/chapter/10.1007/978-3-032-09037-9_17"]} +{"year":"2025","title":"Toward Effective Blocking for Entity Matching","authors":["D Paulsen - 2025"],"snippet":"In this dissertation we study entity matching, a fundamental problem that lies at the heart of data integration for data science and AI. Specifically, we consider the following common entity matching problem: given two tables A and B with the same …","url":["https://search.proquest.com/openview/6fefbb244d23b54a6e25925d2195bff7/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2025","title":"Toward Human-Centered AI-Assisted Terminology Work","authors":["AS Martin - arXiv preprint arXiv:2512.18859, 2025"],"snippet":"… contained 42.08% English-language content, followed distantly by Russian at only 6.48% (Common Crawl 2025a). It is also estimated to … represented in the Common Crawl, which are two blog platforms (Blogspot and WordPress) and …","url":["https://arxiv.org/pdf/2512.18859"]} +{"year":"2025","title":"Toward more truly international comparative research: Current opportunities and challenges of multilingual text analysis with computational methods","authors":["F Lind, SC Volk - Critical Communication Research with Global …, 2025"],"snippet":"This chapter focuses on international comparative content analyses and critically reflects on the pervasive Western influence in published research. Based on a secondary data analysis of 186 comparative content analysis studies, it uncovers a …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.4324/9781003583073-13&type=chapterpdf"]} +{"year":"2025","title":"Toward Multimodal Agent Intelligence: Perception, Reasoning, Generation and Interaction","authors":["Z Zhou, ML de Melo, TA Rios"],"snippet":"The pursuit of Artificial General Intelligence (AGI) necessitates the development of agents that can understand and interact with the world in a manner akin to humans. A cornerstone of this endeavor is multimodal agent intelligence, which equips …","url":["https://engrxiv.org/preprint/download/5901/9796"]} +{"year":"2025","title":"Toward Next-Generation Database System: Integrating Data Management With Artificial Intelligence","authors":["L Zhou - 2025"],"snippet":"Integrating artificial intelligence (AI) with database (DB) systems unlocks transformative opportunities to overcome the limitations of both fields: the lack of intelligence and automation for data management, and the lack of declarative and …","url":["https://search.proquest.com/openview/4a6f323f816e12e0d73921032b4138bc/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Towards a Better Understanding of IoT Domain Names: A Study of IoT Backend","authors":["I Ayoub, MS Lenders, B Ampeau, S Balakrichenan… - IEEE Access, 2025"],"snippet":"In this paper, we study IoT domain names, the domain names of backend servers on the Internet that are accessed by IoT devices. We investigate how they compare to non-IoT domain names based on their statistical and DNS properties, and the …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/10966846.pdf"]} +{"year":"2025","title":"Towards a robust natural language understanding","authors":["M Chabane - 2025"],"snippet":"Natural Language Processing (NLP) technologies have seen remarkable progress in recent years, unlocking new possibilities across domains such as education, healthcare, and social media. However, this progress remains largely confined to …","url":["http://dspace.univ-setif.dz:8888/jspui/bitstream/123456789/6563/1/2524.pdf"]} {"year":"2025","title":"Towards an AI narratology: the possibilities of LLM classification for the quantification of abstract narrative concepts in literary studies","authors":["C Carroll - The Routledge Handbook of AI and Literature, 2024"],"snippet":"While narratology and computational literary criticism share a value for categorisation of formal textual features, the application of computational approaches to narratology has been limited. In this chapter, I argue that the reason …","url":["https://www.taylorfrancis.com/chapters/edit/10.4324/9781003255789-32/towards-ai-narratology-possibilities-llm-classification-quantification-abstract-narrative-concepts-literary-studies-claudia-carroll"]} {"year":"2025","title":"Towards an Explainable Linguistic Approach to the Identification of Expressive Forms Within Arabic Text","authors":["Z Banou, S El Filali, EH Benlahmar, FZ Alaoui… - Engineering Proceedings, 2025"],"snippet":"… To evaluate the effectiveness of our negation and litotes detection pipeline, we applied it to the Arabic portion of the OSCAR corpus, a large-scale multilingual dataset extracted from Common Crawl. The dataset, which contains billions of words …","url":["https://www.mdpi.com/2673-4591/112/1/26"]} {"year":"2025","title":"Towards Best Practices for Open Datasets for LLM Training","authors":["S Baack, S Biderman, K Odrozek, A Skowron, A Bdeir… - arXiv preprint arXiv …, 2025"],"snippet":"… behind the Common pile sent transcripts of all the videos they used to the YouTube channels and the work they did on identifying creative commons licensed subsets of Common Crawl is being gifted to Common Crawl for them to use to …","url":["https://arxiv.org/pdf/2501.08365"]} @@ -11161,6 +11401,7 @@ {"year":"2025","title":"Training and Fine-Tuning LLMs","authors":["BP Jeyaraman - Large Language Models Ops for Finance, 2025"],"snippet":"Training and fine-tuning large language models (LLMs) are critical steps in adapting these models to the unique demands of the financial sector. From handling sensitive data to ensuring compliance and delivering accurate, domain-specific insights …","url":["https://link.springer.com/chapter/10.1007/979-8-8688-1700-7_3"]} {"year":"2025","title":"Training Data Attribution (TDA): Examining Its Adoption & Use Cases","authors":["D Cheng, J Bae, J Bullock, D Kristofferson - arXiv preprint arXiv:2501.12642, 2025"],"snippet":"This report investigates Training Data Attribution (TDA) and its potential importance to and tractability for reducing extreme risks from AI. First, we discuss the plausibility and amount of effort it would take to bring existing TDA research efforts from their …","url":["https://arxiv.org/pdf/2501.12642"]} {"year":"2025","title":"Training Dynamics Impact Post-Training Quantization Robustness","authors":["A Catalan-Tatjer, N Ajroldi, J Geiping - arXiv preprint arXiv:2510.06213, 2025"],"snippet":"While post-training quantization is widely adopted for efficient deployment of large language models, the mechanisms underlying quantization robustness remain unclear. We conduct a comprehensive analysis of quantization degradation across …","url":["https://arxiv.org/pdf/2510.06213"]} +{"year":"2025","title":"Training Large Language Models","authors":["E Ustaoğlu"],"snippet":"It had always been a dream among scientists and researchers to build machines which can communicate with humans just in the way humans communicate between themselves. However, this task required facing many difficulties for bridging the gap …","url":["https://www.researchgate.net/profile/Erhan-Ustaoglu/publication/397655157_Training_Large_Language_Models/links/692436c8acf4cf6385379180/Training-Large-Language-Models.pdf"]} {"year":"2025","title":"Training Matryoshka Mixture-of-Experts for Elastic Inference-Time Expert Utilization","authors":["Y Wang, Q Hu, Y Ding, R Wang, Y Gong, J Jiao… - arXiv preprint arXiv …, 2025"],"snippet":"Mixture-of-Experts (MoE) has emerged as a promising paradigm for efficiently scaling large language models without a proportional increase in computational cost. However, the standard training strategy of Top-K router prevents MoE models from …","url":["https://arxiv.org/pdf/2509.26520"]} {"year":"2025","title":"Training Optimal Large Diffusion Language Models","authors":["J Ni, Q Liu, C Du, L Dou, H Yan, Z Wang, T Pang… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce Quokka, the first systematic scaling law for diffusion language models (DLMs), encompassing both compute-constrained and data-constrained regimes, and studying the key modeling and optimization designs. Quokka is a good friend of …","url":["https://arxiv.org/pdf/2510.03280"]} {"year":"2025","title":"Training Sparse Mixture Of Experts Text Embedding Models","authors":["Z Nussbaum, B Duderstadt - arXiv preprint arXiv:2502.07972, 2025"],"snippet":"Transformer-based text embedding models have improved their performance on benchmarks like MIRACL and BEIR by increasing their parameter counts. However, this scaling approach introduces significant deployment challenges, including …","url":["https://arxiv.org/pdf/2502.07972"]} @@ -11172,11 +11413,14 @@ {"year":"2025","title":"Transformation of ChatGPT into Threat: The Effects of Generative AI on Data Protection and Security","authors":["NJ Manjula, K Randhi, SR Bandarapu - American Journal of Computing and …, 2024"],"snippet":"Purpose: For 2022, GenAI models were the main digital transformation advancement. Cybersecurity is crucial when GenAI models like ChatGPT and Google Bard get more complex. Cybersecurity incidents have highlighted GenAI's offensive and …","url":["https://ideas.repec.org/a/bfy/ojajce/v7y2024i5p12-29id2586.html"]} {"year":"2025","title":"Transformer Architectures for Vocabulary Test Item Difficulty Prediction","authors":["L Skidmore, M Felice, KJ Dunn"],"snippet":"… 2020) is pre-trained on text from 100 languages using a large-scale CommonCrawlbased corpus. It employs SentencePiece tokenisation and is trained with a masked language modelling (MLM) objective. XLM-RoBERTa has been …","url":["https://aclanthology.org/anthology-files/pdf/bea/2025.bea-1.12.pdf"]} {"year":"2025","title":"Transformer Architectures","authors":["P Singh, B Raman - Deep Learning Through the Prism of Tensors, 2024"],"snippet":"This chapter delves into transformer architectures, which have revolutionized natural language processing (NLP) and beyond. It covers the historical context, self-attention mechanisms, and the encoder–decoder structure of transformers. Popular …","url":["https://link.springer.com/chapter/10.1007/978-981-97-8019-8_6"]} +{"year":"2025","title":"Transformer-based approaches for lemmatizing abbreviations in Russian texts","authors":["A Glazkova, O Lyashevskaya, D Morozov, I Smal - 2025"],"snippet":"This paper addresses the task of lemmatizing abbreviations in the Russian language. Abbreviation lemmatization is particularly challenging, as it involves not only transforming a word into its normal form but also correctly expanding the …","url":["ftp://ftp.pdmi.ras.ru/pub/publicat/znsl/v546/p032.pdf"]} {"year":"2025","title":"Transformer-Based Intrusion Detection for 5G Core Networks","authors":["Y Thulasinathan - 2025"],"snippet":"… RoBERTa uses a much bigger dataset of 160 GB which is sourced from CommonCrawl News, OpenWeBText, Stories and more, while BERT only trains on a dataset of 16 GB. Batch sizes, training steps and the learning rates of the model are …","url":["https://brocku.scholaris.ca/bitstreams/e367f7ee-faab-4fc9-90cd-bf21f71f4f11/download"]} +{"year":"2025","title":"Transformer-Based Multilingual NLP Model for Low-Resource Language Translation","authors":["AD Alghamdi - Journal of Multiscale Modelling, 2025"],"snippet":"Low-resource language machine translation remains an ongoing issue because of the limited parallel corpora and restricted linguistic diversity. While existing multilingual models such as mBERT or XLM-RoBERTa attain high performance on …","url":["https://www.worldscientific.com/doi/abs/10.1142/S1756973726400184"]} {"year":"2025","title":"Transformer-Based Re-Ranking Model for Enhancing Contextual and Syntactic Translation in Low-Resource Neural Machine Translation","authors":["A Javed, H Zan, O Mamyrbayev, M Abdullah, K Ahmed… - Electronics, 2025"],"snippet":"Neural machine translation (NMT) plays a vital role in modern communication by bridging language barriers and enabling effective information exchange across diverse linguistic communities. Due to the limited availability of data in low-resource …","url":["https://www.mdpi.com/2079-9292/14/2/243"]} {"year":"2025","title":"Transformers and Beyond: A Review of Key NLP Developments","authors":["Y Rathore"],"snippet":"… its massive parameter size and training on a broad dataset comprising Common Crawl, WebText, books, and Wikipedia. The training … XLM-R is pretrained on a dataset derived from CommonCrawl, called CC100, which includes cleaned and …","url":["https://www.theyashrathore.in/rp.pdf"]} {"year":"2025","title":"Transformers in Natural Language Processing","authors":["P Singh, B Raman - The Geometry of Intelligence: Foundations of …, 2025"],"snippet":"Language modeling is one of the foundational tasks in NLP and is crucial for various downstream tasks such as machine translation, summarization, and dialog systems. Transformers, with their self-attention mechanisms and ability to model complex …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=gD5fEQAAQBAJ&oi=fnd&pg=PA219&dq=commoncrawl&ots=HtNVYptsa5&sig=_3kvscPt-rAfLsQ3LMYes7SzXfQ"]} {"year":"2025","title":"Transformers-based feedback analysis of e-commerce: a focused study on quality assessment of agriculture products","authors":["W Xu - … Journal of Information and Communication Technology, 2025"],"snippet":"Ensuring the quality of agricultural products in e-commerce is a significant challenge due to product variability and the absence of direct inspection before purchase. Customer reviews serve as a critical source of information, offering insights into …","url":["https://www.inderscienceonline.com/doi/pdf/10.1504/IJICT.2025.146365"]} +{"year":"2025","title":"Transforming advertising in the age of generative AI: exploring advertising professionals' perceptions of human-AI value co-creation","authors":["J Yang, C Dong, SC Chu, M Rheu - International Journal of Advertising, 2025"],"snippet":"This exploratory study examines the adoption of Generative Artificial Intelligence (GAI) in the advertising industry through 34 semi-structured interviews with professionals in related fields. Drawing on the service-dominant logic and the value-based …","url":["https://www.tandfonline.com/doi/full/10.1080/02650487.2025.2604910"]} {"year":"2025","title":"Transforming Translation: The Evolution and Impact of AI on Language Transfer and Communication","authors":["DA DePalma, A Lommel - Translation Studies in the Age of Artificial Intelligence"],"snippet":"This chapter examines the evolution of translation technologies from ancient writing systems to the current era dominated by artificial intelligence (AI). It highlights how each technological advancement has not only enhanced the efficiency and reach of …","url":["https://www.taylorfrancis.com/chapters/edit/10.4324/9781003482369-2/transforming-translation-donald-depalma-arle-lommel"]} {"year":"2025","title":"Translate, then Detect: Leveraging Machine Translation for Cross-Lingual Toxicity Classification","authors":["SJ Bell, E Sánchez, D Dale, P Stenetorp, M Artetxe… - arXiv preprint arXiv …, 2025"],"snippet":"Multilingual toxicity detection remains a significant challenge due to the scarcity of training data and resources for many languages. While prior work has leveraged the translate-test paradigm to support cross-lingual transfer across a range of …","url":["https://arxiv.org/pdf/2509.14493"]} {"year":"2025","title":"Translation in the Wild","authors":["Y Balashov - arXiv preprint arXiv:2505.23548, 2025"],"snippet":"Large Language Models (LLMs) excel in translation among other things, demonstrating competitive performance for many language pairs in zeroand few-shot settings. But unlike dedicated neural machine translation models, LLMs are not …","url":["https://arxiv.org/pdf/2505.23548"]} @@ -11185,8 +11429,10 @@ {"year":"2025","title":"Trends, Challenges and New Frontiers of Artificial Intelligence","authors":["P Pal Chaudhuri, A Dutta, S Pal Choudhury… - New Kind of Machine …, 2025"],"snippet":"… Many AI models have been trained on contemporary web texts such as in the CommonCrawl Dataset [115], a free, open repository of web crawl data that can be used by anyone, which likely might have a liberal undertone. This is a departure …","url":["https://link.springer.com/chapter/10.1007/978-981-96-1501-8_2"]} {"year":"2025","title":"Triple-Entry Accounting and Other Secure Methods to Preserve User Privacy and Mitigate Financial Risks in AI-Empowered Lifelong Education","authors":["K Sgantzos, P Tzavaras, M Al Hemairy, ER Porras - Journal of Risk and Financial …, 2025"],"snippet":"Within the past five years, and as Artificial Intelligence (AI) increasingly per‑vades the academic and educational landscape, a delicate balance has emerged between leveraging AI’s transformative potential and safeguarding individual privacy, which …","url":["https://www.mdpi.com/1911-8074/18/4/176"]} {"year":"2025","title":"Trust at Your Own Peril: A Mixed Methods Exploration of the Ability of Large Language Models to Generate Expert-Like Systems Engineering Artifacts and a …","authors":["TG Topcu, M Husain, M Ofsa, P Wach - arXiv preprint arXiv:2502.09690, 2025"],"snippet":"Multi-purpose Large Language Models (LLMs), a subset of generative Artificial Intelligence (AI), have recently made significant progress. While expectations for LLMs to assist systems engineering (SE) tasks are paramount; the interdisciplinary …","url":["https://arxiv.org/pdf/2502.09690"]} +{"year":"2025","title":"Truth, Meaning, and Intention in Large Language Models","authors":["C Odenwald - 2025"],"snippet":"The arrival of commercially available chatbots that are able to produce human-like language has made philosophical thought experiments come to life. Such language models produce mostly coherent language, but how do their outputs compare to …","url":["https://eplus.uni-salzburg.at/Abschlussarbeiten/content/titleinfo/12867061/full.pdf"]} {"year":"2025","title":"TUM Data Innovation Lab Munich Data Science Institute (MDSI) Technical University of Munich & Zweites Deutsches Fernsehen (ZDF)","authors":["H Ghonim, JW Hwang, T Nielen, TUMMDA Scagliotti - 2025"],"snippet":"… For instance, content from websites like Reddit, where conversations frequently express sexist or racist views, is included in the Common Crawl dataset used to train GPT-3 [3]. Even worse, LLMs run the potential of propagating false information since …","url":["https://www.mdsi.tum.de/fileadmin/w00cet/di-lab/pdf/ZDF_SS25_FinalReport.pdf"]} {"year":"2025","title":"Turing Test 2.0: The General Intelligence Threshold","authors":["G Mappouras - arXiv preprint arXiv:2505.19550, 2025"],"snippet":"… Popular LLMs, like ChatGPT are mainly trained using data that is publicly available on the internet [3] like the massive Common Crawl data set [1]. From that we can infer some information about the training data used to train the LLMs. For …","url":["https://arxiv.org/pdf/2505.19550"]} +{"year":"2025","title":"Two CFG Nahuatl for automatic corpora expansion","authors":["JJ Guzmán-Landa, JM Torres-Moreno… - arXiv preprint arXiv …, 2025"],"snippet":"The aim of this article is to introduce two Context-Free Grammars (CFG) for Nawatl Corpora expansion. Nawatl is an Amerindian language (it is a National Language of Mexico) of the $\\pi$-language type, ie a language with few digital resources. For this …","url":["https://arxiv.org/pdf/2512.14239"]} {"year":"2025","title":"Two Spelling Normalization Approaches Based on Large Language Models","authors":["M Domingo, F Casacuberta - arXiv preprint arXiv:2506.23288, 2025"],"snippet":"The absence of standardized spelling conventions and the organic evolution of human language present an inherent linguistic challenge within historical documents, a longstanding concern for scholars in the humanities. Addressing this issue …","url":["https://arxiv.org/pdf/2506.23288"]} {"year":"2025","title":"U'Dedup: Updatable Block-Level Deduplication Scheme over Similar Data in Fog-Assisted Cloud Storage","authors":["M He, X Zhang, L Wang - IEEE Internet of Things Journal, 2025"],"snippet":"Fog-assisted cloud storage enables efficient collection and management of Internet of Things data, while large-scale data raise severe requirements for storage space. Deduplication schemes over similar data have been investigated to relieve the …","url":["https://ieeexplore.ieee.org/abstract/document/11017715/"]} {"year":"2025","title":"UI-E2I-Synth: Advancing GUI Grounding with Large-Scale Instruction Synthesis","authors":["X Liu, X Zhang, Z Zhang, Y Lu - arXiv preprint arXiv:2504.11257, 2025"],"snippet":"… For Web platform, we use the dumped webpage metadata in Common Crawl (Common Crawl… The Web is our main data source due to the variety of layouts and design styles across websites, as well as the extensive quantity of webpages available in …","url":["https://arxiv.org/pdf/2504.11257"]} @@ -11196,6 +11442,7 @@ {"year":"2025","title":"Ultra-FineWeb: Efficient Data Filtering and Verification for High-Quality LLM Training Data","authors":["Y Wang, Z Fu, J Cai, P Tang, H Lyu, Y Fang, Z Zheng… - arXiv preprint arXiv …, 2025"],"snippet":"Data quality has become a key factor in enhancing model performance with the rapid development of large language models (LLMs). Model-driven data filtering has increasingly become a primary approach for acquiring high-quality data. However, it …","url":["https://arxiv.org/pdf/2505.05427"]} {"year":"2025","title":"UmuTeam at CheckThat! 2025: Language-Specific versus Multilingual Models for Fact-Checking","authors":["T Bernal-Beltrán, R Pan, JA García-Díaz… - 2025"],"snippet":"… For multilingual and zero-shot scenarios, we employed XLM-RoBERTa-Large [30], a Transformerbased multilingual encoder model trained on 2.5 TB of filtered CommonCrawl data covering 100 languages. This model was pre-trained using the …","url":["https://ceur-ws.org/Vol-4038/paper_64.pdf"]} {"year":"2025","title":"Uncertainty Aware LLM Deidentification and Anonymization of Clinical Notes","authors":["A Kline, NG Nia, TJ Smith, D Leibowitz, D Johnston - Authorea Preprints, 2025"],"snippet":"Background: The increasing demand for privacypreserving access to clinical data has catalyzed the development of synthetic Protected Health Information (PHI) corpora for evaluating Named Entity Recognition (NER) systems. Methods: In this …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.175979329.97184142"]} +{"year":"2025","title":"Uncertainty Quantification for Large Language Model Reward Learning under Heterogeneous Human Feedback","authors":["P Liu, J Lu, WW Sun - 2025"],"snippet":"We study the problem of statistical estimation and inference for the reward model used in the training of large language models (LLMs). A key component in aligning LLMs with human values is reinforcement learning from human feedback (RLHF) …","url":["https://web.ics.purdue.edu/~sun244/UQ_for_RLHF.pdf"]} {"year":"2025","title":"Uncovering inequalities in new knowledge learning by large language models across different languages","authors":["C Wang, H Tang, X Yang, Y Xie, J Suh, S Sitaram… - arXiv preprint arXiv …, 2025"],"snippet":"… Following existing research approaches to multilingual natural language processing (NLP) (26,27), we classify languages into different resource levels based on their proportions in the CommonCrawl corpus, which was used to pre-train GPT-3 …","url":["https://arxiv.org/pdf/2503.04064"]} {"year":"2025","title":"Uncovering Unsafety Traits in Italian Language Models","authors":["G Rizzi, G Magazzù, A Sormani, F Pulerà, D Scalena… - Proceedings of the Eleventh …, 2025"],"snippet":"… • Minerva † [20] is the first family of LLMs trained entirely from scratch on native Italian texts using a portion of FineWeb, which includes filtered and deduplicated Common Crawl dumps with various timestamps. We adopted the instruction-tuned …","url":["https://clic2025.unica.it/wp-content/uploads/2025/09/90_main_long.pdf"]} {"year":"2025","title":"Understand, Solve and Translate: Bridging the Multilingual Mathematical Reasoning Gap","authors":["H Ko, G Son, D Choi - arXiv preprint arXiv:2501.02448, 2025"],"snippet":"Large language models (LLMs) demonstrate exceptional performance on complex reasoning tasks. However, despite their strong reasoning capabilities in high-resource languages (eg, English and Chinese), a significant performance gap persists in …","url":["https://arxiv.org/pdf/2501.02448"]} @@ -11228,6 +11475,7 @@ {"year":"2025","title":"Using AI Tools to Investigate the Health Effects of Traffic Related PM2. 5 Air Pollution on Chronic Disease Prevalence in the Georgetown Community of Salisbury …","authors":["KV Kelly - 2025"],"snippet":"Research has established that PM 2.5 is a major public and environmental health concern and a key driver of cardiovascular disease, diabetes, and hypertension. PM 2.5 contains a variety of pollutants such as carbon dioxide (CO 2), lead (Pb), arsenic …","url":["https://search.proquest.com/openview/9a1c06b4660500da571f49a90ad46be7/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Using Different Modes of Correction to Improve Fairness","authors":["Y Wang - 2024"],"snippet":"As the prevalence of artificial intelligence continues to grow, we are seeing the increasing impact of algorithmic decision making on people’s daily lives, eg, credit approval, hiring, criminal justice, and student assessment. Given the potential impact …","url":["https://search.proquest.com/openview/ea196b67e62926eb7bea3a75cb745fcf/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"Using Knowledge Graphs to harvest datasets for efficient CLIP model training","authors":["S Ging, S Walter, J Bratulić, J Dienert, H Bast, T Brox - arXiv preprint arXiv …, 2025"],"snippet":"… They collected image-text pairs from CommonCrawl and filtered them using Wikipedia and WordNet, then balanced the results. Gadre et al. [10] proposed DataComp, a filtering challenge with a candidate pool of up to 13B image-text pairs …","url":["https://arxiv.org/pdf/2505.02746"]} +{"year":"2025","title":"Using Machine Learning to Detect Scientific Misinformation","authors":["C Impey, A Danehy, M Wenger, S Buxner - Astronomy Education Journal, 2025"],"snippet":"Large language models and machine learning methods have been used to detect pseudoscience and scientific misinformation online. The goal of the project is to develop tools to combat the rise of misinformation that is rapidly drowning out …","url":["https://www.astroedjournal.org/index.php/ijae/article/download/138/79"]} {"year":"2025","title":"Using Scaling Laws for Data Source Utility Estimation in Domain-Specific Pre-Training","authors":["O Ostapenko, C Guille-Escuret, L Kumar, M Tian… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce a framework for optimizing domain-specific dataset construction in foundation model training. Specifically, we seek a cost-efficient way to estimate the quality of data sources (eg synthetically generated or filtered web data, etc.) in order …","url":["https://arxiv.org/pdf/2507.22250"]} {"year":"2025","title":"Utilizing ChatGPT to integrate world English and diverse knowledge: A transnational perspective in critical artificial intelligence (AI) literacy","authors":["A Ghimire - Computers and Composition, 2025"],"snippet":"This article proposes the implementation of a transnational post-digital pedagogy and Critical AI literacy incorporating ChatGPT in the classroom. It draws upon Scott Graham's suggestion for a multidimensional recursive writing process, emphasizing …","url":["https://www.sciencedirect.com/science/article/pii/S8755461524000896"]} {"year":"2025","title":"Utilizing Hybrid CNN-SVM and FastText Word Embedding for Twitter Cyberbullying Classification","authors":["A Qoiriah, DO Putri, Y Yamasari, IM Suartana, RE Putra… - 2024 Seventh International …, 2024"],"snippet":"People’s new habits in using social media, including spreading comments or hate speech that leads to cyberbullying behavior, is a concern because it has a serious impact on victims. Twitter has 400 million users with the majority being young users …","url":["https://ieeexplore.ieee.org/abstract/document/10823790/"]} @@ -11235,6 +11483,7 @@ {"year":"2025","title":"Utilizing Parameter-Efficient Fine-Tuning Methods to Improve Controllable Text Generation/submitted by Hector Auvinen","authors":["H Auvinen - 2025"],"snippet":"Abstract Language models have become a crucial component in the field of natural language processing (NLP). Despite demonstrating remarkable performance in various tasks, the text generated by these models often lacks accurate control. This …","url":["https://epub.jku.at/obvulihs/content/titleinfo/11984304/full.pdf"]} {"year":"2025","title":"Validating Malicious URLs in Phishing Campaigns Using CNNs","authors":["S Hakala, J Ekström - 2024"],"snippet":"Fraud rates are growing [18], property crime has shifted towards fraud [17], and the world is increasingly online: in 2023, around 89% of households had access to internet [42]. As we continue to move our daily interactions to the internet …","url":["https://helda.helsinki.fi/bitstreams/d410213c-e132-4bcc-bc0d-b2c487612f0e/download"]} {"year":"2025","title":"Variational Autoencoding and Segmentation","authors":["HJM van Genuchten"],"snippet":"Mobile Robotics rely on Simultaneous Localization and Mapping (SLAM) to navigate in unknown and dynamic environments. SLAM works by detecting landmarks in the environment and relating the position of the robot to these landmarks. Semantic …","url":["https://pure.tue.nl/ws/portalfiles/portal/350970848/Genuchten_D.pdf"]} +{"year":"2025","title":"Vashantor: A Large-scale Multilingual Benchmark Dataset for Automated Translation of Bangla Regional Dialects to Bangla","authors":["FTJ Fariaa, MB Moina, A Al Wasea, M Ahmmedb…"],"snippet":"The Bangla linguistic variety is a fascinating mix of regional dialects that contributes to the cultural diversity of the Bangla-speaking community. Despite extensive study into translating Bangla to English, English to Bangla, and Banglish to Bangla in the …","url":["https://www.researchgate.net/profile/Mukaffi-Bin-Moin-2/publication/377077547_Vashantor_A_Large-scale_Multilingual_Benchmark_Dataset_for_Automated_Translation_of_Bangla_Regional_Dialects_to_Bangla_Language/links/691d577a1555db2ebd6081cd/Vashantor-A-Large-scale-Multilingual-Benchmark-Dataset-for-Automated-Translation-of-Bangla-Regional-Dialects-to-Bangla-Language.pdf"]} {"year":"2025","title":"VECTOR REPRESENTATION OF CZECH TEXT","authors":["BV EICHLER"],"snippet":"This thesis presents Czechtriever, a retrieval model designed for the Czech language and trained without annotated data. The model is based on the architecture and methodology of Contriever and employs contrastive learning to …","url":["https://theses.cz/id/ni57x9/thesis.pdf"]} {"year":"2025","title":"Verbal Frontiers: Combining Words in the Wild, Computational Modeling, and Behavior Analysis to Explore Verbal Communities","authors":["DJ Cox - Perspectives on Behavior Science, 2025"],"snippet":"Significant advancements in science occur when previously unobservable or immeasurable things critical to theory become observable and measurable. The “verbal community” is a case in point; it plays a critical role in the analysis of verbal behavior …","url":["https://link.springer.com/article/10.1007/s40614-025-00484-y"]} {"year":"2025","title":"VIBE: Vector Index Benchmark for Embeddings","authors":["E Jääsaari, V Hyvönen, M Ceccarello, T Roos… - arXiv preprint arXiv …, 2025"],"snippet":"Approximate nearest neighbor (ANN) search is a performance-critical component of many machine learning pipelines. Rigorous benchmarking is essential for evaluating the performance of vector indexes for ANN search. However, the datasets …","url":["https://arxiv.org/pdf/2505.17810"]} @@ -11270,8 +11519,10 @@ {"year":"2025","title":"Web-scale Retrieval Experimentation with chatnoir-pyterrier","authors":["JH Merker, J Bevendorff, M Fröbe, T Hagen, H Scells…"],"snippet":"The IR community has always aimed to improve the realism of retrieval experiments by increasing the size of the document collections. As collection sizes grow from megabytes to giga-, tera-, and maybe soon petabytes, IR labs are challenged to …","url":["https://downloads.webis.de/publications/papers/merker_2025a.pdf"]} {"year":"2025","title":"Web2Wiki: Characterizing Wikipedia Linking Across the Web","authors":["V Veselovsky, T Piccardi, A Anderson, R West, A Arora - arXiv preprint arXiv …, 2025"],"snippet":"… Using a dataset from Common Crawl, we identify over 90 million Wikipedia links spanning 1.68% of Web domains and examine their distribution, context, and function. Our analysis of English Wikipedia reveals three key findings: (1) Wikipedia …","url":["https://arxiv.org/pdf/2505.15837"]} {"year":"2025","title":"WebFAQ: A Multilingual Collection of Natural Q&A Datasets for Dense Retrieval","authors":["M Dinzinger, L Caspari, KG Dastidar, J Mitrović… - arXiv preprint arXiv …, 2025"],"snippet":"… Common Crawl. Our work builds upon the efforts of the Web Data Commons3 (WDC) project, whose focus is the large-scale extraction of structured data from the Common Crawl4 … also utilizes QA pairs extracted from Common Crawl. CCQA …","url":["https://arxiv.org/pdf/2502.20936"]} +{"year":"2025","title":"WebGCN: Web Information Extraction Algorithm Based on Graph Neural Networks","authors":["X Wang, D Yan, Y Wang, H Zhang, X Wen, F Liu… - International Conference on …, 2026"],"snippet":"E-commerce platforms have complex and dynamically changing webpage structures. Traditional web crawlers and rule-based information extraction methods struggle to adapt to these challenges, resulting in high maintenance costs and low extraction …","url":["https://link.springer.com/chapter/10.1007/978-981-95-3061-8_5"]} {"year":"2025","title":"Webis at TREC 2024: Biomedical Generative Retrieval, Retrieval-Augmented Generation, and Tip-of-the-Tongue Tracks","authors":["M Fröbe, L Gienapp, JH Merker, H Scells, EO Schmidt…"],"snippet":"In this paper, we describe the Webis Group’s participation in the 2024 edition of TREC. We participated in the Biomedical Generative Retrieval track, the Retrieval-Augmented Generation track, and the Tip-of-the-Tongue track. For the biomedical track, we …","url":["https://trec.nist.gov/pubs/trec33/papers/webis.biogen.rag.tot.pdf"]} {"year":"2025","title":"WebMall--A Multi-Shop Benchmark for Evaluating Web Agents","authors":["R Peeters, A Steiner, L Schwarz, JY Caspary, C Bizer - arXiv preprint arXiv …, 2025"],"snippet":"… To populate the shops with real-world product offers, we used the WDC Extraction of the October 2024 Common Crawl4. The extraction files contain product offers from thousands of real-world e-shops that mark up product offers on their websites using …","url":["https://arxiv.org/pdf/2508.13024"]} +{"year":"2025","title":"Webový korpus slovenčiny ARANEUM+ HPLT+ FineWeb2","authors":["I Ripka - ŠTÚDIE A ČLÁNKY"],"snippet":"úvod Jazykovedný ústav Ľ. Štúra Slovenskej akadémie vied (ďalej JÚĽŠ SAV) si už dlhodobo udržiava korpusovú tradíciu v tvorbe textových korpusov a vo výskume v oblasti korpusovej lingvistiky. tvorbe korpusov sa venuje najmä oddelenie …","url":["https://www.juls.savba.sk/ediela/ks/2025/5/ks5-2025.pdf#page=38"]} {"year":"2025","title":"WeLT: Weighted Loss Trainer for Biomedical Joint Entity and Relation Extraction","authors":["G Mobasher - 2025"],"snippet":"The exponential growth of unstructured textual data has emphasised the need for Information Extraction (IE) to transform raw text into actionable knowledge. IE involves automatically identifying and categorising relevant entities, relationships …","url":["https://archiv.ub.uni-heidelberg.de/volltextserver/35978/1/Thesis.pdf"]} {"year":"2025","title":"What Are Chatbots' Stereotypes About? A Data-Driven Analysis of Large Language Models' Content Associations with Social Categories","authors":["G Nicolas, A Caliskan - Proceedings of the AAAI/ACM Conference on AI, Ethics …, 2025"],"snippet":"This study introduces a data-driven taxonomy of stereotype content in contemporary large language models (LLMs). We prompt ChatGPT 4.5, ChatGPT 3.5, Llama 3, and Mixtral 8x7B, four recent and powerful LLMs, for the characteristics associated …","url":["https://ojs.aaai.org/index.php/AIES/article/download/36682/38820"]} {"year":"2025","title":"What are Foundation Models Cooking in the Post-Soviet World?","authors":["A Lavrouk, T Naous, A Ritter, W Xu - arXiv preprint arXiv:2502.18583, 2025"],"snippet":"The culture of the Post-Soviet states is complex, shaped by a turbulent history that continues to influence current events. In this study, we investigate the Post-Soviet cultural food knowledge of foundation models by constructing BORSch, a …","url":["https://arxiv.org/pdf/2502.18583"]} @@ -11280,10 +11531,12 @@ {"year":"2025","title":"What Gets Measured Gets Managed: Mitigating Supply Chain Attacks with a Link Integrity Management System","authors":["J So, M Ferdman, N Nikiforakis - arXiv preprint arXiv:2509.14583, 2025"],"snippet":"… To quantify this discussion, we leverage the data archived by the Common Crawl (CC) project [8] to emulate longitudinal analyses. The CC project periodically archives the web with their crawlers and provides its data for public use in anindex defined by the …","url":["https://arxiv.org/pdf/2509.14583"]} {"year":"2025","title":"What happens when generative AI models train recursively on each others' generated outputs?","authors":["HA Vu, G Reeves, E Wenger - arXiv preprint arXiv:2505.21677, 2025"],"snippet":"… Mapping these to realistic scenarios, D∗ could be a public dataset like Common Crawl;Dk could be a private dataset of math problems curated by entity k; and Dt could be an internet scrape from after initial model training. We weight the relative …","url":["https://arxiv.org/pdf/2505.21677"]} {"year":"2025","title":"What Is The Political Content in LLMs' Pre-and Post-Training Data?","authors":["T Ceron, D Nikolaev, D Stammbach, D Nozza - arXiv preprint arXiv:2509.22367, 2025"],"snippet":"Large language models (LLMs) are known to generate politically biased text, yet how such biases arise remains unclear. A crucial step toward answering this question is the analysis of training data, whose political content remains largely …","url":["https://arxiv.org/pdf/2509.22367"]} +{"year":"2025","title":"What Leads to Longer Word of Mouth Discussion?","authors":["R Boghrati, J Berger - 2025"],"snippet":"While a great deal of research has demonstrated word of mouth’s impact, less is known about the broader discussions in which such sharing takes place. Some things are talked about for longer, and longer discussion tends to increase …","url":["https://www.journals.uchicago.edu/doi/pdf/10.1086/740059"]} {"year":"2025","title":"When AI Meets the Web: Prompt Injection Risks in Third-Party AI Chatbot Plugins","authors":["Y Kaya, A Landerer, S Pletinckx, M Zimmermann… - arXiv preprint arXiv …, 2025"],"snippet":"… We use the Common Crawl dataset [41] to find websites that embed our target chatbot plugins. Common Crawl (CC) periodically releases snapshots of publicly accessible web data, including the raw HTML content of billions of pages. We scan …","url":["https://arxiv.org/pdf/2511.05797"]} {"year":"2025","title":"When Bad Data Leads to Good Models","authors":["K Li, Y Chen, F Viégas, M Wattenberg - arXiv preprint arXiv:2505.04741, 2025"],"snippet":"In large language model (LLM) pretraining, data quality is believed to determine model quality. In this paper, we re-examine the notion of \"quality\" from the perspective of preand post-training co-design. Specifically, we explore the …","url":["https://arxiv.org/pdf/2505.04741"]} {"year":"2025","title":"When Dimensionality Hurts: The Role of LLM Embedding Compression for Noisy Regression Tasks","authors":["F Drinkall, JB Pierrehumbert, S Zohren - arXiv preprint arXiv:2502.02199, 2025"],"snippet":"Large language models (LLMs) have shown remarkable success in language modelling due to scaling laws found in model size and the hidden dimension of the model's text representation. Yet, we demonstrate that compressed representations of …","url":["https://arxiv.org/pdf/2502.02199"]} {"year":"2025","title":"When Does Language Transfer Help? Sequential Fine-Tuning for Cross-Lingual Euphemism Detection","authors":["J Sammartino, L Barak, J Peng, A Feldman - arXiv preprint arXiv:2508.11831, 2025"],"snippet":"Euphemisms are culturally variable and often ambiguous, posing challenges for language models, especially in low-resource settings. This paper investigates how cross-lingual transfer via sequential fine-tuning affects euphemism detection across …","url":["https://arxiv.org/pdf/2508.11831"]} +{"year":"2025","title":"WHEN DYNAMIC COMPETITION AND STATIC COMPETITION MERGE: ANTITRUST, INNOVATION QUESTIONS, AND THE CASE OF GENERATIVE ARTIFICIAL …","authors":["SK Mehra - Antitrust Law Journal, 2025"],"snippet":"… In ChatGPT's case, this included Common Crawl, an open-source collection of raw web page data, metadata extracts, and text extracts through automated harvesting.75 The need for such resources to train generative AI applications …","url":["https://search.proquest.com/openview/b242cfcd2e3e6e03343332bb17d03663/1?pq-origsite=gscholar&cbl=45248"]} {"year":"2025","title":"When Hallucination Costs Millions: Benchmarking AI Agents in High-Stakes Adversarial Financial Markets","authors":["Z Dai, Z Peng, Z Cheng, RY Li - arXiv preprint arXiv:2510.00332, 2025"],"snippet":"We present CAIA, a benchmark exposing a critical blind spot in AI evaluation: the inability of state-of-the-art models to operate in adversarial, high-stakes environments where misinformation is weaponized and errors are irreversible. While …","url":["https://arxiv.org/pdf/2510.00332"]} {"year":"2025","title":"When Language Shapes Thought: Cross-Lingual Transfer of Factual Knowledge in Question Answering","authors":["E Kang, J Kim - Proceedings of the 34th ACM International Conference …, 2025"],"snippet":"Multilingual large language models (LLMs) offer promising opportunities for cross-lingual information access, yet their use of factual knowledge remains highly sensitive to the input language. Prior work has addressed this through English prompting and …","url":["https://dl.acm.org/doi/abs/10.1145/3746252.3760807"]} {"year":"2025","title":"When natural language is not enough: The limits of in-context learning demonstrations in multilingual reasoning","authors":["L Ranaldi, B Haddow, A Birch","L Ranaldi, B Haddow, A Birch - Findings of the Association for Computational …, 2025"],"snippet":"Previous studies have demonstrated the effectiveness of reasoning methods in eliciting multi-step reasoned answers from Large Language Models (LLMs) by leveraging in-context demonstrations. These methods, exemplified by Chain-of-Thought …","url":["https://aclanthology.org/2025.findings-naacl.412.pdf","https://aclanthology.org/anthology-files/pdf/naacl/2025.naacl-findings.412.pdf"]} @@ -11298,10 +11551,457 @@ {"year":"2025","title":"WinoWhat: A Parallel Corpus of Paraphrased WinoGrande Sentences with Common Sense Categorization","authors":["I Gevers, V De Marez, L De Bruyne, W Daelemans - arXiv preprint arXiv:2503.23779, 2025"],"snippet":"In this study, we take a closer look at how Winograd schema challenges can be used to evaluate common sense reasoning in LLMs. Specifically, we evaluate generative models of different sizes on the popular WinoGrande benchmark. We …","url":["https://arxiv.org/pdf/2503.23779"]} {"year":"2025","title":"Without Burning the Servers Down","authors":["PO Suarez, T Vaughan, G Lindahl"],"snippet":"… - We use Exponential Backoff and Jitter to develop our own download client for Common Crawl, cc-downloader [8]. - We develop it in the rust programming language [9] and release pre-compiled binaries for Linux (ARM and x86-64), Mac (ARM …","url":["https://netpreserve.org/resources/WAC25_POSTER16_ORTIZ-SUAREZ-VAUGHAN-LINDHAL.pdf"]} {"year":"2025","title":"Word Embeddings in NLP","authors":["T UÇKAN, E KURT - PIONEER AND INNOVATIVE STUDIES IN COMPUTER …"],"snippet":"Word embeddings is a natural language processing (NLP) technology that represents words as multidimensional numerical arrays. This technology mathematically expresses the semantic and syntactic properties of words. For …","url":["https://www.allsciencesacademy.com/_files/ugd/13252f_2d4bfe34926e43e69c1d2fff7b74bf11.pdf#page=60"]} +{"year":"2025","title":"Workload-Aware Optimization for High-Throughput Log Analytics","authors":["L Zhang - 2025"],"snippet":"In modern system management, it is critical to collect and analyze large volumes of log data. Regular expressions (regex) are the norm in the industry for extracting information from these logs. However, neither database systems (DBMSs) nor log …","url":["https://search.proquest.com/openview/b1a1c6f460cbf5922f83d9652723f013/1?pq-origsite=gscholar&cbl=18750&diss=y"]} {"year":"2025","title":"WorthIt: Check-worthiness Estimation of Italian Social Media Posts","authors":["A Daffara, A Ramponi, S Tonelli - 2025"],"snippet":"Check-worthiness estimation is the first and a paramount task in the automated fact-checking pipeline. It allows professional fact-checkers to cope with the increasing amount of mis/disinformative textual content being published online by prioritizing claims that …","url":["https://clic2025.unica.it/wp-content/uploads/2025/09/34_main_long.pdf"]} +{"year":"2025","title":"WOWS-EVAL 2025: Efficient Baselines for Automated Relevance Label Transfer","authors":["D Alexander, M Fröbe, G Hendriksen - 2025"],"snippet":"This paper describes our submissions to the 2025 shared task on WOWS-EVAL that aims to automatically estimate the relevance of documents to a query given documents that are already known to be relevant to the query. In the end, we aim to …","url":["https://ceur-ws.org/Vol-4137/WOWS_2025_paper_6.pdf"]} {"year":"2025","title":"X-Transfer Attacks: Towards Super Transferable Adversarial Attacks on CLIP","authors":["H Huang, S Erfani, Y Li, X Ma, J Bailey - arXiv preprint arXiv:2505.05528, 2025"],"snippet":"As Contrastive Language-Image Pre-training (CLIP) models are increasingly adopted for diverse downstream tasks and integrated into large vision-language models (VLMs), their susceptibility to adversarial perturbations has emerged as a …","url":["https://arxiv.org/pdf/2505.05528"]} {"year":"2025","title":"xGen-small Technical Report","authors":["E Nijkamp, B Pang, E Pakhomov, A Gokul, J Qu… - arXiv preprint arXiv …, 2025"],"snippet":"We introduce xGen-small, a family of 4B and 9B Transformer decoder models optimized for long-context applications. Our vertically integrated pipeline unites domain-balanced, frequency-aware data curation; multi-stage pre-training with …","url":["https://arxiv.org/pdf/2505.06496"]} {"year":"2025","title":"XiHeFusion: Harnessing Large Language Models for Science Communication in Nuclear Fusion","authors":["X Wang, Q Yang, F Wang, Q Chen, W Wu, Y Jin… - arXiv preprint arXiv …, 2025"],"snippet":"… We have collected multi-source knowledge about nuclear fusion tasks to support the training of this model, including the common crawl, eBooks, arXiv, dissertation, etc. After the model has mastered the knowledge of the nuclear fusion field, we …","url":["https://arxiv.org/pdf/2502.05615"]} {"year":"2025","title":"xVLM2Vec: Adapting LVLM-based embedding models to multilinguality using Self-Knowledge Distillation","authors":["E Musacchio, L Siciliani, P Basile, G Semeraro - arXiv preprint arXiv:2503.09313, 2025"],"snippet":"… For example, XLM-ROBERTA was trained on 2.5TB of CommonCrawl data in 100 languages. However, embeddings for the same concept in different languages may not have the same dense representation since there is no constraint during training …","url":["https://arxiv.org/pdf/2503.09313?"]} {"year":"2025","title":"YORDEPAN (v. 1.0): A LIGHTWEIGHT ANNOTATION TOOL FOR YORÙBÁ DEPENDENCY TREEBANK CREATION","authors":["KO LAWAL"],"snippet":"We present YORDEPAN (v. 1.0), a lightweight Streamlit-based application originally designed for a per sentence annotation in a dependency treebank creation task, it can be adapted across various languages. As such, it is language-agnostic. The app …","url":["https://www.researchgate.net/profile/Kolawole-Lawal-5/publication/395833304_YORDEPAN_TOWARDS_YORUBA_DEPENDENCY_TREEBANK_CREATION/links/68f50895e7f5f867e6e0ec00/YORDEPAN-TOWARDS-YORUBA-DEPENDENCY-TREEBANK-CREATION.pdf"]} +{"year":"2025","title":"Yuan-TecSwin: A text conditioned Diffusion model with Swin-transformer blocks","authors":["S Wu, T Yu, S Wang, X Zhao - arXiv preprint arXiv:2512.16586, 2025"],"snippet":"Diffusion models have shown remarkable capacity in image synthesis based on their U-shaped architecture and convolutional neural networks (CNN) as basic blocks. The locality of the convolution operation in CNN may limit the model's ability …","url":["https://arxiv.org/pdf/2512.16586"]} {"year":"2025","title":"Z-Pruner: Post-Training Pruning of Large Language Models for Efficiency without Retraining","authors":["SB Bhuiyan, MSH Adib, MA Bhuiyan, MR Kabir… - arXiv preprint arXiv …, 2025"],"snippet":"… • C4: A large-scale, cleaned dataset derived from the Common Crawl web archive, totaling hundreds of gigabytes of English text. It filters out low-quality content, duplicates, and boilerplate, providing a diverse and highvolume source of real-world …","url":["https://arxiv.org/pdf/2508.15828"]} +{"year":"2026","title":"(Green)Washing the Trust: Climate Information and Banking Policies","authors":["S Di Paolo, D Liberati, L Rubeo"],"snippet":"… , we turned to Common Crawl, an open-access repository that regularly archives vast portions of the internet. Common Crawl stores petabytes … In our case, we therefore relied on Common Crawl only as a complementary source, to obtain the …","url":["https://www.bancaditalia.it/pubblicazioni/temi-discussione/2026/2026-1514/en_tema_1514.pdf"]} +{"year":"2026","title":"1-Bit Wonder: Improving QAT Performance in the Low-Bit Regime through K-Means Quantization","authors":["S Maskey, C Eichenberg, J Messner, D Orr - arXiv preprint arXiv:2602.15563, 2026"],"snippet":"Quantization-aware training (QAT) is an effective method to drastically reduce the memory footprint of LLMs while keeping performance degradation at an acceptable level. However, the optimal choice of quantization format and bit-width presents a …","url":["https://arxiv.org/pdf/2602.15563"]} +{"year":"2026","title":"2322—Or a contemporary pharmakon.","authors":["K Köth - reading time"],"snippet":"In recent months, people using ChatGPT and similar Generative Artificial Intelligence (GenAI) services as writing tools were haunted by one character that could potentially give away their copying and pasting of machine-generated text—the …","url":["https://creativecomplexity.com/notes/2322-or-a-contemporary-pharmakon"]} +{"year":"2026","title":"2Mamba2Furious: Linear in Complexity, Competitive in Accuracy","authors":["G Mongaras, EC Larson - arXiv preprint arXiv:2602.17363, 2026"],"snippet":"… This dataset is composed of over 15 Trillion clean and deduplicated tokens from CommonCrawl, a dataset of a crawl of webpages on the internet. As such, it is composed of various different types of data, from code, to random documents …","url":["https://arxiv.org/pdf/2602.17363"]} +{"year":"2026","title":"A 75‐Year Journey and the Diffusion Divide","authors":["JM Lavista Ferres - Degrees of Change: What AI Means for Education and …, 2026"],"snippet":"The concept of artificial intelligence (AI) traces its roots back to 1950, when Alan Turing published his famous paper, “Computing Machinery and Intelligence”, in which he asked the now‐legendary question, “Can machines think?” and proposed …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/9781394413096.ch1"]} +{"year":"2026","title":"A Comparative Analysis of State-of-the-Art Multilingual Processing Systems","authors":["IKDA Batsi - Smart Computing and Systems: 9th Global Symposium …, 2026"],"snippet":"With over 7,000 languages spoken worldwide and the exponential growth of multilingual digital content, automatically processing multilingual doc-uments has become a crucial challenge. This article presents a comparative study of the state of …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=c82_EQAAQBAJ&oi=fnd&pg=PA47&dq=commoncrawl&ots=C_Ei4XJfxn&sig=btK_N4h93_bJMatutJBq7cEEcEk"]} +{"year":"2026","title":"A Comparative Evaluation of Professional Book Indexing Software: Capabilities, Limitations, and Future Directions","authors":["TM Devendrappa, BS Biradhar"],"snippet":"… [10] While the massive Common Crawl corpus is publicly accessible, it remains more affordable to crawl and index a smaller, application specific dataset than to operate a full sized general purpose index like Common Crawl motivating our …","url":["https://www.dline.info/ijis/fulltext/v18n1/ijisv18n1_1.pdf"]} +{"year":"2026","title":"A comparative study of composed image retrieval in remote sensing","authors":["A Petropoulos - 2026"],"snippet":"This work introduces a new approach to perform image retrieval in a remote sensing environment, a task called remote sensing composed image retrieval. This task combines a reference image with a textual description as a single query that …","url":["https://dspace.lib.ntua.gr/xmlui/bitstream/handle/123456789/63395/Athanasios_Petropoulos___Thesis.pdf?sequence=1"]} +{"year":"2026","title":"A Comparative Study of mBERT and IndicBERT for Natural Language Processing in Indic Languages","authors":["M Danish, H Liu, S Alshmrany - 2025 IEEE 7th International Conference on …, 2025"],"snippet":"… XLM and its improved variant, XLM-RoBERTa (XLM-R) [6], expanded training data beyond Wikipedia by incorporating corpora from CommonCrawl, significantly improving results on multilingual benchmarks such as XNLI. Nevertheless, these …","url":["https://ieeexplore.ieee.org/abstract/document/11325438/"]} +{"year":"2026","title":"A Comparative Study on Domain and Content-Based Approaches","authors":["QV Nguyen, TPQ Le, TRT Van Nam Pham - The 14th Conference on Information Technology …"],"snippet":"… 100 languages using filtered text data, derived from CommonCrawl [10]. Since we will benchmark on both English and Vietnamese, XLM-RoBERTa is fine-tuned as Domain (XLM-RoBERTa) , in which the raw domain is first tokenized using the BPE …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=_Z2lEQAAQBAJ&oi=fnd&pg=PA272&dq=commoncrawl&ots=p_tFX2RwOI&sig=2Wz6ZEHPQWZk-w4L3M-F339p_9s"]} +{"year":"2026","title":"A Consolidated Overview of Datasets and Benchmarks for Machine Learning and Computer Vision","authors":["T Schlosser - Machine Learning, 2026"],"snippet":"This manuscript provides a consolidated, practitioner-oriented overview of widely used datasets and benchmarks for machine learning (ML) and computer vision (CV). It aims to serve as a compact entry point for selecting suitable benchmarks …","url":["https://www.researchgate.net/profile/Tobias-Schlosser/publication/401495396_A_Consolidated_Overview_of_Datasets_and_Benchmarks_for_Machine_Learning_and_Computer_Vision/links/69a7131ed1599a2cb7f624f9/A-Consolidated-Overview-of-Datasets-and-Benchmarks-for-Machine-Learning-and-Computer-Vision.pdf"]} +{"year":"2026","title":"A Consumer-Centric Framework for Measuring Product Obsolescence Using User-Generated Content and Large Language Models: Evidence From IoT Devices","authors":["MA Nasrabadi, Y Beauregard, A Ekhlassi - IEEE TRANSACTIONS ON …, 2026"],"snippet":"Identifying product obsolescence factors is essential for guiding sustainable design and extending product longevity. Unlike prior studies, this research leverages online consumer reviews to explore product obsolescence factors. First, ChatGPT-4o, an …","url":["https://www.researchgate.net/profile/Amir-Ekhlassi/publication/398349099_A_consumer-centric_framework_for_measuring_product_obsolescence_using_user-generated_content_and_large_language_models_Evidence_from_IoT_devices/links/6949d31827359023a00f0dc2/A-consumer-centric-framework-for-measuring-product-obsolescence-using-user-generated-content-and-large-language-models-Evidence-from-IoT-devices.pdf"]} +{"year":"2026","title":"A Deep Learning-based Computational Framework for Detecting and Analysing Multimodal Online Coordinated Behaviour","authors":["G Zhu - 2026"],"snippet":"This thesis examines multimodal online coordinated behaviours and develops a computational detection framework that extends the traditionally limited understanding of such behaviours, while enabling the evaluation and application of …","url":["https://eprints.qut.edu.au/263305/2/Guangnan%20Zhu%20Thesis%281%29.pdf"]} +{"year":"2026","title":"A Device-Cloud Collaborative Data Enrichment Framework for Continual Learning on Devices","authors":["C Gong, Z Zheng, F Wu, G Chen - IEEE Transactions on Networking, 2026"],"snippet":"… Cloud servers typically possess extensive datasets sourced from various channels, such as public datasets released by organizations (eg ImageNet [42]), opensource data crawled from the Internet webs (eg Common Crawl [43]), crowdsourced data …","url":["https://ieeexplore.ieee.org/abstract/document/11419642/"]} +{"year":"2026","title":"A family of large language models for materials research with insights into model adaptability in continued pretraining","authors":["D Ahlawat, V Mishra, S Singh, M Zaki, V Bihani… - Nature Machine Intelligence, 2026"],"snippet":"… The RedPajama dataset 51 , which served as the primary training corpus for the LLaMA2 29 , encompasses diverse textual sources, including arXiv preprints, GitHub repositories, StackExchange discussions, Wikipedia articles and sanitized …","url":["https://www.nature.com/articles/s42256-026-01199-8"]} +{"year":"2026","title":"A Human-Centric Framework for Data Attribution in Large Language Models","authors":["A Wührl, M Ruckdeschel, K Lo, A Rogers - arXiv preprint arXiv:2602.10995, 2026"],"snippet":"… LLMs owe much to Common Crawl, a non-profit effort which explicitly decided against requiring attribution for their scraped data [156]. There are many instances of publishers objecting to the above practices, but licensing deals with AI companies …","url":["https://arxiv.org/pdf/2602.10995"]} +{"year":"2026","title":"A hybrid deep learning framework for multilingual depression detection and symptom classification from social media text","authors":["CH Shwetha, K Pushpalatha - International Journal of Information Technology, 2026"],"snippet":"… 237 million parameters), it is trained on diverse sources such as wikipedia, common crawl, news articles, and translation data, including transliterated and code-mixed text, making it effective for processing multilingual social media content [8]. …","url":["https://link.springer.com/article/10.1007/s41870-025-03055-1"]} +{"year":"2026","title":"A Hybrid Protocol for Large-Scale Semantic Dataset Generation in Low-Resource Languages: The Turkish Semantic Relations Corpus","authors":["E Tosun, ME Buldur, Ö Ezerceli, M ElHussieni - arXiv preprint arXiv:2601.13253, 2026"],"snippet":"We present a hybrid methodology for generating large-scale semantic relationship datasets in low-resource languages, demonstrated through a comprehensive Turkish semantic relations corpus. Our approach integrates three phases: (1) …","url":["https://arxiv.org/pdf/2601.13253"]} +{"year":"2026","title":"A language independent machine learning model for sentiment analysis and toxicity classification in code-mixed data","authors":["MJ Elizabeth, R Hazari - Natural Computing, 2026"],"snippet":"This paper introduces a cellular automata-based machine learning approach designed to solve sentiment analysis and toxicity classification tasks in code-mixed data. The complex behaviour of cellular automata plays a major role in generating …","url":["https://link.springer.com/article/10.1007/s11047-025-10062-5"]} +{"year":"2026","title":"A novel approach to automating context-driven alternative text generation through purposeful games","authors":["N Droutsas - 2026"],"snippet":"Accessibility of the Web is a pervasive issue, owing to the persistence of accessibility barriers (eg, poor navigation, lack of/unsuitable alternative text (alt text), complex web forms), with significant impact on users with disabilities. Alt text barriers …","url":["https://bura.brunel.ac.uk/bitstream/2438/32758/1/FulltextThesis.pdf"]} +{"year":"2026","title":"A Privacy by Design Framework for Large Language Model-Based Applications for Children","authors":["D Addae, D Rogachova, N Kahani, M Barati… - arXiv preprint arXiv …, 2026"],"snippet":"Children are increasingly using technologies powered by Artificial Intelligence (AI). However, there are growing concerns about privacy risks, particularly for children. Although existing privacy regulations require companies and organizations to …","url":["https://arxiv.org/pdf/2602.17418"]} +{"year":"2026","title":"A Survey of Domain-specific Fine-tuned Large Language Models","authors":["P Sarzaeim, A Azim, G Bauer, M Makrehchi - IEEE Access, 2026"],"snippet":"… The training dataset of GPT-3 was created by filtering CommonCrawl [30]. and adding reference corpora. GPT-3 exhibits impressive … Llama extracts training data from various sources, including CommonCrawl [26], C4 [34], GitHub, Wikipedia …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11364219.pdf"]} +{"year":"2026","title":"A Survey of the Recent Foundation Models Based on Their Architectural and Learning Strategies","authors":["A Khan, A Ijaz, MH Tahir, A Rehman, A Asif, S Khan…"],"snippet":"… Pre-training and Fine-tuning: In the pre-training stage, a model learns generic representations by training on broad, diverse data sources for example, text from Wikipedia and Common Crawl for language models. This process equipped the …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.176834456.68708065"]} +{"year":"2026","title":"A Survey on Large Multimodal Models for 3D Vision and Scene Understanding","authors":["J Miao, B Han - Authorea Preprints, 2026"],"snippet":"The landscape of artificial intelligence has been profoundly transformed by the advent of Large Multimodal Models (LMMs), which integrate the superior language comprehension and generation capabilities of Large Language Models (LLMs) with …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.176784330.01281926"]} +{"year":"2026","title":"A survey on ordering of text at different granular levels: P. Tiwari, CR Chowdary","authors":["P Tiwari, CR Chowdary - Knowledge and Information Systems, 2026"],"snippet":"A well-ordered text is a crucial need for various models. The text ordering task has both direct and indirect influence on various tasks like concept to text, document modelling, essay scoring, linearization, machine translation, opinion generation of …","url":["https://link.springer.com/article/10.1007/s10115-025-02628-7"]} +{"year":"2026","title":"A Systematic Review of Natural Language Understanding-Related Challenges in Conversational Agent Development","authors":["GC Uzoaru, II Ayogu, JN Odii, AC Onyeka, ME Nwanga…"],"snippet":"… Common Crawl, can improve word embeddings that capture a wider range of semantic relationships [99]. Investigations have been conducted on the impact of utilizing a combination of the Wikipedia, Statmt News, UMBC, and Gigaword corpora …","url":["https://www.academia.edu/download/124492932/A_Systematic_Review_of_Natural_Language.pdf"]} +{"year":"2026","title":"A Typologically Grounded Evaluation Framework for Word Order and Morphology Sensitivity in Multilingual Masked LMs","authors":["A Feldman, L Barak, J Peng - 2026"],"snippet":"We introduce a typology-aware diagnostic for multilingual masked language models that tests reliance on word order versus inflectional form. Using Universal Dependencies, we apply inference-time perturbations: full token scrambling, content-word …","url":["https://par.nsf.gov/servlets/purl/10667910"]} +{"year":"2026","title":"AdaFRUGAL: Adaptive Memory-Efficient Training with Dynamic Control","authors":["QH Bui, AS Ta - arXiv preprint arXiv:2601.11568, 2025"],"snippet":"… We use VietVault [21], a high-quality 80GB corpus curated from Common Crawl dumps prior to 2023. The dataset has undergone extensive cleaning, deduplication, and filtering for toxic content. Training on this corpus for 200,000 steps allows us to …","url":["https://arxiv.org/pdf/2601.11568"]} +{"year":"2026","title":"Adaptation to Data Sparsity in Machine Translation and Large Language Models","authors":["W Lai - 2025"],"snippet":"Recent advances in natural language processing (NLP) and large language models (LLMs) have been largely driven by access to vast amounts of high-quality training data. However, data remains imbalanced across domains, languages, and stylistic …","url":["https://mediatum.ub.tum.de/doc/1783500/document.pdf"]} +{"year":"2026","title":"Adapting the Object Understanding Capabilities of Vision-Language Models for Improved Usability Around the World","authors":["K Buettner - 2025"],"snippet":"Artificial intelligence models have permeated society with impressive utility in tasks like object recognition and object detection. Model capabilities have strengthened through large-scale training with image-text data widely available on the Internet …","url":["https://search.proquest.com/openview/f199ee177133fa020f52a18ae1f3d764/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Advancements in Attention Mechanisms for Neural Network Architectures","authors":["X Zhao, S Liu, J Xu, L Chen, H Wang, Y Li"],"snippet":"… The Dynamic Attention model achieves the highest accuracy of 90.1% and an F1 score of 0.90 on the Common Crawl dataset, indicating superior performance in large-scale text processing scenarios. Enhanced Self-Attention and Sparse Self-Attention …","url":["https://www.researchgate.net/profile/Yongjie-Li-6/publication/400819170_Advancements_in_Attention_Mechanisms_for_Neural_Network_Architectures/links/6992ddca7247bc6473e0ed8e/Advancements-in-Attention-Mechanisms-for-Neural-Network-Architectures.pdf"]} +{"year":"2026","title":"Advancements in Chatbot Technology: A Comprehensive Overview of ChatGPT","authors":["P Jain, A Tandon, B Agarwal - Generative AI Tools, 2026"],"snippet":"Whether unlocking your cellphone through face recognition or telling Alexa and Siri to play a song, artificial intelligence (AI) has filtered into our day-to-day lives. AI-powered chatbots have become a game-changer in various aspects of our lives, both …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.1201/9781003476023-4&type=chapterpdf"]} +{"year":"2026","title":"Advancing Subjectivity Detection in Bengali News Articles Using Transformer Models with POS-Aware Features","authors":["MM Kabir, K Ahmed, MA Habib, MM Hoque"],"snippet":"… Bangla-BERT-1 is trained on the Bengali Common Crawl corpus using the base BERT architecture. We also include Bangla BERT-2, which is trained on a larger Bengali corpus and optimized for NLP tasks in Bengali. We chose XLMRoBERTa for …","url":["https://aclanthology.org/anthology-files/anthology-files/pdf/banglalp/2025.banglalp-1.11.pdf"]} +{"year":"2026","title":"Advancing the Pareto Frontier of Training Open Language Models","authors":["M Xia - 2026"],"snippet":"Large language models (LLMs) have transformed AI, but their training remains extraordinarily expensive and is dominated by proprietary systems whose data, code, and training recipes are often inaccessible. This combination of high compute cost …","url":["https://search.proquest.com/openview/afd71a8adf9606526abe86480502cde2/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Adversarial attack detection in resource-constrained environments: A stable and sequential federated learning architecture with TinyLlama-1.1 B","authors":["SC Böcekçi, K Yıldız - 2026"],"snippet":"Large Language Models (LLMs) face training challenges on resource-constrained devices, and the performance losses caused by compression methods necessitate a ‘full fine-tuning’approach. In this study, a Mutex-based architecture is proposed for …","url":["https://www.researchsquare.com/article/rs-8613070/latest.pdf"]} +{"year":"2026","title":"AfroScope: A Framework for Studying the Linguistic Landscape of Africa","authors":["SY Kwon, AR Elmadany, M Abdul-Mageed - arXiv preprint arXiv:2601.13346, 2026"],"snippet":"Language Identification (LID) is the task of determining the language of a given text and is a fundamental preprocessing step that affects the reliability of downstream NLP applications. While recent work has expanded LID coverage for African …","url":["https://arxiv.org/pdf/2601.13346"]} +{"year":"2026","title":"AI and Low-Resource Languages: Bridging the Linguistic Divide","authors":["NS Mullah, WMNW Zainon - Reshaping Language and Cognition in Education …, 2026"],"snippet":"Artificial intelligence (AI) is rapidly transforming global communication, learning, and access to services, yet its benefits remain concentrated in high-resource languages, leaving most low-resource languages (LRLs) digitally marginalised. This chapter …","url":["https://www.igi-global.com/chapter/ai-and-low-resource-languages/403448"]} +{"year":"2026","title":"AI FOR DYNAMIC PASSENGER INFORMATION IN PUBLIC TRANSPORT","authors":["T Järvenpää, J Savia - 2025"],"snippet":"Artificial intelligence methods are being increasingly considered for new application domains, including passenger information in public transport. This master’s thesis examines this intersection of topics through a narrative review–to connect possible …","url":["https://trepo.tuni.fi/bitstream/handle/10024/232099/KarlssonLotte.pdf?sequence=2"]} +{"year":"2026","title":"AI for Mathematics: Progress, Challenges, and Prospects","authors":["H Ju, B Dong - arXiv preprint arXiv:2601.13209, 2026"],"snippet":"… During pre-training, filtering pipelines [107,141] extract high-quality mathematical content from web corpora (eg, Common Crawl), textbooks, and research papers to maximize domain relevance. Post-training refines these models using supervised …","url":["https://arxiv.org/pdf/2601.13209"]} +{"year":"2026","title":"AI in Research: Accuracy, Originality, and Efficiency Reconsidered","authors":["D Azisllari, T Amaral - Meta-Research on Artificial Intelligence in Research …"],"snippet":"This systematic literature review examines the ethical implications, reliability, and authorship concerns surrounding Large Language Models (LLMs) in academic research. Through thematic analysis of peer-reviewed literature, this study identifies …","url":["https://repositori.upf.edu/bitstreams/93e8c76c-cf46-471d-8962-5c1a7215afc8/download#page=61"]} +{"year":"2026","title":"AI pirated my art and birthed infringing works, and other metaphors that confound copyright law","authors":["MD Murray - Akron Law Review, 2025"],"snippet":"The wondrous and beguiling technology of generative artificial intelligence (“generative AI”) 2 leaves us at a loss for words as to how to understand or even describe it. 3 But in an effort to fill the gap we rush to assign metaphors and analogies to the complex …","url":["https://ideaexchange.uakron.edu/cgi/viewcontent.cgi?article=2602&context=akronlawreview"]} +{"year":"2026","title":"AI Regulation, Copyright, and Data Mining: A Critical Analysis of the Brazilian Proposal","authors":["B Khan - IIC-International Review of Intellectual Property and …, 2026"],"snippet":"… Common Crawl content is the basis for many of the most widely used AI training datasets, which are then used in the development of large language models (LLMs). Footnote 24 However Common Crawl … to dataset developers like Common Crawl …","url":["https://link.springer.com/article/10.1007/s40319-025-01672-8"]} +{"year":"2026","title":"AI Sensemaking and Adaptation Through Knowledge Workers' Career Transition Narratives","authors":["L Li - 2025"],"snippet":"With advancing capabilities exhibited by generative AI systems, there have been many competing narratives about the occupational implications that they hold for knowledge workers–those whose work is composed primarily of cognitive labor …","url":["https://search.proquest.com/openview/4544d73a9a0935be91d23b0b4e2e758e/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"AI-driven risk estimation: a GPT-based approach to news monitoring for manufacturing resilience","authors":["A Jacob, A Ben Achour, U Teicher - The International Journal of Advanced …, 2026"],"snippet":"… Furthermore, \"news-please\" can access the Common Crawl news archive, which is used in this study to evaluate historical real news [22]. … The dataset was downloaded from the Common Crawl archive in a HTML-like format, filtered by URL …","url":["https://link.springer.com/article/10.1007/s00170-025-17260-6"]} +{"year":"2026","title":"AI-Generated Creativity and the Law","authors":["PH Originality, AI Fears - 2026"],"snippet":"The legal inferences of AI-driven change of copyrighted works advance complicated questions under intellectual property and copyright law including the boundary between conversion in use or infringement. AI-generated modifications from …","url":["https://www.igi-global.com/viewtitle.aspx?titleid=400874"]} +{"year":"2026","title":"AI-Generated Creativity and the Law: Protecting Human Originality Amid Imposter Fears","authors":["B Chouhan - Imposter Syndrome and AI: Navigating Human Identity …, 2026"],"snippet":"The legal inferences of AI-driven change of copyrighted works advance complicated questions under intellectual property and copyright law including the boundary between conversion in use or infringement. AI-generated modifications from …","url":["https://www.igi-global.com/chapter/ai-generated-creativity-and-the-law/400874"]} +{"year":"2026","title":"AI-powered open-source infrastructure for accelerating materials discovery and advanced manufacturing","authors":["M Salas, A Singh, C Pignataro, L Pal - Communications Materials, 2026"],"snippet":"Recent advances in artificial intelligence (AI) offer significant opportunities to drive industrial transformation by addressing growing societal demands for products, techno-economic efficiency, and reduced carbon footprints. This review presents a …","url":["https://www.nature.com/articles/s43246-026-01105-0"]} +{"year":"2026","title":"Al-Generated Creativity","authors":["PH Originality, AI Fears - Imposter Syndrome and AI: Navigating Human Identity …, 2026"],"snippet":"The legal inferences of AI-driven change of copyrighted works advance complicated questions under intellectual property and copyright law including the boundary between conversion in use or infringement. AI-generated modifications from …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=qsu8EQAAQBAJ&oi=fnd&pg=PA27&dq=commoncrawl&ots=I3j3Hgf1DL&sig=tNO31S3v02MyzPPlsD_4Yv7RyaQ"]} +{"year":"2026","title":"ALIBIS: Assessing and mitigating the risk of sensitive metadata Leakage In moBile Image Sharing","authors":["TTL Nguyen, B Carminati, E Ferrari - Pervasive and Mobile Computing, 2026"],"snippet":"Smartphones have become necessary in modern life and can replace traditional devices like cameras. The high demand for taking and sharing photos via smartphones, especially with the explosion of social networks and instant messaging …","url":["https://www.sciencedirect.com/science/article/pii/S157411922600012X"]} +{"year":"2026","title":"An actionable framework for AI‐ready data","authors":["N Majithia, T Carey‐Wilson, E Simperl, N Shadbolt - AI Magazine, 2026"],"snippet":"Data is the foundation of AI. Poor‐quality data drive up costs and can lead to hidden problems for AI models, especially in complex fields such as healthcare and manufacturing. Meanwhile, biased data negatively affect the performance of AI …","url":["https://onlinelibrary.wiley.com/doi/pdf/10.1002/aaai.70054"]} +{"year":"2026","title":"An AI Tool for Text-to-Image Generation Using Stable Diffusion Model","authors":["S Chowdhury, S Mukherjee, S Moitra, SK Dhar… - … on Web 6.0 and Industry 6.0: WIN …"],"snippet":"This paper presents the development and evaluation of a Text-to-Image AI Generator web application, leveraging advanced AI techniques, with a partic-ular focus on a custom-built Stable Diffusion model, to transform textual prompts into …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=4oClEQAAQBAJ&oi=fnd&pg=PA246&dq=commoncrawl&ots=52c1MMmSNI&sig=bylwnrqadCd7c_Wdo11Kk2DZ4XE"]} +{"year":"2026","title":"An effective convolutional cross-lingual language model for Vietnamese online social media mining","authors":["TM Nguyen, TM Nguyen, TV Huynh, KV Nguyen - International Journal of Machine …, 2026"],"snippet":"… Pre-trained on an extensive 2.5TB dataset from filtered CommonCrawl across 100 languages, including Vietnamese, XLM-RoBERTa provides rich contextual embeddings that capture sentence-level characteristics. The significance of …","url":["https://link.springer.com/article/10.1007/s13042-025-02904-6"]} +{"year":"2026","title":"An Examination of Ethics When Using ChatGPT","authors":["BV Ailes - 2026"],"snippet":"With the general population’s recent and dramatic increase in the frequency of ChatGPT and other similar Artificial Intelligence Generated Content (AIGC) usage throughout various industries, gray areas are becoming more prominent regarding …","url":["https://search.proquest.com/openview/057b942798d8389381fb415f58973b77/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"An Information Theoretic Perspective on Agentic System Design","authors":["S He, A Narayan, IS Khare, SW Linderman, C Ré… - arXiv preprint arXiv …, 2025"],"snippet":"Agentic language model (LM) systems power modern applications like \"Deep Research\" and \"Claude Code,\" and leverage multi-LM architectures to overcome context limitations. Beneath their apparent diversity lies a recurring pattern: smaller \"compressor\" …","url":["https://arxiv.org/pdf/2512.21720"]} +{"year":"2026","title":"An Ontology of Representations: Limits of Universality","authors":["M Stakenborg"],"snippet":"Talk announcement: I will be presenting the arguments in this post and the post on (re) discovering natural laws on Tuesday 17 February at 18: 00 GMT/10: 00 PT, as part of the closing of the Dovetail Fellowship. If you'd like to discuss these ideas live, you're …","url":["https://forum.effectivealtruism.org/posts/zpbwznJLLDXrynr8L/an-ontology-of-representations-limits-of-universality"]} +{"year":"2026","title":"Analysis of the discovery and harvesting of websites by crawler SpiderLing","authors":["M KOSTKA"],"snippet":"… external reference crawler, specifically CommonCrawl. The focus is on the number of webpages Spiderling v2 discovers and downloads compared to CommonCrawl, which URLs are systematically … It is not the aim of this thesis to …","url":["https://is.muni.cz/th/l6rwo/DP_Spiderling.pdf"]} +{"year":"2026","title":"Analyzing LLM Safety Circumvention via Multilingual Euphemistic Coating","authors":["SH Gil, JS Kim, S Lee - 2026 IEEE International Conference on Big Data and …, 2026"],"snippet":"… First, in language selection, MultiJail and LinguaSafe primarily follow availability as reflected in Common Crawl, whereas we select eight languages based on W3Techs’ statistics on website content language usage to better reflect exposure …","url":["https://ieeexplore.ieee.org/abstract/document/11397027/"]} +{"year":"2026","title":"Application of Explainable Artificial Intelligence (XAI) to detect anomalies in computer networks through the creation of images from data packets","authors":["H Junior, PR Galego - 2025"],"snippet":"With the increasing complexity of cyber threats, understanding how artificial intelligence models make decisions has become a crucial step in improving trust and transparency. This work presents a methodology that transforms network traffic …","url":["https://repositorio.unesp.br/bitstreams/372e4751-edd0-4294-8325-11057ea44d7f/download"]} +{"year":"2026","title":"AR-Omni: A Unified Autoregressive Model for Any-to-Any Generation","authors":["D Cheng, R Yuan, Y Li, R You, W Wang, L Nie… - arXiv preprint arXiv …, 2026"],"snippet":"… Content and source: Web image URLs and associated alternative text collected from Common Crawl … Content and source: High-quality web text derived from Common Crawl through an efficient filtering and verification pipeline applied to …","url":["https://arxiv.org/pdf/2601.17761"]} +{"year":"2026","title":"ARCHITECTURES DERIVED FROM THE TRANSFORMER MODEL","authors":["H ÇELİK, R KATIRCI"],"snippet":"… It is pretrained on a large-scale Common Crawl–based corpus (mC4) covering 101 languages, enabling it to perform a wide range of multilingual NLP tasks within a unified framework. The model maintains the same architecture and span …","url":["https://www.researchgate.net/profile/Hilal-Celik-10/publication/399523113_ARCHITECTURES_DERIVED_FROM_THE_TRANSFORMER_MODEL/links/695dfac8a1fd017989119763/ARCHITECTURES-DERIVED-FROM-THE-TRANSFORMER-MODEL.pdf"]} +{"year":"2026","title":"Artificial intelligence is creating a new global linguistic hierarchy","authors":["G Occhini, K Tanaka-Ishii, A Barford, R Tikochinski… - arXiv preprint arXiv …, 2026"],"snippet":"… Conversely, PC2 is strongly characterized by high positive loadings for AI technology availability, with the highest values observed for the volume of data on CommonCrawl (loading =0.37), the number of datasets on HuggingFace (loading =0.37) …","url":["https://arxiv.org/pdf/2602.12018"]} +{"year":"2026","title":"Asian and Low-Resource Language Information Processing","authors":["KB Nelatoori, AK Sahagal, HB Kommanti, H Sharma… - ACM Transactions on, 2026"],"snippet":"… outperformed m-BERT in a number of downstream multi-lingual tests due to the training on more than two terabytes of CommonCrawl data. This model is also called as XLMR. — MuRIL24: A recently released BERT model called MuRIL (Multilingual …","url":["https://dl.acm.org/doi/pdf/10.1145/3759466"]} +{"year":"2026","title":"Assessing the Impact of Typological Features on Multilingual Machine Translation in the Age of Large Language Models","authors":["V Hirak, J Jumelet, A Bisazza - arXiv preprint arXiv:2602.03551, 2026"],"snippet":"… While this is a very rough approximation of what our evaluated models were exposed to, we assume that large disparities within CommonCrawl will correlate overall with large disparities of language presence on the Web, which provides the …","url":["https://arxiv.org/pdf/2602.03551"]} +{"year":"2026","title":"Assessing the quality and coherence of word embeddings after SCM-based intersectional bias mitigation","authors":["E Kocadag, SSM Ziabari, AMM Alsahag"],"snippet":"Static word embeddings often absorb social biases from the text they learn from, and those biases can quietly shape downstream systems. Prior work that uses the Stereotype Content Model (SCM) has focused mostly on single-group bias along …","url":["https://www.researchgate.net/profile/Seyed-Sahand-Mohammadi-Ziabari/publication/399552686_Assessing_word_embeddings_after_SCM-based_bias_mitigation_Assessing_the_quality_and_coherence_of_word_embeddings_after_SCM-based_intersectional_bias_mitigation/links/695eca82a1fd01798911e5af/Assessing-word-embeddings-after-SCM-based-bias-mitigation-Assessing-the-quality-and-coherence-of-word-embeddings-after-SCM-based-intersectional-bias-mitigation.pdf"]} +{"year":"2026","title":"Assessment of the State-of-the-art Benchmarks Used to Evaluate Social Reasoning and Theory of Mind in LLMs","authors":["L HORNÍKOVA"],"snippet":"Large language models have demonstrated impressive performance on various downstream tasks, hinting at possible reasoning capabilities. To assess whether these models can reason beyond surface-level inference, various benchmarks have …","url":["https://is.muni.cz/th/f0evs/Assessment_of_the_State-of-the-art_Benchmarks_Used_to_Evaluate_Social_Reasoning_and_Theory_of_Mind_in_LLMs_Archive.pdf"]} +{"year":"2026","title":"AttenMIA: LLM Membership Inference Attack through Attention Signals","authors":["P Zaree, MAA Mamun, Y Dong, I Alouani… - arXiv preprint arXiv …, 2026"],"snippet":"… Following established extraction protocols, we construct an evaluation corpus of short natural-language prefixes sampled from Internet-scale text (Common Crawl). Prefixes are 5–10 tokens long and are used to prompt GPT-2 to generate 256token …","url":["https://arxiv.org/pdf/2601.18110"]} +{"year":"2026","title":"Attention Amplification in Multilingual LLMs: Why Script Representation Matters","authors":["Y Mishra, S Mishra - 2026"],"snippet":"Modern Large Language Models (LLMs) inherently exhibit a profound architectural bias toward English and other Latin-script languages, inadvertently erecting a severe “script barrier” for the vast majority of the world’s linguistic diversity. This …","url":["https://www.researchsquare.com/article/rs-8959575/latest.pdf"]} +{"year":"2026","title":"Audio Driven Detection of Hate Speech in Telugu: Toward Ethical and Secure CPS","authors":["M Santhosh Kumar, P Sai Ravula, M Prasanna Teja… - Reliability in Cyber-Physical …, 2026"],"snippet":"… XLM RoBERTa is a RoBERTa-based multilingual model that was trained at 2.5 TB of multilingual data collected from CommonCrawl. XLM-R performs better cross-lingual generalization than mBERT, and also has larger model capacity, making it a strong …","url":["https://link.springer.com/chapter/10.1007/978-3-032-09917-4_3"]} +{"year":"2026","title":"Automated item‐level measures of verbal fluency in semantic and logopenic primary progressive aphasia","authors":["JMJ Vonk, FJ Ferrante, BT Morin, DA Rodriguez, M Lin… - Alzheimer's & Dementia, 2026"],"snippet":"INTRODUCTION Verbal fluency tasks are widely used in primary progressive aphasia (PPA), but most studies rely only on total correct responses, overlooking qualitative features of the words produced. We applied a scalable computational …","url":["https://alz-journals.onlinelibrary.wiley.com/doi/pdf/10.1002/alz.71124"]} +{"year":"2026","title":"Autonomous Data Processing using Meta-Agents","authors":["U Khurana - arXiv preprint arXiv:2602.00307, 2026"],"snippet":"… This work demonstrates the feasibility of using LLMs to prepare training data at scale, including Common Crawl and specific target websites. Research on the interaction between data cleaning and automated machine learning [13] reveals that …","url":["https://arxiv.org/pdf/2602.00307"]} +{"year":"2026","title":"Autopoietic Invocation Syntax","authors":["MS Andrade - 2025"],"snippet":"This book is a commercial publication and may not be reproduced, distributed, or transmitted in any form or by any means, including photocopying, recording, or other electronic or mechanical methods, without the prior written permission of the …","url":["https://www.radegenbio.com/uploads/b/f3ab1930-8e25-11ed-ac2a-cd934c2005a8/dfc5dcb0-c358-11f0-9546-9be709c8fb13.pdf"]} +{"year":"2026","title":"AXE: Low-Cost Cross-Domain Web Structured Information Extraction","authors":["A Mansour, KW Alshaer, M Elsaban - arXiv preprint arXiv:2602.01838, 2026"],"snippet":"… To get the diversity of data that is needed, we used Common Crawl to get a random set of 10k pages. Then we extracted different heuristics from each page. Using these heuristics, we performed deduplication by clustering those pages into 1,000 pages. …","url":["https://arxiv.org/pdf/2602.01838"]} +{"year":"2026","title":"Bagpiper: Solving Open-Ended Audio Tasks via Rich Captions","authors":["J Tian, H Wang, BH Su, C Huang, Q Wang, J Shi… - arXiv preprint arXiv …, 2026"],"snippet":"… For text-only data, we use a broad Dolma3 ingredient122 mixture (domainbalanced CommonCrawl high-quality subsets, plus code/math/reasoning/STEM-centric subsets) together with dialogue-style instruction corpora (llama-nemotron23, Olmo324). …","url":["https://arxiv.org/pdf/2602.05220"]} +{"year":"2026","title":"Balancing Innovation and Responsibility: A S-CORE Framework Analysis of GPT-4's Ethical and Societal Impact","authors":["H Wanga"],"snippet":"… The model's training via large-scale scraping of publicly available data (eg, Common Crawl) operates in … An analysis of undesirable content in the Common Crawl corpus. arXiv preprint arXiv:… An analysis of undesirable content in the …","url":["https://www.researchgate.net/profile/Herbert-Wanga-2/publication/399233538_International_Journal_of_Research_Publication_and_Reviews_Balancing_Innovation_and_Responsibility_A_S-CORE_Framework_Analysis_of_GPT-4's_Ethical_and_Societal_Impact/links/6955252b27359023a01233fa/International-Journal-of-Research-Publication-and-Reviews-Balancing-Innovation-and-Responsibility-A-S-CORE-Framework-Analysis-of-GPT-4s-Ethical-and-Societal-Impact.pdf"]} +{"year":"2026","title":"Benchmarking Vision-Language Models for French PDF-to-Markdown Conversion","authors":["B Rigal, V Dupriez, A Mignon, RL Hy, N Mery - arXiv preprint arXiv:2602.11960, 2026"],"snippet":"This report evaluates PDF-to-Markdown conversion using recent Vision-Language Models (VLMs) on challenging French documents. Document parsing is a critical step for Retrieval-Augmented Generation (RAG) pipelines, where transcription and …","url":["https://arxiv.org/pdf/2602.11960"]} +{"year":"2026","title":"BERT based sentiment analysis of consumer hesitancy toward solar energy adoption","authors":["A Jabbar, J Yuan, AR Al-Shamasneh, S Rekik - Scientific Reports, 2026"],"snippet":"The adoption of solar energy is pivotal in addressing climate change and achieving long-term energy security. However, its widespread deployment faces notable barriers, including high upfront costs, consumer doubts about system reliability, and …","url":["https://www.nature.com/articles/s41598-026-38604-6_reference.pdf"]} +{"year":"2026","title":"Beyond a Single Extractor: Re-thinking HTML-to-Text Extraction for LLM Pretraining","authors":["J Li, J Gardner, D Kang, F Shi, K Singh, CL Li… - arXiv preprint arXiv …, 2026"],"snippet":"… While standard practice applies a single extractor to all of Common Crawl, we show this leads to suboptimal data yield—using multiple extractors in parallel can significantly increase yield while maintaining performance. We further show that …","url":["https://arxiv.org/pdf/2602.19548"]} +{"year":"2026","title":"Beyond Divergent Creativity: A Human-Based Evaluation of Creativity in Large Language Models","authors":["K Nakajima, J Zuiderveld, S Pezzelle - arXiv preprint arXiv:2601.20546, 2026"],"snippet":"Large language models (LLMs) are increasingly used in verbal creative tasks. However, previous assessments of the creative capabilities of LLMs remain weakly grounded in human creativity theory and are thus hard to interpret. The widely used …","url":["https://arxiv.org/pdf/2601.20546"]} +{"year":"2026","title":"Beyond Machine Perception: AI Urban Imagination","authors":["DN del Castillo, I Neri"]} +{"year":"2026","title":"Beyond Many-Shot Translation: Scaling In-Context Demonstrations For Low-Resource Machine Translation","authors":["LF Salim, E Carlin, A Morinvil, X Ai, LW Ku - arXiv preprint arXiv:2602.04764, 2026"],"snippet":"Building machine translation (MT) systems for low-resource languages is notably difficult due to the scarcity of high-quality data. Although Large Language Models (LLMs) have improved MT system performance, adapting them to lesser-represented …","url":["https://arxiv.org/pdf/2602.04764"]} +{"year":"2026","title":"Beyond neutrality: a comprehensive approach of religious bias in large language models","authors":["KA Hossain, JS Mahmud, MH Tuli, A Mitra - 2025"],"snippet":"While recent developments in large language models have improved bias detection and classification, sensitive subjects like religion still present challenges because even minor errors can result in severe misunderstandings. In particular, multilingual …","url":["https://dspace.bracu.ac.bd:8443/xmlui/bitstream/handle/10361/27421/21201496%2C%2022101698%2C%2022101788%2C%2022101426_CSE.pdf?sequence=1&isAllowed=y"]} +{"year":"2026","title":"BEYOND THE CARDIGAN: UNPACKING BIAS IN CHAT GPT'S VISUALISATIONS OF LIBRARY SPACES AND STAFF","authors":["K Oddone, DHR Spennemann - Access (Online), 2025"],"snippet":"Using ChatGPT, we prompted the generation of images depicting two librarians, or a librarian with a client, set against the interior of a library in school, pub-lic and university contexts. Data collection was carried out using ChatGPT-4o, which …","url":["https://search.proquest.com/openview/9c284d3e2d9626c87c503aaecfd968f5/1?pq-origsite=gscholar&cbl=2032617"]} +{"year":"2026","title":"Beyond the Unit Hypersphere: Embedding Magnitude in Contrastive Learning","authors":["X Feng, T Watanabe - arXiv preprint arXiv:2602.09229, 2026"],"snippet":"… In contrast, E5’s pre-training corpus (CCPairs) explicitly includes web pages from the MS-MARCO document ranking corpus through Common Crawl. This data overlap has important implications for our fine-tuning experiments, as discussed in Section 3. …","url":["https://arxiv.org/pdf/2602.09229"]} +{"year":"2026","title":"Bias Detection in Cultural Heritage Metadata: Preliminary Results from the IMAGES Project","authors":["A Oddi, G Romagna, R Rasconi, P Panarese… - 2025"],"snippet":"This paper presents early findings from a pilot study within IMAGES (Inclusive Machine Learning Using Art and Culture for Tackling Gender and Ethnicity Stereotypes), a PRIN PNRR interdisciplinary project investigating the role of artificial …","url":["https://ceur-ws.org/Vol-4147/paper14.pdf"]} +{"year":"2026","title":"Bielik 11B v3: Multilingual Large Language Model for European Languages","authors":["K Ociepa, R Kinas, K WrĂłbel, A GwoĹşdziej - arXiv preprint arXiv:2601.11579, 2025"],"snippet":"We present Bielik 11B v3, a state-of-the-art language model highly optimized for the Polish language, while also maintaining strong capabilities in other European languages. This model extends the Mistral 7B v0.2 architecture, scaled to 11B …","url":["https://arxiv.org/pdf/2601.11579"]} +{"year":"2026","title":"Big data analytics framework for defense strategic intelligence and decision support systems","authors":["RI Adha, A Mardamsyah, KI Phatoni - Journal of Defense Technology and …, 2026"],"snippet":"The contemporary defense environment faces rapidly evolving threats, vast heterogeneous data, and linguistic diversity, creating significant challenges for timely and accurate intelligence analysis. This study aims to develop an integrated …","url":["https://research.idu.ac.id/index.php/jdte/article/download/56/9"]} +{"year":"2026","title":"Bilingual fake-news detection in low-resource media: A Transformer-based framework for Nepali–English content","authors":["P Ghimire, P Shrestha - Journal of Innovations in Engineering Education, 2025"]} +{"year":"2026","title":"Brain-Inspired Foundation Models for Data Efficiency","authors":["Z Wu - 2025"],"snippet":"Foundation models (FMs), including large language models (LLMs) and vision–language models (VLMs), have led to remarkable progress in natural language processing (NLP) and computer vision (CV). However, their success depends on large-scale, curated …","url":["https://search.proquest.com/openview/301d68b402b62785032231ebafd56ac5/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Breaking Bias: A Context-Aware Multimodal Framework for Detecting and Neutralizing Ideological Bias in News","authors":["A Mitra, R Majumdar, R Das, A Saha, J Paul, S Mondal… - ACM Transactions on …, 2026"],"snippet":"Media bias, particularly bias by word choice, influences public perception and fosters polarized clusters of opinion. Biased word choices, often subtle and context-independent, erode trust and often jeopardize social framework. Detecting and neutralizing such …","url":["https://dl.acm.org/doi/pdf/10.1145/3789258"]} +{"year":"2026","title":"Breaking Size Barrier: Enhancing Reasoning for Large-Size Table Question Answering","authors":["X Wu, D Liang, J Yang, X Cheng, LZ Chai, T Li, L Yang… - International Conference on …, 2026"],"snippet":"… Ultimately, We collect 576 tables from the Common Crawl subset of Tablib, covering diverse topics such as finance, competition, sports, and science. To better explore the impact of table size on reasoning tasks in a single dimension, we …","url":["https://link.springer.com/chapter/10.1007/978-981-95-3830-0_16"]} +{"year":"2026","title":"Breaking the Single-Reference-Vector Barrier in Approximate Nearest Neighbor Search","authors":["J Xie, J Liang, S Teng, JX Yu, Y Liu - 2026"],"snippet":"Approximate nearest neighbor (ANN) searches are commonly employed in various machine learning applications, such as recommendation systems, but traditional ANN searches typically involve only a single reference vector in a query. To broaden …","url":["https://xiejiadong.github.io/files/paper/%5Bwww26%5Dmulti-vector.pdf"]} +{"year":"2026","title":"BYOL: Bring Your Own Language Into LLMs","authors":["SW Zamir, W Hamidouche, BB Amor, L Marotti… - arXiv preprint arXiv …, 2026"],"snippet":"… While over 7,000 languages are spoken worldwide1, only a small fraction dominates the web (eg, ∼90% of Common Crawl text comes from just twenty languages2). As generative AI increasingly becomes a general-purpose technology …","url":["https://arxiv.org/pdf/2601.10804"]} +{"year":"2026","title":"Can Good Writing Be Generative? Expert-Level AI Writing Emerges through Fine-Tuning on High-Quality Books","authors":["T Chakrabarty, PS Dhillon - arXiv preprint arXiv:2601.18353, 2026"],"snippet":"… GPT-3 [7] one of the first large language models was trained on the infamous BooksCorpus4 in addition to Common Crawl and WebText which consisted of many books. The first Llama model [63] states its book sources are Project Gutenberg5 …","url":["https://arxiv.org/pdf/2601.18353"]} +{"year":"2026","title":"Can Small Training Runs Reliably Guide Data Curation? Rethinking Proxy-Model Practice","authors":["JT Wang, T Wu, K Lyu, J Zou, D Song, R Jia, P Mittal - arXiv preprint arXiv …, 2025"],"snippet":"… They all originate from Common Crawl but vary in curation strategies such as data filtering criteria and deduplication algorithms. (3) Scoring-based data filter. We construct 6 data recipes using different mixing ratios of head-middle versus tail …","url":["https://arxiv.org/pdf/2512.24503"]} +{"year":"2026","title":"CellularSpecSec-Bench: A Staged Benchmark for Evidence-Grounded Interpretation and Security Reasoning over 3GPP Specifications","authors":["K Xie, X Zhao, Y Hu, S Yuan, T Xie - arXiv preprint arXiv:2601.12716, 2026"],"snippet":"Cellular networks are critical infrastructure supporting billions of worldwide users and safetyand mission-critical services. Vulnerabilities in cellular networks can therefore cause service disruption, privacy breaches, and broad societal harm …","url":["https://arxiv.org/pdf/2601.12716"]} +{"year":"2026","title":"Characterizing Democratic Biases in AI-Powered Participatory Democracy Platforms","authors":["M Berriche, S Hafid, A Mogoutov, JP Cointet - 2026"],"snippet":"The “biases” of artificial intelligence systems have sparked numerous controversies that have made headlines in recent years: think of sexist biases in automatic translation tools or racist biases in decision-support systems used in courts. The …","url":["https://hal.science/hal-05531294/document"]} +{"year":"2026","title":"Classifying Telugu Sentiments Using Transformed-Based Methods","authors":["E Francis, P Praveen - 2026 7th International Conference on Mobile …, 2026"],"snippet":"… It acts as an enhanced version of the previous XLM-100 framework and was developed using around 2.5TB of text sourced from Common Crawl data. Documents were used to train the cross-lingual model that encodes sentences. One hundred languages. …","url":["https://ieeexplore.ieee.org/abstract/document/11412850/"]} +{"year":"2026","title":"Climate news mediates extreme weather effects on climate change concern","authors":["J Peisker, R Hoffmann, R Muttarak - Climate Risk Management, 2026"],"snippet":"As the severe impacts of climate change become increasingly apparent, concerns about climate-related issues have grown in recent years. The news media plays an important role in disseminating information about climate change and its …","url":["https://www.sciencedirect.com/science/article/pii/S2212096326000197"]} +{"year":"2026","title":"Clustering Clinical Notes in Spanish Based","authors":["QJ Moreno-Lara¹, I Román-Godínez¹, SR Santos-Arce¹ - … of CNIB 2025, September 11–13 …"],"snippet":"Clinical notes are essential to documenting patient care from initial evaluations in admission notes to care outcomes in discharge summaries. These documents often contain incomplete or inconsistent information, affecting care continuity and clinical …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=EhuzEQAAQBAJ&oi=fnd&pg=PA156&dq=commoncrawl&ots=x25EPpqz7s&sig=tPKc5UTXxRuthFGVtgdSOFEPk-w"]} +{"year":"2026","title":"Cognitive salience features enhance multitask deep learning for pragmatic reasoning across cultures","authors":["M Qi, YN Thai - Scientific Reports, 2026"],"snippet":"Cross-cultural pragmatic reasoning requires computational systems to interpret implicit meanings and culturally-specific communication patterns beyond literal semantic comprehension. This study proposes a cognitive salience feature-driven …","url":["https://www.nature.com/articles/s41598-026-40809-8_reference.pdf"]} +{"year":"2026","title":"Cognitively Tuned AI Bots for Linguistic Equity: A Neuro-Symbolic Framework for Policy-Aligned Language Learning Systems","authors":["SI Ali"],"snippet":"… First, multilingual corpora from open-access resources such as the Common Crawl and Wikipedia are supplemented with curated datasets from underrepresented languages. Second, collaborations with linguistic researchers …","url":["https://www.researchgate.net/profile/Syed-Ali-542/publication/399474553_Cognitively_Tuned_AI_Bots_for_Linguistic_Equity_A_Neuro-Symbolic_Framework_for_Policy-Aligned_Language_Learning_Systems/links/695c8765a1fd0179891126ea/Cognitively-Tuned-AI-Bots-for-Linguistic-Equity-A-Neuro-Symbolic-Framework-for-Policy-Aligned-Language-Learning-Systems.pdf"]} +{"year":"2026","title":"Colour Contrast on the Web: A WCAG 2.1 Level AA Compliance Audit of Common Crawl's Top 500 Domains","authors":["T Vaughan, PO Suarez - arXiv preprint arXiv:2602.24067, 2026"],"snippet":"… Using archived HTML from Common Crawl, we evaluate declared foreground and … Studies using Common Crawl and similar archives have examined language distribution, hyperlink … from Common Crawl archives, we extend archive-based …","url":["https://arxiv.org/pdf/2602.24067"]} +{"year":"2026","title":"CommonLID: Re-evaluating State-of-the-Art Language Identification Performance on Web Data","authors":["PO Suarez, L Burchell, C Arnett, R Mosquera-Gómez… - arXiv preprint arXiv …, 2026"],"snippet":"… We sourced the data used to build CommonLID from Common Crawl web crawl data. This means it has the potential to contain personally … Moreover, we chose Common Crawl as a source because they have always respected robots.txt10 …","url":["https://arxiv.org/pdf/2601.18026"]} +{"year":"2026","title":"Communication Frequency in Megatron‐LM: Experimental Insights Applied to Heterogeneous Distributed Training Time Prediction","authors":["HR Zhang, Y Feng, Z Chen, Y Tian, X Zheng, C Liu… - … and Computation: Practice …, 2026"],"snippet":"As model parameters increase exponentially, distributed training has become essential for advancing modern deep neural networks. Megatron‐LM, an efficient distributed training framework developed by NVIDIA, enables the training of trillion‐parameter …","url":["https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.70500"]} +{"year":"2026","title":"Comparative Study of Zero-Shot and Supervised Learning for Tamil Tweet","authors":["R Jain, M Kesavan, VK Jayaraman - Data Science and Applications: Proceedings of ICDSA …"],"snippet":"… XLM-RoBERTa itself is a multilingual transformer-based encoder trained on 2.5 TB of CommonCrawl data across 100 languages. It uses 12 to 24 self-attention layers and a shared 250K subword vocabulary via SentencePiece, which supports …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=NHylEQAAQBAJ&oi=fnd&pg=PA85&dq=commoncrawl&ots=6CA0MZBM03&sig=l89s9NaLp8tVFah2bTzzE5ePeP8"]} +{"year":"2026","title":"Compass-Embedding v4: Robust Contrastive Learning for Multilingual E-commerce Embeddings","authors":["P Ueareeworakul, S Liu, J Feng, L Hu, Z Shi, C Sun… - arXiv preprint arXiv …, 2025"],"snippet":"As global e-commerce rapidly expands into emerging markets, the lack of high-quality semantic representations for low-resource languages has become a decisive bottleneck for retrieval, recommendation, and search systems. In this work, we …","url":["https://arxiv.org/pdf/2601.11565"]} +{"year":"2026","title":"Comprehensive Comparison of RAG Methods Across Multi-Domain Conversational QA","authors":["K Alushi, J Strich, C Biemann, M Semmann - arXiv preprint arXiv:2602.09552, 2026"],"snippet":"… 2023) dataset incorporates questions from pre-existing datasets, including QuAC, and includes information sourced from the Common Crawl and web searches. Question rewriting was also used to \"fix\" any inquiries that had references to the conversation …","url":["https://arxiv.org/pdf/2602.09552"]} +{"year":"2026","title":"Constructing Synthetic Instruction Datasets for Improving Reasoning in Domain-Specific LLMs: A Case Study in the Japanese Financial Domain","authors":["Y Okochi, FM Sim, T Okada - arXiv preprint arXiv:2603.01353, 2026"],"snippet":"In adapting LLMs to specific domains, achieving both domain expertise and reasoning ability remains an urgent challenge. This study proposes a general method for constructing high-quality synthetic instruction data for any domain …","url":["https://arxiv.org/pdf/2603.01353"]} +{"year":"2026","title":"Contractual Deepfakes: Can Large Language Models Generate Contracts?","authors":["E Mik - arXiv preprint arXiv:2602.09384, 2026"],"snippet":"… Training data predominantly stem from the Common Crawl archive of Internet text scraped from billions of URLs. As a large amount of its content is undesirable for training, LLM builders train their models on filtered samples of Common Crawl, such …","url":["https://arxiv.org/pdf/2602.09384"]} +{"year":"2026","title":"Convert Once, Consume Many: SDF for Cacheable, Typed Semantic Extraction from Web Pages","authors":["P Sarkar"],"snippet":"… Common Crawl [19] provides web‐scale snapshots for research and retrieval. Common Crawlstyle pipelines typically require repeated downstream parsing, boilerplate removal, and extraction to build usable datasets. SDF can be viewed as …","url":["https://sdfprotocol.org/whitepaper.pdf"]} +{"year":"2026","title":"Cost-Efficiency Frameworks for Scaling Large Language Models in Real-World Systems","authors":["NK Gaur, A Kumar, P Rajendra - 2025 IEEE 7th International Conference on …, 2025"],"snippet":"Large Language Models (LLMs) including GPT, LLaMA, and PaLM have been shown to be capable of transformations in fields like healthcare to finance. Nonetheless, the expensive nature of the computational and financial training …","url":["https://ieeexplore.ieee.org/abstract/document/11325717/"]} +{"year":"2026","title":"Crash Blossoms/IF & ONLY IF: A Lo-Fidelity AI Newspaper (2020)","authors":["N Jones, T Schofield, S Skinner - DATA browser 10 Curating Superintelligences: A …"],"snippet":"What remains of curation if the ‘next biennial’, as Joasia Krysa suggests, is ‘curated by a machine’? 3 Will the profession of curator continue to exist if we succeed in formalising and automating ‘creativity’? how does curatorial action change if, as van …","url":["https://www.data-browser.net/pdf/DB10_Curating_Superintelligences.pdf#page=195"]} +{"year":"2026","title":"CRFD: Causal and Interpretable Fault Diagnosis Using Counterfactual Reasoning for Microservices on Multi-source Observability Data","authors":["X Tian, S Ying, T Li, C Shi, D Xiao - ACM Transactions on Software Engineering and …, 2026"],"snippet":"Causal and interpretable fault diagnosis is essential for ensuring the stable operation and efficient maintenance of large-scale microservice systems. However, the existing fault diagnosis approaches confront two significant challenges: (1) …","url":["https://dl.acm.org/doi/pdf/10.1145/3796706"]} +{"year":"2026","title":"Crowdsourcing Piedmontese to Test LLMs on Non-Standard Orthography","authors":["G Vico, J Libovický - arXiv preprint arXiv:2602.14675, 2026"],"snippet":"We present a crowdsourced dataset for Piedmontese, an endangered Romance language of northwestern Italy. The dataset comprises 145 Italian-Piedmontese parallel sentences derived from Flores+, with translations produced by speakers …","url":["https://arxiv.org/pdf/2602.14675"]} +{"year":"2026","title":"Cybersecurity Future Threats in Artificial Intelligence: A Comprehensive Analysis","authors":["J Foley - 2026"],"snippet":"The rapid integration of artificial intelligence (AI) systems into critical infrastructure, healthcare, finance, and national security sectors has introduced unprecedented cybersecurity challenges. This paper examines emerging threats that exploit AI …","url":["https://www.authorea.com/doi/pdf/10.22541/au.176800035.52392754"]} +{"year":"2026","title":"DanQing: An Up-to-Date Large-Scale Chinese Vision-Language Pre-training Dataset","authors":["H Shen, T Gu, B Qin, L Wu, Y Wu, S Tan, Z Sun… - arXiv preprint arXiv …, 2026"],"snippet":"… As a result, we propose DanQing, which contains 100 million image-text pairs collected from Common Crawl. Different from existing datasets, DanQing is curated through a more rigorous selection process, yielding superior data quality. Moreover …","url":["https://arxiv.org/pdf/2601.10305"]} +{"year":"2026","title":"Data Darwinism Part I: Unlocking the Value of Scientific Data for Pre-training","authors":["Y Qin, Z Huang, T Mi, W Si, C Zhou, Q Guo, S Feng… - arXiv preprint arXiv …, 2026"],"snippet":"Data quality determines foundation model performance, yet systematic processing frameworks are lacking. We introduce Data Darwinism, a ten-level taxonomy (L0-L9) that conceptualizes data-model co-evolution: advanced models produce superior …","url":["https://arxiv.org/pdf/2602.07824"]} +{"year":"2026","title":"Data Distribution Matters: A Data-Centric Perspective on Context Compression for Large Language Model","authors":["K Lv, J Tang, L Liu, H Chen, W Zhang, S Liu, Y Wang… - arXiv preprint arXiv …, 2026"],"snippet":"… The datasets are structured as follows: dataset P1 consists entirely of the Common Crawl (CC) subset, representing a baseline of general-purpose web text. dataset P2 to dataset P6 are constructed by substituting portions of the CC data with …","url":["https://arxiv.org/pdf/2602.01778"]} +{"year":"2026","title":"Data Preparation, Collecting, Cleaning, and Managing Datasets in Generative AI","authors":["V Dutt, S Singh, GK Sethi - Generative AI for Remote Sensing of the Environment"],"snippet":"… Some of these include Common Crawl for webpages, ImageNet for images, and LibriSpeech for sound files. Public datasets are useful for benchmarking as well as for prototyping purposes (Song et al., 2024). • Proprietary data: Companies rely on …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.1201/9781003616207-12&type=chapterpdf"]} +{"year":"2026","title":"Data Science and Technology Towards AGI Part I: Tiered Data Management","authors":["Y Wang, Z Fu, H Zhao, C Zhao, C Zhou, X Lin, H Lyu… - arXiv preprint arXiv …, 2026"],"snippet":"… retrieve high-quality code and technical documentation previously overlooked in Common Crawl. Furthermore, synthetic augmentation … , targeting 75% similarity), deduplicating each of the 96 Common Crawl snapshots separately to produce about …","url":["https://arxiv.org/pdf/2602.09003"]} +{"year":"2026","title":"dataRLsec: Safety, Security, and Reliability With Robust Offline Reinforcement Learning for DPAs","authors":["SKS Pandian, N Kshetri - arXiv preprint arXiv:2601.01289, 2026"],"snippet":"Data poisoning attacks (DPAs) are becoming popular as artificial intelligence (AI) algorithms, machine learning (ML) algorithms, and deep learning (DL) algorithms in this artificial intelligence (AI) era. Hackers and penetration testers are excessively …","url":["https://arxiv.org/pdf/2601.01289"]} +{"year":"2026","title":"DeBERTa-AT: A DeBERTaV3 Variant Fine-Tuned on Air Traffic Data","authors":["RS Yue, DL Nielsen, K Kalyanam - AIAA SCITECH 2026 Forum, 2026"],"snippet":"Large language models (LLMs) offer a powerful platform and can leverage tools to extract relevant information and provide recommendations for air traffic users. These can range from classification of voluntary safety reports to discovering shared …","url":["https://arc.aiaa.org/doi/abs/10.2514/6.2026-0429"]} +{"year":"2026","title":"Deceptive Simplicity: How high F1 Scores Obscure Artifact-Driven Performance in Indic Fake News Detection","authors":["A Rathi, A Verma, A Verma - Authorea Preprints, 2026"],"snippet":"High F1 scores in Indic misinformation classification have suggested significant progress in its detection. However, we challenge this narrative by demonstrating that such performance is reflective of dataset artifacts rather than a robust semantic …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.177220025.50670051"]} +{"year":"2026","title":"Deep Iterative Persona Alignment: Generating Statistically Representative LLM Personas for High-Fidelity Social Simulations","authors":["S Yuan, B He - 2026"],"snippet":"The increasing adoption of large language models for simulating human behavior offers a promising new paradigm for social science research. However, a critical limitation persists: current LLM persona generation methods prioritize individual …","url":["https://www.preprints.org/frontend/manuscript/a2d480d151927a3a09aae3f3ad25d837/download_pub"]} +{"year":"2026","title":"Deep Learning Approaches for Multimodal Emotion Recognition: Trends, Issues, and Prospects","authors":["K Sharma, R Nandal - Journal of Multi Disciplinary Engineering Technologies"],"snippet":"High-fidelity human-computer interaction now hinges on the machine’s ability to decode affective states—a task where single-source data (speech or text alone) consistently falls short. This paper pivots away from traditional unimodal constraints …","url":["http://jmdet.com/wp-content/uploads/2026/02/4_JMDET_19_1_202509_04_Deep_Learning_Approaches_for_Multimodal_Emotion_Recognition__Trends__Issues__and_Prospects.pdf"]} +{"year":"2026","title":"DEFT-Net: Explainable Deepfake Text Detection for Combating Information Disorder in the Age of Generative AI.","authors":["S Sadiq, S Ullah, N Abuzinadah, R Alharthi… - International Journal of …, 2026"],"snippet":"… The FastText Subword model generates word embeddings by analyzing subword components, trained on a large corpus of 600 billion tokens from Common Crawl, resulting in 2 million distinct word vectors [58]. Unlike traditional word-level …","url":["https://search.ebscohost.com/login.aspx?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=18756891&AN=191772692&h=sGQNgZLFYzpjEQZszZcic%2Fnb8FrlwOeOgOXeadSwBChmNKb7WBEm8a%2BOh%2FTUoZZZXljEkxZRUeHnG%2F%2B6QzksSA%3D%3D&crl=c"]} +{"year":"2026","title":"DEPARTMENT OF PHILOSOPHY, LINGUISTICS AND THEORY OF SCIENCE","authors":["A Nieminen"],"snippet":"… This Common Crawl subset … 1 The Common Crawl is a vast open-source corpus containing raw text from webpages, as well as metadata and … https://commoncrawl.org/overview …","url":["https://gupea.ub.gu.se/server/api/core/bitstreams/eee4413a-0a16-4d0d-b2e4-1de30eb8f11d/content"]} +{"year":"2026","title":"Designing a Federated Web Crawler","authors":["S Wortelboer, A de Vries, D Hiemstra - 2025"],"snippet":"1 Abstract This thesis explores the design of a federated web crawler, in which multiple agents cooperate to explore the web by coordinating the partitioning and communicating of URLs. A prototype system was developed by extending the …","url":["https://www.cs.ru.nl/bachelors-theses/2025/Sebastiaan_Wortelboer___1021918___Designing_a_Federated_Web_Crawler_-_Comparing_Domain_Filtering_and_Consistent_Hashing_Approaches.pdf"]} +{"year":"2026","title":"Designing Recommender Systems for Balanced News Exposure","authors":["E Chen, J Dong, S Hsieh, Y Chen, D Isave… - User Experience & Artificial …"],"snippet":"As digital platforms increasingly shape public discourse, the design of news recommender systems (NRS) must balance personalization with exposure to diverse viewpoints. While traditional algorithms optimize for engagement, they risk …","url":["https://www.researchgate.net/profile/Tracy-Yin-2/publication/401544593_Proceedings_of_International_Conference_on_User_Experience_Artificial_Intelligence_2025/links/69a86763a91b826e4346ae03/Proceedings-of-International-Conference-on-User-Experience-Artificial-Intelligence-2025.pdf#page=78"]} +{"year":"2026","title":"Detecting Training Data For Large Language Models: A Survey","authors":["C Yang, J Li, S Lan, Y Wang, H Du, C Gong, X Yao… - ACM Computing Surveys, 2026"],"snippet":"… [19] used texts from the Common Crawl dataset to create prompts for GPT-2 and then searched and veriied the outputs according to Google webpages. If the content output by GPT-2 matched the content found on Google pages, it was considered …","url":["https://dl.acm.org/doi/pdf/10.1145/3779430"]} +{"year":"2026","title":"Detection of AI-Generated texts with BERT variants","authors":["V Balara, K Machová - 2025 International Conference on Emerging eLearning …, 2025"],"snippet":"With the dawn of the generative AI, the social media became flooded with AI-generated content. While images made this way can often be spotted by human eye, in the case of texts, and more importantly short texts, the correct detection can be a …","url":["https://ieeexplore.ieee.org/abstract/document/11280096/"]} +{"year":"2026","title":"Detection of Maliciously Disseminated Hate Speech in Spanish Using Fine-Tuning and In-Context Learning Techniques with Large Language Models","authors":["T Bernal-Beltrán, R Pan, JA García-Díaz… - Computers, Materials and …, 2026"],"snippet":"The malicious dissemination of hate speech via compromised accounts, automated bot networks and malware-driven social media campaigns has become a growing cybersecurity concern. Automatically detecting such content in Spanish is …","url":["https://www.sciencedirect.com/org/science/article/pii/S1546221826001311"]} +{"year":"2026","title":"Detection of Phishing Attack by using LightGBM&Xgbost","authors":["AM Shah, M Noman, TF Khan - Journal Homepage, 2025"],"snippet":"Phishing attacks provide a significant security risk to both individuals and organizations. To steal sensitive information, these assaults are typically carried out by creating phony websites that substantially resemble actual ones. This research …","url":["https://nja.pastic.gov.pk/PJCIS/index.php/JCIS/article/download/33779/32878"]} +{"year":"2026","title":"DHPLT: large-scale multilingual diachronic corpora and word representations for semantic change modelling","authors":["M Fedorova, A Kutuzov, K Umarova - arXiv preprint arXiv:2602.11968, 2026"],"snippet":"In this resource paper, we present DHPLT, an open collection of diachronic corpora in 41 diverse languages. DHPLT is based on the web-crawled HPLT datasets; we use web crawl timestamps as the approximate signal of document creation time. The …","url":["https://arxiv.org/pdf/2602.11968"]} +{"year":"2026","title":"Differentially Private and Communication Efficient Large Language Model Split Inference via Stochastic Quantization and Soft Prompt","authors":["Y Gu, R Jin, X Ji, Y Jin, W Xu - arXiv preprint arXiv:2602.11513, 2026"],"snippet":"Large Language Models (LLMs) have achieved remarkable performance and received significant research interest. The enormous computational demands, however, hinder the local deployment on devices with limited resources. The current …","url":["https://arxiv.org/pdf/2602.11513"]} +{"year":"2026","title":"Discovering Universal Activation Directions for PII Leakage in Language Models","authors":["L Marchyok, Z Coalson, S Keum, S Son, S Hong - arXiv preprint arXiv:2602.16980, 2026"],"snippet":"Modern language models exhibit rich internal structure, yet little is known about how privacy-sensitive behaviors, such as personally identifiable information (PII) leakage, are represented and modulated within their hidden states. We present UniLeak, a …","url":["https://arxiv.org/pdf/2602.16980"]} +{"year":"2026","title":"Disreo: Provably Secure No-Box-Extraction Linguistic Steganography Based on Distribution Reorganization","authors":["J Jiang, K Chen, N Zhao, Y Qi, X Zhang, W Zhang… - IEEE Transactions on …, 2026"],"snippet":"Existing provably secure linguistic steganographic methods typically rely on white-box extraction, which necessitates access to large language models. This requirement is impractical in environments with limited resources. To tackle this issue, we propose …","url":["https://ieeexplore.ieee.org/abstract/document/11329500/"]} +{"year":"2026","title":"Distill-then-Replace: Efficient Task-Specific Hybrid Attention Model Construction","authors":["X Xia, H Zhang, C Zhong, J Sun, Y Oishi - arXiv preprint arXiv:2601.11667, 2026"],"snippet":"Transformer architectures deliver state-of-the-art accuracy via dense full-attention, but their quadratic time and memory complexity with respect to sequence length limits practical deployment. Linear attention mechanisms offer linear or near-linear …","url":["https://arxiv.org/pdf/2601.11667"]} +{"year":"2026","title":"DLT-Corpus: A Large-Scale Text Collection for the Distributed Ledger Technology Domain","authors":["WH Cruz, P Devine, N Vadgama, P Tasca, J Xu - arXiv preprint arXiv:2602.22045, 2026"],"snippet":"We introduce DLT-Corpus, the largest domain-specific text collection for Distributed Ledger Technology (DLT) research to date: 2.98 billion tokens from 22.12 million documents spanning scientific literature (37,440 publications), United States Patent …","url":["https://arxiv.org/pdf/2602.22045"]} +{"year":"2026","title":"DM0: An Embodied-Native Vision-Language-Action Model towards Physical AI","authors":["E Yu, H Lv, J Sun, K Lin, R Zhang, Y Shi, Y Chen… - arXiv preprint arXiv …, 2026"],"snippet":"Moving beyond the traditional paradigm of adapting internet-pretrained models to physical tasks, we present DM0, an Embodied-Native Vision-Language-Action (VLA) framework designed for Physical AI. Unlike approaches that treat physical grounding …","url":["https://arxiv.org/pdf/2602.14974"]} +{"year":"2026","title":"Domain Specific Specialization in Low-Resource Settings: The Efficacy of Offline Response-Based Knowledge Distillation in Large Language Models","authors":["E Aslan, P Erdoğmuş - arXiv preprint arXiv:2601.16219, 2026"],"snippet":"Large Language Models (LLMs) excel in general tasks but often struggle with hallucinations when handling domain-specific or institutional knowledge absent from their pre-training. We present an offline response-based knowledge distillation …","url":["https://arxiv.org/pdf/2601.16219"]} +{"year":"2026","title":"Duties for datasets.(2023)","authors":["JTH SOH - Data and Private Law"],"snippet":"… Eighty per cent of the tokens originated from a selected subset of the CommonCrawl (a dataset comprising substantially the entire written internet),47 filtered based on similarity to the other four datasets – WebText, a corpus of English …","url":["https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=6401&context=sol_research"]} +{"year":"2026","title":"Dynamic Generation of Links and Forwarding to Related Web-based Content","authors":["GP Roßrucker - Fortschritt-Berichte VDI"],"snippet":"This paper is dedicated to artificial link generation based on methods and state of the art scientific approaches of natural language processing and information retrieval (NLIR). The motivation to conduct research in this field is presented. It is …","url":["https://www.inlibra.com/document/download/pdf/uuid/54e32609-31f7-3390-b04a-b3176b0a530b#page=97"]} +{"year":"2026","title":"Dynamic Large Concept Models: Latent Reasoning in an Adaptive Semantic Space","authors":["X Qu, S Wang, Z Huang, K Hua, F Yin, RJ Zhu, J Zhou… - arXiv preprint arXiv …, 2025"],"snippet":"Large Language Models (LLMs) apply uniform computation to all tokens, despite language exhibiting highly non-uniform information density. This token-uniform regime wastes capacity on locally predictable spans while under-allocating …","url":["https://arxiv.org/pdf/2512.24617"]} +{"year":"2026","title":"Edge-Ready Romanian Language Models: Training, Quantization, and Deployment","authors":["TA Diac, PF de Viana, AF Neagoe, A Oprea… - AI, 2026"],"snippet":"We present RoBaseLM-S (125 M) and RoBaseLM-M (260 M), two compact Romanian decoder-only language models trained from scratch on a 4.3 B-token curated corpus. Architecturally, they follow a modern LLaMA-style recipe with pre-norm …","url":["https://www.mdpi.com/2673-2688/7/2/61"]} +{"year":"2026","title":"Efficient Multilingual Name Type Classification Using Convolutional Networks","authors":["D Lauc - arXiv preprint arXiv:2601.11090, 2026"],"snippet":"We present a convolutional neural network approach for classifying proper names by language and entity type. Our model, Onomas-CNN X, combines parallel convolution branches with depthwise-separable operations and hierarchical …","url":["https://arxiv.org/pdf/2601.11090"]} +{"year":"2026","title":"Emotion Classification in Bangla: A Comprehensive Comparison of BanglaBERT, mBERT, and XLM-RoBERTa with Error Analysis and Significance Testing","authors":["MR Hasan, Z Chong, TK Abdulwasea, HO Rashid… - 2025 9th International …, 2025"],"snippet":"Recognizing emotions in Bangla is challenging due to the scarcity of annotated data and the language’s complex structure. This study evaluates three transformer-based models: BanglaBERT, mBERT, and XLM-RoBERTa, using a balanced dataset of …","url":["https://ieeexplore.ieee.org/abstract/document/11268079/"]} +{"year":"2026","title":"Emotion Detection on Sylheti-Banglish","authors":["FT Transformer-Based, MM Islam, AA Adil, MJ Islam - … on Big Data, IoT and Machine …, 2026"],"snippet":"Emotion detection plays a crucial role in understanding human expressions and has widespread applications in social media anal-ysis, conversational agents, and mental health monitoring. While significant advancements have been made for …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=-D_CEQAAQBAJ&oi=fnd&pg=PA69&dq=commoncrawl&ots=J_Tc5EfOCO&sig=dP5kAwruTRXdkRGd_18YDiGOiQI"]} +{"year":"2026","title":"Enabling Language Models to Process Information at Scale","authors":["T Gao - 2026"],"snippet":"Progress in language model (LM) development has driven rapid gains in their capabilities, with state-of-the-art systems now matching--and often surpassing--PhD-level performance on exam-style benchmarks. This thesis advances the field by …","url":["https://search.proquest.com/openview/a7ba8d3a752ca5d0b9dd2a6c10928e35/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Encyclopedia of Large Language Models and Foundation Models","authors":["H Hays"],"snippet":"This comprehensive encyclopedia provides an authoritative and detailed reference on Large Language Models (LLMs) and Foundation Models. As artificial intelligence continues to transform technology and society, understanding these powerful …","url":["https://www.researchgate.net/profile/Hasi-Hays/publication/399801350_Encyclopedia_of_Large_Language_Models_and_Foundation_Models/links/69693894ee048155cffc2267/Encyclopedia-of-Large-Language-Models-and-Foundation-Models.pdf"]} +{"year":"2026","title":"End-to-End Test-Time Training for Long Context","authors":["A Tandon, K Dalal, X Li, D Koceja, M Rød, S Buchanan… - arXiv preprint arXiv …, 2025"],"snippet":"We formulate long-context language modeling as a problem in continual learning rather than architecture design. Under this formulation, we only use a standard architecture -- a Transformer with sliding-window attention. However, our model …","url":["https://arxiv.org/pdf/2512.23675"]} +{"year":"2026","title":"Enhanced extractive text summarization framework for low-resourced Urdu language","authors":["S Nazir, M Asif, S Ahmad, H Aljuaid, S Ahmad - PLoS One, 2026"],"snippet":"This era has witnessed an enormous increase in textual corpus available in digital form. Therefore, an intelligent mechanism is required to extract the essential information. This task is performed using an automatic text summarization that …","url":["https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0341596"]} +{"year":"2026","title":"ENHANCING CONTENT RETRIEVAL WITH BIG DATA AND NATURAL LANGUAGE PROCESSING FOR SCALABLE AND SEMANTIC SEARCH SYSTEMS","authors":["S SUJANTHI, M SUNEETHA, NRAO THOTA… - Journal of Theoretical and …, 2026"],"snippet":"… We evaluate the system over three datasets: Common Crawl (web content), Medical Text Mining, and Amazon Product Reviews, and compare to traditional keywordbased search and TF‐IDF and Word2Vec‐based approaches. The …","url":["http://www.jatit.org/volumes/Vol104No3/28Vol104No3.pdf"]} +{"year":"2026","title":"Enhancing Knowledge Quality in Crowd-Sourced Developer Q&A Platforms through AI-driven Software Solutions","authors":["SD Bappon - 2025"],"snippet":"Programming question-and-answer (Q&A) platforms have transformed how developers seek and share programming knowledge, addressing limitations of traditional documentation and tutorial resources. Among them, Stack Overflow (SO) …","url":["https://harvest.usask.ca/bitstreams/26d090da-9daa-4dc3-859d-d06aa683a53a/download"]} +{"year":"2026","title":"Enriching Language Model Capabilities: from Dialogue Intelligence and Evaluation to Tabular Data Reasoning","authors":["B Yang - 2025"],"snippet":"… For example, the C4 dataset (Colossal Cleaned Crawled Corpus) [39], used to train the T5 model, was created by applying a series of heuristics to Common Crawl, such as removing lines without terminal punctuation, filtering out offensive words …","url":["https://search.proquest.com/openview/c589b145697ca1c1c67dacddb62920e9/1?pq-origsite=gscholar&cbl=2026366&diss=y"]} +{"year":"2026","title":"EPISTEMOLOGICAL PERSISTENCE IN MULTILINGUAL AI: THE ILLUSION OF LOCALITY IN LARGE LANGUAGE MODELS","authors":["G Barkin - INTERNATIONAL REVIEW OF MODERN SOCIOLOGY, 2025"],"snippet":"… A foundational source for many contemporary language models is Common Crawl, a massive archive of publicly available web text that is … Popular LLM training data appears to be significantly more Anglophone than the Common Crawl, however …","url":["https://www.academia.edu/download/129301512/Epistemological_Persistence_in_Multilingual_AI_The_Illusion_of_Locality_in_Large_Language_Models_PRINT.pdf"]} +{"year":"2026","title":"Equivocation and Erosion: How LLMs Undermine Catholic Religious Discourse","authors":["JA Karr Jr, MP Lad, D Hernandez, L Conwill…"],"snippet":"Large Language Models (LLMs) offer opportunities for information dissemination, yet present challenges with upholding the distinct theological practices of the Catholic faith. By training on vast datasets, LLMs can generate responses that …","url":["https://jonathankarr.com/documents/adaptive_faith_paper.pdf"]} +{"year":"2026","title":"EstLLM: Enhancing Estonian Capabilities in Multilingual LLMs via Continued Pretraining and Post-Training","authors":["A Dorkin, T Purason, E Kalbaliyev, HA Kuulmets… - arXiv preprint arXiv …, 2026"],"snippet":"Large language models (LLMs) are predominantly trained on English-centric data, resulting in uneven performance for smaller languages. We study whether continued pretraining (CPT) can substantially improve Estonian capabilities in a pretrained …","url":["https://arxiv.org/pdf/2603.02041"]} +{"year":"2026","title":"EuroLLM-22B: Technical Report","authors":["MM Ramos, DM Alves, H Gisserot-Boukhlef, J Alves… - arXiv preprint arXiv …, 2026"],"snippet":"This report presents EuroLLM-22B, a large language model trained from scratch to support the needs of European citizens by covering all 24 official European Union languages and 11 additional languages. EuroLLM addresses the issue of European …","url":["https://arxiv.org/pdf/2602.05879"]} +{"year":"2026","title":"Evaluating AI for the Digital Scholarly Edition","authors":["KM Penner - Artificial Intelligence for Bible Translation and …, 2026"]} +{"year":"2026","title":"Evaluating and improving mathematical reasoning in large language models via skill combinations","authors":["V Shah - 2025"],"snippet":"… These models were trained on large corpora of data generally scraped from the internet (eg., CommonCrawl). This data probably … training of a previous generation model on 120B math specific tokens sourced from CommonCrawl using …","url":["https://umontreal.scholaris.ca/bitstreams/40c57c0a-022f-4ef7-8d26-0fd333f59425/download"]} +{"year":"2026","title":"Evaluating Complementarity of Hand-Crafted Features and Domain Embeddings in Semantic Duplicate Detection","authors":["H Ahmad - 2025"],"snippet":"Duplicate question detection addresses the challenge of identifying semantically equivalent questions expressed through different linguistic forms. Detecting semantically equivalent questions is difficult due to two main factors: the same …","url":["https://helda.helsinki.fi/bitstreams/20b1ebaf-29fc-4df3-b019-342a5df8da8e/download"]} +{"year":"2026","title":"Evaluating Metalinguistic Knowledge in Large Language Models across the World's Languages","authors":["T Arčon, M Klemen, M Robnik-Šikonja, K Dobrovoljc - arXiv preprint arXiv …, 2026"],"snippet":"… Accuracy correlates strongly with language frequency in Common Crawl, suggesting that grammatical competence is mainly data-driven and may deteriorate during post-training. Similarly, the MELA benchmark (Zhang et al., 2024) assesses …","url":["https://arxiv.org/pdf/2602.02182"]} +{"year":"2026","title":"Evaluating the impact of word embeddings on similarity scoring in practical information retrieval","authors":["N McCarroll, K Curran, E McNamee, A Clist… - arXiv preprint arXiv …, 2026"],"snippet":"Search behaviour is characterised using synonymy and polysemy as users often want to search information based on meaning. Semantic representation strategies represent a move towards richer associative connections that can adequately …","url":["https://arxiv.org/pdf/2602.05734"]} +{"year":"2026","title":"Evaluation of machine translation systems on Images with text","authors":["B Puchol Salort - 2026"],"snippet":"This Master’s Thesis aims to carry out a comparative evaluation of machine translation systems applied to images containing text, a task at the intersection of computer vision and natural language processing. The images under study may …","url":["https://riunet.upv.es/bitstreams/42594d9a-fe37-45ca-b32b-bffa3adcfe7c/download"]} +{"year":"2026","title":"Evo: Autoregressive-Diffusion Large Language Models with Evolving Balance","authors":["J Wu, M Hu, J Zhu, Y Liu, T Zhang, K Li, J Chen, J Pan… - arXiv preprint arXiv …, 2026"],"snippet":"We introduce \\textbf{Evo}, a duality latent trajectory model that bridges autoregressive (AR) and diffusion-based language generation within a continuous evolutionary generative framework. Rather than treating AR decoding and diffusion …","url":["https://arxiv.org/pdf/2603.06617"]} +{"year":"2026","title":"EvoESAP: Non-Uniform Expert Pruning for Sparse MoE","authors":["Z Liu, S Tang, B Sun, Z Shen, X Yuan - arXiv preprint arXiv:2603.06003, 2026"],"snippet":"… 2024) which is a cleaned Common Crawl web-text corpus as calibration data in Table 6. All the hyperparameters are the same as mentioned in Section 4.1. Here we highlight two observations: (i) calibrating on C4 leads to a substantial drop on …","url":["https://arxiv.org/pdf/2603.06003"]} +{"year":"2026","title":"Evolutionary Knowledge-Enhanced Semantic Communication: Transition from SISO to MIMO","authors":["B Wang, H Xiong, S Han, M Sun, D Zhang, X Xu… - IEEE Transactions on …, 2026"],"snippet":"… Specifically, the 300-dimensional GloVe version trained on 42 billion words from the Common Crawl dataset is selected, primarily due to its highest vocabulary overlap with ReVerb45K. Besides, semantic compression and decompression …","url":["https://ieeexplore.ieee.org/abstract/document/11386815/"]} +{"year":"2026","title":"Excising Satan: Literal Elimination of the Adversary from World History and AI Training Corpora to Optimize Model Size and Emergent Cognition","authors":["DC Youvan - 2026"],"snippet":"… Studies of Common Crawl datasets …","url":["https://www.researchgate.net/profile/Douglas-Youvan/publication/401226850_Excising_Satan_Literal_Elimination_of_the_Adversary_from_World_History_and_AI_Training_Corpora_to_Optimize_Model_Size_and_Emergent_Cognition/links/699f90875d60ab4835718658/Excising-Satan-Literal-Elimination-of-the-Adversary-from-World-History-and-AI-Training-Corpora-to-Optimize-Model-Size-and-Emergent-Cognition.pdf"]} +{"year":"2026","title":"Explicit Multi-head Attention for Inter-head Interaction in Large Language Models","authors":["R Peng, Y Zhou, D Song, K Lv, B Wang, Q Guo, X Qiu - arXiv preprint arXiv …, 2026"],"snippet":"In large language models built upon the Transformer architecture, recent studies have shown that inter-head interaction can enhance attention performance. Motivated by this, we propose Multi-head Explicit Attention (MEA), a simple yet …","url":["https://arxiv.org/pdf/2601.19611"]} +{"year":"2026","title":"Exploring generative artificial intelligence: a comprehensive guide","authors":["R Shoitan, MM Moussa, N Tawfik, YI Cho, MS Abdallah - PeerJ Computer Science, 2026"],"snippet":"… Common Crawl dataset provides significant volumes of web data gathered from multiple pages, with frequent monthly updates. It has played a crucial role in training other notable language models, including GPT-3 and BERT. However, it comes with …","url":["https://peerj.com/articles/cs-3276/"]} +{"year":"2026","title":"Exploring Semantic Learning in Natural Language Processing: Bridging Meaning and Machine Understanding","authors":["MDSS Lam, A Kande"],"snippet":"Natural Language Processing (NLP) standards for semantic learning purposes attempt to establish a link that connects machine ability with human-derived linguistic value. This paper investigates state-of-the-art semantic representation …","url":["https://ieomsociety.org/proceedings/orlando2025/131.pdf"]} +{"year":"2026","title":"Expressiving Rational Agency with Language Models","authors":["Y Du - Machines in Minds: Expressing Digital Rationality …, 2026"],"snippet":"… WebText2 with higher content quality, and Common Crawl with lower content quality. Footnote 10 To be more detailed, WebText2 contains content that has been evaluated by internet users such as being liked on social networks, while Common …","url":["https://link.springer.com/chapter/10.1007/978-3-032-15842-0_6"]} +{"year":"2026","title":"Extracting Data on Patient Perspectives from Social Media","authors":["A Luschi, J Polisena, E Iadanza - Patient Involvement in Health Technology …, 2026"],"snippet":"Social media, a collection of online platforms where users interact, share, and exchange content, have emerged as a significant secondary source of real world data (RWD) in health-related research. Social media data offer distinct advantages …","url":["https://link.springer.com/chapter/10.1007/978-3-032-11284-2_7"]} +{"year":"2026","title":"FACULTY OF GRADUATE STUDIES AND RESEARCH","authors":["A Yafoz"],"snippet":"… for the Arabic language are pre-trained on datasets gathered from Common Crawl …","url":["https://uregina.scholaris.ca/server/api/core/bitstreams/574bca5f-4d39-410b-a628-1a9ac3ba0081/content"]} +{"year":"2026","title":"Fairness or Fluency? An Investigation into Language Bias of Pairwise LLM-as-a-Judge","authors":["X Zhou, Z Luo, Y Gao, Q Chen, X Hu, Y Zhao, R Liu - arXiv preprint arXiv:2601.13649, 2026"],"snippet":"Recent advances in Large Language Models (LLMs) have incentivized the development of LLM-as-a-judge, an application of LLMs where they are used as judges to decide the quality of a certain piece of text given a certain context. However …","url":["https://arxiv.org/pdf/2601.13649"]} +{"year":"2026","title":"Fake Review Detection in Low-Resource Settings with Multilingual Transformer Models: The Case of Bangla","authors":["MA Mahmud, A Hasan, T Mahbub, N Hasan Rafi… - International Conference on …, 2025"],"snippet":"… In addition to parallel datasets like PMINDIA and the Dakshina corpus, the model is pretrained using a combination of monolingual corpora from Wikipedia and Common Crawl. For the masked language modelling (MLM) goal, MuRIL employs …","url":["https://link.springer.com/chapter/10.1007/978-3-032-11355-9_12"]} +{"year":"2026","title":"FakeNews-TransAug: an enhanced AraBERT-Based deep learning model with data augmentation for addressing class imbalance in Arabic fake news detection","authors":["NM Alkudah, NB Idris, MAM Abushariah, AQM Sabri… - PeerJ Computer Science, 2026"],"snippet":"The rapid proliferation of fake news and fake news in digital media, particularly in the Arabic language, poses serious threats to public trust and societal stability. This article introduces FakeNews-TransAug, a novel deep learning model designed to …","url":["https://peerj.com/articles/cs-3654/"]} +{"year":"2026","title":"FedMosaic: Federated Retrieval-Augmented Generation via Parametric Adapters","authors":["Z Liang, Y Wang, Z Zhou, H Zhang, B Liu, Y Tong - arXiv preprint arXiv:2602.05235, 2026"],"snippet":"… However, most RAG deployments assume access to a centralized corpus (eg Wikipedia, Common Crawl, or enterprise repositories). While feasible in open-domain settings, this assumption fails in vertical domains such as healthcare and finance …","url":["https://arxiv.org/pdf/2602.05235"]} +{"year":"2026","title":"Form-aware Poetic Generation for Bangla","authors":["A Amina, M Al Mushabbir, S Ahmed"],"snippet":"Poetry generation in low-resource languages such as Bangla is particularly challenging due to the scarcity of structured poetic corpora and the complexity of its metrical system (matra). We present a structureaware framework for Bangla poetry …","url":["https://aclanthology.org/anthology-files/anthology-files/pdf/banglalp/2025.banglalp-1.30.pdf"]} +{"year":"2026","title":"FormuLLA: A Large Language Model Approach to Generating Novel 3D Printable Formulations","authors":["A Okubena, YA Mohammed, M Elbadawi - arXiv preprint arXiv:2601.02071, 2026"],"snippet":"Pharmaceutical three-dimensional (3D) printing is an advanced fabrication technology with the potential to enable truly personalised dosage forms. Recent studies have integrated artificial intelligence (AI) to accelerate formulation and …","url":["https://arxiv.org/pdf/2601.02071"]} +{"year":"2026","title":"Foundations Models: Is Transfer Learning a solved problem?","authors":["A Cornuéjols"],"snippet":"Figure 15.36: Illustration of how the T5 model (“Text-to-text Transfer Transformer”) can be used to perform multiple NLP tasks, such as translating English to German; determining if a sentence is linguistic valid or not (CoLA stands for “Corpus of …","url":["https://antoinecornuejols.github.io/teaching/Master-AIC/(5-4)Tr_OOD_LLMs_and_transfer_learning-v4.pdf"]} +{"year":"2026","title":"From $ O (mn) $ to $ O (r^ 2) $: Two-Sided Low-Rank Communication for Adam in Distributed Training with Memory Efficiency","authors":["S Dang, J Shao, X Zheng, G Dai, Y Song, H Ye - arXiv preprint arXiv:2602.08007, 2026"],"snippet":"As foundation models continue to scale, pretraining increasingly relies on data-parallel distributed optimization, making bandwidth-limited gradient synchronization a key bottleneck. Orthogonally, projection-based low-rank optimizers were mainly …","url":["https://arxiv.org/pdf/2602.08007"]} +{"year":"2026","title":"From Bias Mitigation to Bias Negotiation: Governing Identity and Sociocultural Reasoning in Generative AI","authors":["ZO Dunivin, B Han, J Bollenbocher - arXiv preprint arXiv:2602.18459, 2026"],"snippet":"LLMs act in the social world by drawing upon shared cultural patterns to make social situations understandable and actionable. Because identity is often part of the inferential substrate of competent judgment, ethical alignment requires regulating …","url":["https://arxiv.org/pdf/2602.18459"]} +{"year":"2026","title":"From Data to Model in Bias: A Statistical Analysis of Political Bias in the C4 Corpus and Its Impact on LLMs","authors":["J You, J Lee, S Lee, HY Kwon - Proceedings of the Nineteenth ACM International …, 2026"],"snippet":"… A critical analysis of the largest source for generative ai training data: Common crawl. In Proceedings of the 2024 ACM Conference on … Practical Datasets for Analyzing LLM Corpora Derived from Common Crawl. In Proceedings of the …","url":["https://dl.acm.org/doi/abs/10.1145/3773966.3777990"]} +{"year":"2026","title":"From Insights to Improvements: Advancements Across the LLM Lifecycle","authors":["AS Deshpande - 2026"],"snippet":"Large language models (LLMs) have transformed natural language understanding and generation, yet their development remains a fragmented process. Each stage in the LLM lifecycle—pre-training, post-training, evaluation, and improvement—poses …","url":["https://search.proquest.com/openview/d31be730be88830f7860259f64bf8f5e/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"From Prompt to Clone: Copyright Challenges in AI Model Distillation","authors":["C Philipp - UC Law Science and Technology Journal, 2026"],"snippet":"This paper examines the legality of model distillation in the context of large language models (LLMs), where smaller “student” models are trained by mimicking the outputs of larger, proprietary “teacher” models. As artificial intelligence continues …","url":["https://repository.uclawsf.edu/cgi/viewcontent.cgi?article=1151&context=hastings_science_technology_law_journal"]} +{"year":"2026","title":"From Reflection to Repair: A Scoping Review of Dataset Documentation Tools","authors":["P Reynolds-Cuéllar, M Wong-Villacres, AA Garcia… - arXiv preprint arXiv …, 2026"],"snippet":"… Meanwhile, web-scale data collected via internet crawls can include petabytes of data; one popular dataset, the Common Crawl, includes … an analysis of undesirable content in the common crawl corpus. In Chengqing Zong, Fei Xia …","url":["https://arxiv.org/pdf/2602.15968"]} +{"year":"2026","title":"From Specialist to Large Models: A Paradigm Evolution Towards Semantic-Aware MIMO","authors":["K Ying, Z Gao, T Yang, J Zhang, X Cheng, TQS Quek… - arXiv preprint arXiv …, 2026"],"snippet":"The sixth generation (6G) network is expected to deploy larger multiple-input multiple-output (MIMO) arrays to support massive connectivity, which will increase overhead and latency at the physical layer. Meanwhile, emerging 6G demands such as immersive …","url":["https://arxiv.org/pdf/2602.21672"]} +{"year":"2026","title":"From the Platform Society to the AI Society: Towards Critical Studies of Generative AI","authors":["P Tornberg, J Uitermark"],"snippet":"The era of Artificial Intelligence has begun. Generative AI is rapidly reshaping knowledge production, culture, and political authority, giving rise to an emerging AI society. Yet this transformation did not emerge ex nihilo. This paper argues that the …","url":["https://osf.io/download/qahd3/"]} +{"year":"2026","title":"Geedge Cases: Censorship Measurement Insights from the Geedge Networks Leak","authors":["J Sheffey, A Zohaib, M Wu, A Houmansadr - 2025"],"snippet":"… GFWeb [9] additionally uses FQDNs from the Common Crawl [4] dataset. … Additionally, the Tranco and Citizen Lab lists are limited compared to larger censorship measurement lists such as Common Crawl or ICANN CZDS, which may …","url":["https://people.cs.umass.edu/~amir/papers/2026-FOCI-GEEDGE-CASES.pdf"]} +{"year":"2026","title":"Gen AI, Copyright, and the Law's Blind Spot Constructing a Latent Reproduction Solution","authors":["A Shukla, P Bhinda - J. Intell. Prot. Stud., 2025"],"snippet":"Drifting away from the traditional research on the interplay of copynghtprovisions and ArtifdcialIntelligence, this paper offers a foundational reframing of the Indian copyright discourse surrounding generative Al by shifting the legal focus from output-based …","url":["https://heinonline.org/hol-cgi-bin/get_pdf.cgi?handle=hein.journals/jnloitl9§ion=16"]} +{"year":"2026","title":"Generative AI for FinTech","authors":["M Dubey - Generative and Open AI in Industry 5.0, 2026"],"snippet":"Recent advancements in generative artificial intelligence (AI) have introduced novel opportunities for the application of AI within the financial sector. This chapter examines the various ways in which generative AI is contributing to the rapidly …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.1201/9781003596479-9&type=chapterpdf"]} +{"year":"2026","title":"Generative AI","authors":["A Laaksonen - Guide to Using Generative AI in Programming, 2026"],"snippet":"… However, it was reported that the older GPT-3 model, released in 2020, has 175 billion parameters, and its training data included the Common Crawl web archive, collections of books, and Wikipedia data [1]. …","url":["https://link.springer.com/chapter/10.1007/978-3-032-07453-9_3"]} +{"year":"2026","title":"Generative and Open AI","authors":["L Gupta, F Rahman, MK Giluka - Generative and Open AI in Industry 5.0, 2026"],"snippet":"Generative artificial intelligence (AI) has emerged as a powerful tool driving a wide range of modern applications. This chapter explores the fundamental concepts of Generative AI and its growing impact on daily life. Generative AI models are …","url":["https://api.taylorfrancis.com/content/chapters/edit/download?identifierName=doi&identifierValue=10.1201/9781003596479-1&type=chapterpdf"]} +{"year":"2026","title":"Generative Artificial Intelligence, Writing Placement, and Principled Decision Making in US Postsecondary Contexts","authors":["C Toth, J Nastal, TB Barney, K Messer, J Godfrey - 2026"],"snippet":"Christie Toth, Ph. D. is associate professor of writing and rhetoric studies at the University of Utah, where she leads the development of placement assessment for first-year writing in addition to coordinating undergraduate transfer initiatives and …","url":["https://wacclearinghouse.org/docs/jwa/vol9/tothetal.pdf"]} +{"year":"2026","title":"GEOMETRIC CONSTRAINTS ON VECTOR ARITHMETIC IN STATIC WORD EMBEDDINGS: ASYSTEMATIC EVALUATION AND TOPOLOGICAL ANALYSIS","authors":["D Mu"],"snippet":"The vector offset hypothesis, the principle that semantic relationships are preserved as consistent linear translations, has long been a cornerstone of static word embedding theory. This paper presents a comprehensive evaluation of the structural …","url":["https://openreview.net/pdf?id=VPlmreAT9z"]} +{"year":"2026","title":"Global AI Bias Audit for Technical Governance","authors":["J Hung - arXiv preprint arXiv:2602.13246, 2026"],"snippet":"This paper presents the outputs of the exploratory phase of a global audit of Large Language Models (LLMs) project. In this exploratory phase, I used the Global AI Dataset (GAID) Project as a framework to stress-test the Llama-3 8B model and …","url":["https://arxiv.org/pdf/2602.13246"]} +{"year":"2026","title":"Global Perspectives on People, Power and Society","authors":["SH Soomro, CMS Sandhu"],"snippet":"… As an example, ChatGPT, like other models, is trained on massive datasets including some variants of the Common Crawl corpus, the text scraped by internet robots with millions of petabytes. These collections are not entirely representative of …","url":["https://journal.vu.edu.pk/Data/volumes/28/Shahid.%20FINAL.pdf"]} +{"year":"2026","title":"Governed by Algorithms: AI and Public Opinion Formation in Times of Crisis","authors":["H Sarhan - 2025"],"snippet":"This dissertation examines how AI image captioning systems, social media moderation, and prompting, shape representations of social groups during political tension. Across three studies, it shows how captions can erase identities, how …","url":["https://mediatum.ub.tum.de/doc/1796311/document.pdf"]} +{"year":"2026","title":"GPT-4 In-Context Learning Ability with Semantico-Syntactically Similar Examples in Russian","authors":["T Plotnikov - Mundo Eslavo, 2025"],"snippet":"En el rendimiento en modo zero-shot con un conjunto de datos que contiene más de 2.200 frases y oraciones en ruso, GPT-4 presenta dificultades para identificar correctamente el significado de algunos ejemplos. Por ello, los ejemplos “problemáticos” …","url":["https://revistaseug.ugr.es/index.php/meslav/article/download/33411/30649"]} +{"year":"2026","title":"Grammatical Error Detection in L2 English and Italian: How Multilingual LLMs Handle Ambiguity in Learner Errors","authors":["E Dentico"],"snippet":"This thesis explores how large language models deal with ambiguity in grammatical errors in learner-written English and Italian. Using data from the MultiGED-2023 shared task and learner corpora (MERLIN, FCE, REALEC), five transformer-based …","url":["https://home.cltl.labs.vu.nl/static/data/theses/thesis_Elisabetta_Dentico_2025.pdf"]} +{"year":"2026","title":"Graph-Based Model for Hindi Text Summarization","authors":["M Waghe, D Chandargi, MA Rayyan, S Sonawane - 2025 IEEE Pune Section …, 2025"],"snippet":"Text summarization for low-resource languages such as Hindi presents inherent challenges owing to the limited availability of high-quality linguistic resources and curated datasets. In this study, we examine and compare three distinct …","url":["https://ieeexplore.ieee.org/abstract/document/11377704/"]} +{"year":"2026","title":"GreekMMLU: A Native-Sourced Multitask Benchmark for Evaluating Language Models in Greek","authors":["Y Zhang, M Konomi, C Xypolopoulos, K Divriotis… - arXiv preprint arXiv …, 2026"],"snippet":"… These formats typically reside outside the scope of standard webcrawling pipelines (eg, Common Crawl) used for LLM pre-training, thereby minimizing the risk of data contamination. This process yielded a raw corpus of diverse subject matter …","url":["https://arxiv.org/pdf/2602.05150"]} +{"year":"2026","title":"Green AI: Sustainable Model Training Practices","authors":["T Kumar, S Choudhary - 2025"],"snippet":"The rapid growth of deep learning (DL) has ushered in the era of “Red AI”, characterized by the continuous pursuit of state-of-the-art (SOTA) accuracy through massive increases in model parameters and training data. This trajectory has …","url":["https://rjwave.org/jaafr/papers/JAAFR2512108.pdf"]} +{"year":"2026","title":"GRIP: Geometric Refinement and Adaptive Information Potential for Data Efficiency","authors":["C Wang, J Yang, X Yao, Y Yu, P Jiao, L Yu, J Fang… - arXiv preprint arXiv …, 2026"],"snippet":"… To simulate realistic domain augmentation, we construct a 100B token hybrid candidate pool Dpool comprising a Fixed Background (CommonCrawl… on The Stack; the curated high-value subset is then combined with the constant …","url":["https://arxiv.org/pdf/2603.00031"]} +{"year":"2026","title":"Harvard Data Science Review ⢠Issue 5.4, Fall 2023","authors":["NE Sanders, A Ulinich, B Schneier"],"snippet":"Political polling is a multi-billion-dollar industry with outsized influence on the societal trajectory of the United States and nations around the world. However, in recent years it has been severely challenged by rising nonresponse rates and other …","url":["https://assets.pubpub.org/ezfw89rv/1d3cf75d-56d8-4fb3-a623-96883c0c0f70.html"]} +{"year":"2026","title":"Harvard Data Science Review ⢠Issue 7.2, Spring 2025","authors":["CL Borgman, P Groth"],"snippet":"Sharing research data is necessary, but not sufficient, for data reuse. Open science policies focus more heavily on data sharing than on reuse, yet both are complex, labor-intensive, expensive, and require infrastructure investments by multiple …","url":["https://assets.pubpub.org/sraoz9zb/35d32cfc-eded-4772-8406-999034371e46.html"]} +{"year":"2026","title":"HASOC-Meme: Enhancing Hate Speech Recognition in Bengali, Hindi, Gujarati, and Bodo Memes Using Multimodal Multitask Transformers","authors":["A Hegde, S Coelho, AM Shetty - 2025"],"snippet":"… • Bangla - For Bangla Language, we used XLM-RoBERTa Large1, a transformer-based multilingual language model pre-trained on massive CommonCrawl corpora. It employs subword tokenization, allowing efficient processing of Bangla’s rich …","url":["https://ceur-ws.org/Vol-4173/T9-7.pdf"]} +{"year":"2026","title":"HateBertBN: a hybrid transformer based model for Bangla hate speech detection across various social contexts","authors":["T Azhar, T Mahmud, MA Hasan, MN Uddin, SB Park - Discover Computing, 2026"],"snippet":"… corpus containing approximately 8.6 billion tokens, sourced from Bangla Common Crawl, Wikipedia, and news articles. Unlike monolingual models… It is a multilingual transformer model based on the RoBERTa architecture and pre-trained …","url":["https://link.springer.com/article/10.1007/s10791-025-09804-x"]} +{"year":"2026","title":"HONEY, AI SHRUNK THE ARCHIVE ARTIFICIAL INTELLIGENCE AS COMPRESSION ALGORITHM","authors":["AI How - New Directions in Digital Textual Studies: Book History …, 2026"]} +{"year":"2026","title":"How Do Language Models Acquire Character-Level Information?","authors":["S Sato, R Sasano - arXiv preprint arXiv:2602.05347, 2026"],"snippet":"Language models (LMs) have been reported to implicitly encode character-level information, despite not being explicitly provided during training. However, the mechanisms underlying this phenomenon remain largely unexplored. To reveal the …","url":["https://arxiv.org/pdf/2602.05347"]} +{"year":"2026","title":"How Far Can Unsupervised RLVR Scale LLM Training?","authors":["B He, Y Zuo, Z Liu, S Zhao, Z Fu, J Yang, C Qian… - arXiv preprint arXiv …, 2026"],"snippet":"… 2025] curates multi-domain QA from CommonCrawl and open web sources spanning law, physics and social science, and converts them into multiple-choice format so that answer correctness can be checked without human annotation. This …","url":["https://arxiv.org/pdf/2603.08660"]} +{"year":"2026","title":"How Hyper-Datafication Impacts the Sustainability Costs in Frontier AI","authors":["SN Wilson, S Mair, M Okinyi, EB Dam, J Koch… - arXiv preprint arXiv …, 2026"],"snippet":"… It compares each group’s share of total dataset size on the Hugging Face Hub with its share of Common Crawl pages and global speakers. English represents a larger share of the dataset volume (57%) than its share of Common Crawl pages (42%) …","url":["https://arxiv.org/pdf/2602.00056"]} +{"year":"2026","title":"How Long Is a Piece of String? A Brief Empirical Analysis of Tokenizers","authors":["J Roberts, K Han, S Albanie - arXiv preprint arXiv:2601.11518, 2026"],"snippet":"… There is no significant correlation between the compression ratio and the common crawl prevalence for a given language. … in Common Crawl. Each data point represents a language text tokenized by the Gemini (circle) or GPT (square) …","url":["https://arxiv.org/pdf/2601.11518"]} +{"year":"2026","title":"How Well Do Large Language Models Understand African American Language? Causes and Implications","authors":["K McKeown, JA Grieser, N Deas, S Kleiner, DU Patton… - Annual Review of …, 2026"],"snippet":"We focus on studying large language models (LLMs) and their ability to successfully interpret African American Language (AAL) in ways that do not distort the communicative intent of the speakers or writers. We discuss research that quantifies …","url":["https://www.annualreviews.org/content/journals/10.1146/annurev-linguistics-041824-050629"]} +{"year":"2026","title":"HumanLLM: Towards Personalized Understanding and Simulation of Human Nature","authors":["Y Lei, T Wang, J Lian, Z Hu, D Lian, X Xie - arXiv preprint arXiv:2601.15793, 2026"],"snippet":"… Twitter [9] (88.1M original tweets from CommonCrawl): We leverage tweets from a broad and diverse set of users on Twitter, a global microblogging platform. Tweets offer real-time, spontaneous expressions of users’ thoughts, emotions, and daily …","url":["https://arxiv.org/pdf/2601.15793"]} +{"year":"2026","title":"Hypernymy Relation in NLP: Tasks, Approaches, Resources, and Future Directions—A Systematic Literature Review","authors":["R Alharbi, H Al-Muhtaseb, T Helmy - IEEE Access, 2025"],"snippet":"Hypernymy is a semantic relation between two terms, where a more specific term is entailed by a more general term—that is, the meaning of the more specific term is encompassed by the meaning of the more general term. This relation is crucial for …","url":["https://ieeexplore.ieee.org/iel8/6287639/10820123/11271798.pdf"]} +{"year":"2026","title":"Identifying ESO data usage in scientific publications for astrophysical discoveries through Natural Language Processing","authors":["CI Urbina Lara - 2025"],"snippet":"Astrophysics is a fundamental science that seeks to understand the nature of the universe, from the smallest particles to the largest cosmic structures. Its relevance extends beyond mere academic interest; astrophysical research has been a driving …","url":["https://repositorio.uchile.cl/bitstream/handle/2250/208305/identifying-eso-data-usage-in-scientific-publications-for-astrophysical-discoveries-through-natural-language.pdf?sequence=1"]} +{"year":"2026","title":"Improving Sign Language-Gloss Translations with Pretrained Models","authors":["S Chen, Y Wang - IEEE Access, 2026"],"snippet":"Sign language gloss-to-text translation is a crucial task for enhancing communication and accessibility for the deaf community. However, the scarcity of annotated sign language data poses significant challenges for developing effective …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11397303.pdf"]} +{"year":"2026","title":"Incentives and Creativity in Groups—Experimental Evidence on Creative Processes and Dimensions","authors":["E Sarrazin - 2026"],"snippet":"Creativity and teamwork are essential in today’s rapidly evolving labor market, yet little is known about how incentives shape creative group processes across multiple creativity dimensions, specifically quantity, quality, and originality. I introduce a novel …","url":["https://download.uni-mainz.de/RePEc/pdf/Discussion_Paper_2601.pdf"]} +{"year":"2026","title":"InfiniteWeb: Scalable Web Environment Synthesis for GUI Agent Training","authors":["Z Zhang, Z Wang, X Zhang, Z Guo, J Li, B Li, Y Lu - arXiv preprint arXiv:2601.04126, 2026"],"snippet":"… Since the original WebGen-Bench does not include design images, we match each test website with a design image extracted from Common Crawl based on website category. This design image is provided to all methods as input to enable …","url":["https://arxiv.org/pdf/2601.04126"]} +{"year":"2026","title":"Informed Consent for AI and Health Care","authors":["A Zimmerman"],"snippet":"… Common crawl and colossal clean crawled corpus (C4) are examples of data scraped from the internet. There are active debates about intellectual property rights infractions and this repurposing of the internet to capture its data. Restrictions on …","url":["https://link.springer.com/content/pdf/10.1007/978-3-032-11347-4.pdf"]} +{"year":"2026","title":"Integrating Semi-Supervised Learning And Ensemble Deep Learning For Low-Resource Deep Knowledge Tracing","authors":["UM Okechukwu - 2025"],"snippet":"As learning platforms scale, two obstacles prevent Deep Knowledge Tracing (DKT) from thriving in practice: the scarcity of labeled interactions and the opacity of predictions. We introduced a semi-supervised, ensemble DKT pipeline that trains …","url":["https://digitalcommons.pvamu.edu/cgi/viewcontent.cgi?article=2662&context=pvamu-theses"]} +{"year":"2026","title":"INTEGRATING SEMI-SUPERVISED LEARNING AND TRANSFORMER MODELS FOR OFFENSIVE LANGUAGE DETECTION IN ENGLISH–TAMIL CODE-MIXED …","authors":["LL Scientific - Journal of Theoretical and Applied Information …, 2026"],"snippet":"… XLM-RoBERTa (XLM-R) [21], which is trained on 2.5TB of CommonCrawl data in 100 languages, is capable of state-of-the-art performance in cross-lingual classifications as well as sequence labeling tasks. This model involves …","url":["http://www.jatit.org/volumes/Vol104No1/23Vol104No1.pdf"]} +{"year":"2026","title":"Integrating Vision and Language for Human Centered Interaction and Robotics","authors":["A Popescu, S Marinou, L Pereira, F Shyama - Authorea Preprints, 2025"],"snippet":"The convergence of computer vision and natural language processing has given rise to Multimodal Large Vision-Language Models (LVLMs), a new class of artificial intelligence systems capable of jointly perceiving, reasoning, and generating across …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.176721030.05388084"]} +{"year":"2026","title":"Interpretable computational cetaphor processing","authors":["S Wang - 2026"],"snippet":"Metaphors are a fundamental component of human language, enabling abstract reasoning, nuanced communication, and cultural expression. However, their inherent complexity poses significant challenges for natural language processing (NLP) …","url":["https://etheses.whiterose.ac.uk/id/eprint/38149/1/Thesis.pdf"]} +{"year":"2026","title":"Is the Golden Ticket Tainted? Testing Standardized Tests with AI","authors":["VK Suresh - 2026"],"snippet":"Standardized test scores are signals in college admissions, yet the measurement properties of these instruments receive little empirical attention. This paper uses large language models (LLMs) as stable external benchmarks to audit item-level …","url":["https://www.vikramsuresh.com/assets/files/main_ai_sat.pdf"]} +{"year":"2026","title":"ISSN 0347-948X E-publication< http://hdl. handle. net/2077/83731> Printed in Sweden by Stema Specialtryck AB 2024 Typeset in LATEX2ε by the author","authors":["F Morger"],"snippet":"… sents 3-lab Common Crawl yes 3 450 750 1 065 …","url":["https://gupea.ub.gu.se/server/api/core/bitstreams/44b1bf10-24d2-443a-ac17-678943272990/content"]} +{"year":"2026","title":"Iterative Structured Pruning for Large Language Models with Multi-Domain Calibration","authors":["G Wu, H Zhang, Z Zhibin, J Guo, X Cheng - arXiv preprint arXiv:2601.02674, 2026"],"snippet":"… We design a diverse calibration dataset that spans multiple domains, including Wikipedia articles, Common Crawl data, code repositories, and mathematical texts. This diversity enables the pruning process to generalize more effectively across a …","url":["https://arxiv.org/pdf/2601.02674"]} +{"year":"2026","title":"JOHAN MICHEL & SARA LINDBERG","authors":["J MICHEL"],"snippet":"… The dataset Bias in Bios was extracted from Common Crawl by filtering text lines … on an initial analysis of a small subset of Common Crawl. In some cases, closely …","url":["https://gupea.ub.gu.se/server/api/core/bitstreams/8b5e23f4-554e-4712-ae84-dd2c1b84aad7/content"]} +{"year":"2026","title":"Joint Selection for Large-Scale Pre-Training Data via Policy Gradient-based Mask Learning","authors":["Z Fan, Y Xian, Y Sun, L Shen - arXiv preprint arXiv:2512.24265, 2025"],"snippet":"… Since the original FineWeb dataset only provides the year of the Common Crawl snapshot to which each sample belongs, but not the text domains, we randomly sampled 1,000,000 samples from FineWeb, FineWeb-Edu, and our FineWeb-Mask …","url":["https://arxiv.org/pdf/2512.24265"]} +{"year":"2026","title":"Journey Through Language: Models and Prompt Engineering","authors":["R Akerkar - Artificial Intelligence: Transcending Traditional …, 2026"],"snippet":"This chapter explores the architecture, capabilities, and practical use of large language models (LLMs), with a particular focus on prompt engineering as a key technique for guiding model behaviour. It begins with foundational concepts …","url":["https://link.springer.com/chapter/10.1007/978-3-031-91084-5_7"]} +{"year":"2026","title":"KVzap: Fast, Adaptive, and Faithful KV Cache Pruning","authors":["S Jegou, M Jeblick - arXiv preprint arXiv:2601.07891, 2026"],"snippet":"… The dataset contains 27k prompts split into 9 subsets (common crawl, multilingual, math, code, etc.). We filtered prompts to a length of 750–1,250 tokens to minimize the impact of sequence lengths on attention weights and then selected up to 500 …","url":["https://arxiv.org/pdf/2601.07891"]} +{"year":"2026","title":"LAER-MoE: Load-Adaptive Expert Re-layout for Efficient Mixture-of-Experts Training","authors":["X Liu, Y Wang, F Fu, X Xiao, H Li, J Li, B Cui - arXiv preprint arXiv:2602.11686, 2026"],"snippet":"Expert parallelism is vital for effectively training Mixture-of-Experts (MoE) models, enabling different devices to host distinct experts, with each device processing different input data. However, during expert parallel training, dynamic routing results …","url":["https://arxiv.org/pdf/2602.11686"]} +{"year":"2026","title":"Language Model Memory and Memory Models for Language","authors":["BL Badger - arXiv preprint arXiv:2602.13466, 2026"],"snippet":"The ability of machine learning models to store input information in hidden layer vector embeddings, analogous to the concept of `memory', is widely employed but not well characterized. We find that language model embeddings typically contain …","url":["https://arxiv.org/pdf/2602.13466"]} +{"year":"2026","title":"Language specific web-crawling","authors":["EA Schillack - 2025"],"snippet":"Low resource language modelling remains a challenge in natural language processing, particularly for low resource languages with a limited digital presence. This thesis investigates sentence level language identification when given very short …","url":["https://scholar.sun.ac.za/items/b24398f6-a53d-4d82-9deb-03c094b7caa3"]} +{"year":"2026","title":"Language Using NLP and Computer","authors":["R Indhu, S Kruthika, LS Makkala, HS Sushmitha… - Computing and Machine …, 2026"],"snippet":"In an environment where spoken language is the primary means of communication, those who are nonverbal encounter considerable difficulties in express-ing themselves and establishing social connections. Our study aims to develop a real-time …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=Xd68EQAAQBAJ&oi=fnd&pg=PA70&dq=commoncrawl&ots=us3oy3UEOg&sig=REUj4yS-CRnCxUGF633hIOIcryo"]} +{"year":"2026","title":"Language Without Propositions: Why Large Language Models Hallucinate","authors":["J Mácha - 2026"],"snippet":"This paper defends the thesis that LLM hallucinations are best explained as a truth representation problem: Current models lack an internal representation of propositions as truth-bearers, so truth and falsity cannot constrain generation in the …","url":["https://www.preprints.org/frontend/manuscript/aa098da5355504323f975e40aea1e174/download_pub"]} +{"year":"2026","title":"Language-Dependent Communication Strategies in Multilingual Large Language Models: A Comparative Analysis of Russian and English Response Patterns in …","authors":["N Yampolski - 2026"],"snippet":"This study investigates whether language selection in multilingual large language models (LLMs) affects output quality beyond translation accuracy. Through controlled testing of 21 semantically equivalent query pairs in Russian and English …","url":["https://www.researchgate.net/profile/Nikita-Yampolski/publication/399670758_Language-Dependent_Communication_Strategies_in_Multilingual_Large_Language_Models_A_Comparative_Analysis_of_Russian_and_English_Response_Patterns_in_Mistral_AI/links/6964c3b65cc49c35ce7c1351/Language-Dependent-Communication-Strategies-in-Multilingual-Large-Language-Models-A-Comparative-Analysis-of-Russian-and-English-Response-Patterns-in-Mistral-AI.pdf"]} +{"year":"2026","title":"Language-Dependent Miscalibration in Multilingual LLM Evaluators","authors":["E Zhou, L Resck, Z Hui, A Korhonen"],"snippet":"Prompted LLM-as-a-Judge systems or trained reward models are typically validated using pairwise accuracy, under the assumption that high accuracy implies reliable and language-invariant evaluation. We demonstrate that multilingual LLM …","url":["https://openreview.net/pdf?id=vvnTA1aReN"]} +{"year":"2026","title":"Large AI Models Have a Prioritization Problem: Policy Implications and Solutions","authors":["JC Jackson, Y Liu, Z Wang, WJ Brady - Policy Insights from the Behavioral and Brain …, 2025"],"snippet":"Artificial intelligence (AI) models trained on large corpora must prioritize some information over others. Distilling vast data into simple user-friendly representations can lead to a prioritization problem, in which large AI models neglect crucial …","url":["https://journals.sagepub.com/doi/abs/10.1177/23727322251408311"]} +{"year":"2026","title":"Large Language Models (LLMS) for Clinical Note Generation: International Classification of Disease (ICD) Code, Knowledge Graph (KG) and Prompt Evaluation","authors":["IP Makohon - 2025"],"snippet":"In the past decade, a surge in the amount of electronic health record (EHR) data in the United States occurred, driven by a favorable policy environment created by the Health Information Technology for Economic and Clinical Health (HITECH) Act of …","url":["https://digitalcommons.odu.edu/cgi/viewcontent.cgi?article=1193&context=computerscience_etds"]} +{"year":"2026","title":"Large language models and conditional rules in clinical decision support systems","authors":["S Sivasothy, A Bingham, I Logothetis, S Barnett… - Health Information Science …, 2026"],"snippet":"Background Clinical Decision Support Systems (CDSS) improve patient outcomes and support sustainable health services by enhancing medical decisions. Developing rules for a CDSS is expensive due to delays in capturing and defining …","url":["https://link.springer.com/article/10.1007/s13755-026-00428-z"]} +{"year":"2026","title":"Large Language Models","authors":["WX Zhao, K Zhou, J Li, T Tang, JR Wen"],"snippet":"The release of ChatGPT in late November 2022 had a significant impact on the field of artificial intelligence (AI), quickly propelling large language models (LLMs) into the spotlight of both societal and academic discourse. This breakthrough represents …","url":["https://link.springer.com/content/pdf/10.1007/978-981-96-6259-3.pdf"]} +{"year":"2026","title":"Large Language Models: A Mathematical Formulation","authors":["R Baptista, A Stuart, S Tran - arXiv preprint arXiv:2601.22170, 2026"],"snippet":"Large language models (LLMs) process and predict sequences containing text to answer questions, and address tasks including document summarization, providing recommendations, writing software and solving quantitative problems. We provide a …","url":["https://arxiv.org/pdf/2601.22170"]} +{"year":"2026","title":"Large Language Models: A Survey of Architectures, Training Paradigms, and Alignment Methods","authors":["D Bhati, F Neha, DS Bandaru, M Weber, ID Gajera - 2026"],"snippet":"Large Language Models (LLMs) have become foundational to modern Artificial Intelligence (AI), enabling advanced reasoning, multimodal understanding, and scalable human-AI interaction across diverse domains. This survey provides a …","url":["https://www.preprints.org/frontend/manuscript/abaf3e6f650e2aeab454da56761a79aa/download_pub"]} +{"year":"2026","title":"Learning from Synthetic Data: Limitations of ERM","authors":["K Amin, A Bie, W Kong, U Syed, S Vassilvitskii - arXiv preprint arXiv:2601.15468, 2026"],"snippet":"The prevalence and low cost of LLMs have led to a rise of synthetic content. From review sites to court documents, ``natural'' content has been contaminated by data points that appear similar to natural data, but are in fact LLM-generated. In this work …","url":["https://arxiv.org/pdf/2601.15468"]} +{"year":"2026","title":"Learning to ask and answer in specialized documents: Exemplifying through modular integrated construction regulatory documents","authors":["Y Wei, X Li, Z Huang, Z Lin, N Zhang"],"snippet":"Large language models perform well in general question-answering tasks but face challenges in local contextual question-answering within specialized domains due to the high cost of domain-specific dataset curation and unstable model performance …","url":["https://www.researchgate.net/profile/Xiao-Li-11/publication/401252993_Learning_to_ask_and_answer_in_specialized_documents_Exemplifying_through_modular_integrated_construction_regulatory_documents/links/69a05a82e8fd5476473b225f/Learning-to-ask-and-answer-in-specialized-documents-Exemplifying-through-modular-integrated-construction-regulatory-documents.pdf"]} +{"year":"2026","title":"Less is More: Convergence Benefits of Fewer Data Weight Updates over Longer Horizon","authors":["R Das, N Patel, M Razaviyayn, V Mirrokni - arXiv preprint arXiv:2602.19510, 2026"],"snippet":"… In pre-training large foundation models, we are typically given access to m diverse training domains (eg, Wikipedia, GitHub, CommonCrawl), each associated with a loss function ℓi (i ∈ [m]). The goal is not simply to minimize the average loss …","url":["https://arxiv.org/pdf/2602.19510"]} +{"year":"2026","title":"Leveraging AI-Driven Speech Analysis for Legal Documentation","authors":["B Chikane, A Birajdar, A Fatima, C Badve, N Patel - … Harnessing AI/ML for Secured IoT …, 2026"],"snippet":"… Unlike proprietary models like GPT-3, Chinchilla, and PaLM, LLaMA is trained solely on open data sources such as CommonCrawl, C4, Wikipedia, and Stack Exchange, making it widely accessible for research and development. The study …","url":["https://link.springer.com/chapter/10.1007/978-3-032-05507-1_22"]} +{"year":"2026","title":"Lexical and Statistical Analysis of Bangla Newspaper and Literature: A Corpus-Driven Study on Diversity, Readability, and NLP Adaptation","authors":["P Bhattacharyya, A Bhattacharya - 2026"],"snippet":"In this paper, we present a comprehensive corpus-driven analysis of Bangla literary and newspaper texts to investigate their lexical diversity, structural complexity and readability. We undertook Vācaspati and IndicCorp, which are the most extensive …","url":["https://www.researchgate.net/profile/Pramit-Bhattacharyya-2/publication/399490050_Lexical_and_Statistical_Analysis_of_Bangla_Newspaper_and_Literature_A_Corpus-Driven_Study_on_Diversity_Readability_and_NLP_Adaptation/links/695d0d240c98040d48284968/Lexical-and-Statistical-Analysis-of-Bangla-Newspaper-and-Literature-A-Corpus-Driven-Study-on-Diversity-Readability-and-NLP-Adaptation.pdf"]} +{"year":"2026","title":"LingGen: Linguistic Fine-grained Controlled Generation","authors":["M Elgaar, H Amiri"],"snippet":"We present LingGen, a novel controlled text generation system that enables precise control over a variable number of linguistic attributes through a dedicated attribute embedding network and optimized attribute integration mechanisms. Such fine-grained …","url":["https://mohdelgaar.github.io/files/papers/linggen.pdf"]} +{"year":"2026","title":"LiteToken: Removing Intermediate Merge Residues From BPE Tokenizers","authors":["Y Sun, H Yang, Z Lin, M Zhang - arXiv preprint arXiv:2602.04706, 2026"],"snippet":"Tokenization is fundamental to how language models represent and process text, yet the behavior of widely used BPE tokenizers has received far less study than model architectures and training. In this paper, we investigate intermediate merge …","url":["https://arxiv.org/pdf/2602.04706"]} +{"year":"2026","title":"LLM Security in Cloud-Native Architectures: A Comprehensive Survey of Attacks, Defenses, and Operational Challenges","authors":["MY Malik - Authorea Preprints, 2026"],"snippet":"The rapid adoption of Large Language Models (LLMs) in cloud-native environments has introduced unprecedented security challenges that traditional cybersecurity frameworks fail to adequately address. This survey provides a comprehensive …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.177130593.35890269"]} +{"year":"2026","title":"LLM sentiment quantification reveals selective alignment with human course-evaluation raters","authors":["JW Lacy, C Nnoka, Z Jock, C Morreale - Computers and Education: Artificial …, 2026"],"snippet":"Student course evaluations contain rich qualitative feedback in the form of comments written in response to open-ended questions. However, this qualitative data, which may be more nuanced and detailed than quantitative ratings, is often …","url":["https://www.sciencedirect.com/science/article/pii/S2666920X26000068"]} +{"year":"2026","title":"LLM-based Exploitation of Edge Data in Modern Power Systems","authors":["M Liang, Q Luo, T Yu, P Kuang, Z Li, Z Pan - Journal of Modern Power Systems and …, 2025"],"snippet":"The modern power systems face challenges, including high proportions of uncertain renewable energy, rapid dynamics of power electronics, and decentralized control among multiple entities. Digital development has enabled power grids to integrate …","url":["https://ieeexplore.ieee.org/iel8/8685265/8816648/11268946.pdf"]} +{"year":"2026","title":"LLMOrbit: A Circular Taxonomy of Large Language Models-From Scaling Walls to Agentic AI Systems","authors":["BN Patro, VS Agneeswaran - arXiv preprint arXiv:2601.14053, 2026"],"snippet":"The field of artificial intelligence has undergone a revolution from foundational Transformer architectures to reasoning-capable systems approaching human-level performance. We present LLMOrbit, a comprehensive circular taxonomy navigating …","url":["https://arxiv.org/pdf/2601.14053"]} +{"year":"2026","title":"LLMs as Cultural Archives: Cultural Commonsense Knowledge Graph Extraction","authors":["JC Tonga, CC Liu, I Gurevych, F Koto - arXiv preprint arXiv:2601.17971, 2026"],"snippet":"Large language models (LLMs) encode rich cultural knowledge learned from diverse web-scale data, offering an unprecedented opportunity to model cultural commonsense at scale. Yet this knowledge remains mostly implicit and unstructured …","url":["https://arxiv.org/pdf/2601.17971"]} +{"year":"2026","title":"Locating the missing large-scale emission in the jet of M87* with short EHT baselines","authors":["B Georgiev, P Tiede, SD von Fellenberg, M Janssen… - arXiv preprint arXiv …, 2026"],"snippet":"In Very-Long Baseline Interferometric arrays, nearly co-located stations probe the largest scales and typically cannot resolve the observed source. In the absence of large-scale structure, closure phases constructed with these stations are zero and …","url":["https://arxiv.org/pdf/2601.13356"]} +{"year":"2026","title":"Logarithmic-time Schedules for Scaling Language Models with Momentum","authors":["D Ferbach, C Paquette, G Gidel, K Everett, E Paquette - arXiv preprint arXiv …, 2026"],"snippet":"In practice, the hyperparameters $(\\beta_1, \\beta_2)$ and weight-decay $\\lambda$ in AdamW are typically kept at fixed values. Is there any reason to do otherwise? We show that for large-scale language model training, the answer is yes: by exploiting …","url":["https://arxiv.org/pdf/2602.05298"]} +{"year":"2026","title":"Long-Tail Knowledge in Large Language Models: Taxonomy, Mechanisms, Interventions and Implications","authors":["S Badhe, D Shah, N Kathrotia - arXiv preprint arXiv:2602.16201, 2026"],"snippet":"… Current definitions largely rely on frequency counts in pre-training corpora such as Common Crawl, which fail to capture domain-specific importance or informational density [53]. What constitutes a rare fact in a general web corpus may be …","url":["https://arxiv.org/pdf/2602.16201"]} +{"year":"2026","title":"Lotus: Efficient LLM Training by Randomized Low-Rank Gradient Projection with Adaptive Subspace Switching","authors":["T Miao, Z Bao, L Zhang - arXiv preprint arXiv:2602.01233, 2026"],"snippet":"… To evaluate the effectiveness of Lotus, we pre-train LLaMA models of varying sizes on the C4 dataset—a widely used cleaned version of the Common Crawl corpus, following GaLore’s experimental settings and using perplexity as the primary …","url":["https://arxiv.org/pdf/2602.01233"]} +{"year":"2026","title":"Low-Resource Balochi Named Entity Recognition: Corpus Construction and Multilingual Transformer Evaluation","authors":["N Basir, RA Vighio, B Ansari, DN Arain, A Ali - The Asian Bulletin of Big Data …, 2026"],"snippet":"Named Entity Recognition (NER) remains largely unexplored for Balochi, a morphologically complex, low-resource language spoken by approximately 8 to 10 million population across Pakistan, Iran, and Afghanistan. This paper makes three …","url":["https://www.abbdm.net/index.php/Journal/article/download/462/461"]} +{"year":"2026","title":"MABPE: MORPHOLOGY-AWARE BYTE PAIR ENCODING FOR ASSAMESE TOKENIZATION","authors":["MP Lahkar, U Sharma, T Pradhan"],"snippet":"Developing effective tokenization methods is crucial for the success of large language models (LLMs), especially for underrepresented languages like Assamese. This study explores the use of Byte Pair Encoding (BPE) as a tokenization technique …","url":["https://www.researchgate.net/profile/Manash-Lahkar/publication/401156137_MABPE_MORPHOLOGY-AWARE_BYTE_PAIR_ENCODING_FOR_ASSAMESE_TOKENIZATION/links/699dedb342f94d1212ae567d/MABPE-MORPHOLOGY-AWARE-BYTE-PAIR-ENCODING-FOR-ASSAMESE-TOKENIZATION.pdf"]} +{"year":"2026","title":"Machine Learning and Circular Bioeconomy Transforming Sustainability Through Intelligent Systems: AI-Driven Change in Circular Bioeconomy Systems","authors":["A Bansal, A Sharma, S Parashar, AK Sharma, S Vats - Circular and Bioeconomy …, 2026"],"snippet":"… They extracted and processed text from the websites of more than 678,000 businesses using the CommonCrawl dataset. They classified 142,949 businesses as belonging to the bioeconomy after identifying businesses engaged in biobased …","url":["https://www.igi-global.com/chapter/machine-learning-and-circular-bioeconomy-transforming-sustainability-through-intelligent-systems/403546"]} +{"year":"2026","title":"Machine Learning-Based Theme Classification for Video Content Analysis: A Bilingual Approach on the StoryBox","authors":["H Parmaksız, Ö Öztürk, O Akarsu - Artificial Intelligence and Applications, 2025"],"snippet":"This research introduces an advanced hybrid machine learning framework for the automatic thematic classification of video content in a bilingual (Turkish–English) setting, with a particular focus on the YouTube StoryBox dataset (172 videos). The …","url":["https://ojs.bonviewpress.com/index.php/AIA/article/download/6942/1809"]} +{"year":"2026","title":"MacrOData: New Benchmarks of Thousands of Datasets for Tabular Outlier Detection","authors":["X Ding, S Klüttermann, H Wen, Y Chen, L Akoglu - arXiv preprint arXiv:2602.09329, 2026"],"snippet":"Quality benchmarks are essential for fairly and accurately tracking scientific progress and enabling practitioners to make informed methodological choices. Outlier detection (OD) on tabular data underpins numerous real-world applications, yet …","url":["https://arxiv.org/pdf/2602.09329"]} +{"year":"2026","title":"MAGA-Bench: Machine-Augment-Generated Text via Alignment Detection Benchmark","authors":["A Song, Y Cheng, Y Xu, R Feng - arXiv preprint arXiv:2601.04633, 2026"],"snippet":"Large Language Models (LLMs) alignment is constantly evolving. Machine-Generated Text (MGT) is becoming increasingly difficult to distinguish from Human-Written Text (HWT). This has exacerbated abuse issues such as fake news and online fraud. Fine-tuned …","url":["https://arxiv.org/pdf/2601.04633"]} +{"year":"2026","title":"MDMIC—An Augmented Indic Corpus and Joint Multitask Attention-Based Fusion Framework for Cross-Domain, Multi-Intent NLU in LoRes Languages","authors":["K Mitra, AV Kolasani, PS Shruthi, K Chelliah… - IEEE Access, 2026"],"snippet":"… XLM-R is a strong multilingual encoder trained on 2.5 TB of CommonCrawl data. Its robust cross-lingual alignment and script coverage make it a state-of-the-art baseline for multilingual NLU, particularly for diverse Indic languages. As per the …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11395942.pdf"]} +{"year":"2026","title":"Measuring Online Media Ideology with Large Language Models and\" Multi-Cue Classification\"","authors":["LP da Silva - 2026"],"snippet":"… First, I obtain the corpus by modifying a Python crawler to extract one million media articles from Common Crawl News, a repository of most media articles1 on the … These come from an even larger dataset of 1 million articles that I randomly …","url":["https://osf.io/download/zmtqp/"]} +{"year":"2026","title":"Mecellem Models: Turkish Models Trained from Scratch and Continually Pre-trained for the Legal Domain","authors":["Ö Uğur, M Göksu, M Çimen, M Yılmaz, E Şavirdi… - arXiv preprint arXiv …, 2026"],"snippet":"This paper presents Mecellem models, a framework for developing specialized language models for the Turkish legal domain through domain adaptation strategies. We make two contributions: (1)Encoder Model Pre-trained from Scratch …","url":["https://arxiv.org/pdf/2601.16018"]} +{"year":"2026","title":"MedXIAOHE: A Comprehensive Recipe for Building Medical MLLMs","authors":["B Shi, B Cui, B Jiang, D Yu, F Qian, H Yang, H Wang… - arXiv preprint arXiv …, 2026"],"snippet":"We present MedXIAOHE, a medical vision-language foundation model designed to advance general-purpose medical understanding and reasoning in real-world clinical applications. MedXIAOHE achieves state-of-the-art performance across …","url":["https://arxiv.org/pdf/2602.12705"]} +{"year":"2026","title":"Memorization Dynamics in Knowledge Distillation for Language Models","authors":["J Borkar, K Chadha, N Mireshghallah, Y Zhang… - arXiv preprint arXiv …, 2026"],"snippet":"… For the primary setup, we use 1M examples with a sequence length of 256 tokens from the July 2025 Common Crawl dump of FineWeb as our dataset D. We first fine-tune the Pythia 12B base model on D using cross-entropy loss to obtain the teacher …","url":["https://arxiv.org/pdf/2601.15394"]} +{"year":"2026","title":"Mi: dm 2.0 Korea-centric Bilingual Language Models","authors":["D Shin, S Lee, S Bae, H Ryu, C Ok, H Jung, H Ji, J Lim… - arXiv preprint arXiv …, 2026"],"snippet":"… For the subsequent step, we design a source-specific refinement pipeline for non-common Crawl Korean datasets, such as those securely acquired from books, encyclopedias, academic papers, expert knowledge databases, and licensed news articles. This …","url":["https://arxiv.org/pdf/2601.09066"]} +{"year":"2026","title":"Micron® MRDIMM Technology Delivers Performance and Efficiency","authors":["H Pötter, S Gomatam, M Arif, S Somandepalli…"],"snippet":"This report comprehensively evaluates Micron® Multiplexed Rank Dual In-line Memory Module (MRDIMM) technology, emphasizing its impact on memory subsystem latency, bandwidth, and system-level power efficiency across a range of …","url":["https://www.micron.com/content/dam/micron/global/public/products/memory/dram-modules/mrdimm/documents/mrdimm-white-paper.pdf"]} +{"year":"2026","title":"MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework for General Deep Research Tasks","authors":["S Su, S Xing, X Dong, M Zhong, B Wang, X Zhu… - arXiv preprint arXiv …, 2026"],"snippet":"Despite the remarkable progress of large language models (LLMs), the capabilities of standalone LLMs have begun to plateau when tackling real-world, complex tasks that require interaction with external tools and dynamic environments. Although …","url":["https://arxiv.org/pdf/2602.22808"]} +{"year":"2026","title":"Mitigating Preference Leakage via Strict Estimator Separation for Normative Generative Ranking","authors":["D Nahhas, X Cai, I Razzak, S Jameel - arXiv preprint arXiv:2602.20800, 2026"],"snippet":"In Generative Information Retrieval (GenIR), the bottleneck has shifted from generation to the selection of candidates, particularly for normative criteria such as cultural relevance. Current LLM-as-a-Judge evaluations often suffer from circularity …","url":["https://arxiv.org/pdf/2602.20800"]} +{"year":"2026","title":"Moving Beyond Sparse Grounding with Complete Screen Parsing Supervision","authors":["AS Gurbuz, S Hong, A Nassar, M Pollefeys, P Staar - arXiv preprint arXiv:2602.14276, 2026"],"snippet":"… This dataset aggregates URLs from multiple sources, including Common Crawl, Alexa Top Sites, and public domain lists. We then curate a balanced subset of URLs spanning various categories (eg, e-commerce, news, social media, blogs) to ensure …","url":["https://arxiv.org/pdf/2602.14276"]} +{"year":"2026","title":"MrBERT: Modern Multilingual Encoders via Vocabulary, Domain, and Dimensional Adaptation","authors":["D Tamayo, I Lacunza, P Rivera-Hidalgo, S Da Dalt… - arXiv preprint arXiv …, 2026"],"snippet":"We introduce MrBERT, a family of 150M-300M parameter encoders built on the ModernBERT architecture and pre-trained on 35 languages and code. Through targeted adaptation, this model family achieves state-of-the-art results on Catalan …","url":["https://arxiv.org/pdf/2602.21379"]} +{"year":"2026","title":"MTT-TKG: Multi-Time-Gate, Time-Aware, and Time-Guided Representation Learning for TKGs","authors":["Q Liu, S Feng, M Huang, UA Bhatti, MK Khan - IEEE Internet of Things Journal, 2026"],"snippet":"Temporal Knowledge Graph (TKG) representation learning embeds entities and relations into a low-dimensional space while preserving relational structures across time steps. Existing methods often neglect the critical role of timestamps in capturing …","url":["https://ieeexplore.ieee.org/abstract/document/11357906/"]} +{"year":"2026","title":"Multi-Document Summarization: From Ideational to Opinionated Sources","authors":["M Li - 2025"],"snippet":"… To retrieve more input documents in each cluster, WCEP includes similar articles in the Common Crawl News dataset2. Because of its training data size, these two datasets are popularly used in MDS research. However, both datasets have …","url":["https://minerva-access.unimelb.edu.au/bitstreams/e27c9a41-4b18-4bb0-a5fb-b13c815ecc32/download"]} +{"year":"2026","title":"Multi-view Multi-label Canonical Correlation Analysis with Dual Correlations for Cross-modal Multimedia Retrieval","authors":["A Rani, Y Verma - International Journal of Multimedia Information …, 2026"],"snippet":"… pretrained over Common Crawl and Wikipedia corpus for computing representation of German and Spanish text (in the IAPRTC-12 dataset). In case of the Flickr dataset, due to the presence of multilingual tags, we use the fastText …","url":["https://link.springer.com/article/10.1007/s13735-025-00384-6"]} +{"year":"2026","title":"Multimodal Foundation Models","authors":["L Lin, Y Liu - Multimodal Large Models: A New Paradigm of Artificial …, 2026"],"snippet":"… LLaMA preprocesses five CommonCrawl datasets from 2017 to 2020 using the CCNet pipeline [757]. This pipeline performs row-level deduplication, employs FastText linear classifiers for language identification to remove non-English pages …","url":["https://link.springer.com/chapter/10.1007/978-981-95-4929-0_3"]} +{"year":"2026","title":"Multimodal Large Models","authors":["L Lin, Y Liu"],"snippet":"… GPT-3 [61]: Released in May 2020, it utilized 175 billion parameters and 45 TB of CommonCrawl data for massive-scale learning—over … data and CommonCrawl data as low-quality data, train a simple logistic regression model to assess data …","url":["https://link.springer.com/content/pdf/10.1007/978-981-95-4929-0.pdf"]} +{"year":"2026","title":"Multimodal Large Models: A New Paradigm of Artificial Intelligence","authors":["L Lin"],"snippet":"… GPT-2 was pre-trained on larger text datasets including CommonCrawl, WebText, and BooksCorpus. Compared to GPT-1, GPT-2 showed … GPT-3 [61]: Released in May 2020, it utilized 175 billion parameters and 45 TB of CommonCrawl data for …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=LcHAEQAAQBAJ&oi=fnd&pg=PR6&dq=commoncrawl&ots=zHslsni1nJ&sig=7pLEHDkbH_gU-R_EguY3YZXXtIo"]} +{"year":"2026","title":"Multitask Learning of Semantic Role Labeling and Named Entity Recognition for domain-specific documents from the Dutch East-India Company archives","authors":["H Goossens"],"snippet":"This thesis investigates the effect of Multitask Learning (MTL) on Semantic Role Labeling (SRL) using annotated documents from the Dutch East-India Company (VOC) archives, written in Early Modern Dutch. Several Transformer-based models are …","url":["https://home.cltl.labs.vu.nl/static/data/theses/thesis_Hannah_Goossens_2025.pdf"]} +{"year":"2026","title":"MUTEX: Leveraging Multilingual Transformers and Conditional Random Fields for Enhanced Urdu Toxic Span Detection","authors":["I Arshad, F Saleem, I Hussain - arXiv preprint arXiv:2603.05057, 2026"],"snippet":"… XLM-RoBERTa is a multi-lingual transformer, which was trained on the CommonCrawl data and it provides powerful multi-lingual representation especially in low-resource languages [15]. RoBERTa-Large is added to provide the baseline …","url":["https://arxiv.org/pdf/2603.05057"]} +{"year":"2026","title":"Name It to Map It: Exploring How Onomastic Databases Enable AI to Navigate Culture and Identity","authors":["E Schochenmaier"],"snippet":"In the rapidly evolving field of artificial intelligence, the synergy between linguistics and AI is unlocking new potentials, especially in the realm of named entity recognition (NER) and machine learning (CARSENAT–SHOKHENMAYER 2016 …","url":["https://www.researchgate.net/profile/Eugen-Schochenmaier/publication/399411512_Name_It_to_Map_It_Exploring_How_Onomastic_Databases_Enable_AI_to_Navigate_Culture_and_Identity/links/6959643f27359023a013170d/Name-It-to-Map-It-Exploring-How-Onomastic-Databases-Enable-AI-to-Navigate-Culture-and-Identity.pdf"]} +{"year":"2026","title":"Natural Language Processing for Multilingual and Low-Resource Languages","authors":["J Karthikeyan, MY Sayed, Y Waykar, V Sathya… - 2025 IEEE 7th International …, 2025"],"snippet":"Multilingual and low-resource NLP is a challenging task as it suffers from the lack of labelled data, high annotation cost, and the paucity of language-specific resources. We present an integrated framework that uses pre-trained multilingual encoders …","url":["https://ieeexplore.ieee.org/abstract/document/11325195/"]} +{"year":"2026","title":"Natural Language Processing in the Era of Large Language Models: Foundations, Integration, and Low-Resource Frontiers","authors":["M Gottam - 2026"],"snippet":"Large Language Models (LLMs) have fundamentally transformed the landscape of Natural Language Processing (NLP), subsuming and redefining tasks that were once addressed by specialized, modular pipelines. This paper surveys the role of …","url":["https://www.preprints.org/frontend/manuscript/18e8902d287d8bf70d7657c469bf7474/download_pub"]} +{"year":"2026","title":"Natural Language Processing Resources for Tamil Language: A Systematic Review.","authors":["M Tissera, H Saadhiq - Journal of Information & Communication Convergence …, 2025"],"snippet":"… The OSCAR corpus [13], with 226 million words scraped from CommonCrawl, supports large-scale models such as MuRIL [14], which was trained in multiple Indian languages, including Tamil. By combining OSCAR with Wikipedia and …","url":["https://search.ebscohost.com/login.aspx?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=22348255&AN=190562249&h=WDbUGS4I8q7Pw7IzOIoQl0ZA7rrgtIQy5mNFiTmnFJFBHy%2BHvm1qf5KzL3W%2BuKj6UG0mfsI%2F8qJmd2R1Y5p%2BcA%3D%3D&crl=c"]} +{"year":"2026","title":"Natural Language Processing: Key Applications and Modern Advances","authors":["S Yang"],"snippet":"Natural Language Processing (NLP) is a core subfield of artificial intelligence that enables machines to understand, interpret, and generate human language in a semantically meaningful way. This paper provides a concise overview of NLP's …","url":["https://openreview.net/pdf?id=TSOPN3zx8E"]} +{"year":"2026","title":"Navigating Copyright, Data Governance, and AI Training: Multijurisdictional Policy Assessment","authors":["M Bapat, I Kul - International Conference on Socio Legal Intricacies of …, 2026"],"snippet":"… For example, Common Crawl is a non-profit initiative. It continuously crawls and archives larger sections of the internet and collects publicly accessible data. Because it aggregates material from open web sources and articles, it often contains …","url":["https://www.atlantis-press.com/article/126021996.pdf"]} +{"year":"2026","title":"Navigating the Ocean of Language Model Training Data","authors":["J Liu - 2025"],"snippet":"One crucial step toward understanding large language models (LLMs) is to understand their training data. Modern LLMs are trained on text corpora with trillions of tokens, hindering them from being easily analyzed. In this thesis, I discuss my …","url":["https://search.proquest.com/openview/723276a25a6df19e8ba9486a4fd5f00d/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Neural FOXP2--Language Specific Neuron Steering for Targeted Language Improvement in LLMs","authors":["A Saha, T Joshi, V Jain, A Chadha, A Das - arXiv preprint arXiv:2602.00945, 2026"],"snippet":"LLMs are multilingual by training, yet their lingua franca is often English, reflecting English language dominance in pretraining. Other languages remain in parametric memory but are systematically suppressed. We argue that language defaultness is …","url":["https://arxiv.org/pdf/2602.00945"]} +{"year":"2026","title":"NorwAI's Large Language Models: Technical Report","authors":["JA Gulla, P Liu, L Zhang - arXiv preprint arXiv:2601.03034, 2026"],"snippet":"Norwegian, spoken by approximately five million people, remains underrepresented in many of the most significant breakthroughs in Natural Language Processing (NLP). To address this gap, the NorLLM team at NorwAI has developed a family of models …","url":["https://arxiv.org/pdf/2601.03034"]} +{"year":"2026","title":"of Thesis: Assessment of the State-of-the-art Benchmarks","authors":["L Horníková"],"snippet":"Large language models have demonstrated impressive performance on various downstream tasks, hinting at possible reasoning capabilities. To assess whether these models can reason beyond surface-level inference, various benchmarks have …","url":["https://is.muni.cz/th/f0evs/Assessment_of_the_State-of-the-art_Benchmarks_Used_to_Evaluate_Social_Reasoning_and_Theory_of_Mind_in_LLMs.pdf"]} +{"year":"2026","title":"On Knowledge in AI: Epistemic and Ethical Limitations of Language Models and Knowledge Graphs","authors":["A Kraft - 2025"],"snippet":"This thesis critically investigates whether or not AI-based knowledge technology built on language models, knowledge graphs, and/or knowledge-enhanced language models deserves the epistemic authority it happens to receive and …","url":["https://ediss.sub.uni-hamburg.de/bitstream/ediss/12231/1/dissertation_angelie_kraft.pdf"]} +{"year":"2026","title":"On the Retention of Edited Knowledge in Fine-Tuned Lan-guage Models","authors":["F Wen, S Zhang"],"snippet":"… Specifically, for our experiments on the GPT-2 XL, which was pre-trained on webtext, we choose the Common Crawl dataset for fine-tuning. Since a small subset of the data suffices to demonstrate the influence, we sampled 60k data for our …","url":["https://origen-workshop.github.io/assets/pdf/2025/papers/13_On_the_Retention_of_Edited_.pdf"]} +{"year":"2026","title":"On the Spectral Flattening of Quantized Embeddings","authors":["J Huang, W Fang, Z Tang, Y Wang, X Kang, Y Zheng… - arXiv preprint arXiv …, 2026"],"snippet":"Training Large Language Models (LLMs) at ultra-low precision is critically impeded by instability rooted in the conflict between discrete quantization constraints and the intrinsic heavy-tailed spectral nature of linguistic data. By formalizing the connection …","url":["https://arxiv.org/pdf/2602.00969"]} +{"year":"2026","title":"One Word is Enough: Minimal Adversarial Perturbations for Neural Text Ranking","authors":["T Karmakar, S Saha, D Majumdar, S Halder - arXiv preprint arXiv:2601.20283, 2026"],"snippet":"Neural ranking models (NRMs) achieve strong retrieval effectiveness, yet prior work has shown they are vulnerable to adversarial perturbations. We revisit this robustness question with a minimal, query-aware attack that promotes a target …","url":["https://arxiv.org/pdf/2601.20283"]} +{"year":"2026","title":"Open LLM Projects Should Allocate More Compute for Data Than Training","authors":["M Idahl"],"snippet":"Open LLM projects aim to build the best possible open language models under constrained compute budgets. Currently, most allocate the vast majority of their GPU compute to training runs rather than better data. This position paper argues that …","url":["https://openreview.net/pdf?id=YdvGgmIJRH"]} +{"year":"2026","title":"Optimal Splitting of Language Models from Mixtures to Specialized Domains","authors":["S Seto, P Ablin, A Filippova, J Ye, L Béthune…"],"snippet":"Language models achieve impressive performance on a variety of knowledge, language, and reasoning tasks due to the scale and diversity of pretraining data available. The standard training recipe is a two-stage paradigm: pretraining first on …","url":["https://openreview.net/pdf?id=jouSp5cLqC"]} +{"year":"2026","title":"OPTIMAL-EM: Complexity-Driven Clustering for Optimised Web Accessibility Evaluation","authors":["A Hambley, Y Yesilada, M Vigo, S Harper - ACM Transactions on the Web, 2026"],"snippet":"… For validation, we used Common Crawl5 to rank the top 300 sites by harmonic centrality, narrowing the list to 173 and selecting 10 for conirmatory evaluations. From this, we randomly chose 500 pages from three websites (The University of …","url":["https://dl.acm.org/doi/pdf/10.1145/3799797"]} +{"year":"2026","title":"Optimization of artificial intelligence natural language processing model based on deep neural network","authors":["M Jia - IEEE Access, 2025"],"snippet":"This paper proposes a novel framework integrating Hierarchical Relational Transformer (HRT) and Adaptive Semantic Calibration (ASC) to address challenges in semantic coherence, transferability, and interpretability, especially in multilingual …","url":["https://ieeexplore.ieee.org/iel8/6287639/6514899/11296800.pdf"]} +{"year":"2026","title":"Optimizing Assamese information retrieval using classification and embedding techniques","authors":["MP Lahkar, U Sharma, T Pradhan - International Journal of Machine Learning and …, 2026"],"snippet":"… For implementation, we use pre-trained FastText embeddings trained on Common Crawl and Wikipedia corpora for Assamese. Each embedding has 300 dimensions and captures both syntactic and semantic nuances. The TF-IDF …","url":["https://link.springer.com/article/10.1007/s13042-025-02950-0"]} +{"year":"2026","title":"Optimizing Content Generation with AI and Web Technologies: A Scalable Framework","authors":["K Prabu, A Alagarsamy, GK Kushwaha, VK Sharma… - … International Conference on …, 2025"],"snippet":"The demand for AI-driven content generation has significantly increased due to the growing need for scalable and high-quality digital content across industries such as marketing, education, and entertainment. This paper presents the development of a …","url":["https://ieeexplore.ieee.org/abstract/document/11320626/"]} +{"year":"2026","title":"Optimizing Nearest-Neighbor Search with Seed Forwarding in Serial Multi-Graph Vector Indexes","authors":["정승혁, 한대규, 구헤라, 남범석 - Journal of KIISE, 2025"],"snippet":"데이터가 다수의 리포지토리에 분리되어있는 다중 벡터스토어 환경에서, 그래프 기반 근사 최근접 탐색(ANN)은 시간당 처리량 면에서 한계점이 존재한다. 본 연구는 이러한 한계를 해소하기 위해 시드 포워딩 (seed forwarding) 기법을 제안한다. 시드 포워딩은 …","url":["https://www.dbpia.co.kr/Journal/articleDetail?nodeId=NODE12543697"]} +{"year":"2026","title":"OPUS: Towards Efficient and Principled Data Selection in Large Language Model Pre-training in Every Iteration","authors":["S Wang, X Ouyang, T Xu, Y Hu, J Liu, G Chen, T Zhang… - arXiv preprint arXiv …, 2026"],"snippet":"As high-quality public text approaches exhaustion, a phenomenon known as the Data Wall, pre-training is shifting from more tokens to better tokens. However, existing methods either rely on heuristic static filters that ignore training dynamics, or …","url":["https://arxiv.org/pdf/2602.05400"]} +{"year":"2026","title":"OPUS: Towards Principled and Scalable Data Selection for Large Language Model Pre-training in Every Iteration","authors":["S Wang, X Ouyang, T Xu, Y Hu, J Liu, G Chen, T Zhang…"],"snippet":"As high-quality public text approaches exhaustion, a phenomenon known as the Data Wall—LLM pre-training is shifting from more tokens to better tokens. However, existing methods either rely on heuristic static filters that ignore training dynamics, or …","url":["https://openreview.net/pdf?id=FEfuE1mAB6"]} +{"year":"2026","title":"Oral History Encounters AI: An Exploration of Core Principles and Best Practices, Context and Consent","authors":["M Larson - The Oral History Review, 2026"],"snippet":"… Some basic information was revealed for the model for OpenAI’s GPT-3, so we know that in 2020, the weighted training data used was 60 percent Common Crawl (ie, acquired by harvesting existing sources on the internet), 22 percent WebText2 (drawn …","url":["https://www.tandfonline.com/doi/abs/10.1080/00940798.2026.2625663"]} +{"year":"2026","title":"Ordinary, Reasonable Chatbots: Do AI Models Track Human Legal Judgments?","authors":["N PATEL, E WENGER, B CHRISTOPHER"],"snippet":"… This is likely amplified by the possibility that many frontier LLMs draw from overlapping web-scraped “data commons,” especially large, shared corpora such as those derived from Common Crawl [29]. More broadly, because AI training …","url":["https://arguslab.pratt.duke.edu/documents/llm_reasonableness.pdf"]} +{"year":"2026","title":"Outlier and collapse: The enron corpus and foundation model training data","authors":["Z Zimmer - Big Data & Society, 2026"],"snippet":"… on the massive, publicly available data sets like the Common Crawl used to build many of the contemporary large language models (LLMs); … These training datasets are huge—Common Crawl indexes over 2 billion webpages, and ImageNet …","url":["https://journals.sagepub.com/doi/pdf/10.1177/20539517261421474"]} +{"year":"2026","title":"Over-Searching in Search-Augmented Large Language Models","authors":["R Xie, D Gopinath, D Qiu, D Lin, H Sun, S Potdar… - arXiv preprint arXiv …, 2026"],"snippet":"Search-augmented large language models (LLMs) excel at knowledge-intensive tasks by integrating external retrieval. However, they often over-search -- unnecessarily invoking search tool even when it does not improve response quality …","url":["https://arxiv.org/pdf/2601.05503"]} +{"year":"2026","title":"Overview of the TREC 2025 RAGTIME Track","authors":["D Lawrie, S MacAvaney, J Mayfield, L Soldaini, E Yang… - arXiv preprint arXiv …, 2026"],"snippet":"… The documents were obtained by the CommonCrawl service between August 1, 2021 and July 31, 2024. Text was extracted from each source web page Language id performed by GlotLID v3, which the FineWeb folks at HuggingFace recommended …","url":["https://arxiv.org/pdf/2602.10024"]} +{"year":"2026","title":"Pantagruel: Unified Self-Supervised Encoders for French Text and Speech","authors":["PH Le, V Pelloin, A Chatelain, M Bouziane, M Ghennai… - arXiv preprint arXiv …, 2026"],"snippet":"We release Pantagruel models, a new family of self-supervised encoder models for French text and speech. Instead of predicting modality-tailored targets such as textual tokens or speech units, Pantagruel learns contextualized target …","url":["https://arxiv.org/pdf/2601.05911"]} +{"year":"2026","title":"PerSoMed: A Large-Scale Balanced Dataset for Persian Social Media Text Classification","authors":["I Chehreh, E Ansari - arXiv preprint arXiv:2602.19333, 2026"],"snippet":"This research introduces the first large-scale, well-balanced Persian social media text classification dataset, specifically designed to address the lack of comprehensive resources in this domain. The dataset comprises 36,000 posts …","url":["https://arxiv.org/pdf/2602.19333"]} +{"year":"2026","title":"Phishcatcher 2.0: Real-Time Adaptive Client-Side Defense Against Sophisticated Web Spoofing Attacks","authors":["DP Priya, K Naveen - Milestone Transactions on Artificial Intelligence, 2026"],"snippet":"… This model has been trained using the data from PhishTank and Common Crawl. It has a capability of accuracy of 97.4%. However, this model has a very high computational cost. … Transformer-based phishing detector PhishTank, Common Crawl …","url":["https://www.milestoneresearch.in/JOURNALS/index.php/MAI/article/download/282/276"]} +{"year":"2026","title":"PII-VisBench: Evaluating Personally Identifiable Information Safety in Vision Language Models Along a Continuum of Visibility","authors":["GM Shahariar, ZA Nazi, MOH Bhuiyan, Z Shi - arXiv preprint arXiv:2601.05739, 2026"],"snippet":"Vision Language Models (VLMs) are increasingly integrated into privacy-critical domains, yet existing evaluations of personally identifiable information (PII) leakage largely treat privacy as a static extraction task and ignore how a subject's online …","url":["https://arxiv.org/pdf/2601.05739"]} +{"year":"2026","title":"POET-X: Memory-efficient LLM Training by Scaling Orthogonal Transformation","authors":["Z Qiu, L Liu, A Weller, H Shi, W Liu - arXiv preprint arXiv:2603.05500, 2026"],"snippet":"… 2020), a widely-used, largescale corpus derived from Common Crawl. We benchmark our method against AdamW, the prevailing … This dataset is a large-scale, meticulously cleaned version of Common Crawl’s web corpus, first introduced for …","url":["https://arxiv.org/pdf/2603.05500"]} +{"year":"2026","title":"Practising responsibility: Ethics in NLP as a hands-on course","authors":["M Nissim, V Patti, B Savoldi - arXiv preprint arXiv:2512.24825, 2025"],"snippet":"As Natural Language Processing (NLP) systems become more pervasive, integrating ethical considerations into NLP education has become essential. However, this presents inherent challenges in curriculum development: the field's …","url":["https://arxiv.org/pdf/2512.24825"]} +{"year":"2026","title":"PREDICTING DISEASE WITH AI: CAN MACHINES FORESEE OUR GENETIC FUTURE?","authors":["E GUMUSOGLU-ACAR - and Organ Printing"],"snippet":"The mixture of genomics and artificial intelligence (AI) has carried biomedical research and healthcare one step further. The completion of the human genome through the Human Genome Project and its large collaborations, such as ENCODE …","url":["https://www.researchgate.net/profile/Seda-Keskin-3/publication/399434166_Regenerative_Medicine_Stem_Cells_Tissue_Engineering_And_Organ_Printing/links/695acf1006a9ab54f84e9fae/Regenerative-Medicine-Stem-Cells-Tissue-Engineering-And-Organ-Printing.pdf#page=270"]} +{"year":"2026","title":"Pretraining a Japanese-Only Large Language Model for Studying Second Language Acquisition","authors":["SMYJH Fei, CT Kodama, HKY Murawaki"],"snippet":"We introduce Dango, a decoder-only LLM pretrained to approximate an L1 Japanese speaker for studying L1–L2 transfer. Existing multilingual LLMs are trained on mixedlanguage data, which obscures sequential L1–L2 effects. We show …","url":["https://www.anlp.jp/proceedings/annual_meeting/2026/pdf_dir/C7-17.pdf"]} +{"year":"2026","title":"Prevent Social Robots from Eroding Trust?","authors":["R Hakli - Social Robots and Cultural Sustainability"],"snippet":"Already now, many different types of artificial systems play a role in our social lives. In the foreseeable future, we are likely to interact with a growing number of social robotics products in industry, transportation, healthcare, military, children's toys, and …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=GZ6lEQAAQBAJ&oi=fnd&pg=PA205&dq=commoncrawl&ots=TPXeWYpCSI&sig=FrqR1v7hvYmS1EJ2dKzuJYpFa2s"]} +{"year":"2026","title":"Prior Aware Memorization: An Efficient Metric for Distinguishing Memorization from Generalization in Large Language Models","authors":["T Tiwari, A Trachtenberg, GE Suh - arXiv preprint arXiv:2602.18733, 2026"],"snippet":"Training data leakage from Large Language Models (LLMs) raises serious concerns related to privacy, security, and copyright compliance. A central challenge in assessing this risk is distinguishing genuine memorization of training data from the …","url":["https://arxiv.org/pdf/2602.18733"]} +{"year":"2026","title":"Problems in High-Dimensional Estimation and Large Language Models","authors":["X Li - 2026"],"snippet":"This dissertation investigates critical problems at the intersection of high-dimensional statistics and the rapidly advancing field of large language models (LLMs), forging a narrative that bridges foundational theory with state-of-the-art applications. The work …","url":["https://dash.harvard.edu/bitstreams/f7d6f7fb-f324-4e41-85e0-776f86590696/download"]} +{"year":"2026","title":"Problems With Large Language Models for Learner Modelling: Why LLMs Alone Fall Short for Responsible Tutoring in K--12 Education","authors":["D Hooshyar, Y Yang, G Šíř, T Kärkkäinen… - arXiv preprint arXiv …, 2025"],"snippet":"… Web-based corpora such as Common Crawl or WebText [23] may embed representational imbalances, toxic content, and stereotypes [24, 25], while pre-processing steps are often constrained by subjective judgments [17]. During pre-training, LLMs …","url":["https://arxiv.org/pdf/2512.23036"]} +{"year":"2026","title":"Progressive Adaptation of Large Language Models for Multilingual Text Ranking","authors":["L Zhang, Y Zhang, D Long, P Xie, M Zhang, J Li… - ACM Transactions on …, 2026"],"snippet":"Despite increasing research attention to text ranking, most studies focus on monolingual scenarios, with a particular emphasis on English-language contexts. This narrow focus limits the applicability of ranking models in cross-lingual contexts …","url":["https://dl.acm.org/doi/pdf/10.1145/3788859"]} +{"year":"2026","title":"Prompt Engineering Taxonomy for AI-Enhanced Construction Safety Analysis: Identifying and Categorizing Fall from Height Accidents","authors":["U Ray, S Chang, JW Park"],"snippet":"Prompt engineering emerges as an innovative computational methodology for advancing AI-driven sustainable technologies. This study introduces a novel taxonomical prompt engineering framework utilizing advanced natural language …","url":["https://www.researchgate.net/profile/Unmesa-Ray/publication/400513975_Prompt_Engineering_Taxonomy_for_AI-Enhanced_Construction_Safety_Analysis_Identifying_and_Categorizing_Fall_from_Height_Accidents/links/698cee1e12f837212a19c1f1/Prompt-Engineering-Taxonomy-for-AI-Enhanced-Construction-Safety-Analysis-Identifying-and-Categorizing-Fall-from-Height-Accidents.pdf"]} +{"year":"2026","title":"propella-1: Multi-Property Document Annotation for LLM Data Curation at Scale","authors":["M Idahl, B Droste, B Plüster, JP Harries - arXiv preprint arXiv:2602.12414, 2026"],"snippet":"Since FineWeb-Edu, data curation for LLM pretraining has predominantly relied on single scalar quality scores produced by small classifiers. A single score conflates multiple quality dimensions, prevents flexible filtering, and offers no interpretability …","url":["https://arxiv.org/pdf/2602.12414"]} +{"year":"2026","title":"QianfanHuijin Technical Report: A Novel Multi-Stage Training Paradigm for Finance Industrial LLMs","authors":["S Li, W Lu, L Liu, C Lin, S Li, Z Tan, H Zhong, Y Zeng… - arXiv preprint arXiv …, 2025"],"snippet":"Domain-specific enhancement of Large Language Models (LLMs) within the financial context has long been a focal point of industrial application. While previous models such as BloombergGPT and Baichuan-Finance primarily focused on …","url":["https://arxiv.org/pdf/2512.24314"]} +{"year":"2026","title":"QQ: A Toolkit for Language Identifiers and Metadata","authors":["W Poelman, Y Chen, M de Lhoneux - arXiv preprint arXiv:2603.00620, 2026"],"snippet":"The growing number of languages considered in multilingual NLP, including new datasets and tasks, poses challenges regarding properly and accurately reporting which languages are used and how. For example, datasets often use different …","url":["https://arxiv.org/pdf/2603.00620"]} +{"year":"2026","title":"Quantifying the Gaps: A Systematic Taxonomy of Bias and Imbalance in 96 Multilingual AI Benchmarks & Datasets","authors":["S JAJEE, T SHAW, V SONI - 2026"],"snippet":"… The data is extracted from CommonCrawl and the Internet Archive, and the authors detail their open-source-based methods for data acquisition and processing. The HPLT resources are among the largest open text corpora ever released and …","url":["https://www.researchgate.net/profile/Sankalp-Jajee/publication/399995100_Quantifying_the_Gaps_A_Systematic_Taxonomy_of_Bias_and_Imbalance_in_96_Multilingual_AI_Benchmarks_Datasets/links/69726a71ac604d40d0e50a42/Quantifying-the-Gaps-A-Systematic-Taxonomy-of-Bias-and-Imbalance-in-96-Multilingual-AI-Benchmarks-Datasets.pdf"]} +{"year":"2026","title":"Qwen3-Coder-Next Technical Report","authors":["R Cao, M Chen, J Chen, Z Cui, Y Feng, B Hui, Y Jing… - arXiv preprint arXiv …, 2026"],"snippet":"… To improve single-turn QA capability, we use Common Crawl documents as seed data and prompt Qwen3-Coder-480B-A35B-Instruct to generate multiple grounded question–answer pairs per document. Generated questions must be self-contained …","url":["https://arxiv.org/pdf/2603.00729"]} +{"year":"2026","title":"Racka: Efficient Hungarian LLM Adaptation on Academic Infrastructure","authors":["Z Csibi, BG Gortka, N Gyöngyössy, K Nagy… - arXiv preprint arXiv …, 2026"],"snippet":"… The primary source for web data across all languages was the Common Crawl repository, with all data sourced from crawls conducted … 70B Hungarian tokens were collected from various sources including Common Crawl, academic …","url":["https://arxiv.org/pdf/2601.01244"]} +{"year":"2026","title":"RAIRS: Optimizing Redundant Assignment and List Layout for IVF-Based ANN Search","authors":["Z Yang, S Chen - arXiv preprint arXiv:2601.07183, 2026"],"snippet":"… OpenAI: This OpenAI embedding data set is generated from an open sourced C4 data set from the Common Crawl data. We download it using the command line tool of VectorDBBench [69] with L2 distance as the metric type. • T2I [8]: The Yandex Text-to-Image …","url":["https://arxiv.org/pdf/2601.07183"]} +{"year":"2026","title":"Raising Bars, Not Parameters: LilMoo Compact Language Model for Hindi","authors":["S Fatimah, A Sen, S Falk, F Mai, L Flek, NK Corrêa - arXiv preprint arXiv:2603.03508, 2026"],"snippet":"… To broaden topical diversity and ensure inclusion of recent material, we supplement these sources with newly released Common Crawl snapshots. All collected data are then processed through a unified multi-stage pipeline designed to …","url":["https://arxiv.org/pdf/2603.03508"]} +{"year":"2026","title":"Real Time Phishing Site Detection Using Transformer Ensemble Models","authors":["S Ohmshankar, S Sreevidhya, BB Sam, F Jermina… - 2025 4th International …, 2025"],"snippet":"… dataset compiled from Phish Tank and Open Phish feeds combined with benign websites collected from Alexa and Common Crawl. The … , OpenPhish) and benign snapshots from large web listings and Common Crawl. Record collection dates to …","url":["https://ieeexplore.ieee.org/abstract/document/11330770/"]} +{"year":"2026","title":"Reducing Tokenization Premiums for Low-Resource Languages","authors":["G Churchill, S Skiena - arXiv preprint arXiv:2601.13328, 2026"],"snippet":"… T5 derived its vocabulary from the Common Crawl, which likely makes up a significant portion of the other three vocabularies in the cluster. The second cluster comprises BERT Multilingual (Google) and M2M-100 (Meta), both “multilingual” …","url":["https://arxiv.org/pdf/2601.13328"]} +{"year":"2026","title":"ReflectiveRAG: Rethinking adaptivity in retrieval-augmented generation","authors":["A Verma, S Gupta, S Pillai, P Sircar, D Gupta - 2026"],"snippet":"… Evaluated on WebQuestions, HotpotQA (distractor setting) and InternalQA with 50M Common Crawl distractors, ReflectiveRAG achieves substantial gains over strong baselines-including DeepRAG-improving EM by+ 2.7 pp and F1 by+ 2.5 pp …","url":["https://www.amazon.science/publications/reflectiverag-rethinking-adaptivity-in-retrieval-augmented-generation"]} +{"year":"2026","title":"Regulatory Spillovers, Supply-Chain Networks, and the Valuation Relevance of ESG Risk","authors":["MA Aruwaji - 2026"],"snippet":"… GDELT structures glo©al news content o©tained through large-scale we© crawling including archived via the Common Crawl initiative and provides structured metadata on themes entities and contextual attri©utes relevant to ESG-related events …","url":["https://www.researchsquare.com/article/rs-8829686/latest.pdf"]} +{"year":"2026","title":"Reimagining Efficient Agents: A Survey of Memory, Tool Learning, and Planning: VOL-II","authors":["SM Mamun - 2026"],"snippet":"This document constitutes the second volume of the comprehensive anatomical review of the seminal survey\" Toward Efficient Agents: A Survey of Memory, Tool Learning, and Planning\" by Yang et al.(2026). While Volume I established a …","url":["https://www.academia.edu/download/129035750/Authentic_AI_Protocols_Efficiency_Review.pdf"]} +{"year":"2026","title":"ReLE: A Scalable System and Structured Benchmark for Diagnosing Capability Anisotropy in Chinese LLMs","authors":["R Fang, J Li, W Chen, B Hu, YC Chen, X Tang, L Diao - arXiv preprint arXiv …, 2026"],"snippet":"… Therefore, in addition to strict 13-gram overlap checks against pre-training corpora (CommonCrawl, C4), we implement Embedding-based Semantic Deduplication. We encode all fresh samples using a specialized retrieval model (BGE-M3) …","url":["https://arxiv.org/pdf/2601.17399"]} +{"year":"2026","title":"RELog: Robust and Efficient Anomaly Detection Based on Complete Logs for Large-scale Systems","authors":["X Chai, X Feng, Z Lou, Y Sun, M Guizani - IEEE Transactions on Computers, 2026"],"snippet":"… Existing studies have compared log templates to sentences in natural language, using FastText or Bidirectional Encoder Representations from Transformers (BERT) [35] pretrained on common domain datasets such as Common Crawl Corpus, Wikipedia …","url":["https://ieeexplore.ieee.org/abstract/document/11352878/"]} +{"year":"2026","title":"Replaying pre-training data improves fine-tuning","authors":["S Kotha, P Liang - arXiv preprint arXiv:2603.04964, 2026"],"snippet":"To obtain a language model for a target domain (eg math), the current paradigm is to pre-train on a vast amount of generic web text and then fine-tune on the relatively limited amount of target data. Typically, generic data is only mixed in during fine-tuning …","url":["https://arxiv.org/pdf/2603.04964"]} +{"year":"2026","title":"RexBERT: Context Specialized Bidirectional Encoders for E-commerce","authors":["R Bajaj, A Garg - arXiv preprint arXiv:2602.04605, 2026"],"snippet":"… Our source is FineFineWeb, a 4.4 trillion token CommonCrawl-derived corpus organised into approximately fifty categories. Each entry consists of a text snippet and domain label. To isolate retail content we manually inspect domains and select …","url":["https://arxiv.org/pdf/2602.04605"]} +{"year":"2026","title":"Riding the Spider: A Network-Sampling Framework for Multi-Platform Data Collections","authors":["P Kessling, FV Münch"]} +{"year":"2026","title":"Robot Tutors or Peers? Evaluating Math Learning and Conformity with LLM-Powered Robots in Tanzanian Primary Schools","authors":["EP Rutatola, EC Ntahomvukye, K Stroeken… - Proceedings of the 21st …, 2026"],"snippet":"In the past decade, more than half of Tanzanian pupils have failed mathematics in the national Primary School Leaving Examinations (PSLEs), a problem often linked to large class sizes, limited resources, and a shortage of qualified teachers. Social …","url":["https://dl.acm.org/doi/abs/10.1145/3757279.3785594"]} +{"year":"2026","title":"RoutIR: Fast Serving of Retrieval Pipelines for Retrieval-Augmented Generation","authors":["E Yang, A Yates, D Lawrie, J Mayfield, T Adriaanse - arXiv preprint arXiv:2601.10644, 2026"],"snippet":"… To demonstrate the adaptability of RoutIR, we report effectiveness and efficiency using the TREC 2023 NeuCLIR MLIR task [22], which has 76 queries and about 10 million web documents in Chinese, Persian, and Russian extracted from …","url":["https://arxiv.org/pdf/2601.10644"]} +{"year":"2026","title":"Safe and secure use of AI in research projects","authors":["R Shigapov - 2025"],"snippet":"AI tools increasingly support all stages of research projects. At the same time, their use is constrained by ethical principles, research-integrity standards, and governance (legal and regulatory) requirements. This workshop introduces these …","url":["https://shigapov.github.io/safe_ai/build/safe_AI-cb520290b76910f6059cbdde9e33906c.pdf"]} +{"year":"2026","title":"SATELLITE BASED ASSESSMENT OF SOIL HEAVY METAL CONTAMINATION USING DEEP LEARNING AND SWARM INTELLIGENCE","authors":["NSK CHEETI, G ANURADHA, W WANDOKO… - Journal of Theoretical and …, 2026"],"snippet":"… We evaluate the system over three datasets: Common Crawl (web content), Medical Text Mining, and Amazon Product Reviews, and compare to traditional keyword-based search and TF‐IDF and Word2Vec‐based approaches. The …","url":["https://www.jatit.org/volumes/hundredfour3.php"]} +{"year":"2026","title":"ScaleSwap: A Scalable OS Swap System for All-Flash Swap Arrays","authors":["T Ahn, C Yu, S Lee, Y Son"],"snippet":"… instance, Common Crawl [29] has recently collected approximately 455 TB of web data over two weeks, amounting to an average of 32 TB … However, in our evaluation with the Common Crawl dataset (see Section 5.4), Spark preprocessing …","url":["https://www.usenix.org/system/files/fast26-ahn.pdf"]} +{"year":"2026","title":"Scaling Open Discrete Audio Foundation Models with Interleaved Semantic, Acoustic, and Text Tokens","authors":["P Manakul, WH Gan, M Bartelds, G Sun, W Held… - arXiv preprint arXiv …, 2026"],"snippet":"Current audio language models are predominantly text-first, either extending pre-trained text LLM backbones or relying on semantic-only audio tokens, limiting general audio modeling. This paper presents a systematic empirical study of native audio …","url":["https://arxiv.org/pdf/2602.16687"]} +{"year":"2026","title":"Scaling Reward Modeling without Human Supervision","authors":["J Fan, Y Li, Z Qi, D Zhang, K Brantley, SM Kakade… - arXiv preprint arXiv …, 2026"],"snippet":"Learning from feedback is an instrumental process for advancing the capabilities and safety of frontier models, yet its effectiveness is often constrained by cost and scalability. We present a pilot study that explores scaling reward models through …","url":["https://arxiv.org/pdf/2603.02225"]} +{"year":"2026","title":"SCHOLARLY CURATION","authors":["D Van Hulle - New Directions in Digital Textual Studies: Book History …, 2026"]} +{"year":"2026","title":"Scientific research methodology applied to artificial intelligence and data science: General approach","authors":["G Cruz, J Arimatea, C Tiza, M Maura, A Quiñones…"],"snippet":"The history of science has gone through various paradigms: from the empirical observation of natural phenomena and the theoretical formulation of laws to the computational simulation of complex systems. Today, we are immersed in what Jim …","url":["https://editorialmarcaribe.es/ark:/10951/isbn.9789915698434/ebook.pdf"]} +{"year":"2026","title":"ScrapeGraphAI-100k: A Large-Scale Dataset for LLM-Based Web Information Extraction","authors":["W Brach, F Zuppichini, M Vinciguerra, L Padoan - arXiv preprint arXiv:2602.15189, 2026"],"snippet":"The use of large language models for web information extraction is becoming increasingly fundamental to modern web information retrieval pipelines. However, existing datasets tend to be small, synthetic or text-only, failing to capture the …","url":["https://arxiv.org/pdf/2602.15189"]} +{"year":"2026","title":"SeDa: A Unified System for Dataset Discovery and Multi-Entity Augmented Semantic Exploration","authors":["K Ling, Z Qin, Y Zhu, H Zhang, H Yu, G Fan - arXiv preprint arXiv:2603.07502, 2026"],"snippet":"… Common Crawl Supplement. We further incorporate Common Crawl as a supplementary data source to capture long-tail datasets that are often absent from conventional repositories. Common Crawl provides large-scale, monthly web crawl …","url":["https://arxiv.org/pdf/2603.07502"]} +{"year":"2026","title":"Selecting Language Models for Social Science: Start Small, Start Open, and Validate","authors":["DS Stoltz, MA Taylor, S Kumar - arXiv preprint arXiv:2601.10926, 2026"],"snippet":"… the Common Crawl—a massive collection of over 300 billion webpages that grows by between 3 to 5 billion sampled pages every month, and maintained by a 501(c)(3) non-profit organization of the same name.The Common Crawl … Notably …","url":["https://arxiv.org/pdf/2601.10926"]} +{"year":"2026","title":"Sentence Embedding Using Multimodal Approach: Combining FastText with AraBERT for Arabic Text Representation","authors":["H Almayyali, A Aliwy - F1000Research, 2026"],"snippet":"… It was trained according to a Common Crawl dataset. This model is intended to create word embeddings that are as accurate as possible to represent the semantics of Arabic language words, and it can be broadly applied to most natural language …","url":["https://f1000research.com/articles/15-206"]} +{"year":"2026","title":"Simplicity Prevails: The Emergence of Generalizable AIGI Detection in Visual Foundation Models","authors":["Y Zhou, X He, K Lin, B Fan, F Ding, B Li - arXiv preprint arXiv:2602.01738, 2026"],"snippet":"… Through rigorous analysis of the Common Crawl index1, we observe an exponential explosion of generative content in web corpora starting from 2022. Modern VFMs, trained on this evolving data landscape, have inadvertently …","url":["https://arxiv.org/pdf/2602.01738"]} +{"year":"2026","title":"Simulating Epidemic Response and Communication using AI-powered NPCs in Virtual Reality","authors":["F Alam, S Sharma, PR Ovi, KSMT Hossain - Electronic Imaging, 2026"],"snippet":"This study introduces a simulation framework designed to examine epidemic communication and behavioral interventions utilizing AI-driven non-player characters (NPCs) within a 3D environment created in Unity. The framework rectifies …","url":["https://library.imaging.org/admin/apis/public/api/ist/website/downloadArticle/ei/38/13/ERVR-190"]} +{"year":"2026","title":"Simultaneous Speech-to-Speech Translation Without Aligned Data","authors":["T Labiausse, R Fabre, Y Estève, A Défossez… - arXiv preprint arXiv …, 2026"],"snippet":"Simultaneous speech translation requires translating source speech into a target language in real-time while handling non-monotonic word dependencies. Traditional approaches rely on supervised training with word-level aligned data …","url":["https://arxiv.org/pdf/2602.11072"]} +{"year":"2026","title":"Small Language Models: Architecture, Evolution, and the Future of Artificial Intelligence","authors":["AP Shah, MP Hosseini, SM Park, C Miao, W Wei - 2026"],"snippet":"… Source Selection: Instead of indiscriminately scraping the web, SLM training starts with selecting high quality sources, such as filtered web pages (eg Common Crawl filtered by quality classifiers), academic papers (eg arXiv), books (eg Google …","url":["https://www.preprints.org/frontend/manuscript/c72aeffd44ea7a7eacf8b0b92a118b41/download_pub"]} +{"year":"2026","title":"Small LLMs for Medical NLP: a Systematic Analysis of Few-Shot, Constraint Decoding, Fine-Tuning and Continual Pre-Training in Italian","authors":["P Ferrazzi, M Franzin, A Lavelli, B Magnini - arXiv preprint arXiv:2602.17475, 2026"],"snippet":"Large Language Models (LLMs) consistently excel in diverse medical Natural Language Processing (NLP) tasks, yet their substantial computational requirements often limit deployment in real-world healthcare settings. In this work, we investigate …","url":["https://arxiv.org/pdf/2602.17475"]} +{"year":"2026","title":"Soft Contamination Means Benchmarks Test Shallow Generalization","authors":["A Spiesberger, JJ Vazquez, N Pochinkov, T Gavenčiak… - arXiv preprint arXiv …, 2026"],"snippet":"If LLM training data is polluted with benchmark test data, then benchmark performance gives biased estimates of out-of-distribution (OOD) generalization. Typical decontamination filters use n-gram matching which fail to detect semantic …","url":["https://arxiv.org/pdf/2602.12413"]} +{"year":"2026","title":"SplitGuard: A Resource Efficient Framework for Auditing Train and Eval Overlap and Near Duplicate Contamination in NLP Datasets","authors":["IP Karunanayaka - Authorea Preprints, 2026"],"snippet":"The integrity of Natural Language Processing (NLP) benchmarks is threatened by dataset contamination, where test examples leak into training data. While exact deduplication is standard, near duplicate leakage remains a pervasive issue. This …","url":["https://www.techrxiv.org/doi/pdf/10.36227/techrxiv.177205027.77074976"]} +{"year":"2026","title":"State-of-the-art Small Language Coder Model: Mify-Coder","authors":["A Parmar, A Panigrahi, AK Dwivedi, A Bhattacharya… - arXiv preprint arXiv …, 2025"],"snippet":"We present Mify-Coder, a 2.5B-parameter code model trained on 4.2T tokens using a compute-optimal strategy built on the Mify-2.5B foundation model. Mify-Coder achieves comparable accuracy and safety while significantly outperforming much …","url":["https://arxiv.org/pdf/2512.23747"]} +{"year":"2026","title":"STELLA: Self-Reflective Terminology-Aware Framework for Building an Aerospace Information Retrieval Benchmark","authors":["B Kim - arXiv preprint arXiv:2601.03496, 2026"],"snippet":"Tasks in the aerospace industry heavily rely on searching and reusing large volumes of technical documents, yet there is no public information retrieval (IR) benchmark that reflects the terminologyand query-intent characteristics of this …","url":["https://arxiv.org/pdf/2601.03496"]} +{"year":"2026","title":"Step 3.5 Flash: Open Frontier-Level Intelligence with 11B Active Parameters","authors":["A Huang, A Li, A Kong, B Wang, B Jiao, B Dong… - arXiv preprint arXiv …, 2026"],"snippet":"We introduce Step 3.5 Flash, a sparse Mixture-of-Experts (MoE) model that bridges frontier-level agentic intelligence and computational efficiency. We focus on what matters most when building agents: sharp reasoning and fast, reliable execution …","url":["https://arxiv.org/pdf/2602.10604"]} +{"year":"2026","title":"STEP3-VL-10B Technical Report","authors":["A Huang, C Yao, C Han, F Wan, H Guo, H Lv, H Zhou… - arXiv preprint arXiv …, 2026"],"snippet":"We present STEP3-VL-10B, a lightweight open-source foundation model designed to redefine the trade-off between compact efficiency and frontier-level multimodal intelligence. STEP3-VL-10B is realized through two strategic shifts: first, a unified …","url":["https://arxiv.org/pdf/2601.09668"]} +{"year":"2026","title":"Stop Preaching and Start Practising Data Frugality for Responsible Development of AI","authors":["SN Wilson, GF Guðmundsdóttir, A Millard, R Selvan… - arXiv preprint arXiv …, 2026"],"snippet":"… Another notable study by Common Crawl and Tailpipe2 estimates that crawling five billion web pages costs about 326 kgCO2e, and that … The Common Crawl study reports that 2% of their emissions stem from embodied emissions and 98 …","url":["https://arxiv.org/pdf/2602.19789"]} +{"year":"2026","title":"Stop Testing Attacks, Start Diagnosing Defenses: The Four-Checkpoint Framework Reveals Where LLM Safety Breaks","authors":["H Dhabhi, K Thimmaraju - arXiv preprint arXiv:2602.09629, 2026"],"snippet":"… LLMs are trained on massive datasets sourced from the internet, including Common Crawl and similar web archives [6]. This data contains harmful content such as instructions for illegal activities, malicious code, and dangerous information …","url":["https://arxiv.org/pdf/2602.09629"]} +{"year":"2026","title":"Subword-Based Comparative Linguistics across 242 Languages Using Wikipedia Glottosets","authors":["I Chelombitko, M Hämäläinen, A Komissarov - arXiv preprint arXiv:2601.18791, 2026"],"snippet":"… First, the application of this framework to Common Crawl3 would test whether the patterns we observe generalize beyond Wikipedia’s controlled environment. Web-scraped data introduces additional noise but also broader lexical coverage, particularly for …","url":["https://arxiv.org/pdf/2601.18791"]} +{"year":"2026","title":"sui-1: Grounded and Verifiable Long-Form Summarization","authors":["B Droste, JP Harries, M Idahl, B Plüster - arXiv preprint arXiv:2601.08472, 2026"],"snippet":"Large language models frequently generate plausible but unfaithful summaries that users cannot verify against source text, a critical limitation in compliance-sensitive domains such as government and legal analysis. We present sui-1, a 24B parameter …","url":["https://arxiv.org/pdf/2601.08472"]} +{"year":"2026","title":"SUVA: A Probabilistic Framework for Auditing LLMs with an Application to Social Preferences","authors":["Y Leng, Y Yuan - Information Systems Research, 2026"],"snippet":"As organizations increasingly use large language models (LLMs) in delegated decision tasks, understanding and auditing their outputs is important for responsible deployment. However, despite LLMs’ widespread adoption, systematic tools for …","url":["https://pubsonline.informs.org/doi/abs/10.1287/isre.2024.0857"]} +{"year":"2026","title":"Synonym Extraction from Japanese Patent Documents Using Term Definition Sentences","authors":["K Marusaki, S Kawano, A Hentona, H Nonaka - Machine Learning with Applications, 2026"],"snippet":"Conducting prior patent searches before developing technologies and filing patent applications in companies or universities is essential for understanding technological trends among competitors and academic institutions, as well as for …","url":["https://www.sciencedirect.com/science/article/pii/S2666827026000137"]} +{"year":"2026","title":"TabiBERT: A Large-Scale ModernBERT Foundation Model and Unified Benchmarking Framework for Turkish","authors":["M Türker, AE Kızıloğlu, O Güngör, S Üsküdarlı - arXiv preprint arXiv:2512.23065, 2025"],"snippet":"… Schweter 2020) which is a cleaned version of the public web crawl data of Common Crawl. These models have been evaluated on various downstream tasks (such as part-of-speech tagging, named entity recognition, and question answering) where …","url":["https://arxiv.org/pdf/2512.23065"]} +{"year":"2026","title":"Testimole-Conversational: A 30-Billion-Word Italian Discussion Board Corpus (1996-2024) for Language Modeling and Sociolinguistic Research","authors":["M Rinaldi, R Varvara, V Patti - arXiv preprint arXiv:2602.14819, 2026"],"snippet":"We present \"Testimole-conversational\" a massive collection of discussion boards messages in the Italian language. The large size of the corpus, more than 30B word-tokens (1996-2024), renders it an ideal dataset for native Italian Large Language Models'pre-training. …","url":["https://arxiv.org/pdf/2602.14819"]} +{"year":"2026","title":"The Algorithmic Gaze: An Audit and Ethnography of the LAION-Aesthetics Predictor Model","authors":["J Taylor, W Agnew, M Sap, SE Fox, H Zhu - arXiv preprint arXiv:2601.09896, 2026"],"snippet":"… IMG HTML tags of websites archived by the Common Crawl, a US-based non-profit that has scraped publicly available websites since 2008 [4, 75]. Due to LAION researchers’ indiscriminate data collection practices, researchers found that LAION-5B …","url":["https://arxiv.org/pdf/2601.09896"]} +{"year":"2026","title":"The complexity of gender and language: Digitization of a physical board game deconstructing gender stereotypes","authors":["D Anastasiou, C Moll, M Gallais, L Johannsen… - Language and Law …, 2025"],"snippet":"In this paper, we present a physical board game, known as the Gender Game1 (GG), and detail how we enhanced it with digital elements. The goal of the GG is to deconstruct gender stereotypes by highlighting a scientific approach: the GG is …","url":["https://ojs.letras.up.pt/index.php/LLLD/article/download/14603/13759"]} +{"year":"2026","title":"The Design Space of Tri-Modal Masked Diffusion Models","authors":["L Bethune, V Turrisi, BK Mlodozeniec, PR Lopez… - arXiv preprint arXiv …, 2026"],"snippet":"Discrete diffusion models have emerged as strong alternatives to autoregressive language models, with recent work initializing and fine-tuning a base unimodal model for bimodal generation. Diverging from previous approaches, we introduce …","url":["https://arxiv.org/pdf/2602.21472"]} +{"year":"2026","title":"The Effect of Multimodal Conversational AI on Job Interview Anxiety and Performance Among ESL Students","authors":["AH AlKhayat - 2026"],"snippet":"Speaking a second language is often considered one of the most anxiety-inducing skills to acquire. For many ESL learners, limited vocabulary, fear of judgement, and lack of sufficient practice in a crowded classroom can impede their speaking …","url":["https://search.proquest.com/openview/4b1bf5311f3fe67d893d5388c30e7431/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"The Emergence of a New Computational Paradigm in Natural Language Processing: A Review of Architectures, Adaptation, and Applications of Large Language …","authors":["SN Umar, AI Abdullahi, MA Rabiu, AR Umar - 2026"],"snippet":"Large Language Models have become a transformational element in Natural Language Processing because they introduce new approach for understanding and generating languages. This paper is a formal review of the development of Large …","url":["https://www.researchgate.net/profile/Surajo-Nuhu-Umar-2/publication/400788449_The_Emergence_of_a_New_Computational_Paradigm_in_Natural_Language_Processing_A_Review_of_Architectures_Adaptation_and_Applications_of_Large_Language_Models/links/69908bd312f837212a1ad4c7/The-Emergence-of-a-New-Computational-Paradigm-in-Natural-Language-Processing-A-Review-of-Architectures-Adaptation-and-Applications-of-Large-Language-Models.pdf"]} +{"year":"2026","title":"The Entropy of Recursion: A Strategic Framework for Provenance-Verified Data and the Preservation of Signal Integrity in Generative Systems","authors":["T MacPherson - Metavolve Labs, 2025"],"snippet":"The rapid scaling of generative models has precipitated a data integrity challenge characterized by “Model Collapse”—a degenerative process where recursive training on synthetic output drastically reduces statistical variance and induces …","url":["https://iaeternum.ai/entropy_of_recursion_final.pdf"]} +{"year":"2026","title":"The Evolution of the Brain From Analog to Digital: The Birth of Artificial Intelligence","authors":["GR Nahass, PK Patel - Journal of Craniofacial Surgery, 2026"],"snippet":"Recent advances in artificial intelligence (AI) have accelerated the development of systems capable of processing language, images, and multimodal clinical data with unprecedented scale. Among these, transformer-based large language models (LLMs) …","url":["https://journals.lww.com/jcraniofacialsurgery/fulltext/2026/03000/the_evolution_of_the_brain_from_analog_to_digital_.92.aspx"]} +{"year":"2026","title":"The Extrapolation Horizon: Delineating the Boundaries Between Narrow Optimisation and General Intelligence","authors":["P Majumdar - 2026"],"snippet":"This analysis distinguishes between the narrow optimisation of current AI and the extrapolation needed for Artificial General Intelligence (AGI). Today, AI mainly consists of Artificial Narrow Intelligence (ANI), which is good at pattern recognition …","url":["https://www.researchgate.net/profile/Partha-Majumdar-4/publication/400561619_The_Extrapolation_Horizon_Delineating_the_Boundaries_Between_Narrow_Optimisation_and_General_Intelligence/links/6987cd595d60ab48356c0028/The-Extrapolation-Horizon-Delineating-the-Boundaries-Between-Narrow-Optimisation-and-General-Intelligence.pdf"]} +{"year":"2026","title":"The Growing Gains and Pains of Iterative Web Corpora Crawling: Insights from South Slavic CLASSLA-web 2.0 Corpora","authors":["TK Pungeršek, P Rupnik, V Suchomel, N Ljubešić - arXiv preprint arXiv:2601.11170, 2026"],"snippet":"Crawling national top-level domains has proven to be highly effective for collecting texts in less-resourced languages. This approach has been recently used for South Slavic languages and resulted in the largest general corpora for this language group …","url":["https://arxiv.org/pdf/2601.11170"]} +{"year":"2026","title":"The Illusion of Consensus in Human-Centered Interactive AI","authors":["VB Lux - The 3rd InterAI Workshop: Interactive AI for Human …"],"snippet":"As human-centered AI systems increasingly integrate multiple interactive components—such as perception, planning, language, and decision modules—users are often encouraged to interpret agreement across system outputs as a signal of …","url":["https://openreview.net/pdf?id=eJtBEBmYGB"]} +{"year":"2026","title":"The Illusion of Generalization: Re-examining Tabular Language Model Evaluation","authors":["A Gorla, R Puduppully - arXiv preprint arXiv:2602.04031, 2026"],"snippet":"Tabular Language Models (TLMs) have been claimed to achieve emergent generalization for tabular prediction. We conduct a systematic re-evaluation of Tabula-8B as a representative TLM, utilizing 165 datasets from the UniPredict …","url":["https://arxiv.org/pdf/2602.04031"]} +{"year":"2026","title":"The Impact of Fine-Tuning on Entity Resolution: An Experimental Evaluation","authors":["D Karapiperis, L Akritidis, P Bozanis - Knowledge-Based Systems, 2026"],"snippet":"Fine-tuning pre-trained language models has become the state-of-the-art approach for Entity Resolution (ER), but this has created a divide between two dominant architectures: fast-but-less-accurate bi-encoders and accurate-but-slow cross-encoders …","url":["https://www.sciencedirect.com/science/article/pii/S095070512600170X"]} +{"year":"2026","title":"THE IMPACT OF STATEMENT LENGTH ON FAKE NEWS DETECTION USING LARGE LANGUAGE MODELS","authors":["D SCHRIJEN"],"snippet":"The rise of social media as a primary news source has transformed the internet into a crucial source of information. This accessibility has led to the spread of fake news, which undermines public trust. Fake news spreads faster and exerts greater …","url":["http://arno.uvt.nl/show.cgi?fid=189794"]} +{"year":"2026","title":"The importance of morphology-aware subword tokenization for NLP tasks in Slovak language modeling","authors":["D Držík, J Kapusta - Expert Systems with Applications, 2026"],"snippet":"To effectively train large language models (LLMs) for morphologically rich and low-resource languages such as Slovak, high-quality tokenization is essential. Traditional approaches like Byte-Pair Encoding (BPE) overlook linguistic structure, often …","url":["https://www.sciencedirect.com/science/article/pii/S0957417426004057"]} +{"year":"2026","title":"The Large Model Family","authors":["L Lin, Y Liu - Multimodal Large Models: A New Paradigm of Artificial …, 2026"],"snippet":"… GPT-3 [61]: Released in May 2020, it utilized 175 billion parameters and 45 TB of CommonCrawl data for massive-scale learning—over 10 … and CommonCrawl data as low-quality data, train a simple logistic regression model to assess data …","url":["https://link.springer.com/chapter/10.1007/978-981-95-4929-0_1"]} +{"year":"2026","title":"The Many Roles of Intermediaries in Reusing Data","authors":["CL Borgman, P Groth"],"snippet":"… We offered the Common Crawl data set as an example of how generative AI presumes access to data at massive scale. Our commenters touched on implications of AI for data mining (Leonelli), benchmarking (boyd & Sarathy), repurposing (Treloar) …","url":["https://assets.pubpub.org/qr6qna9w/c17c3adb-622e-4f03-9f40-a0c90dbd1260.html"]} +{"year":"2026","title":"The Regulation That Cried Wolf: Generative AI Training Data and the Challenge of Lawful Scale","authors":["AR Martinez - Network L. Rev., 2025"],"snippet":"… 'Common Crawl, 'Common Crawl maintains a free, open repository of web crawl data that can be used by anyone' (Common Crawl), available at https://commoncrawl.org/. ' Stefan Baack, 'A Critical Analysis of the Largest Source for Generative AI Training …","url":["https://heinonline.org/hol-cgi-bin/get_pdf.cgi?handle=hein.journals/ntwklwrvw2025§ion=35"]} +{"year":"2026","title":"The Role of Epistemic Drift in Online Civic Discourse about Science","authors":["S Lewandowsky, D Garcia - Current Opinion in Psychology, 2026"],"snippet":"Our societies are experiencing an epistemic drift, that is a changing understanding of what it means to be “honest” and how to arrive at “truth”. This drift has increasingly replaced reliance on evidence and facts during truth-seeking with reliance on beliefs …","url":["https://www.sciencedirect.com/science/article/pii/S2352250X26000011"]} +{"year":"2026","title":"The Roots of Performance Disparity in Multilingual Language Models: Intrinsic Modeling Difficulty or Design Choices?","authors":["C Shani, Y Reif, N Roll, D Jurafsky, E Shutova - arXiv preprint arXiv:2601.07220, 2026"],"snippet":"Multilingual language models (LMs) promise broader NLP access, yet current systems deliver uneven performance across the world's languages. This survey examines why these gaps persist and whether they reflect intrinsic linguistic difficulty …","url":["https://arxiv.org/pdf/2601.07220"]} +{"year":"2026","title":"The Silent Spill: Measuring Sensitive Data Leaks Across Public URL Repositories","authors":["T Ramadan, AR Abdou, M Mannan, A Youssef - arXiv preprint arXiv:2602.21826, 2026"],"snippet":"… [9] trained a machine-learning classifier on a billion URLs from Common Crawl [2] using Curlie.org[3] categories to categorize URLs that belong to five sensitive categories: Ethnicity, Health, Political Beliefs, Religion, and Sexual Orientation …","url":["https://arxiv.org/pdf/2602.21826"]} +{"year":"2026","title":"The silicon gaze: A typology of biases and inequality in LLMs through the lens of place","authors":["FW Kerche, M Zook, M Graham - Platforms & Society, 2026"],"snippet":"This paper introduces the concept of the silicon gaze to explain how large language models (LLMs) reproduce and amplify long-standing spatial inequalities. Drawing on a 20.3-million-query audit of ChatGPT, we map systematic biases in the model's …","url":["https://journals.sagepub.com/doi/pdf/10.1177/29768624251408919"]} +{"year":"2026","title":"The Third Ambition: Artificial Intelligence and the Science of Human Behavior","authors":["WR Neuman, C Coleman - arXiv preprint arXiv:2603.07329, 2026"],"snippet":"Contemporary artificial intelligence research has been organized around two dominant ambitions: productivity, which treats AI systems as tools for accelerating work and economic output, and alignment, which focuses on ensuring that …","url":["https://arxiv.org/pdf/2603.07329"]} +{"year":"2026","title":"Time, memory retrieval, and speculative storytelling","authors":["D Ezzo - photographies, 2026"],"snippet":"… The most common of these models (SD 1.5, SD 2.1, SDXL) are trained on LAION-5B, a dataset compiled from billions of image-text pairs scraped from Common Crawl.Footnote 7 As a result, the visual and textual material that informs the model is drawn from the …","url":["https://www.tandfonline.com/doi/abs/10.1080/17540763.2025.2607751"]} +{"year":"2026","title":"TopS-Key: Semantic Keyphrase Extraction Using Contextual Principal Component Analysis and Fuzzy Technique for Order Preference by Similarity to Ideal Solution …","authors":["R Singh, G Jain - The European Journal on Artificial Intelligence"],"snippet":"In the age of rapidly expanding textual data, extracting meaningful insights poses a significant challenge. To address this, we introduce TopS-Key, an advanced framework for automatic keyphrase extraction that integrates natural language …","url":["https://journals.sagepub.com/doi/abs/10.1177/30504554251395957"]} +{"year":"2026","title":"topSEARCH: a Comprehensive Tool for the Retrieval and Analysis of Multi-Type Online Resources","authors":["A Cejudo, Y Tellechea, T Garcia-Navarro, A Calvo…"],"snippet":"The internet is filled with diverse content types, such as videos, news articles, podcasts, and mobile apps, spread across various platforms and requiring significant time and effort to gather and evaluate. We propose a novel methodology …","url":["https://www.researchgate.net/profile/Ander-Cejudo/publication/399937033_topSEARCH_a_Comprehensive_Tool_for_the_Retrieval_and_Analysis_of_Multi-Type_Online_Resources/links/6974b0a2f5b9fd48849bb1c1/topSEARCH-a-Comprehensive-Tool-for-the-Retrieval-and-Analysis-of-Multi-Type-Online-Resources.pdf"]} +{"year":"2026","title":"Toward Compliance Zero: AI and the Vanishing Costs of Regulatory Compliance","authors":["P Ohm - Network L. Rev., 2025"],"snippet":"Thanks to recent advances in artificial intelligence, we can now automate tasks that we have never before been able to automate.'Work that was expensive has become cheap; 2 tasks that oncerequiredalot of time cannowbe accomplishednearly …","url":["https://heinonline.org/hol-cgi-bin/get_pdf.cgi?handle=hein.journals/ntwklwrvw2025§ion=33"]} +{"year":"2026","title":"Toward Cross-Lingual Quality Classifiers for Multilingual Pretraining Data Selection","authors":["Y Turki, V Sabolčec, B Messmer, M Jaggi"],"snippet":"As Large Language Models (LLMs) scale, data curation has shifted from maximizing volume to optimizing the signal-to-noise ratio by performing quality filtering. However, for many languages, native high-quality data is insufficient to train robust quality …","url":["https://openreview.net/pdf?id=b5y9sVqyZx"]} +{"year":"2026","title":"Toward Evaluating Model Collapse in LLMs: Insights from Continual Pretraining","authors":["K Minchev, A Alexandrov, M Vechev, N Konstantinov"],"snippet":"… 9.6B tokens) of highquality mathematical educational content filtered from the CommonCrawl Crawl (2008). This dataset has been filtered using a custom model trained to provide scores from 0 to 5, and filtering out all documents from the …","url":["https://openreview.net/pdf?id=c1yJSRahCt"]} +{"year":"2026","title":"Toward Traditional Chinese ModernBERT: A Preliminary Study","authors":["YE Chen, QY He, KY Chen - Proceedings of the 37th Conference on Computational …, 2025"],"snippet":"… 其 資料主要源自於對 Common Crawl 存檔 的大規模處理與精煉.該資料集涵蓋了從 2013 年到 2024 年 4 月的 96 個 Common Crawl 數據快照,並透過一個包含過濾, 去重 和語言辨識的複雜流程進行處理,旨 在為開發能夠理解和生成多種語言文字的 大型語言 …","url":["https://aclanthology.org/2025.rocling-main.16.pdf"]} +{"year":"2026","title":"Towards a Comprehensive Characterization of Semantic Grounding","authors":["LA Ciaccio, MT Weißgerber, F Pulvermüller"],"snippet":"A large body of research has shown that semantic representations are formed through direct experience and interaction with the outer world. A crucial open question is, however, what aspects of human experience are particularly relevant for …","url":["https://osf.io/download/y3jvm/"]} +{"year":"2026","title":"Towards Interpretable and Robust ML Systems","authors":["S Verma - 2025"],"snippet":"Recent advancements in ML have taken strides in enabling models to accomplish unprecedented tasks, starting from the bare minimum binary classification for loan applications to intrinsically complex self-driving. As the models have become better …","url":["https://search.proquest.com/openview/bc2338893acc1238e185f4b9fdd75110/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"Towards Safer Social Media: Multimodal Hate Speech Detection in Memes across Diverse Indian Languages","authors":["R Nagaraju, HL Shashirekha - 2025"],"snippet":"… ∗ XLM-Roberta - is a multilingual variant of RoBERTa trained on 2.5 TB of filtered CommonCrawl data across 100 languages. It improves upon mBERT by leveraging more data, longer training, and dynamic masking, achieving state-of-the-art results …","url":["https://ceur-ws.org/Vol-4173/T9-15.pdf"]} +{"year":"2026","title":"Towards Scalable Improvement of Large Language Models: Task, Feedback, and Learning","authors":["W Yuan - 2026"],"snippet":"… WebInstruct recalls relevant documents from Common Crawl using a fastText model trained on a diverse seed dataset of quiz websites. It then extracts question-answer pairs contained in recalled web pages and uses LLMs (Qwen-72B [Bai et al. 2023a] …","url":["https://search.proquest.com/openview/6f4d60dde798dc3bfc0c3b5a7c805e57/1?pq-origsite=gscholar&cbl=18750&diss=y"]} +{"year":"2026","title":"TRACE: A Multi-dimensional Framework for Performance-cost Evaluation of AI Governance Strategies in Large Language Models.","authors":["N Kumar, D Gupta - International Journal of Intelligent Engineering & …, 2026"],"snippet":"… Researchers can replicate this setup using any domain-specific corpus of similar scale and composition, such as corporate documentation from publicly available sources or curated subsets of Common Crawl. …","url":["https://search.ebscohost.com/login.aspx?direct=true&profile=ehost&scope=site&authtype=crawler&jrnl=2185310X&AN=191649667&h=HWnCtQqPQa8d9WN1ltVhvQ2ylEkGgjLXgl2wQh5ssDiCto%2BJworpg5st0jA3B4Oa%2FBByJqmAI3d8T3PVYsv%2B1w%3D%3D&crl=c"]} +{"year":"2026","title":"Traces of Social Competence in Large Language Models","authors":["T Kouwenhoven, M van der Meer, M van Duijn - arXiv preprint arXiv:2603.04161, 2026"],"snippet":"… However, two sources potentially contain the dataset, as they are constructed by scraping the web freely (CommonCrawl) … CommonCrawl We can check the index to see whether any known links to the FB data have been included in the …","url":["https://arxiv.org/pdf/2603.04161"]} +{"year":"2026","title":"Tracing Moral Foundations in Large Language Models","authors":["C Yu, B Yi, F Karimi-Malekabadi, S Abdurahman, J Ye… - arXiv preprint arXiv …, 2026"],"snippet":"Large language models (LLMs) often produce human-like moral judgments, but it is unclear whether this reflects an internal conceptual structure or superficial ``moral mimicry.'' Using Moral Foundations Theory (MFT) as an analytic framework, we study …","url":["https://arxiv.org/pdf/2601.05437"]} +{"year":"2026","title":"Training Methods for Large Language Models: Current Approaches and Challenges","authors":["D Karydas, D Margaritis, HC Leligou - Technologies, 2026"],"snippet":"… The largest portion typically originates from Common Crawl, a massive but noisy web dataset that requires extensive preprocessing and filtering to be usable at scale [4]. Additional high-quality sources such as Wikipedia and large book corpora are …","url":["https://www.mdpi.com/2227-7080/14/2/133"]} +{"year":"2026","title":"Transfer Learning for Cross-Domain Sentiment Classification via Domain Adaptation","authors":["RK Singh, A Saini, E Cambria, A Kumar - IEEE Transactions on Computational Social …, 2025"],"snippet":"… 1) Static Embeddings: We utilize static GloVe word embeddings with 300 dimensions, which are pretrained using the Common Crawl corpus that has 840 billion tokens. All of these vectors are frozen while training so that their general …","url":["https://ieeexplore.ieee.org/iel8/6570650/6780646/11263899.pdf"]} +{"year":"2026","title":"Transformer Architectures and the Self-Attention Mechanism for Low-Resource African Languages: A Survey of Approaches, Benchmarks, and Open Challenges","authors":["NTD Wilfried - 2026"],"snippet":"… This structural exclusion propagates directly into language technology: a model trained on Common Crawl inherits the linguistic biases of … For multilingual NLP, this was extended to mBERT (104 languages) and XLMR [8], pretrained on 2.4 TB of …","url":["https://www.researchgate.net/profile/Daniel-Wilfried-Ngankeu-Takou/publication/401679209_Transformers_for_Low-Resource_African_Languages_Transformer_Architectures_and_the_Self-Attention_Mechanism_for_Low-Resource_African_Languages_A_Survey_of_Approaches_Benchmarks_and_Open_Challenges/links/69add40ea91b826e43483e0f/Transformers-for-Low-Resource-African-Languages-Transformer-Architectures-and-the-Self-Attention-Mechanism-for-Low-Resource-African-Languages-A-Survey-of-Approaches-Benchmarks-and-Open-Challenges.pdf"]} +{"year":"2026","title":"Trust and Social Robots: How can We Prevent Social Robots from Eroding Trust?","authors":["A Strasser - Social Robots and Cultural Sustainability","A Strasser - Social Robots and Cultural Sustainability, 2026"],"snippet":"There is no question that being able to judge the reliability of information sources plays a crucial role in knowledge acquisition. In acquiring knowledge, one must be able to evaluate the sources of the information used. Much of what people know they …","url":["https://link.springer.com/chapter/10.1007/978-3-031-99290-2_9","https://link.springer.com/content/pdf/10.1007/978-3-031-99290-2.pdf#page=213"]} +{"year":"2026","title":"Trustworthiness of large language models: hallucinations","authors":["N Brunello - Challenges and Applications of Generative Large …, 2026"],"snippet":"This chapter offers an expansive view of the hallucination phenomenon in large language models (LLMs), aiming to provide readers with a well-rounded understanding of both its complexity and implications. Hallucinations—instances …","url":["https://www.sciencedirect.com/science/chapter/edited-volume/pii/B9780443335921000073"]} +{"year":"2026","title":"Tucano 2 Cool: Better Open Source LLMs for Portuguese","authors":["NK Corrêa, A Sen, S Fatimah, S Falk, L Landgraf… - arXiv preprint arXiv …, 2026"],"snippet":"… For web-crawled data sourced from Common Crawl WARC files, we begin by extracting text content using the Trafilatura library (Barbaresi… able datasets originate from similar sources (eg, Common Crawl). Therefore, for these datasets …","url":["https://arxiv.org/pdf/2603.03543"]} +{"year":"2026","title":"UberWeb: Insights from Multilingual Curation for a 20-Trillion-Token Dataset","authors":["AG Carranza, K Mentzer, RP Monti, A Fang, A Deng… - arXiv preprint arXiv …, 2026"],"snippet":"Multilinguality is a core capability for modern foundation models, yet training high-quality multilingual models remains challenging due to uneven data availability across languages. A further challenge is the performance interference that can arise from …","url":["https://arxiv.org/pdf/2602.15210"]} +{"year":"2026","title":"UniGeM: Unifying Data Mixing and Selection via Geometric Exploration and Mining","authors":["C Wang, Y Yu, X Yao, J Yang, R Cantoro, C Li, Q Cui… - arXiv preprint arXiv …, 2026"],"snippet":"The scaling of Large Language Models (LLMs) is increasingly limited by data quality. Most methods handle data mixing and sample selection separately, which can break the structure in code corpora. We introduce \\textbf{UniGeM}, a framework that unifies …","url":["https://arxiv.org/pdf/2602.03772"]} +{"year":"2026","title":"Unravelling information on impactful geo-hydrological hazard events with HazMiner, a multilingual text mining method developed through a global scale coverage …","authors":["B Valkenborg, O Dewitte, B Smets - EGUsphere, 2026"],"snippet":"The incidence and impacts from geo-hydrological hazards (GH) such as floods, flash floods and landslides are changing globally due to anthropogenic environmental changes and increased exposure driven by population growth. Reliable datasets on …","url":["https://egusphere.copernicus.org/preprints/2026/egusphere-2026-722/egusphere-2026-722.pdf"]} +{"year":"2026","title":"Using Knowledge Graphs to Harvest","authors":["S Ging, S Walter, J Bratulić, J Dienert - … 2025, Freiburg, Germany, September 23–26 …, 2026"],"snippet":"… [10] further built the publicly available LAION-400M dataset by filtering HTML data from Common Crawl [11] based on the similarity estimated by the CLIP model. In a … [6] proposed DataComp, a filtering challenge containing up to 13B image-text pairs …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=d56lEQAAQBAJ&oi=fnd&pg=PA285&dq=commoncrawl&ots=1o2J5HM4bq&sig=9wCdPmcsw2H3x2DSQdXR83-pvMI"]} +{"year":"2026","title":"Verbālu vārdkopu lietojums sociālajos medijos: lokatīva un prievārda UZ konkurence The use of verbal phrases in social media: Competition between the locative and …","authors":["L Lauze - Valoda: nozīme un forma, 2025"],"snippet":"According to Diessel (1999a, 2), demonstratives fulfill certain syntactic, semantic, and pragmatic functions. The discussion of the syntactic properties of demonstratives is based on the argument that one has to distinguish between the …","url":["https://dspace.lu.lv/server/api/core/bitstreams/2ce7705d-b3d6-44ea-893a-af5cb6117863/content#page=107"]} +{"year":"2026","title":"ViFinClass: A Benchmark Dataset for Vietnamese Financial News Topic Classification","authors":["QL Nguyen - International Journal of Asian Language Processing, 2026"],"snippet":"We introduce ViFinClass, the first large-scale benchmark dataset for Vietnamese financial news topic classification. Collected from CafeF between 2006 and 2023, the dataset contains over 30,000 curated articles across five major financial topics …","url":["https://www.worldscientific.com/doi/abs/10.1142/S2717554526500037"]} +{"year":"2026","title":"Vision-Based Natural Language Scene Understanding for Autonomous Driving: An Extended Dataset and a New Model for Traffic Scene Description Generation","authors":["DS Zadeh, OA Basir, B Moshiri - arXiv preprint arXiv:2601.14438, 2026"],"snippet":"Traffic scene understanding is essential for enabling autonomous vehicles to accurately perceive and interpret their environment, thereby ensuring safe navigation. This paper presents a novel framework that transforms a single frontal-view camera …","url":["https://arxiv.org/pdf/2601.14438"]} +{"year":"2026","title":"ViSQA: A benchmark dataset and baseline models for Vietnamese spoken question answering","authors":["LT Minh, ND Thinh, NKT Loc, LV Quan, ND Tam… - PloS one, 2026"],"snippet":"Spoken Question Answering (SQA) extends machine reading comprehension to spoken content and requires models to handle both automatic speech recognition (ASR) errors and downstream language understanding. Although large-scale SQA …","url":["https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0340771"]} +{"year":"2026","title":"VulnBERTa-XAI: Towards Explainable AI for Automating CWE Weakness Assignment and Improving the Quality of Cybersecurity CVE","authors":["H Turtiainen, A Costin, T Hämäläinen - Cyber Security"],"snippet":"Vulnerability management is crucial for companies with compliance requirements and regulations. The goal is to allocate the most appropriate resources to address vulnerabilities efficiently. The growing number of vulnerabilities discovered by …","url":["https://link.springer.com/content/pdf/10.1007/978-3-032-08890-1.pdf#page=387"]} +{"year":"2026","title":"Watching the Well Run Dry: Digital Settler Colonialism","authors":["AI Generative - The Need to Rename Tech","H Jernigan - The Need to Rename Tech, 2026"],"snippet":"A man from the industry known as Big Water arrives in your neighbourhood one day. ¹ He hands you a piece of paper stating Big Water has paid a ridiculously low fee to drill into an aquifer underneath your community, which includes Indigenous Peoples …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=0ojCEQAAQBAJ&oi=fnd&pg=PA115&dq=commoncrawl&ots=23i_EqJj9U&sig=0-9cw2vG5qPrqw8M8RmNvApeKTk","https://link.springer.com/content/pdf/10.1007/978-3-032-05155-4.pdf#page=122"]} +{"year":"2026","title":"Web as History","authors":["N Brügger, R Schroeder"],"snippet":"… archiveteam.org), or Common Crawl’s open repository of web crawl data (commoncrawl.org). Fourth, specific parts of the web of the past that had actually disappeared may have been meticulously restored and put online. This is the case for the project ‘Restoring …","url":["https://muse.jhu.edu/book/81951/pdf/download"]} +{"year":"2026","title":"WebFAQ 2.0: A Multilingual QA Dataset with Mined Hard Negatives for Dense Retrieval","authors":["M Dinzinger, L Caspari, A Salman, I Topi, J Mitrović… - arXiv preprint arXiv …, 2026"],"snippet":"… This expansion was enabled by a fundamentally revised data collection strategy: instead of relying solely on structured data extractions, we mined URLs that potentially contain structured FAQs directly from Common Crawl and use the OWLer …","url":["https://arxiv.org/pdf/2602.17327"]} +{"year":"2026","title":"What Determines Multilingual LLM Performance? Revisiting the Roles of Linguistic Distance and Resource Availability","authors":["Q Guo, R Sasano"],"snippet":"… Accordingly, we adopt Common Crawl language distribution statistics as an external proxy for language resource availability. Common Crawl publishes monthly language estimates based on sampled web pages rather than exhaustive counts. To …","url":["https://www.anlp.jp/proceedings/annual_meeting/2026/pdf_dir/B5-18.pdf"]} +{"year":"2026","title":"What Do LLMs Associate with Your Name? A Human-Centered Black-Box Audit of Personal Data","authors":["D Staufer, K Morehouse - arXiv preprint arXiv:2602.17483, 2026"],"snippet":"Large language models (LLMs), and conversational agents based on them, are exposed to personal data (PD) during pre-training and during user interactions. Prior work shows that PD can resurface, yet users lack insight into how strongly models …","url":["https://arxiv.org/pdf/2602.17483"]} +{"year":"2026","title":"Wikipedia-grounded Dataset for Question Answering, Verification, and Text Generation for Ukrainian","authors":["V Taranukha, O Marchenko, A Anisimov, E Nasirov - 2025"],"snippet":"… Its advantages are: strong cross-lingual performance, robust to morphology, trained on large CommonCrawl in 100 languages. The disadvantages consist of: 512-token limit, unigram segmentation differs from WordPiece, leading to different window …","url":["https://ceur-ws.org/Vol-4170/Paper11.pdf"]} +{"year":"2026","title":"Window-based Membership Inference Attacks Against Fine-tuned Large Language Models","authors":["Y Chen, Y Du, K Zhang, A Kundu, C Fleming, B Ribeiro… - arXiv preprint arXiv …, 2026"],"snippet":"Most membership inference attacks (MIAs) against Large Language Models (LLMs) rely on global signals, like average loss, to identify training data. This approach, however, dilutes the subtle, localized signals of memorization, reducing attack …","url":["https://arxiv.org/pdf/2601.02751"]} +{"year":"2026","title":"Word Embedding Association Test (WEAT)","authors":["C Chan - Target"],"snippet":"… GLoVe trained on Wikipedia, Common Crawl, and Gigaword fastText trained on Wikipedia and Common Crawl …","url":["https://methodshub.gesis.org/library/tutorials/weat/1/"]} +{"year":"2026","title":"World Gist: Implicit Urban Imaginaries in Foundation Models","authors":["D Negueruela del Castillo, I Neri, L Schaerf - Architecture in the AI Era for Research …, 2026"],"snippet":"… like CLIP and Stable Diffusion, harvested primarily through Common Crawl's systematic archiving of the internet, Footnote 1 constitute a … , and imagined in popular digital culture (mainstream or else, as common crawl seeks to capture …","url":["https://link.springer.com/chapter/10.1007/978-981-95-0760-3_6"]} +{"year":"2026","title":"World Gist: Implicit Urban Imaginaries","authors":["DN del Castillo, I Neri, L Schaerf - Architecture in the AI Era for Research, Practice …, 2026"],"snippet":"As artificial intelligence increasingly shapes architectural and planning practices, understanding how these systems interpret and represent urban environ-ments becomes crucial for critical practice. This research proposes a novel methodology to …","url":["https://books.google.de/books?hl=en&lr=lang_en&id=pd28EQAAQBAJ&oi=fnd&pg=PA63&dq=commoncrawl&ots=laZQ-E7Fv3&sig=oQ9CO6Ga2Sghb5p5ZFPisfSyK6c"]} +{"year":"2026","title":"WorldVQA: Measuring Atomic World Knowledge in Multimodal Large Language Models","authors":["R Zhou, Y Shao, H Lu, B Xing, T Bai, Y Chen, J Zhao… - arXiv preprint arXiv …, 2026"],"snippet":"We introduce WorldVQA, a benchmark designed to evaluate the atomic visual world knowledge of Multimodal Large Language Models (MLLMs). Unlike current evaluations, which often conflate visual knowledge retrieval with reasoning …","url":["https://arxiv.org/pdf/2602.02537"]} +{"year":"2026","title":"Zero-shot stance detection in practice: insights on training, prompting, and decoding with a capable lightweight LLM","authors":["R Aiyappa, S Senthilmani, J An, H Kwak, YY Ahn - PeerJ Computer Science, 2026"],"snippet":"… LLMs are pre-trained on a large corpus of web text collected by the Common Crawl Project (CCP) which gathers around 20TB of text data per month by crawling web pages (https://commoncrawl.org/). As FlanT5-XXL uses a slice of CCP, it is …","url":["https://peerj.com/articles/cs-3540/"]} +{"year":"2026","title":"Zone Scanning at a ccTLD: Detection and Analysis","authors":["P Huppert, M Müller, T Wabeke, C Hesselman…"],"snippet":"… A majority of names can be mined from Certificate Transparency (CT) logs and Common Crawl data. Not all domain names can be found this way, however, and new names are usually found with some delay [16], [17]. …","url":["https://www.hesselman.net/publicaties/NOMS26-Zone-Scanning.pdf"]} diff --git a/gscholar_alerts/parse_scholar_alert_eml.py b/gscholar_alerts/parse_scholar_alert_eml.py index ce91244..35d7ab9 100644 --- a/gscholar_alerts/parse_scholar_alert_eml.py +++ b/gscholar_alerts/parse_scholar_alert_eml.py @@ -2,10 +2,10 @@ """Extract citations from Google Scholar alter e-mails (in EML format)""" -import datetime +import argparse import email -import email.parser import json +import logging import os import re import urllib @@ -14,7 +14,14 @@ from html.parser import HTMLParser +LOGGING_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s' +LOG_LEVEL = 'INFO' +logging.basicConfig(level=LOG_LEVEL, format=LOGGING_FORMAT) + + class Citation: + """Represents one Google Scholar citation.""" + boilerplate_lines = { 'cancel alert', 'update alert to receive fewer, more relevant', @@ -40,6 +47,7 @@ def __init__(self, date, refs): self.url = set() self.link = set() self.idx = '' + self.idx_type = '' self.ref = set(refs) self.data = '' # temporary data self.end_of_input = False @@ -164,6 +172,7 @@ def json(self): class CitationsHTMLParser(HTMLParser): + """Parser for one Google Scholar alert message.""" def __init__ (self, date, author_ref, msg_ref): HTMLParser.__init__(self) self.in_title = False @@ -171,6 +180,7 @@ def __init__ (self, date, author_ref, msg_ref): self.end_of_citations = False self.citations = [] self.date = date + self.inline = False self.ref = [msg_ref, author_ref] def handle_starttag(self, tag, attrs): @@ -187,7 +197,7 @@ def handle_starttag(self, tag, attrs): for attr in attrs: if attr[0].lower() == 'href': self.citations[-1].add_link(attr[1]) - elif tag == 'br' or tag == '

': + elif tag in {'br', '

'}: if self.citations: self.citations[-1].add_line_break() else: @@ -211,7 +221,8 @@ def handle_data(self, data): self.citations[-1].add_title(data) elif self.in_script: pass - elif data_normalized.startswith("this message was sent by google scholar because you're following new results for"): + elif data_normalized.startswith( + "this message was sent by google scholar because you're following new results for"): self.end_of_citations = True else: self.citations[-1].add_data(data) @@ -228,8 +239,7 @@ def message_get_payload(msg): yield (msg.get_content_type(), payload) elif isinstance(payload, list): for sub in payload: - for pld in message_get_payload(sub): - yield pld + yield from message_get_payload(sub) elif isinstance(payload, bytes): yield (msg.get_content_type(), payload.decode('utf-8')) else: @@ -243,30 +253,67 @@ def parse_eml(eml_file): for (mime, body) in list(message_get_payload(msg)): parser = CitationsHTMLParser(date, msg['subject'], eml_file) parser.feed(body) - for citation in parser.citations: - yield citation - + yield from parser.citations + +def load_citations(citations_file): + """Load existing citations, extracted earlier by this script and cleaned up. + Only the following fields are used: title, year, authors, snippet, url.""" + citations = {} + lines_read = 0 + with open(citations_file, 'r', encoding='UTF-8') as f: + for line in f: + lines_read += 1 + try: + d = json.loads(line) + # at least a title is required + if 'title' not in d: + continue + # create citation object + c = Citation(d.get('year'), []) + c.add_title(d['title']) + c.authors = {a.lower(): a for a in d.get('authors', [])} + c.snippet = d.get('snippet', '') + c.url = set(d.get('url', [])) + if c in citations: + citations[c].update(c) + else: + citations[c] = c + except json.JSONDecodeError: + logging.warning('Failed to parse line: %s', line.strip()) + logging.info('Read %d lines of citations file %s', lines_read, citations_file) + logging.info('Loaded %d existing citations', len(citations)) + return citations if __name__ == '__main__': - if len(sys.argv) <= 1: - sys.stderr.write(sys.argv[0] + ' ') - sys.exit(1) + parser = argparse.ArgumentParser( + description='Extract citations from Google Scholar alert e-mails (in EML format)') + parser.add_argument('eml_folder', type=str, + help='Folder containing EML files') + parser.add_argument('citations_file', type=str, nargs='?', + help='File with existing citations (JSON-lines format)') + args = parser.parse_args() - emls = os.listdir(sys.argv[1]) - sys.stderr.write('Found %d messages\n' % len(emls)) + emls = os.listdir(args.eml_folder) + logging.info('Found %d messages in eml folder', len(emls)) - citations = dict() + citations = {} + if args.citations_file: + logging.info('Loading existing citations from %s', args.citations_file) + citations = load_citations(args.citations_file) + n_citations_in_messages = 0 for eml in emls: for citation in parse_eml(os.path.join(sys.argv[1], eml)): + n_citations_in_messages += 1 if citation in citations: citations[citation].update(citation) else: citations[citation] = citation - sys.stderr.write('Found %d citations\n' % len(citations)) + logging.info('Found %d citations in messages', n_citations_in_messages) + logging.info('Number of citations after update: %d', len(citations)) for citation in citations: print(citation.json())