Skip to content

Commit d212090

Browse files
Complete 2018 citations
- improve count commands in Makefile - add related 2017 citations - fix snippets
1 parent 09a2de4 commit d212090

3 files changed

Lines changed: 854 additions & 38 deletions

File tree

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ tmp/commoncrawl_site_wp.csv: tmp/commoncrawl.bib
2929
cc-annotations:
3030
perl -lne '$$h{$$1}++ if /^\s*(cc(?:-[a-z_0-9]+)+)\s*=/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
3131
cc-classes:
32-
perl -lne 'next unless s/^\s*cc-class\s*=\s*//; s/^["{]//; s/["}],?$$//; $$h{$$_}++ for split /,\s*/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
33-
cc-derived-dataset-used:
34-
perl -lne 'next unless s/^\s*cc-derived-dataset-used\s*=\s*//; s/^["{]//; s/["}],?$$//; $$h{$$_}++ for split /,\s*/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
32+
perl -lne 'if (s/^\s*cc-class\s*=\s*["{]// .. s/["}],?$$//) { $$classes .= $$_ } elsif (defined $$classes) { $$h{$$_}++ for split /,\s*/, $$classes; $$classes = undef; }; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
33+
cc-derived-datasets:
34+
perl -lne 'if (s/^\s*cc-derived-dataset-(?:used|cited|about)\s*=\s*["{]// .. s/["}],?$$//) { $$datasets .= $$_ } elsif (defined $$datasets) { $$h{$$_}++ for split /,\s*/, $$datasets; $$datasets = undef; } END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
3535
count:
3636
grep -c '^@' bib/*.bib | perl -aF':' -lne 'print join("\t", $$F[1], $$F[0], @F[2..$$#F])' | sort -k2,2

bib/cc2017.bib

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,25 @@ @Article{Schaefer:2017:boilerplate-detection
1515
acmid = "3135309",
1616
publisher = "Springer-Verlag New York, Inc.",
1717
address = "Secaucus, NJ, USA",
18-
abstract = "Removal of boilerplate is one of the essential tasks in web corpus construction and web indexing. Boilerplate (redundant and automatically inserted material like menus, copyright notices, navigational elements, etc.) is usually considered to be linguistically unattractive for inclusion in a web corpus. Also, search engines should not index such material because it can lead to spurious results for search terms if these terms appear in boilerplate regions of the web page. The size of large web corpora necessitates the use of efficient algorithms while a high accuracy directly improves the quality of the final corpus. In this paper, I present and evaluate a supervised machine learning approach to general-purpose boilerplate detection for languages based on Latin alphabets which is both very efficient and very accurate. Using a Multilayer Perceptron and a high number of carefully engineered features, I achieve between 95\% and 99\% correct classifications (depending on the input language) with precision and recall over 0.95. Since the perceptrons are trained on language-specific data, I also evaluate how well perceptrons trained on one language perform on other languages. The single features are also evaluated for the merit they contribute to the classification. I show that the accuracy of the Multilayer Perceptron is on a par with that of other classifiers such as Support Vector Machines. I conclude that the quality of general-purpose boilerplate detectors depends mainly on the availability of many well-engineered features and which are highly language-independent. The method has been implemented in the open-source texrex web page cleaning software, and large corpora constructed using it are available from the COW initiative, including the CommonCOW corpora created from CommonCrawl data sets.",
18+
abstract = "Removal of boilerplate is one of the essential tasks in web corpus construction and web indexing.
19+
Boilerplate (redundant and automatically inserted material like menus, copyright notices, navigational
20+
elements, etc.) is usually considered to be linguistically unattractive for inclusion in a web corpus.
21+
Also, search engines should not index such material because it can lead to spurious results for search
22+
terms if these terms appear in boilerplate regions of the web page. The size of large web corpora
23+
necessitates the use of efficient algorithms while a high accuracy directly improves the quality of the
24+
final corpus. In this paper, I present and evaluate a supervised machine learning approach to
25+
general-purpose boilerplate detection for languages based on Latin alphabets which is both very
26+
efficient and very accurate. Using a Multilayer Perceptron and a high number of carefully engineered
27+
features, I achieve between 95\% and 99\% correct classifications (depending on the input language)
28+
with precision and recall over 0.95. Since the perceptrons are trained on language-specific data, I
29+
also evaluate how well perceptrons trained on one language perform on other languages. The single
30+
features are also evaluated for the merit they contribute to the classification. I show that the
31+
accuracy of the Multilayer Perceptron is on a par with that of other classifiers such as Support Vector
32+
Machines. I conclude that the quality of general-purpose boilerplate detectors depends mainly on the
33+
availability of many well-engineered features and which are highly language-independent. The method has
34+
been implemented in the open-source texrex web page cleaning software, and large corpora constructed
35+
using it are available from the COW initiative, including the CommonCOW corpora created from
36+
CommonCrawl data sets.",
1937
keywords = "Boilerplate, Corpus construction, Non-destructive corpus normalization, Web corpora",
2038
cc-author-affiliation = "Freie Universität Berlin, Germany",
2139
cc-class = "nlp/boilerplate-removal, web-corpora, corpus-construction",
@@ -212,7 +230,6 @@ @InProceedings{cc:MehmoodShafiqWaheed:2017:regional-context-www
212230
cc-class = "web-science, webometrics",
213231
}
214232

215-
216233
@Article{cc:PanchenkoEtAl:2017:web-scale-dependency-corpus,
217234
author = "Alexander Panchenko and Eugen Ruppert and Stefano Faralli and Simone Paolo Ponzetto and Chris
218235
Biemann",
@@ -228,3 +245,13 @@ @Article{cc:PanchenkoEtAl:2017:web-scale-dependency-corpus
228245
cc-class = "nlp/dependency-parsing, corpus-construction",
229246
}
230247

248+
@Article{cc:KaleTaulaHewavitharanaSrivastava:2017:semantic-query-segmentation,
249+
title = "Towards semantic query segmentation",
250+
author = "Kale, Ajinkya and Taula, Thrivikrama and Hewavitharana, Sanjika and Srivastava, Amit",
251+
journal = "arXiv preprint arXiv:1707.07835",
252+
year = "2017",
253+
URL = "https://arxiv.org/abs/1707.07835",
254+
cc-author-affiliation = "eBay Inc.",
255+
cc-derived-dataset-cited = "GloVe-word-embeddings",
256+
cc-class = "ir/query-segmentation; nlp/word-embeddings, patent",
257+
}

0 commit comments

Comments
 (0)