Complete 2018 citations

sebastian-nagel · sebastian-nagel · commit d2120901bd08 · 2019-07-08T15:28:21.000+02:00
- improve count commands in Makefile
- add related 2017 citations
- fix snippets
diff --git a/Makefile b/Makefile
@@ -29,8 +29,8 @@ tmp/commoncrawl_site_wp.csv: tmp/commoncrawl.bib
 cc-annotations:
 	perl -lne '$$h{$$1}++ if /^\s*(cc(?:-[a-z_0-9]+)+)\s*=/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
 cc-classes:
-	perl -lne 'next unless s/^\s*cc-class\s*=\s*//; s/^["{]//; s/["}],?$$//; $$h{$$_}++ for split /,\s*/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
-cc-derived-dataset-used:
-	perl -lne 'next unless s/^\s*cc-derived-dataset-used\s*=\s*//; s/^["{]//; s/["}],?$$//; $$h{$$_}++ for split /,\s*/; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
+	perl -lne 'if (s/^\s*cc-class\s*=\s*["{]// .. s/["}],?$$//) { $$classes .= $$_ } elsif (defined $$classes) { $$h{$$_}++ for split /,\s*/, $$classes; $$classes = undef; }; END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
+cc-derived-datasets:
+	perl -lne 'if (s/^\s*cc-derived-dataset-(?:used|cited|about)\s*=\s*["{]// .. s/["}],?$$//) { $$datasets .= $$_ } elsif (defined $$datasets) {  $$h{$$_}++ for split /,\s*/, $$datasets; $$datasets = undef; } END {print $$v, "\t", $$k while (($$k,$$v)=each %h)}' bib/*.bib | sort -k1,1nr
 count:
 	grep -c '^@' bib/*.bib | perl -aF':' -lne 'print join("\t", $$F[1], $$F[0], @F[2..$$#F])' | sort -k2,2
diff --git a/bib/cc2017.bib b/bib/cc2017.bib
@@ -15,7 +15,25 @@ @Article{Schaefer:2017:boilerplate-detection
   acmid        = "3135309",
   publisher    = "Springer-Verlag New York, Inc.",
   address      = "Secaucus, NJ, USA",
-  abstract     = "Removal of boilerplate is one of the essential tasks in web corpus construction and web indexing. Boilerplate (redundant and automatically inserted material like menus, copyright notices, navigational elements, etc.) is usually considered to be linguistically unattractive for inclusion in a web corpus. Also, search engines should not index such material because it can lead to spurious results for search terms if these terms appear in boilerplate regions of the web page. The size of large web corpora necessitates the use of efficient algorithms while a high accuracy directly improves the quality of the final corpus. In this paper, I present and evaluate a supervised machine learning approach to general-purpose boilerplate detection for languages based on Latin alphabets which is both very efficient and very accurate. Using a Multilayer Perceptron and a high number of carefully engineered features, I achieve between 95\% and 99\% correct classifications (depending on the input language) with precision and recall over 0.95. Since the perceptrons are trained on language-specific data, I also evaluate how well perceptrons trained on one language perform on other languages. The single features are also evaluated for the merit they contribute to the classification. I show that the accuracy of the Multilayer Perceptron is on a par with that of other classifiers such as Support Vector Machines. I conclude that the quality of general-purpose boilerplate detectors depends mainly on the availability of many well-engineered features and which are highly language-independent. The method has been implemented in the open-source texrex web page cleaning software, and large corpora constructed using it are available from the COW initiative, including the CommonCOW corpora created from CommonCrawl data sets.",
+  abstract     = "Removal of boilerplate is one of the essential tasks in web corpus construction and web indexing.
+                 Boilerplate (redundant and automatically inserted material like menus, copyright notices, navigational
+                 elements, etc.) is usually considered to be linguistically unattractive for inclusion in a web corpus.
+                 Also, search engines should not index such material because it can lead to spurious results for search
+                 terms if these terms appear in boilerplate regions of the web page. The size of large web corpora
+                 necessitates the use of efficient algorithms while a high accuracy directly improves the quality of the
+                 final corpus. In this paper, I present and evaluate a supervised machine learning approach to
+                 general-purpose boilerplate detection for languages based on Latin alphabets which is both very
+                 efficient and very accurate. Using a Multilayer Perceptron and a high number of carefully engineered
+                 features, I achieve between 95\% and 99\% correct classifications (depending on the input language)
+                 with precision and recall over 0.95. Since the perceptrons are trained on language-specific data, I
+                 also evaluate how well perceptrons trained on one language perform on other languages. The single
+                 features are also evaluated for the merit they contribute to the classification. I show that the
+                 accuracy of the Multilayer Perceptron is on a par with that of other classifiers such as Support Vector
+                 Machines. I conclude that the quality of general-purpose boilerplate detectors depends mainly on the
+                 availability of many well-engineered features and which are highly language-independent. The method has
+                 been implemented in the open-source texrex web page cleaning software, and large corpora constructed
+                 using it are available from the COW initiative, including the CommonCOW corpora created from
+                 CommonCrawl data sets.",
   keywords     = "Boilerplate, Corpus construction, Non-destructive corpus normalization, Web corpora",
   cc-author-affiliation = "Freie Universität Berlin, Germany",
   cc-class     = "nlp/boilerplate-removal, web-corpora, corpus-construction",
@@ -212,7 +230,6 @@ @InProceedings{cc:MehmoodShafiqWaheed:2017:regional-context-www
   cc-class     = "web-science, webometrics",
 }
 
-
 @Article{cc:PanchenkoEtAl:2017:web-scale-dependency-corpus,
   author       = "Alexander Panchenko and Eugen Ruppert and Stefano Faralli and Simone Paolo Ponzetto and Chris
                  Biemann",
@@ -228,3 +245,13 @@ @Article{cc:PanchenkoEtAl:2017:web-scale-dependency-corpus
   cc-class     = "nlp/dependency-parsing, corpus-construction",
 }
 
+@Article{cc:KaleTaulaHewavitharanaSrivastava:2017:semantic-query-segmentation,
+  title        = "Towards semantic query segmentation",
+  author       = "Kale, Ajinkya and Taula, Thrivikrama and Hewavitharana, Sanjika and Srivastava, Amit",
+  journal      = "arXiv preprint arXiv:1707.07835",
+  year         = "2017",
+  URL          = "https://arxiv.org/abs/1707.07835",
+  cc-author-affiliation = "eBay Inc.",
+  cc-derived-dataset-cited = "GloVe-word-embeddings",
+  cc-class     = "ir/query-segmentation; nlp/word-embeddings, patent",
+}
diff --git a/bib/cc2018.bib b/bib/cc2018.bib