cc-citations/bib/cc2013.bib at main · commoncrawl/cc-citations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@Article{cc:BirchDurraniKoehn:2013:Edinburgh-SLT-and-MT-system,
  year         = "2013",
  author       = "Alexandra Birch and Nadir Durrani and Phillip Koehn",
  title        = "Edinburgh {SLT} and {MT} System Description for the {IWSLT} 2013",
  URL          = "http://workshop2013.iwslt.org/downloads/Edinburgh_SLT_and_MT_System_Description_for_the_IWSLT_2013_Evaluation.pdf",
  cc-author-affiliation = "School of Informatics, University of Edinburgh",
}

@Article{cc:SmithSaint-AmandPlamadaKoehnEtAl:2013:Dirt-cheap-web-scale-parallel-corpus,
  year         = "2013",
  author       = "Jason R. Smith and Herve Saint-Amand and Magdalena Plamada and Phillipp Koehn and Chris Callison-Burch
                 and Adam Lopez",
  title        = "Dirt Cheap Web-Scale Parallel Text from the Common Crawl",
  pages        = "1374–1383",
  URL          = "http://www.aclweb.org/anthology/P13-1135",
  abstract     = "Parallel text is the fuel that drives modern machine translation systems. The Web is a comprehensive
                 source of preexisting parallel text, but crawling the entire web is impossible for all but the largest
                 companies. We bring web-scale parallel text to the masses by mining the Common Crawl, a public Web
                 crawl hosted on Amazon 19s Elastic Cloud. Starting from nothing more than a set of common two-letter
                 language codes, our open-source extension of the STRAND algorithm mined 32 terabytes of the crawl in
                 just under a day, at a cost of about $500. Our large-scale experiment uncovers large amounts of
                 parallel text in dozens of language pairs across a variety of domains and genres, some previously
                 unavailable in curated datasets. Even with minimal cleaning and filtering, the resulting data boosts
                 translation performance across the board for five different language pairs in the news domain, and on
                 open domain test sets we see improvements of up to 5 BLEU. We make our code and data available for
                 other researchers seeking to mine this rich new data resource.",
  publisher    = "Association for Computational Linguistics",
  cc-author-affiliation = "Johns Hopkins University, University of Edinburgh, University of Zurich, University of
                 Pennsylvania",
}

@Article{cc:StymneHardmeierTiedemannNivre:2013:Tunable-distortion-limits,
  year         = "2013",
  author       = "Sara Stymne and Christian Hardmeier and Jorg Tiedemann and Joakim Nivre",
  title        = "Tunable Distortion Limits and Corpus Cleaning for {SMT}",
  URL          = "http://statmt.org/wmt13/pdf/WMT29.pdf",
  cc-author-affiliation = "Uppsala University: Department of Linguistics and Philology",
}

@Article{cc:HaHerrmannNiehuesMedianiEtAl:2013:KIT-translation-systems-for-IWSLT-2013,
  year         = "2013",
  author       = "Thanh-Le Ha and Teresa Herrmann and Jan Niehues and Mohammed Mediani and Eunah Cho and Yuqi Zhang and
                 Isabel Slawik and Alex Waibel",
  title        = "The {KIT} Translation Systems for {IWSLT} 2013",
  URL          = "http://workshop2013.iwslt.org/downloads/The_KIT_Translation_Systems_for_IWSLT_2013.pdf",
  cc-author-affiliation = "Institute for Anthropomatics",
}

@Article{cc:DrijfhoutJundtWeversHiemstra:2013:Traitor,
  year         = "2013",
  author       = "Wanno Drijfhout and Oliver Jundt and Lesley Wevers and Djoerd Hiemstra",
  title        = "Traitor: Associating Concepts using the World Wide Web",
  URL          = "http://doc.utwente.nl/88328/",
  cc-author-affiliation = "University of Twente",
}

@Article{cc:BizerEckertMeuselMühleisenEtAl:2013:RDFa-microdata-microformats,
  year         = "2013",
  author       = "Christian Bizer and Kai Eckert and Robert Meusel and Hannes Mühleisen and Michael Schuhmacher and
                 Johanna Völker",
  title        = "Deployment of {RDF}a, Microdata, and Microformats on the Web – {A} Quantitative Analysis",
  URL          = "http://hannes.muehleisen.org/Bizer-etal-DeploymentRDFaMicrodataMicroformats-ISWC-InUse-2013.pdf",
  cc-author-affiliation = "Data and Web Science Group – University of Mannhein, Database Architectures Group, Centrum
                 Wiskunde & Informatica, Netherlands",
}