diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..d0b59ac --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,29 @@ +name: cc-warc-examples build + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + java: [ 8, 11, 17, 21 ] + name: Java ${{ matrix.java }} + steps: + - uses: actions/checkout@v6 + + - name: Setup JDK + uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ matrix.java }} + cache: 'maven' + + - name: Build + run: mvn verify javadoc:aggregate diff --git a/README.md b/README.md index fe0977a..74345fd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Common Crawl Logo](http://commoncrawl.org/wp-content/uploads/2016/12/logocommoncrawl.png) +![Common Crawl Logo](https://avatars.githubusercontent.com/u/1194841?s=64) # Common Crawl WARC Examples diff --git a/eclipse-formatter.xml b/eclipse-formatter.xml new file mode 100644 index 0000000..e9ac2f0 --- /dev/null +++ b/eclipse-formatter.xml @@ -0,0 +1,404 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pom.xml b/pom.xml index 5e0ddb7..9dc804c 100644 --- a/pom.xml +++ b/pom.xml @@ -1,188 +1,213 @@ - - 4.0.0 + + + 4.0.0 - org.commoncrawl - cc-warc-examples - 0.5-SNAPSHOT - jar + org.commoncrawl + cc-warc-examples + 0.6-SNAPSHOT + jar - cc-warc-examples - - Common Crawl WARC Examples. - Contains both wrappers for processing WARC files in Hadoop MapReduce jobs and Hadoop examples to get you started. - - https://github.com/commoncrawl/cc-warc-examples + cc-warc-examples + Common Crawl WARC Examples. + Contains both wrappers for processing WARC files in Hadoop MapReduce jobs and Hadoop examples to get you started. + https://github.com/commoncrawl/cc-warc-examples - - - The MIT License - http://www.opensource.org/licenses/mit-license.php - repo - - + + + The MIT License + http://www.opensource.org/licenses/mit-license.php + repo + + - - scm:git:git@github.com:commoncrawl/cc-warc-examples.git - git@github.com:commoncrawl/cc-warc-examples.git - - - - UTF-8 - ${maven.build.timestamp} - yyyyMMddhhmmss - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - + + scm:git:git@github.com:commoncrawl/cc-warc-examples.git + git@github.com:commoncrawl/cc-warc-examples.git + + + UTF-8 + ${maven.build.timestamp} + yyyyMMddhhmmss + + + sonatype-nexus-staging + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + sonatype-nexus-snapshots + https://oss.sonatype.org/content/repositories/snapshots/ + - - - log4j - log4j - 1.2.17 - + + + log4j + log4j + 1.2.17 + - - commons-io - commons-io - 2.11.0 - + + commons-io + commons-io + 2.11.0 + - - org.netpreserve.commons - webarchive-commons - 1.2.0 - - - org.apache.hadoop - hadoop-core - - - - - - org.apache.hadoop - hadoop-client - 3.3.6 - provided - - - net.java.dev.jets3t - jets3t - 0.9.4 - - + + org.netpreserve.commons + webarchive-commons + 3.0.2 + + + org.apache.hadoop + hadoop-core + + + - - src - - - - org.apache.maven.plugins - maven-compiler-plugin - 2.3.2 - - 1.7 - 1.7 - - - - maven-assembly-plugin - 2.4 - - - jar-with-dependencies - - cc-warc-examples-${project.version} - - - - package - - single - - - - - - maven-javadoc-plugin - 2.7 - - true - .svn - UTF-8 - UTF-8 - - - - - jar - javadoc - - - - site - pre-site - - javadoc - - - - - - maven-source-plugin - 2.1.1 - - - - jar - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.2.2 - - - + + org.apache.hadoop + hadoop-client + 3.3.6 + provided + + + net.java.dev.jets3t + jets3t + 0.9.4 + + - - - release-sign-artifacts - - - performRelease - true - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.4 - - ${gpg.passphrase} - - - - sign-artifacts - verify - - sign - - - - - - - - + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.15.0 + + 1.8 + 1.8 + + + + maven-assembly-plugin + 3.8.0 + + + jar-with-dependencies + + cc-warc-examples-${project.version} + + + + + single + + package + + + + + maven-javadoc-plugin + 3.12.0 + + true + .svn + UTF-8 + + + + + jar + javadoc + + + + site + + javadoc + + pre-site + + + + + maven-source-plugin + 3.4.0 + + + + jar + + + + + + org.apache.maven.plugins + maven-release-plugin + 3.3.1 + + + com.diffplug.spotless + spotless-maven-plugin + 2.46.1 + + + + + pom.xml + + + all + true + false + -1 + recommended_2008_06 + + + + + ${project.basedir}/eclipse-formatter.xml + + + + + + src + + + + + release-sign-artifacts + + + performRelease + true + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 3.2.8 + + ${gpg.passphrase} + + + + sign-artifacts + + sign + + verify + + + + + + + diff --git a/src/org/commoncrawl/examples/S3ReaderTest.java b/src/org/commoncrawl/examples/S3ReaderTest.java index cef8e73..1355573 100644 --- a/src/org/commoncrawl/examples/S3ReaderTest.java +++ b/src/org/commoncrawl/examples/S3ReaderTest.java @@ -1,4 +1,5 @@ package org.commoncrawl.examples; + import java.io.IOException; import org.archive.io.ArchiveReader; @@ -10,8 +11,8 @@ import org.jets3t.service.model.S3Object; /** - * This is a raw example of how you can retrieve a file from the - * Common Crawl S3 bucket without credentials using JetS3t. + * This is a raw example of how you can retrieve a file from the Common Crawl S3 bucket without + * credentials using JetS3t. * * @author Stephen Merity (Smerity) */ @@ -19,36 +20,37 @@ public class S3ReaderTest { public static void main(String[] args) throws IOException, ServiceException { // We're accessing a publicly available bucket so don't need to fill in our credentials S3Service s3s = new RestS3Service(null); - + // Let's grab a file out of the CommonCrawl S3 bucket String fn = "crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; S3Object f = s3s.getObject("commoncrawl", fn, null, null, null, null, null, null); - + // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true); - + // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; - for(ArchiveRecord r : ar) { + for (ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println("Header: " + r.getHeader()); System.out.println("URL: " + r.getHeader().getUrl()); System.out.println(); - + // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as all the record's stated length byte[] rawData = new byte[r.available()]; r.read(rawData); // Note: potential optimization would be to have a large buffer only allocated once - + // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); - - // Pretty printing to make the output more readable + + // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); - if (i++ > 4) break; + if (i++ > 4) + break; } } } \ No newline at end of file diff --git a/src/org/commoncrawl/examples/WARCReaderTest.java b/src/org/commoncrawl/examples/WARCReaderTest.java index ea18f6a..972e9ea 100644 --- a/src/org/commoncrawl/examples/WARCReaderTest.java +++ b/src/org/commoncrawl/examples/WARCReaderTest.java @@ -1,4 +1,5 @@ package org.commoncrawl.examples; + import java.io.FileInputStream; import java.io.IOException; @@ -8,43 +9,44 @@ import org.archive.io.warc.WARCReaderFactory; /** - * A raw example of how to process a WARC file using the org.archive.io package. - * Common Crawl S3 bucket without credentials using JetS3t. + * A raw example of how to process a WARC file using the org.archive.io package. Common Crawl S3 + * bucket without credentials using JetS3t. * * @author Stephen Merity (Smerity) */ public class WARCReaderTest { /** * @param args - * @throws IOException + * @throws IOException */ public static void main(String[] args) throws IOException { - // Set up a local compressed WARC file for reading + // Set up a local compressed WARC file for reading String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; FileInputStream is = new FileInputStream(fn); // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, is, true); - + // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; - for(ArchiveRecord r : ar) { + for (ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println(r.getHeader()); System.out.println(r.getHeader().getUrl()); System.out.println(); - + // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as the record's stated length byte[] rawData = IOUtils.toByteArray(r, r.available()); - + // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); - - // Pretty printing to make the output more readable + + // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); - if (i++ > 4) break; + if (i++ > 4) + break; } } } \ No newline at end of file diff --git a/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java b/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java index 40d717c..c290150 100644 --- a/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java +++ b/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java @@ -15,10 +15,11 @@ public class ServerTypeMap { private static final Logger LOG = Logger.getLogger(ServerTypeMap.class); + protected static enum MAPPERCOUNTER { - RECORDS_IN, - NO_SERVER, - EXCEPTIONS + RECORDS_IN, // + NO_SERVER, // + EXCEPTIONS // } protected static class ServerMapper extends Mapper { @@ -39,13 +40,16 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti String content = new String(rawData); JSONObject json = new JSONObject(content); try { - String warcType = json.getJSONObject("Envelope").getJSONObject("WARC-Header-Metadata") + String warcType = json.getJSONObject("Envelope") + .getJSONObject("WARC-Header-Metadata") .getString("WARC-Type"); if (!warcType.equals("response")) { continue; } - JSONObject httpHeaders = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata") - .getJSONObject("HTTP-Response-Metadata").getJSONObject("Headers"); + JSONObject httpHeaders = json.getJSONObject("Envelope") + .getJSONObject("Payload-Metadata") + .getJSONObject("HTTP-Response-Metadata") + .getJSONObject("Headers"); JSONArray httpHeaderNames = httpHeaders.names(); for (int i = 0, l = httpHeaders.length(); i < l; i++) { String headerName = httpHeaderNames.getString(i); diff --git a/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java b/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java index 1fb4290..cc5c663 100644 --- a/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java +++ b/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java @@ -15,9 +15,10 @@ public class TagCounterMap { private static final Logger LOG = Logger.getLogger(TagCounterMap.class); + protected static enum MAPPERCOUNTER { - RECORDS_IN, - EXCEPTIONS + RECORDS_IN, // + EXCEPTIONS // } protected static class TagCounterMapper extends Mapper { @@ -33,7 +34,7 @@ protected static class TagCounterMapper extends Mapper { private Text outKey = new Text(); private LongWritable outVal = new LongWritable(1); @@ -83,9 +83,9 @@ public void setup(Context context) { Configuration conf = context.getConfiguration(); maxOutlinksPerPage = conf.getInt("wat.outlinks.max.per.page", 80); /** - * weighted link counts: each page can distributed `wat.outlinks.max.per.page` - * points, links from pages with many links get a lower weight, the weight is - * calculated as `wat.outlinks.max.per.page / num_links_of_page` + * weighted link counts: each page can distributed `wat.outlinks.max.per.page` points, links + * from pages with many links get a lower weight, the weight is calculated as + * `wat.outlinks.max.per.page / num_links_of_page` */ outlinksWeightedCount = conf.getBoolean("wat.outlinks.weighted.count", false); extractFeed = conf.getBoolean("wat.outlinks.extract.feed", false); @@ -95,8 +95,8 @@ public void setup(Context context) { String nofollowBotPatternString = conf.get("wat.outlinks.respect.nofollow.bot.pattern", ""); if (!nofollowBotPatternString.isBlank()) { try { - nofollowBotPattern = Pattern.compile("\\s*" + nofollowBotPatternString + "\\s*", - Pattern.CASE_INSENSITIVE); + nofollowBotPattern = Pattern + .compile("\\s*" + nofollowBotPatternString + "\\s*", Pattern.CASE_INSENSITIVE); } catch (IllegalArgumentException e) { LOG.error("Failed to compile wat.outlinks.respect.nofollow.bot.pattern", e); } @@ -106,8 +106,7 @@ public void setup(Context context) { @Override public void map(Text key, ArchiveReader value, Context context) throws IOException { - record: - for (ArchiveRecord r : value) { + record: for (ArchiveRecord r : value) { // Skip any records that are not JSON if (!r.getHeader().getMimetype().equals("application/json")) { continue record; @@ -128,7 +127,7 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti String base = warcHeader.getString("WARC-Target-URI"); if (base.charAt(0) == '<') { // some WARC file enclose the WARC-Target-URI in <...> - base = base.substring(1, (base.length()-2)); + base = base.substring(1, (base.length() - 2)); } URL baseUrl = new URL(base); JSONObject responseMetaData = json.getJSONObject("Envelope") @@ -159,12 +158,13 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti } } } else { - LOG.error("Unexpected JSON value type when processing X-Robots-Tag: " - + headerValue.getClass().getName()); + LOG.error( + "Unexpected JSON value type when processing X-Robots-Tag: " + + headerValue.getClass().getName()); } /* - * Note: continue to iterate over all HTTP headers because there might be - * variants (lower/upper case) of the "X-Robots-Tag" header + * Note: continue to iterate over all HTTP headers because there might be variants + * (lower/upper case) of the "X-Robots-Tag" header */ } } @@ -205,7 +205,8 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti || (nofollowBotPattern != null && nofollowBotPattern.matcher(meta.getString("name")).matches()))) { // check HTML meta "robots" - if (meta.has("content") && nofollowPattern.matcher(meta.getString("content")).find()) { + if (meta.has("content") + && nofollowPattern.matcher(meta.getString("content")).find()) { context.getCounter(COUNTER.RECORDS_NOFOLLOW_META_SKIPPED).increment(1); continue record; } @@ -245,7 +246,8 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti context.getCounter(COUNTER.EXCEPTIONS_JSON).increment(1); LOG.error("Caught JSONException while processing record for " + r.getHeader().getUrl(), ex); } catch (MalformedURLException ex) { - LOG.error("Caught MalformedURLException while processing record for " + r.getHeader().getUrl(), + LOG.error( + "Caught MalformedURLException while processing record for " + r.getHeader().getUrl(), ex); context.getCounter(COUNTER.EXCEPTIONS_URL_MALFORMED).increment(1); } catch (Exception ex) { @@ -263,17 +265,16 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti private void addOutLinks(Context context, Collection outLinks, URL baseUrl, JSONArray links) throws JSONException { context.getCounter(COUNTER.LINKS_TOTAL).increment(links.length()); - links: - for (int i = 0, l = links.length(); i < l; i++) { + links: for (int i = 0, l = links.length(); i < l; i++) { JSONObject link = links.getJSONObject(i); if (link.has("url") && link.has("path")) { String linkTypeMarker = ""; String path = link.getString("path"); String urlStr = link.getString("url"); - path: - switch (path) { + path: switch (path) { case "A@/href": - if (respectNofollow && link.has("rel") && nofollowPattern.matcher(link.getString("rel")).find()) { + if (respectNofollow && link.has("rel") + && nofollowPattern.matcher(link.getString("rel")).find()) { context.getCounter(COUNTER.LINKS_REL_NOFOLLOW_SKIPPED).increment(1); continue links; } @@ -291,21 +292,20 @@ private void addOutLinks(Context context, Collection outLinks, URL baseU continue links; case "LINK@/href": if (link.has("rel")) { - switch(link.getString("rel")) { + switch (link.getString("rel")) { case "canonical": break path; case "alternate": if (extractFeed && link.has("type")) { String type = link.getString("type"); - if ("application/atom+xml".equals(type) - || "application/rss+xml".equals(type)) { + if ("application/atom+xml".equals(type) || "application/rss+xml".equals(type)) { linkTypeMarker = extractFeedMarker; break path; } } // fall-through for non-feed rel links default: - // ignore rels not explicitly listed + // ignore rels not explicitly listed context.getCounter(COUNTER.LINKS_MEDIA_SKIPPED).increment(1); continue links; } @@ -335,13 +335,12 @@ private void addOutLinks(Context context, Collection outLinks, URL baseU } } - protected static class OutLinkCombiner extends Reducer { private LongWritable outVal = new LongWritable(1); /** - * @return true if text is safe and does not contain any control - * characters (U+0000 - U+001F) including '\t', '\r', '\n' + * @return true if text is safe and does not contain any control characters (U+0000 - U+001F) + * including '\t', '\r', '\n' */ public static boolean isSafeText(Text text) { int pos = 0; @@ -359,8 +358,8 @@ public static boolean isSafeText(Text text) { } @Override - public void reduce(Text key, Iterable values, - Context context) throws IOException, InterruptedException { + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { if (!isSafeText(key)) { context.getCounter(COUNTER.LINKS_UNSAFE_TEXT_SKIPPED).increment(1); return; @@ -390,8 +389,8 @@ public void setup(Context context) { } @Override - public void reduce(Text key, Iterable values, - Context context) throws IOException, InterruptedException { + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { if (!isSafeText(key)) { context.getCounter(COUNTER.LINKS_UNSAFE_TEXT_SKIPPED).increment(1); return; @@ -400,7 +399,7 @@ public void reduce(Text key, Iterable values, for (LongWritable val : values) { sum += val.get(); } - if (sampleProbability <= 0.0 || (sum*Math.random()) >= sampleProbability) { + if (sampleProbability <= 0.0 || (sum * Math.random()) >= sampleProbability) { // multiply random by number of times outlink URL has been observed outVal.set(sum); context.write(key, outVal); @@ -440,7 +439,8 @@ public int run(String[] args) throws Exception { return run(outputPath, inputPaths.toArray(new Path[inputPaths.size()])); } - public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotFoundException, InterruptedException { + public int run(Path outputPath, Path[] inputPaths) + throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = getConf(); Job job = Job.getInstance(conf); diff --git a/src/org/commoncrawl/examples/mapreduce/WATServerType.java b/src/org/commoncrawl/examples/mapreduce/WATServerType.java index 106db97..3e635fa 100644 --- a/src/org/commoncrawl/examples/mapreduce/WATServerType.java +++ b/src/org/commoncrawl/examples/mapreduce/WATServerType.java @@ -26,9 +26,9 @@ */ public class WATServerType extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(WATServerType.class); - + /** - * Main entry point that uses the {@link ToolRunner} class to run the Hadoop job. + * Main entry point that uses the {@link ToolRunner} class to run the Hadoop job. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new WATServerType(), args); @@ -59,13 +59,14 @@ public int run(String[] args) throws Exception { return run(outputPath, inputPaths.toArray(new Path[inputPaths.size()])); } - public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotFoundException, InterruptedException { + public int run(Path outputPath, Path[] inputPaths) + throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(WATServerType.class); job.setNumReduceTasks(1); - + for (int i = 0; i < inputPaths.length; i++) { LOG.info("Input path: " + inputPaths[i]); FileInputFormat.addInputPath(job, inputPaths[i]); @@ -73,20 +74,20 @@ public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotF LOG.info("Output path: " + outputPath); FileOutputFormat.setOutputPath(job, outputPath); - + job.setInputFormatClass(WARCFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); - + job.setOutputKeyClass(Text.class); - job.setOutputValueClass(LongWritable.class); - - job.setMapperClass(ServerTypeMap.ServerMapper.class); - job.setReducerClass(LongSumReducer.class); - - if (job.waitForCompletion(true)) { - return 0; - } else { - return 1; - } + job.setOutputValueClass(LongWritable.class); + + job.setMapperClass(ServerTypeMap.ServerMapper.class); + job.setReducerClass(LongSumReducer.class); + + if (job.waitForCompletion(true)) { + return 0; + } else { + return 1; + } } } diff --git a/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java b/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java index 3f0211a..d8d549e 100644 --- a/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java +++ b/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java @@ -13,11 +13,12 @@ public class WordCounterMap { private static final Logger LOG = Logger.getLogger(WordCounterMap.class); + protected static enum MAPPERCOUNTER { - RECORDS_IN, - EMPTY_PAGE_TEXT, - EXCEPTIONS, - NON_PLAIN_TEXT + RECORDS_IN, // + EMPTY_PAGE_TEXT, // + EXCEPTIONS, // + NON_PLAIN_TEXT // } protected static class WordCountMapper extends Mapper { @@ -48,8 +49,7 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti } else { context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1); } - } - catch (Exception ex) { + } catch (Exception ex) { LOG.error("Caught Exception", ex); context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1); } diff --git a/src/org/commoncrawl/warc/WARCFileInputFormat.java b/src/org/commoncrawl/warc/WARCFileInputFormat.java index 89d2433..d752774 100644 --- a/src/org/commoncrawl/warc/WARCFileInputFormat.java +++ b/src/org/commoncrawl/warc/WARCFileInputFormat.java @@ -12,8 +12,8 @@ import org.archive.io.ArchiveReader; /** - * Minimal implementation of FileInputFormat for WARC files. - * Hadoop is told that splitting these compressed files is not possible. + * Minimal implementation of FileInputFormat for WARC files. Hadoop is told that splitting these + * compressed files is not possible. * * @author Stephen Merity (Smerity) */ @@ -24,7 +24,7 @@ public RecordReader createRecordReader(InputSplit split, Ta throws IOException, InterruptedException { return new WARCFileRecordReader(); } - + @Override protected boolean isSplitable(JobContext context, Path filename) { // As these are compressed files, they cannot be (sanely) split diff --git a/src/org/commoncrawl/warc/WARCFileRecordReader.java b/src/org/commoncrawl/warc/WARCFileRecordReader.java index b1e8e1e..a31d6ad 100644 --- a/src/org/commoncrawl/warc/WARCFileRecordReader.java +++ b/src/org/commoncrawl/warc/WARCFileRecordReader.java @@ -15,9 +15,9 @@ import org.archive.io.warc.WARCReaderFactory; /** - * The WARC File Record Reader processes a single compressed input. - * The Record Reader returns a single WARC ArchiveReader that can contain - * numerous individual documents, each document handled in a single mapper. + * The WARC File Record Reader processes a single compressed input. The Record Reader returns a + * single WARC ArchiveReader that can contain numerous individual documents, each document + * handled in a single mapper. * * @author Stephen Merity (Smerity) */ @@ -28,8 +28,7 @@ public class WARCFileRecordReader extends RecordReader { private boolean hasBeenRead = false; @Override - public void initialize(InputSplit inputSplit, TaskAttemptContext context) - throws IOException, InterruptedException { + public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration conf = context.getConfiguration(); Path path = split.getPath();