diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..d0b59ac
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,29 @@
+name: cc-warc-examples build
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ java: [ 8, 11, 17, 21 ]
+ name: Java ${{ matrix.java }}
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Setup JDK
+ uses: actions/setup-java@v5
+ with:
+ distribution: 'temurin'
+ java-version: ${{ matrix.java }}
+ cache: 'maven'
+
+ - name: Build
+ run: mvn verify javadoc:aggregate
diff --git a/README.md b/README.md
index fe0977a..74345fd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-
+
# Common Crawl WARC Examples
diff --git a/eclipse-formatter.xml b/eclipse-formatter.xml
new file mode 100644
index 0000000..e9ac2f0
--- /dev/null
+++ b/eclipse-formatter.xml
@@ -0,0 +1,404 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pom.xml b/pom.xml
index 5e0ddb7..9dc804c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,188 +1,213 @@
-
- 4.0.0
+
+
+ 4.0.0
- org.commoncrawl
- cc-warc-examples
- 0.5-SNAPSHOT
- jar
+ org.commoncrawl
+ cc-warc-examples
+ 0.6-SNAPSHOT
+ jar
- cc-warc-examples
-
- Common Crawl WARC Examples.
- Contains both wrappers for processing WARC files in Hadoop MapReduce jobs and Hadoop examples to get you started.
-
- https://github.com/commoncrawl/cc-warc-examples
+ cc-warc-examples
+ Common Crawl WARC Examples.
+ Contains both wrappers for processing WARC files in Hadoop MapReduce jobs and Hadoop examples to get you started.
+ https://github.com/commoncrawl/cc-warc-examples
-
-
- The MIT License
- http://www.opensource.org/licenses/mit-license.php
- repo
-
-
+
+
+ The MIT License
+ http://www.opensource.org/licenses/mit-license.php
+ repo
+
+
-
- scm:git:git@github.com:commoncrawl/cc-warc-examples.git
- git@github.com:commoncrawl/cc-warc-examples.git
-
-
-
- UTF-8
- ${maven.build.timestamp}
- yyyyMMddhhmmss
-
-
- sonatype-nexus-staging
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
- sonatype-nexus-snapshots
- https://oss.sonatype.org/content/repositories/snapshots/
-
+
+ scm:git:git@github.com:commoncrawl/cc-warc-examples.git
+ git@github.com:commoncrawl/cc-warc-examples.git
+
+
+ UTF-8
+ ${maven.build.timestamp}
+ yyyyMMddhhmmss
+
+
+ sonatype-nexus-staging
+ https://oss.sonatype.org/service/local/staging/deploy/maven2/
+ sonatype-nexus-snapshots
+ https://oss.sonatype.org/content/repositories/snapshots/
+
-
-
- log4j
- log4j
- 1.2.17
-
+
+
+ log4j
+ log4j
+ 1.2.17
+
-
- commons-io
- commons-io
- 2.11.0
-
+
+ commons-io
+ commons-io
+ 2.11.0
+
-
- org.netpreserve.commons
- webarchive-commons
- 1.2.0
-
-
- org.apache.hadoop
- hadoop-core
-
-
-
-
-
- org.apache.hadoop
- hadoop-client
- 3.3.6
- provided
-
-
- net.java.dev.jets3t
- jets3t
- 0.9.4
-
-
+
+ org.netpreserve.commons
+ webarchive-commons
+ 3.0.2
+
+
+ org.apache.hadoop
+ hadoop-core
+
+
+
-
- src
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 2.3.2
-
- 1.7
- 1.7
-
-
-
- maven-assembly-plugin
- 2.4
-
-
- jar-with-dependencies
-
- cc-warc-examples-${project.version}
-
-
-
- package
-
- single
-
-
-
-
-
- maven-javadoc-plugin
- 2.7
-
- true
- .svn
- UTF-8
- UTF-8
-
-
-
-
- jar
- javadoc
-
-
-
- site
- pre-site
-
- javadoc
-
-
-
-
-
- maven-source-plugin
- 2.1.1
-
-
-
- jar
-
-
-
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 2.2.2
-
-
-
+
+ org.apache.hadoop
+ hadoop-client
+ 3.3.6
+ provided
+
+
+ net.java.dev.jets3t
+ jets3t
+ 0.9.4
+
+
-
-
- release-sign-artifacts
-
-
- performRelease
- true
-
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.4
-
- ${gpg.passphrase}
-
-
-
- sign-artifacts
- verify
-
- sign
-
-
-
-
-
-
-
-
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.15.0
+
+ 1.8
+ 1.8
+
+
+
+ maven-assembly-plugin
+ 3.8.0
+
+
+ jar-with-dependencies
+
+ cc-warc-examples-${project.version}
+
+
+
+
+ single
+
+ package
+
+
+
+
+ maven-javadoc-plugin
+ 3.12.0
+
+ true
+ .svn
+ UTF-8
+
+
+
+
+ jar
+ javadoc
+
+
+
+ site
+
+ javadoc
+
+ pre-site
+
+
+
+
+ maven-source-plugin
+ 3.4.0
+
+
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-release-plugin
+ 3.3.1
+
+
+ com.diffplug.spotless
+ spotless-maven-plugin
+ 2.46.1
+
+
+
+
+ pom.xml
+
+
+ all
+ true
+ false
+ -1
+ recommended_2008_06
+
+
+
+
+ ${project.basedir}/eclipse-formatter.xml
+
+
+
+
+
+ src
+
+
+
+
+ release-sign-artifacts
+
+
+ performRelease
+ true
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.8
+
+ ${gpg.passphrase}
+
+
+
+ sign-artifacts
+
+ sign
+
+ verify
+
+
+
+
+
+
+
diff --git a/src/org/commoncrawl/examples/S3ReaderTest.java b/src/org/commoncrawl/examples/S3ReaderTest.java
index cef8e73..1355573 100644
--- a/src/org/commoncrawl/examples/S3ReaderTest.java
+++ b/src/org/commoncrawl/examples/S3ReaderTest.java
@@ -1,4 +1,5 @@
package org.commoncrawl.examples;
+
import java.io.IOException;
import org.archive.io.ArchiveReader;
@@ -10,8 +11,8 @@
import org.jets3t.service.model.S3Object;
/**
- * This is a raw example of how you can retrieve a file from the
- * Common Crawl S3 bucket without credentials using JetS3t.
+ * This is a raw example of how you can retrieve a file from the Common Crawl S3 bucket without
+ * credentials using JetS3t.
*
* @author Stephen Merity (Smerity)
*/
@@ -19,36 +20,37 @@ public class S3ReaderTest {
public static void main(String[] args) throws IOException, ServiceException {
// We're accessing a publicly available bucket so don't need to fill in our credentials
S3Service s3s = new RestS3Service(null);
-
+
// Let's grab a file out of the CommonCrawl S3 bucket
String fn = "crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
S3Object f = s3s.getObject("commoncrawl", fn, null, null, null, null, null, null);
-
+
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
-
+
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
- for(ArchiveRecord r : ar) {
+ for (ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println("Header: " + r.getHeader());
System.out.println("URL: " + r.getHeader().getUrl());
System.out.println();
-
+
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the record's stated length
byte[] rawData = new byte[r.available()];
r.read(rawData);
// Note: potential optimization would be to have a large buffer only allocated once
-
+
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
-
- // Pretty printing to make the output more readable
+
+ // Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
- if (i++ > 4) break;
+ if (i++ > 4)
+ break;
}
}
}
\ No newline at end of file
diff --git a/src/org/commoncrawl/examples/WARCReaderTest.java b/src/org/commoncrawl/examples/WARCReaderTest.java
index ea18f6a..972e9ea 100644
--- a/src/org/commoncrawl/examples/WARCReaderTest.java
+++ b/src/org/commoncrawl/examples/WARCReaderTest.java
@@ -1,4 +1,5 @@
package org.commoncrawl.examples;
+
import java.io.FileInputStream;
import java.io.IOException;
@@ -8,43 +9,44 @@
import org.archive.io.warc.WARCReaderFactory;
/**
- * A raw example of how to process a WARC file using the org.archive.io package.
- * Common Crawl S3 bucket without credentials using JetS3t.
+ * A raw example of how to process a WARC file using the org.archive.io package. Common Crawl S3
+ * bucket without credentials using JetS3t.
*
* @author Stephen Merity (Smerity)
*/
public class WARCReaderTest {
/**
* @param args
- * @throws IOException
+ * @throws IOException
*/
public static void main(String[] args) throws IOException {
- // Set up a local compressed WARC file for reading
+ // Set up a local compressed WARC file for reading
String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
-
+
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
- for(ArchiveRecord r : ar) {
+ for (ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
-
+
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
-
+
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
-
- // Pretty printing to make the output more readable
+
+ // Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
- if (i++ > 4) break;
+ if (i++ > 4)
+ break;
}
}
}
\ No newline at end of file
diff --git a/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java b/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java
index 40d717c..c290150 100644
--- a/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java
+++ b/src/org/commoncrawl/examples/mapreduce/ServerTypeMap.java
@@ -15,10 +15,11 @@
public class ServerTypeMap {
private static final Logger LOG = Logger.getLogger(ServerTypeMap.class);
+
protected static enum MAPPERCOUNTER {
- RECORDS_IN,
- NO_SERVER,
- EXCEPTIONS
+ RECORDS_IN, //
+ NO_SERVER, //
+ EXCEPTIONS //
}
protected static class ServerMapper extends Mapper {
@@ -39,13 +40,16 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
String content = new String(rawData);
JSONObject json = new JSONObject(content);
try {
- String warcType = json.getJSONObject("Envelope").getJSONObject("WARC-Header-Metadata")
+ String warcType = json.getJSONObject("Envelope")
+ .getJSONObject("WARC-Header-Metadata")
.getString("WARC-Type");
if (!warcType.equals("response")) {
continue;
}
- JSONObject httpHeaders = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata")
- .getJSONObject("HTTP-Response-Metadata").getJSONObject("Headers");
+ JSONObject httpHeaders = json.getJSONObject("Envelope")
+ .getJSONObject("Payload-Metadata")
+ .getJSONObject("HTTP-Response-Metadata")
+ .getJSONObject("Headers");
JSONArray httpHeaderNames = httpHeaders.names();
for (int i = 0, l = httpHeaders.length(); i < l; i++) {
String headerName = httpHeaderNames.getString(i);
diff --git a/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java b/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java
index 1fb4290..cc5c663 100644
--- a/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java
+++ b/src/org/commoncrawl/examples/mapreduce/TagCounterMap.java
@@ -15,9 +15,10 @@
public class TagCounterMap {
private static final Logger LOG = Logger.getLogger(TagCounterMap.class);
+
protected static enum MAPPERCOUNTER {
- RECORDS_IN,
- EXCEPTIONS
+ RECORDS_IN, //
+ EXCEPTIONS //
}
protected static class TagCounterMapper extends Mapper {
@@ -33,7 +34,7 @@ protected static class TagCounterMapper extends Mapper {
private Text outKey = new Text();
private LongWritable outVal = new LongWritable(1);
@@ -83,9 +83,9 @@ public void setup(Context context) {
Configuration conf = context.getConfiguration();
maxOutlinksPerPage = conf.getInt("wat.outlinks.max.per.page", 80);
/**
- * weighted link counts: each page can distributed `wat.outlinks.max.per.page`
- * points, links from pages with many links get a lower weight, the weight is
- * calculated as `wat.outlinks.max.per.page / num_links_of_page`
+ * weighted link counts: each page can distributed `wat.outlinks.max.per.page` points, links
+ * from pages with many links get a lower weight, the weight is calculated as
+ * `wat.outlinks.max.per.page / num_links_of_page`
*/
outlinksWeightedCount = conf.getBoolean("wat.outlinks.weighted.count", false);
extractFeed = conf.getBoolean("wat.outlinks.extract.feed", false);
@@ -95,8 +95,8 @@ public void setup(Context context) {
String nofollowBotPatternString = conf.get("wat.outlinks.respect.nofollow.bot.pattern", "");
if (!nofollowBotPatternString.isBlank()) {
try {
- nofollowBotPattern = Pattern.compile("\\s*" + nofollowBotPatternString + "\\s*",
- Pattern.CASE_INSENSITIVE);
+ nofollowBotPattern = Pattern
+ .compile("\\s*" + nofollowBotPatternString + "\\s*", Pattern.CASE_INSENSITIVE);
} catch (IllegalArgumentException e) {
LOG.error("Failed to compile wat.outlinks.respect.nofollow.bot.pattern", e);
}
@@ -106,8 +106,7 @@ public void setup(Context context) {
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
- record:
- for (ArchiveRecord r : value) {
+ record: for (ArchiveRecord r : value) {
// Skip any records that are not JSON
if (!r.getHeader().getMimetype().equals("application/json")) {
continue record;
@@ -128,7 +127,7 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
String base = warcHeader.getString("WARC-Target-URI");
if (base.charAt(0) == '<') {
// some WARC file enclose the WARC-Target-URI in <...>
- base = base.substring(1, (base.length()-2));
+ base = base.substring(1, (base.length() - 2));
}
URL baseUrl = new URL(base);
JSONObject responseMetaData = json.getJSONObject("Envelope")
@@ -159,12 +158,13 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
}
}
} else {
- LOG.error("Unexpected JSON value type when processing X-Robots-Tag: "
- + headerValue.getClass().getName());
+ LOG.error(
+ "Unexpected JSON value type when processing X-Robots-Tag: "
+ + headerValue.getClass().getName());
}
/*
- * Note: continue to iterate over all HTTP headers because there might be
- * variants (lower/upper case) of the "X-Robots-Tag" header
+ * Note: continue to iterate over all HTTP headers because there might be variants
+ * (lower/upper case) of the "X-Robots-Tag" header
*/
}
}
@@ -205,7 +205,8 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
|| (nofollowBotPattern != null
&& nofollowBotPattern.matcher(meta.getString("name")).matches()))) {
// check HTML meta "robots"
- if (meta.has("content") && nofollowPattern.matcher(meta.getString("content")).find()) {
+ if (meta.has("content")
+ && nofollowPattern.matcher(meta.getString("content")).find()) {
context.getCounter(COUNTER.RECORDS_NOFOLLOW_META_SKIPPED).increment(1);
continue record;
}
@@ -245,7 +246,8 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
context.getCounter(COUNTER.EXCEPTIONS_JSON).increment(1);
LOG.error("Caught JSONException while processing record for " + r.getHeader().getUrl(), ex);
} catch (MalformedURLException ex) {
- LOG.error("Caught MalformedURLException while processing record for " + r.getHeader().getUrl(),
+ LOG.error(
+ "Caught MalformedURLException while processing record for " + r.getHeader().getUrl(),
ex);
context.getCounter(COUNTER.EXCEPTIONS_URL_MALFORMED).increment(1);
} catch (Exception ex) {
@@ -263,17 +265,16 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
private void addOutLinks(Context context, Collection outLinks, URL baseUrl, JSONArray links)
throws JSONException {
context.getCounter(COUNTER.LINKS_TOTAL).increment(links.length());
- links:
- for (int i = 0, l = links.length(); i < l; i++) {
+ links: for (int i = 0, l = links.length(); i < l; i++) {
JSONObject link = links.getJSONObject(i);
if (link.has("url") && link.has("path")) {
String linkTypeMarker = "";
String path = link.getString("path");
String urlStr = link.getString("url");
- path:
- switch (path) {
+ path: switch (path) {
case "A@/href":
- if (respectNofollow && link.has("rel") && nofollowPattern.matcher(link.getString("rel")).find()) {
+ if (respectNofollow && link.has("rel")
+ && nofollowPattern.matcher(link.getString("rel")).find()) {
context.getCounter(COUNTER.LINKS_REL_NOFOLLOW_SKIPPED).increment(1);
continue links;
}
@@ -291,21 +292,20 @@ private void addOutLinks(Context context, Collection outLinks, URL baseU
continue links;
case "LINK@/href":
if (link.has("rel")) {
- switch(link.getString("rel")) {
+ switch (link.getString("rel")) {
case "canonical":
break path;
case "alternate":
if (extractFeed && link.has("type")) {
String type = link.getString("type");
- if ("application/atom+xml".equals(type)
- || "application/rss+xml".equals(type)) {
+ if ("application/atom+xml".equals(type) || "application/rss+xml".equals(type)) {
linkTypeMarker = extractFeedMarker;
break path;
}
}
// fall-through for non-feed rel links
default:
- // ignore rels not explicitly listed
+ // ignore rels not explicitly listed
context.getCounter(COUNTER.LINKS_MEDIA_SKIPPED).increment(1);
continue links;
}
@@ -335,13 +335,12 @@ private void addOutLinks(Context context, Collection outLinks, URL baseU
}
}
-
protected static class OutLinkCombiner extends Reducer {
private LongWritable outVal = new LongWritable(1);
/**
- * @return true if text is safe and does not contain any control
- * characters (U+0000 - U+001F) including '\t', '\r', '\n'
+ * @return true if text is safe and does not contain any control characters (U+0000 - U+001F)
+ * including '\t', '\r', '\n'
*/
public static boolean isSafeText(Text text) {
int pos = 0;
@@ -359,8 +358,8 @@ public static boolean isSafeText(Text text) {
}
@Override
- public void reduce(Text key, Iterable values,
- Context context) throws IOException, InterruptedException {
+ public void reduce(Text key, Iterable values, Context context)
+ throws IOException, InterruptedException {
if (!isSafeText(key)) {
context.getCounter(COUNTER.LINKS_UNSAFE_TEXT_SKIPPED).increment(1);
return;
@@ -390,8 +389,8 @@ public void setup(Context context) {
}
@Override
- public void reduce(Text key, Iterable values,
- Context context) throws IOException, InterruptedException {
+ public void reduce(Text key, Iterable values, Context context)
+ throws IOException, InterruptedException {
if (!isSafeText(key)) {
context.getCounter(COUNTER.LINKS_UNSAFE_TEXT_SKIPPED).increment(1);
return;
@@ -400,7 +399,7 @@ public void reduce(Text key, Iterable values,
for (LongWritable val : values) {
sum += val.get();
}
- if (sampleProbability <= 0.0 || (sum*Math.random()) >= sampleProbability) {
+ if (sampleProbability <= 0.0 || (sum * Math.random()) >= sampleProbability) {
// multiply random by number of times outlink URL has been observed
outVal.set(sum);
context.write(key, outVal);
@@ -440,7 +439,8 @@ public int run(String[] args) throws Exception {
return run(outputPath, inputPaths.toArray(new Path[inputPaths.size()]));
}
- public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotFoundException, InterruptedException {
+ public int run(Path outputPath, Path[] inputPaths)
+ throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
diff --git a/src/org/commoncrawl/examples/mapreduce/WATServerType.java b/src/org/commoncrawl/examples/mapreduce/WATServerType.java
index 106db97..3e635fa 100644
--- a/src/org/commoncrawl/examples/mapreduce/WATServerType.java
+++ b/src/org/commoncrawl/examples/mapreduce/WATServerType.java
@@ -26,9 +26,9 @@
*/
public class WATServerType extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(WATServerType.class);
-
+
/**
- * Main entry point that uses the {@link ToolRunner} class to run the Hadoop job.
+ * Main entry point that uses the {@link ToolRunner} class to run the Hadoop job.
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new WATServerType(), args);
@@ -59,13 +59,14 @@ public int run(String[] args) throws Exception {
return run(outputPath, inputPaths.toArray(new Path[inputPaths.size()]));
}
- public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotFoundException, InterruptedException {
+ public int run(Path outputPath, Path[] inputPaths)
+ throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(WATServerType.class);
job.setNumReduceTasks(1);
-
+
for (int i = 0; i < inputPaths.length; i++) {
LOG.info("Input path: " + inputPaths[i]);
FileInputFormat.addInputPath(job, inputPaths[i]);
@@ -73,20 +74,20 @@ public int run(Path outputPath, Path[] inputPaths) throws IOException, ClassNotF
LOG.info("Output path: " + outputPath);
FileOutputFormat.setOutputPath(job, outputPath);
-
+
job.setInputFormatClass(WARCFileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
-
+
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(LongWritable.class);
-
- job.setMapperClass(ServerTypeMap.ServerMapper.class);
- job.setReducerClass(LongSumReducer.class);
-
- if (job.waitForCompletion(true)) {
- return 0;
- } else {
- return 1;
- }
+ job.setOutputValueClass(LongWritable.class);
+
+ job.setMapperClass(ServerTypeMap.ServerMapper.class);
+ job.setReducerClass(LongSumReducer.class);
+
+ if (job.waitForCompletion(true)) {
+ return 0;
+ } else {
+ return 1;
+ }
}
}
diff --git a/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java b/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java
index 3f0211a..d8d549e 100644
--- a/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java
+++ b/src/org/commoncrawl/examples/mapreduce/WordCounterMap.java
@@ -13,11 +13,12 @@
public class WordCounterMap {
private static final Logger LOG = Logger.getLogger(WordCounterMap.class);
+
protected static enum MAPPERCOUNTER {
- RECORDS_IN,
- EMPTY_PAGE_TEXT,
- EXCEPTIONS,
- NON_PLAIN_TEXT
+ RECORDS_IN, //
+ EMPTY_PAGE_TEXT, //
+ EXCEPTIONS, //
+ NON_PLAIN_TEXT //
}
protected static class WordCountMapper extends Mapper {
@@ -48,8 +49,7 @@ public void map(Text key, ArchiveReader value, Context context) throws IOExcepti
} else {
context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
}
- }
- catch (Exception ex) {
+ } catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
}
diff --git a/src/org/commoncrawl/warc/WARCFileInputFormat.java b/src/org/commoncrawl/warc/WARCFileInputFormat.java
index 89d2433..d752774 100644
--- a/src/org/commoncrawl/warc/WARCFileInputFormat.java
+++ b/src/org/commoncrawl/warc/WARCFileInputFormat.java
@@ -12,8 +12,8 @@
import org.archive.io.ArchiveReader;
/**
- * Minimal implementation of FileInputFormat for WARC files.
- * Hadoop is told that splitting these compressed files is not possible.
+ * Minimal implementation of FileInputFormat for WARC files. Hadoop is told that splitting these
+ * compressed files is not possible.
*
* @author Stephen Merity (Smerity)
*/
@@ -24,7 +24,7 @@ public RecordReader createRecordReader(InputSplit split, Ta
throws IOException, InterruptedException {
return new WARCFileRecordReader();
}
-
+
@Override
protected boolean isSplitable(JobContext context, Path filename) {
// As these are compressed files, they cannot be (sanely) split
diff --git a/src/org/commoncrawl/warc/WARCFileRecordReader.java b/src/org/commoncrawl/warc/WARCFileRecordReader.java
index b1e8e1e..a31d6ad 100644
--- a/src/org/commoncrawl/warc/WARCFileRecordReader.java
+++ b/src/org/commoncrawl/warc/WARCFileRecordReader.java
@@ -15,9 +15,9 @@
import org.archive.io.warc.WARCReaderFactory;
/**
- * The WARC File Record Reader processes a single compressed input.
- * The Record Reader returns a single WARC ArchiveReader that can contain
- * numerous individual documents, each document handled in a single mapper.
+ * The WARC File Record Reader processes a single compressed input. The Record Reader returns a
+ * single WARC ArchiveReader that can contain numerous individual documents, each document
+ * handled in a single mapper.
*
* @author Stephen Merity (Smerity)
*/
@@ -28,8 +28,7 @@ public class WARCFileRecordReader extends RecordReader {
private boolean hasBeenRead = false;
@Override
- public void initialize(InputSplit inputSplit, TaskAttemptContext context)
- throws IOException, InterruptedException {
+ public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();