From 5c358f06aa2f0e73268ff61cb5b8d5babf9e12e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20J=C3=A4schke?= Date: Fri, 5 Jun 2015 14:49:39 +0200 Subject: [PATCH 1/2] added reading of HTTP body --- .../org/archive/hadoop/ResourceRecordReader.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java index 06d3ce2e..8091b076 100644 --- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java +++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.logging.Logger; +import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -25,6 +26,7 @@ import org.archive.streamcontext.HDFSStream; import org.archive.streamcontext.Stream; import org.archive.util.StreamCopy; +import org.json.JSONException; public class ResourceRecordReader extends RecordReader{ private final static Logger LOG = @@ -110,7 +112,10 @@ public boolean nextKeyValue() throws IOException, InterruptedException { Resource r = producer.getNext(); if(r != null) { - StreamCopy.readToEOF(r.getInputStream()); + //StreamCopy.readToEOF(r.getInputStream()); + byte[] bytes = IOUtils.toByteArray(r.getInputStream()); + r.getMetaData().getTopMetaData().put("httpbody", bytes); + LOG.info(String.format("Extracted offset %d\n", series.getCurrentMemberStartOffset())); cachedK = new ResourceContext(name, @@ -124,6 +129,12 @@ public boolean nextKeyValue() throws IOException, InterruptedException { String.format("ResourceParseException at(%s)(%d)", name,series.getCurrentMemberStartOffset()), e); + } catch (JSONException e) { + e.printStackTrace(); + throw new IOException( + String.format("JSONException at(%s)(%d)", + name,series.getCurrentMemberStartOffset()), + e); } return false; } From 7a9113e1852458c5614571c7117188a83d7659d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20J=C3=A4schke?= Date: Fri, 5 Jun 2015 15:52:06 +0200 Subject: [PATCH 2/2] POM for CDH5 --- pom.xml | 150 +++++++++++++++++++++++++------------------------------- 1 file changed, 68 insertions(+), 82 deletions(-) diff --git a/pom.xml b/pom.xml index 6664efd8..81ccacf6 100644 --- a/pom.xml +++ b/pom.xml @@ -1,63 +1,19 @@ - + 4.0.0 - - org.sonatype.oss - oss-parent - 7 - - - org.netpreserve.commons - webarchive-commons - 1.1.5-SNAPSHOT + org.archive + ia-web-commons + 1.0-SNAPSHOT jar - webarchive-commons - https://github.com/iipc/webarchive-commons - - - The International Internet Preservation Consortium - http://netpreserve.org/ - - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - many-devs - Many Others Developers Proceed Me - many@dev.org - - - anjackson - Andrew Jackson - Andrew.Jackson@bl.uk - - - - GitHub Issues - https://github.com/iipc/webarchive-commons/issues - - - scm:git:git@github.com:iipc/webarchive-commons.git - scm:git:git@github.com:iipc/webarchive-commons.git - git@github.com:iipc/webarchive-commons.git - + ia-web-commons + http://maven.apache.org UTF-8 ${maven.build.timestamp} yyyyMMddhhmmss - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ @@ -71,13 +27,13 @@ com.google.guava guava - 17.0 + 14.0.1 org.json json - 20131018 + 20090211 org.htmlparser @@ -86,7 +42,7 @@ - com.googlecode.juniversalchardet + org.mozilla juniversalchardet 1.0.3 @@ -100,7 +56,7 @@ org.apache.hadoop hadoop-core - 0.20.2-cdh3u4 + 2.5.0-mr1-cdh5.3.3 commons-httpclient @@ -130,17 +86,32 @@ tomcat jasper-compiler - - hsqldb - hsqldb - + provided + + + org.apache.hadoop + hadoop-common + 2.5.0-cdh5.3.3 + provided + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.5.0-cdh5.3.3 + provided + + + org.apache.hadoop + hadoop-mapreduce-client-core + 2.5.0-cdh5.3.3 + provided org.apache.pig pig - 0.10.0 + 0.11.1 provided @@ -163,26 +134,28 @@ it.unimi.dsi - dsiutils - 2.0.12 + mg4j + + 1.0.1 + compile + + + it.unimi.dsi + fastutil + 7.0.6 compile - - - ch.qos.logback - logback-classic - - + + it.unimi.dsi + dsiutils + 2.2.4 + compile + org.apache.httpcomponents httpcore 4.3 - - - joda-time - joda-time - 1.6 - + @@ -203,7 +176,7 @@ jar-with-dependencies - webarchive-commons + ia-web-commons @@ -224,6 +197,24 @@ + + internetarchive + Internet Archive Maven Repository + http://builds.archive.org:8080/maven2 + default + + + true + daily + warn + + + true + daily + warn + + + cloudera Cloudera Hadoop @@ -244,17 +235,12 @@ - ${repository.url} - - ${snapshotRepository.id} - ${snapshotRepository.url} - --->