From dadf5514ace51f3921b83ba66175f9537fe02160 Mon Sep 17 00:00:00 2001
From: Jordan Mendelson
Date: Mon, 25 Nov 2013 21:44:19 -0800
Subject: [PATCH 01/83] Write out WET files (warc encapsulated text) using the
same parser that we have already run
---
.../ExtractingResourceFactoryMapper.java | 7 +-
.../org/archive/extract/ProducerUtils.java | 2 +-
.../archive/extract/ResourceExtractor.java | 5 +
.../archive/extract/WATExtractorOutput.java | 32 +++-
.../archive/extract/WETExtractorOutput.java | 167 ++++++++++++++++++
.../org/archive/format/json/JSONUtils.java | 10 ++
.../archive/format/warc/WARCConstants.java | 4 +-
.../archive/format/warc/WARCRecordWriter.java | 40 ++++-
.../archive/hadoop/ResourceRecordReader.java | 18 +-
.../archive/resource/ResourceConstants.java | 2 +-
.../html/ExtractingParseObserver.java | 47 ++++-
.../archive/resource/html/HTMLMetaData.java | 4 +
.../http/HTTPResponseResourceFactory.java | 1 -
.../resource/warc/WARCResourceFactory.java | 1 -
14 files changed, 312 insertions(+), 28 deletions(-)
create mode 100644 src/main/java/org/archive/extract/WETExtractorOutput.java
diff --git a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
index ad10be40..99a93d50 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java
@@ -152,8 +152,11 @@ private boolean isWARCInfoResource(MetaData envelope) {
}
private boolean isHTTPResponseWARCResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
- WARCConstants.CONTENT_TYPE,
- WARCConstants.HTTP_RESPONSE_MIMETYPE);
+ WARCConstants.CONTENT_TYPE,
+ WARCConstants.HTTP_RESPONSE_MIMETYPE) ||
+ childFieldEquals(envelope,WARC_HEADER_METADATA,
+ WARCConstants.CONTENT_TYPE,
+ WARCConstants.HTTP_RESPONSE_MIMETYPE_NS);
}
private boolean isWARCJSONResource(MetaData envelope) {
return childFieldEquals(envelope,WARC_HEADER_METADATA,
diff --git a/src/main/java/org/archive/extract/ProducerUtils.java b/src/main/java/org/archive/extract/ProducerUtils.java
index b75d2f15..666b0714 100644
--- a/src/main/java/org/archive/extract/ProducerUtils.java
+++ b/src/main/java/org/archive/extract/ProducerUtils.java
@@ -29,7 +29,7 @@ public static ResourceProducer getProducer(String path, long offset) throws IOEx
wf.setStrict(STRICT_GZ);
File file = new File(path);
- if(path.startsWith("hdfs://")) {
+ if(path.startsWith("hdfs://") || path.startsWith("s3a://")) {
String name = file.getName();
Path fsPath = new Path(path);
FileSystem fs = fsPath.getFileSystem(new Configuration());
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index 7f4d6e7a..ff7d8c50 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -53,6 +53,8 @@ private static int USAGE(int exitCode) {
System.err.println("\t\t\t (note that column 1 is NOT standard Wayback canonicalized)\n");
System.err.println("\t\t-wat\tembed JSON output in a compressed WARC" +
"wrapper, for storage, or sharing.");
+ System.err.println("\t\t-wet\tembed text extracts in a compressed WARC" +
+ "wrapper, for storage, or sharing.");
return exitCode;
}
@@ -101,6 +103,9 @@ public int run(String[] args)
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
out = new WATExtractorOutput(os);
+ } else if(args[arg].equals("-wet")) {
+ path = args[arg+1];
+ out = new WETExtractorOutput(os);
} else {
String filter = args[arg+1];
out = new JSONViewExtractorOutput(os, filter);
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index f4d27147..ee803672 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -29,11 +29,17 @@ public class WATExtractorOutput implements ExtractorOutput {
private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
private int bufferRAM = DEFAULT_BUFFER_RAM;
private final static Charset UTF8 = Charset.forName("UTF-8");
-
- public WATExtractorOutput(OutputStream out) {
+ private String outFilename;
+
+ public WATExtractorOutput(OutputStream out) {
+ this(out, null);
+ }
+
+ public WATExtractorOutput(OutputStream out, String filename) {
gzW = new GZIPMemberWriter(out);
recW = new WARCRecordWriter();
wroteFirst = false;
+ outFilename = filename;
}
private CommitedOutputStream getOutput() {
@@ -55,6 +61,11 @@ public void output(Resource resource) throws IOException {
// hrm...
throw new IOException("Missing Envelope.Format");
}
+
+ // remove the text extracts if it exists
+ JSONUtils.removeObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata", "Text");
+
+
cos = getOutput();
if(envelopeFormat.equals("ARC")) {
writeARC(cos,top);
@@ -68,16 +79,23 @@ public void output(Resource resource) throws IOException {
}
private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
- String filename = JSONUtils.extractSingle(md, "Container.Filename");
- if(filename == null) {
- throw new IOException("No Container.Filename...");
- }
+ String filename = outFilename;
+
+ if (filename == null) {
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
+
+ if(filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ }
+
HttpHeaders headers = new HttpHeaders();
headers.add("Software-Info", IAUtils.COMMONS_VERSION);
headers.addDateHeader("Extracted-Date", new Date());
+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
headers.write(baos);
- recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
+ recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
}
private String extractOrIO(MetaData md, String path) throws IOException {
diff --git a/src/main/java/org/archive/extract/WETExtractorOutput.java b/src/main/java/org/archive/extract/WETExtractorOutput.java
new file mode 100644
index 00000000..b306f59b
--- /dev/null
+++ b/src/main/java/org/archive/extract/WETExtractorOutput.java
@@ -0,0 +1,167 @@
+package org.archive.extract;
+
+import org.archive.format.gzip.GZIPMemberWriter;
+import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
+import org.archive.format.http.HttpHeaders;
+import org.archive.format.json.JSONUtils;
+import org.archive.format.warc.WARCRecordWriter;
+import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.util.DateUtils;
+import org.archive.util.IAUtils;
+import org.archive.util.StreamCopy;
+import org.archive.util.io.CommitedOutputStream;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.text.ParseException;
+import java.util.Date;
+
+/**
+ * This is for generating a WARC Encapsulated Text file
+ *
+ * These are implemented as WARC conversion records. Only
+ * Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Text fields are included
+ */
+public class WETExtractorOutput implements ExtractorOutput {
+ WARCRecordWriter recW;
+ private boolean wroteFirst;
+ private GZIPMemberWriter gzW;
+ private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
+ private int bufferRAM = DEFAULT_BUFFER_RAM;
+ private final static Charset UTF8 = Charset.forName("UTF-8");
+ private String outFilename;
+
+ public WETExtractorOutput(OutputStream out) {
+ this(out, null);
+ }
+
+ public WETExtractorOutput(OutputStream out, String filename) {
+ gzW = new GZIPMemberWriter(out);
+ recW = new WARCRecordWriter();
+ wroteFirst = false;
+ outFilename = filename;
+ }
+
+ private CommitedOutputStream getOutput() {
+ return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
+ }
+
+
+ private String extractOrIO(MetaData md, String path) throws IOException {
+ String value = JSONUtils.extractSingle(md, path);
+ if(value == null) {
+ throw new IOException("No "+path+" found.");
+ }
+ return value;
+ }
+
+ public void output(Resource resource) throws IOException {
+ StreamCopy.readToEOF(resource.getInputStream());
+ MetaData top = resource.getMetaData().getTopMetaData();
+ CommitedOutputStream cos;
+
+ if(!wroteFirst) {
+ cos = getOutput();
+ writeWARCInfo(cos, top);
+ cos.commit();
+ wroteFirst = true;
+ }
+ String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
+ if(envelopeFormat == null) {
+ throw new IOException("Missing Envelope.Format");
+ }
+
+ String warctype = JSONUtils.extractSingle(top, "Envelope.WARC-Header-Metadata.WARC-Type");
+ if (warctype != null && warctype.equals("response")) {
+ String textExtract = JSONUtils.extractSingle(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata.Text");
+
+ if (textExtract != null) {
+ cos = getOutput();
+ if(envelopeFormat.equals("WARC")) {
+ writeWARC(cos, top, textExtract);
+ } else {
+ // hrm...
+ throw new IOException("Unknown Envelope.Format");
+ }
+ cos.commit();
+ }
+ }
+ }
+
+ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
+ String filename = outFilename;
+
+ if (filename == null) {
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
+
+ if(filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ }
+
+ HttpHeaders headers = new HttpHeaders();
+ headers.add("Software-Info", IAUtils.COMMONS_VERSION);
+ headers.addDateHeader("Extracted-Date", new Date());
+
+ // Dup out some useful headers from the incoming warcinfo
+ String warctype = JSONUtils.extractSingle(md, "Envelope.WARC-Header-Metadata.WARC-Type");
+ if (warctype != null && warctype.equals("warcinfo")) {
+ final String[] usefulHeaders = {"robots", "isPartOf", "operator", "description", "publisher"};
+
+ for (String header : usefulHeaders) {
+ String value = JSONUtils.extractSingle(md, "Envelope.Payload-Metadata.WARC-Info-Metadata." + header);
+ if (value != null) {
+ headers.add(header, value);
+ }
+ }
+ }
+
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ headers.write(baos);
+ recW.writeWARCInfoRecord(recOut, filename, baos.toByteArray());
+ }
+
+ private void writeWARC(OutputStream recOut, MetaData md, String textExtract) throws IOException {
+ String targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
+
+ String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date");
+ capDateString = transformWARCDate(capDateString);
+ String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
+ writeWARCMDRecord(recOut, targetURI, capDateString, recId, textExtract);
+ }
+
+ private void writeWARCMDRecord(OutputStream recOut, String targetURI, String capDateString, String recId,
+ String textExtract)
+ throws IOException {
+
+ Date capDate;
+ try {
+ capDate = DateUtils.getSecondsSinceEpoch(capDateString);
+
+ } catch (ParseException e) {
+ e.printStackTrace();
+ // TODO... not the write thing...
+ capDate = new Date();
+ }
+
+ recW.writeTextConversionRecord(recOut, textExtract.getBytes("UTF-8"), targetURI, capDate, recId);
+ }
+
+ private static String transformWARCDate(final String input) {
+
+ StringBuilder output = new StringBuilder(14);
+
+ output.append(input.substring(0,4));
+ output.append(input.substring(5,7));
+ output.append(input.substring(8,10));
+ output.append(input.substring(11,13));
+ output.append(input.substring(14,16));
+ output.append(input.substring(17,19));
+
+ return output.toString();
+ }
+}
diff --git a/src/main/java/org/archive/format/json/JSONUtils.java b/src/main/java/org/archive/format/json/JSONUtils.java
index 28f4f43e..946b633b 100644
--- a/src/main/java/org/archive/format/json/JSONUtils.java
+++ b/src/main/java/org/archive/format/json/JSONUtils.java
@@ -114,4 +114,14 @@ private static void extractRecursive(JSONObject json, String path[], int idx, Li
}
}
}
+ public static boolean removeObject(JSONObject json, String path, String node) {
+ JSONObject obj = extractObject(json, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata");
+ if (obj != null) {
+ if (obj.remove("Text") != null) {
+ return true;
+ }
+ }
+
+ return false;
+ }
}
diff --git a/src/main/java/org/archive/format/warc/WARCConstants.java b/src/main/java/org/archive/format/warc/WARCConstants.java
index c9f6cbf3..6e625183 100644
--- a/src/main/java/org/archive/format/warc/WARCConstants.java
+++ b/src/main/java/org/archive/format/warc/WARCConstants.java
@@ -211,7 +211,9 @@ enum WARCRecordType {
"application/http; msgtype=request";
public static final String HTTP_RESPONSE_MIMETYPE =
"application/http; msgtype=response";
-
+ public static final String HTTP_RESPONSE_MIMETYPE_NS =
+ "application/http;msgtype=response"; // wget does this
+
public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
"text/x-ftp-control-conversation";
diff --git a/src/main/java/org/archive/format/warc/WARCRecordWriter.java b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
index 0aab83b7..ae6d8d67 100644
--- a/src/main/java/org/archive/format/warc/WARCRecordWriter.java
+++ b/src/main/java/org/archive/format/warc/WARCRecordWriter.java
@@ -2,18 +2,33 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.UUID;
import org.archive.format.http.HttpConstants;
import org.archive.format.http.HttpHeaders;
+import org.archive.util.Base32;
import org.archive.util.DateUtils;
public class WARCRecordWriter implements WARCConstants, HttpConstants
{
private static final String SCHEME = "urn:uuid";
private static final String SCHEME_COLON = SCHEME + ":";
-
+ private MessageDigest sha1;
+ private Base32 base32;
+
+ public WARCRecordWriter() {
+ try {
+ sha1 = MessageDigest.getInstance("SHA1");
+ } catch (NoSuchAlgorithmException e) {
+ throw new RuntimeException(e);
+ }
+
+ base32 = new Base32();
+ }
+
/**
* Write the headers and contents as a WARC record to the given
* output stream.
@@ -97,6 +112,29 @@ public void writeJSONMetadataRecord( OutputStream out,
writeRecord(out, headers, contents);
}
+ public void writeTextConversionRecord( OutputStream out,
+ byte[] contents,
+ String targetURI,
+ Date originalDate,
+ String origRecordId) throws IOException
+ {
+ HttpHeaders headers = new HttpHeaders();
+ headers.add(HEADER_KEY_TYPE, WARCRecordType.conversion.name());
+ headers.add(HEADER_KEY_URI, targetURI);
+ headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
+ headers.add(HEADER_KEY_ID, makeRecordId());
+ headers.add(HEADER_KEY_REFERS_TO, origRecordId);
+ headers.add(HEADER_KEY_BLOCK_DIGEST, contentHash(contents));
+
+ headers.add(CONTENT_TYPE, "text/plain");
+ writeRecord(out, headers, contents);
+ }
+
+ private String contentHash(byte[] content) {
+ sha1.reset();
+ return "sha1:" + base32.encode(sha1.digest(content));
+ }
+
private String makeRecordId()
{
StringBuilder recID = new StringBuilder();
diff --git a/src/main/java/org/archive/hadoop/ResourceRecordReader.java b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
index 06d3ce2e..933c4f28 100644
--- a/src/main/java/org/archive/hadoop/ResourceRecordReader.java
+++ b/src/main/java/org/archive/hadoop/ResourceRecordReader.java
@@ -75,16 +75,16 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context)
if(inputSplit instanceof FileSplit) {
FileSplit fs = (FileSplit) inputSplit;
Path fsPath = fs.getPath();
- FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
- FSDataInputStream fsdis = fSys.open(fsPath);
- String path = fsPath.getName();
- name = fsPath.getName();
- stream = new HDFSStream(fsdis);
- startOffset = fs.getStart();
+ FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
+ FSDataInputStream fsdis = fSys.open(fsPath);
+ String path = fsPath.getName();
+ name = fsPath.getName();
+ stream = new HDFSStream(fsdis);
+ startOffset = fs.getStart();
length = fs.getLength();
long endOffset = startOffset + length;
stream.setOffset(startOffset);
- series = new GZIPMemberSeries(stream, name, startOffset);
+ series = new GZIPMemberSeries(stream, name, startOffset);
GZIPResourceContainer prod =
new GZIPResourceContainer(series,endOffset);
ResourceProducer envelope;
@@ -95,8 +95,8 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context)
} else {
throw new IOException("arguments must be arc.gz or warc.gz");
}
- ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
- producer = new ExtractingResourceProducer(envelope, mapper);
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ producer = new ExtractingResourceProducer(envelope, mapper);
} else {
throw new IOException("Need FileSplit input...");
diff --git a/src/main/java/org/archive/resource/ResourceConstants.java b/src/main/java/org/archive/resource/ResourceConstants.java
index dd04fcfe..ccc587e2 100644
--- a/src/main/java/org/archive/resource/ResourceConstants.java
+++ b/src/main/java/org/archive/resource/ResourceConstants.java
@@ -114,5 +114,5 @@ public interface ResourceConstants {
public static final String HTML_LINK_TAGS = "Link";
public static final String HTML_META_TAGS = "Metas";
public static final String HTML_SCRIPT_TAGS = "Scripts";
-
+ public static final String HTML_TEXT = "Text";
}
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index e1f57b55..6a1f02e9 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -11,14 +11,17 @@
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
+import org.htmlparser.util.Translate;
public class ExtractingParseObserver implements ParseObserver {
HTMLMetaData data;
Stack> openAnchors;
Stack openAnchorTexts;
+ StringBuffer textExtract;
String title = null;
boolean inTitle = false;
+ boolean inPre = false;
protected static String cssUrlPatString =
"url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
@@ -59,6 +62,7 @@ public ExtractingParseObserver(HTMLMetaData data) {
this.data = data;
openAnchors = new Stack>();
openAnchorTexts = new Stack();
+ textExtract = new StringBuffer(8192);
}
public void handleDocumentStart() {
@@ -66,7 +70,10 @@ public void handleDocumentStart() {
}
public void handleDocumentComplete() {
- // no-op
+ if (textExtract.length() > 0) {
+ data.setTextExtract(textExtract.toString());
+ textExtract = new StringBuffer(8192);
+ }
}
public void handleTagEmpty(TagNode tag) {
@@ -78,7 +85,10 @@ public void handleTagOpen(TagNode tag) {
if(name.equals("TITLE")) {
inTitle = !tag.isEmptyXmlTag();
return;
- }
+ } else if (name.equals("PRE")) {
+ inPre = true;
+ }
+
// first the global attributes:
// background
String v = tag.getAttribute("background");
@@ -101,6 +111,7 @@ public void handleTagClose(TagNode tag) {
// probably the right thing..
return;
}
+
// Only interesting if it's a :
if(tag.getTagName().equals("A")) {
if(openAnchors.size() > 0) {
@@ -122,13 +133,41 @@ public void handleTagClose(TagNode tag) {
data.addHref(vals);
}
}
- }
+ } else if (tag.getTagName().equals("PRE")) {
+ inPre = false;
+ }
}
public void handleTextNode(TextNode text) {
// TODO: OPTIMIZ: This can be a lot smarter, if StringBuilders are full,
// this result is thrown away.
- String t = text.getText().replaceAll("\\s+", " ");
+ //System.out.println("JDBUG: Got text from node: " + text.getText().toString());
+
+ String txt = text.getText();
+ if (!inPre) {
+ txt = Translate.decode(txt);
+ txt = txt.replace('\u00a0', ' ');
+
+ char c = ' ';
+ if (textExtract.length() > 0) {
+ c = textExtract.charAt(textExtract.length()-1);
+ }
+ for (int i = 0; i < txt.length(); i++) {
+ char c2 = txt.charAt(i);
+ // Translate so output is a bit cleaner
+ if (c2 == '\r') {
+ c2 = '\n';
+ }
+ if (!Character.isWhitespace(c) || !Character.isWhitespace(c2)) {
+ textExtract.append(c2);
+ }
+ c = c2;
+ }
+ }
+ else
+ textExtract.append(txt);
+
+ String t = text.getText().replaceAll("\\s+", " ");
if(t.length() > MAX_TEXT_LEN) {
t = t.substring(0,MAX_TEXT_LEN);
diff --git a/src/main/java/org/archive/resource/html/HTMLMetaData.java b/src/main/java/org/archive/resource/html/HTMLMetaData.java
index 024d9677..b9dcc7ea 100644
--- a/src/main/java/org/archive/resource/html/HTMLMetaData.java
+++ b/src/main/java/org/archive/resource/html/HTMLMetaData.java
@@ -69,6 +69,10 @@ public void addHref(String...a) {
appendObj2(this,HTML_LINKS,a);
}
+ public void setTextExtract(String textExtract) {
+ putUnlessNull(this,HTML_TEXT, textExtract);
+ }
+
private void appendObj2(JSONObject o, String arr, String... a) {
if(o == null) {
return;
diff --git a/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java b/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java
index 135691b6..c2eb7b05 100644
--- a/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java
+++ b/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java
@@ -23,7 +23,6 @@ public Resource getResource(InputStream is, MetaData metaData,
ResourceContainer container)
throws ResourceParseException, IOException {
try {
-
HttpResponse response = parser.parse(is);
metaData.putString(PAYLOAD_CONTENT_TYPE,
PAYLOAD_TYPE_HTTP_RESPONSE);
diff --git a/src/main/java/org/archive/resource/warc/WARCResourceFactory.java b/src/main/java/org/archive/resource/warc/WARCResourceFactory.java
index 14bd53e1..137ee0ff 100644
--- a/src/main/java/org/archive/resource/warc/WARCResourceFactory.java
+++ b/src/main/java/org/archive/resource/warc/WARCResourceFactory.java
@@ -23,7 +23,6 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
try {
-
HttpResponse response = parser.parse(is);
WARCResource r = new WARCResource(parentMetaData.createChild(ENVELOPE),
container, response);
From 347e6f133bffbf8c0fd7558141fedc0e67d7b8ae Mon Sep 17 00:00:00 2001
From: Jordan Mendelson
Date: Wed, 5 Feb 2014 11:04:51 -0800
Subject: [PATCH 02/83] Hack to limit CSS size to 100K.
---
.../resource/html/ExtractingParseObserver.java | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 6a1f02e9..23f52473 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -408,9 +408,18 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
- Matcher m = pattern.matcher(content);
+ String newcontent;
+ int contentLen = content.length();
+ if (contentLen > 100000) {
+ newcontent = content.substring(100000);
+ contentLen = newcontent.length();
+ } else {
+ newcontent = content;
+ }
+
+ Matcher m = pattern.matcher(newcontent);
int idx = 0;
- int contentLen = content.length();
+
while((idx < contentLen) && m.find(idx)) {
String url = m.group(1);
int origUrlLength = url.length();
From 04d556c9747679f9dc5304914cbbf13eb713431b Mon Sep 17 00:00:00 2001
From: Jordan Mendelson
Date: Wed, 19 Mar 2014 23:21:03 -0700
Subject: [PATCH 03/83] htmlparser bump to 2.1 to fix some serious bugs with
jsp parsing
---
pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 03b1240d..6860ed02 100644
--- a/pom.xml
+++ b/pom.xml
@@ -38,7 +38,7 @@
org.htmlparser
htmlparser
- 1.6
+ 2.1
From edc38202a347307595d6d6b6cc596974b09a96f3 Mon Sep 17 00:00:00 2001
From: Jordan Mendelson
Date: Thu, 26 Jun 2014 12:46:38 -0700
Subject: [PATCH 04/83] If there is an error parsing a resource, return the
version one level higher (ie, http response instead of html response)
---
.../extract/ExtractingResourceProducer.java | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
index de671bee..ccfd1ee6 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
@@ -37,8 +37,18 @@ public Resource getNext() throws ResourceParseException, IOException {
current.getClass().toString(),
f.getClass().toString()));
}
- current = f.getResource(current.getInputStream(),
- current.getMetaData(), current.getContainer());
+
+ Resource previous = current;
+ try {
+ current = f.getResource(current.getInputStream(),
+ current.getMetaData(), current.getContainer());
+ } catch (ResourceParseException e) {
+ if(LOG.isLoggable(Level.WARNING)) {
+ LOG.warning("Error creating resource, returning more generic version: " + e);
+ }
+ // If we end up with some kind of parse error, return the resource one level higher
+ return previous;
+ }
}
}
From 7eddd644160a30d174423f142b266f6b99c24864 Mon Sep 17 00:00:00 2001
From: Jordan Mendelson
Date: Thu, 26 Jun 2014 15:02:18 -0700
Subject: [PATCH 05/83] Remove the code that returns an easier version if there
was a parse error since it was caused by a s3 problem.
---
.../org/archive/extract/ExtractingResourceProducer.java | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/extract/ExtractingResourceProducer.java b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
index ccfd1ee6..0e938579 100644
--- a/src/main/java/org/archive/extract/ExtractingResourceProducer.java
+++ b/src/main/java/org/archive/extract/ExtractingResourceProducer.java
@@ -37,7 +37,8 @@ public Resource getNext() throws ResourceParseException, IOException {
current.getClass().toString(),
f.getClass().toString()));
}
-
+
+ /*
Resource previous = current;
try {
current = f.getResource(current.getInputStream(),
@@ -49,6 +50,10 @@ public Resource getNext() throws ResourceParseException, IOException {
// If we end up with some kind of parse error, return the resource one level higher
return previous;
}
+ */
+ current = f.getResource(current.getInputStream(),
+ current.getMetaData(), current.getContainer());
+
}
}
From cd7b7d96c6a3fefeb58ecaffb5528824821426d6 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 5 Jul 2016 08:17:03 +0200
Subject: [PATCH 06/83] Support for file:/ URLs which are erroneously
interpreted as relative paths /current_dir/file:/path/file.warc.gz
---
src/main/java/org/archive/extract/ProducerUtils.java | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/extract/ProducerUtils.java b/src/main/java/org/archive/extract/ProducerUtils.java
index 666b0714..d8db9630 100644
--- a/src/main/java/org/archive/extract/ProducerUtils.java
+++ b/src/main/java/org/archive/extract/ProducerUtils.java
@@ -65,7 +65,15 @@ public static ResourceProducer getProducer(String path, long offset) throws IOEx
} else {
- if(!(file.exists() && file.canRead())) {
+ if(path.startsWith("file:/")) {
+ file = new File(new URL(path).getPath());
+ }
+
+ if(!file.exists()) {
+ System.err.println(path + ": file not found.");
+ return null;
+ }
+ if(!file.canRead()) {
System.err.println(path + " is not a readable file.");
return null;
}
From 52bd2599fd5f7d954908ed97651617e57610953c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 5 Jul 2016 09:33:38 +0200
Subject: [PATCH 07/83] Fixed unit test doubleToString which fails on Java 8
OpenJDK because of a fix when rounding doubles cf.
https://bugs.openjdk.java.net/browse/JDK-7131459 Test double value not close
to a tie (1.344 instead of 1.345)
---
src/test/java/org/archive/util/ArchiveUtilsTest.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/test/java/org/archive/util/ArchiveUtilsTest.java b/src/test/java/org/archive/util/ArchiveUtilsTest.java
index 8251615a..e74763b3 100644
--- a/src/test/java/org/archive/util/ArchiveUtilsTest.java
+++ b/src/test/java/org/archive/util/ArchiveUtilsTest.java
@@ -229,7 +229,7 @@ public void testByteArrayEquals() {
/** test doubleToString() */
public void testDoubleToString(){
- double test = 12.345;
+ double test = 12.344d;
assertTrue(
"cecking zero precision",
ArchiveUtils.doubleToString(test, 0).equals("12"));
@@ -238,7 +238,7 @@ public void testDoubleToString(){
ArchiveUtils.doubleToString(test, 2).equals("12.34"));
assertTrue(
"cecking precision higher then the double has",
- ArchiveUtils.doubleToString(test, 65).equals("12.345"));
+ ArchiveUtils.doubleToString(test, 65).equals("12.344"));
}
From adc1345e6c65c1cee0c2c5ec8fd3f547a12e1b5c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 5 Jul 2016 10:03:10 +0200
Subject: [PATCH 08/83] Merge solution from iipc/webarchive-commons for double
rounding problem, see https://github.com/iipc/webarchive-commons/pull/33
---
.../java/org/archive/util/ArchiveUtilsTest.java | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/test/java/org/archive/util/ArchiveUtilsTest.java b/src/test/java/org/archive/util/ArchiveUtilsTest.java
index e74763b3..586a1821 100644
--- a/src/test/java/org/archive/util/ArchiveUtilsTest.java
+++ b/src/test/java/org/archive/util/ArchiveUtilsTest.java
@@ -229,16 +229,19 @@ public void testByteArrayEquals() {
/** test doubleToString() */
public void testDoubleToString(){
- double test = 12.344d;
- assertTrue(
+ double test = 12.121d;
+ assertEquals(
"cecking zero precision",
- ArchiveUtils.doubleToString(test, 0).equals("12"));
- assertTrue(
+ "12",
+ ArchiveUtils.doubleToString(test, 0));
+ assertEquals(
"cecking 2 character precision",
- ArchiveUtils.doubleToString(test, 2).equals("12.34"));
- assertTrue(
+ "12.12",
+ ArchiveUtils.doubleToString(test, 2));
+ assertEquals(
"cecking precision higher then the double has",
- ArchiveUtils.doubleToString(test, 65).equals("12.344"));
+ "12.121",
+ ArchiveUtils.doubleToString(test, 65));
}
From b9b9b8af43b1b33fa224486d98b110df859817ed Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 5 Jul 2016 10:46:28 +0200
Subject: [PATCH 09/83] Fix StringIndexOutOfBoundsException in WAT/WET
generation (fixes #1) - correct check for min. required URL lenght when
stripping 4 characters (2 at each end) - simplified code in method
patternCSSExtract - in case CSS is larger than 100 kB: process first 100kB
and not everything else except the first 100kB (this wasn't the original
intention, probably) - improved regular expression matching URLs in CSS: use
non-capturing groups
---
.../html/ExtractingParseObserver.java | 47 ++++++-------------
1 file changed, 15 insertions(+), 32 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 23f52473..b2fc99a6 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -26,7 +26,7 @@ public class ExtractingParseObserver implements ParseObserver {
protected static String cssUrlPatString =
"url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
protected static String cssImportNoUrlPatString =
- "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;";
+ "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);
@@ -408,48 +408,31 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
- String newcontent;
- int contentLen = content.length();
- if (contentLen > 100000) {
- newcontent = content.substring(100000);
- contentLen = newcontent.length();
- } else {
- newcontent = content;
- }
-
- Matcher m = pattern.matcher(newcontent);
+ Matcher m = pattern.matcher(content);
int idx = 0;
-
- while((idx < contentLen) && m.find(idx)) {
+ int contentLen = content.length();
+ if (contentLen > 100000)
+ // extract URLs only from the first 100 kB
+ contentLen = 100000;
+ while((idx < contentLen) && m.find()) {
+ idx = m.end();
String url = m.group(1);
- int origUrlLength = url.length();
- int urlStart = m.start(1);
- int urlEnd = m.end(1);
- idx = urlEnd;
if(url.length() < 2) {
continue;
}
if ((url.charAt(0) == '(')
- && (url.charAt(origUrlLength-1) == ')')) {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
- origUrlLength -= 2;
+ && (url.charAt(url.length()-1) == ')')) {
+ url = url.substring(1, url.length() - 1);
}
- if (url.charAt(0) == '"') {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
- } else if (url.charAt(0) == '\'') {
- url = url.substring(1, origUrlLength - 1);
- urlStart += 1;
+ if (url.charAt(0) == '"' || url.charAt(0) == '\'') {
+ url = url.substring(1, url.length() - 1);
} else if (url.charAt(0) == '\\') {
- if(url.length() == 2)
+ if(url.length() <= 4) {
continue;
- url = url.substring(2, origUrlLength - 2);
- urlStart += 2;
+ }
+ url = url.substring(2, url.length() - 2);
}
- int urlLength = url.length();
data.addHref("path","STYLE/#text","href",url);
- idx += urlLength;
}
}
}
From fbb6dd024dcf5c83e6c937b28010f03bcc5b0b51 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 2 Aug 2016 17:31:16 +0200
Subject: [PATCH 10/83] pom.xml to build with CDH 5
---
pom-cdh5.xml | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 229 insertions(+)
create mode 100644 pom-cdh5.xml
diff --git a/pom-cdh5.xml b/pom-cdh5.xml
new file mode 100644
index 00000000..3619206d
--- /dev/null
+++ b/pom-cdh5.xml
@@ -0,0 +1,229 @@
+
+ 4.0.0
+
+ org.archive
+ ia-web-commons
+ 1.0-SNAPSHOT
+ jar
+
+ ia-web-commons
+ http://maven.apache.org
+
+
+ UTF-8
+ ${maven.build.timestamp}
+ yyyyMMddhhmmss
+
+
+
+
+ junit
+ junit
+ 3.8.1
+ test
+
+
+
+ com.google.guava
+ guava
+ 14.0.1
+
+
+
+ org.json
+ json
+ 20090211
+
+
+ org.htmlparser
+ htmlparser
+ 2.1
+
+
+
+ org.mozilla
+ juniversalchardet
+ 1.0.3
+
+
+
+ commons-httpclient
+ commons-httpclient
+ 3.1
+
+
+
+ org.apache.hadoop
+ hadoop-client
+ 2.6.0-cdh5.8.0
+
+
+ commons-httpclient
+ commons-httpclient
+
+
+ javax.servlet
+ servlet-api
+
+
+ javax.servlet.jsp
+ jsp-api
+
+
+ org.mortbay.jetty
+ jetty
+
+
+ org.mortbay.jetty
+ jetty-util
+
+
+ tomcat
+ jasper-runtime
+
+
+ tomcat
+ jasper-compiler
+
+
+
+
+ org.apache.hadoop
+ hadoop-common
+ 2.6.0-cdh5.8.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-common
+ 2.6.0-cdh5.8.0
+
+
+ org.apache.hadoop
+ hadoop-mapreduce-client-core
+ 2.6.0-cdh5.8.0
+
+
+
+ org.apache.pig
+ pig
+ 0.11.1
+ provided
+
+
+
+ commons-lang
+ commons-lang
+ 2.5
+
+
+
+ commons-io
+ commons-io
+ 2.4
+
+
+
+ org.gnu.inet
+ libidn
+ 1.15
+
+
+ it.unimi.dsi
+ mg4j
+ 1.0.1
+ compile
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.3
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.3.2
+
+ 1.6
+ 1.6
+
+
+
+ maven-assembly-plugin
+ 2.4
+
+
+ jar-with-dependencies
+
+ ia-web-commons
+
+
+
+ package
+
+ single
+
+
+
+
+
+
+
+ src/main/resources
+ true
+
+
+
+
+
+
+ internetarchive
+ Internet Archive Maven Repository
+ http://builds.archive.org:8080/maven2
+ default
+
+
+ true
+ daily
+ warn
+
+
+ true
+ daily
+ warn
+
+
+
+
+ cloudera
+ Cloudera Hadoop
+ https://repository.cloudera.com/artifactory/cloudera-repos/
+ default
+
+
+ true
+ daily
+ warn
+
+
+ true
+ daily
+ warn
+
+
+
+
+
+
+
+ repository
+
+ ${repository.url}
+
+
+
+
From e379dcc038150ed866c6825f875e2f2a1ee47fb0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sun, 7 Aug 2016 16:37:30 +0200
Subject: [PATCH 11/83] Make regular expression to extract URLs from CSS more
restrictive regarding leading and trailing quotes to avoid long-runners due
to heavy back-tracking. This closes #2.
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index b2fc99a6..6e56a12d 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -24,7 +24,7 @@ public class ExtractingParseObserver implements ParseObserver {
boolean inPre = false;
protected static String cssUrlPatString =
- "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
+ "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
protected static String cssImportNoUrlPatString =
"@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
From 3763ccbb2121a7d3745b0d5d1f381a5395658b6b Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 29 Sep 2016 11:44:18 +0200
Subject: [PATCH 12/83] Extract also `property` attributes of HTML meta
elements, this fixes #3
---
.../java/org/archive/resource/html/ExtractingParseObserver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 6e56a12d..9e0f5c2f 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -388,7 +388,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
+ ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
if(l != null) {
data.addMeta(l);
}
From f4ce8828ccee9ff85d5f77d860fc0c3e068caf7e Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 11:52:27 +0100
Subject: [PATCH 13/83] Use CharsetDetector to guess encoding of HTML document,
fixes #4
---
.../resource/html/HTMLResourceFactory.java | 32 +++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index 935843f1..34062ed9 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -1,9 +1,14 @@
package org.archive.resource.html;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.archive.format.http.HttpHeaders;
+import org.archive.format.json.JSONUtils;
+import org.archive.format.text.charset.CharsetDetector;
+import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
@@ -13,17 +18,40 @@
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
+import org.json.JSONException;
+import org.json.JSONObject;
public class HTMLResourceFactory implements ResourceFactory {
+ protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
+ protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
+
+ protected CharsetDetector charSetDetector = new StandardCharsetDetector();
+
+
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException, IOException {
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
LexParser parser = new LexParser(epo);
CDATALexer lex = new CDATALexer();
- // TODO: figure out charset:
- String charset = "UTF-8";
+
+ // guess charset based on HTTP header and sniffed content chunk
+ is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
+ byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
+ is.mark(0);
+ int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
+ is.reset();
+ JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
+ HttpHeaders httpHeaders = new HttpHeaders();
+ if (headers.has("Content-Type")) {
+ try {
+ httpHeaders.add("Content-Type", headers.getString("Content-Type"));
+ } catch (JSONException e) { }
+ }
+
+ String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
+
Page page;
try {
page = new Page(is, charset);
From a0427962f7d7996367f17232ee007775664fc4fa Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 14:03:38 +0100
Subject: [PATCH 14/83] HTML encoding detection: fix errors with empty content
or empty charset values
---
.../format/text/charset/CharsetDetector.java | 2 ++
.../resource/html/HTMLResourceFactory.java | 25 +++++++++++++------
2 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index d391aac3..f550e342 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -179,6 +179,8 @@ protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
private static String trimAttrValue(String value) {
String result = value;
+ if (result.isEmpty())
+ return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
index 34062ed9..32ffc143 100644
--- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
+++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java
@@ -5,6 +5,8 @@
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
@@ -23,6 +25,8 @@
public class HTMLResourceFactory implements ResourceFactory {
+ public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
+
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
@@ -37,21 +41,28 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
CDATALexer lex = new CDATALexer();
// guess charset based on HTTP header and sniffed content chunk
+ String charset = "UTF-8";
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
is.mark(0);
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
is.reset();
- JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
- HttpHeaders httpHeaders = new HttpHeaders();
- if (headers.has("Content-Type")) {
+ if (chunkSize > 0) {
+ JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
+ HttpHeaders httpHeaders = new HttpHeaders();
+ if (headers.has("Content-Type")) {
+ try {
+ httpHeaders.add("Content-Type", headers.getString("Content-Type"));
+ } catch (JSONException e) { }
+ }
try {
- httpHeaders.add("Content-Type", headers.getString("Content-Type"));
- } catch (JSONException e) { }
+ charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
+ LOG.info("Guessed charset: " + charset);
+ } catch (Exception e) {
+ LOG.error("Failed to guess charset: " + e.getMessage());
+ }
}
- String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
-
Page page;
try {
page = new Page(is, charset);
From 01d076a855e11af16175a9053ffedbe8f6a2aaa1 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 24 Nov 2016 14:07:21 +0100
Subject: [PATCH 15/83] Match http-equiv meta elements with unquoted attribute
values, e.g.
---
.../org/archive/format/text/charset/CharsetDetector.java | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index f550e342..1c0fd227 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -60,7 +60,8 @@ public abstract class CharsetDetector {
private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" +
META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" +
- META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
+ META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" +
+ ANY_ATTR_VALUE + ")(?:\\s|>)?";
@@ -180,7 +181,7 @@ protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
private static String trimAttrValue(String value) {
String result = value;
if (result.isEmpty())
- return result;
+ return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
@@ -229,7 +230,6 @@ public static String findMetaContentType(String pageSample) {
protected String getCharsetFromBytes(byte buffer[], int len)
throws IOException {
String charsetName = null;
-
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(buffer, 0, len);
detector.dataEnd();
From da92adb1b7561d3c7fd29794375310436f4e750f Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 9 Dec 2016 15:35:10 +0100
Subject: [PATCH 16/83] Strip empty port, do not fail, fixes #5
---
src/main/java/org/archive/url/URLParser.java | 24 +++++++++++--------
.../archive/url/IAURLCanonicalizerTest.java | 1 +
.../archive/url/WaybackURLKeyMakerTest.java | 1 +
3 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/src/main/java/org/archive/url/URLParser.java b/src/main/java/org/archive/url/URLParser.java
index 98e4c1aa..83d3c386 100644
--- a/src/main/java/org/archive/url/URLParser.java
+++ b/src/main/java/org/archive/url/URLParser.java
@@ -246,16 +246,20 @@ public static HandyURL parse(String urlString) throws URISyntaxException {
colonPort = uriAuthority.substring(portColonIndex);
}
if(colonPort != null) {
- if(colonPort.startsWith(":")) {
- try {
- port = Integer.parseInt(colonPort.substring(1));
- } catch(NumberFormatException e) {
- throw new URISyntaxException(urlString, "bad port "
- + colonPort.substring(1));
- }
- } else {
- // XXX: what's happened?!
- }
+ if(colonPort.startsWith(":")) {
+ if (colonPort.length() == 1) {
+ // a bare colon (http://example.com:/), use default port
+ } else {
+ try {
+ port = Integer.parseInt(colonPort.substring(1));
+ } catch(NumberFormatException e) {
+ throw new URISyntaxException(urlString, "bad port "
+ + colonPort.substring(1));
+ }
+ }
+ } else {
+ // XXX: what's happened?!
+ }
}
if(userInfo != null) {
int passColonIndex = userInfo.indexOf(COLON);
diff --git a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
index 3263edc7..8a7a18eb 100644
--- a/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
+++ b/src/test/java/org/archive/url/IAURLCanonicalizerTest.java
@@ -12,6 +12,7 @@ public void testFull() throws URISyntaxException {
compCan(iaC,"https://www.archive.org:80/","https://archive.org:80/");
compCan(iaC,"http://www.archive.org:443/","http://archive.org:443/");
compCan(iaC,"https://www.archive.org:443/","https://archive.org/");
+ compCan(iaC,"http://www.archive.org:/","http://archive.org/");
compCan(iaC,"http://www.archive.org/big/","http://archive.org/big");
compCan(iaC,"dns:www.archive.org","dns:www.archive.org");
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
index 34bfe625..26161456 100644
--- a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
+++ b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -22,6 +22,7 @@ public void testMakeKey() throws URISyntaxException {
assertEquals("org,archive)/goo", km.makeKey("http://archive.org/goo/?"));
assertEquals("org,archive)/goo?a&b", km.makeKey("http://archive.org/goo/?b&a"));
assertEquals("org,archive)/goo?a=1&a=2&b", km.makeKey("http://archive.org/goo/?a=2&b&a=1"));
+ assertEquals("org,archive)/", km.makeKey("http://archive.org:/"));
}
}
From eb66fc448110fac39b3692b7843d7c84c8b35112 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 18 Jan 2017 16:18:55 +0100
Subject: [PATCH 17/83] Make regular expression to extract URLs from CSS more
restrictive: merged improvements from iipc/webarchive-commons#63
---
.../html/ExtractingParseObserver.java | 25 ++++-----
.../html/ExtractingParseObserverTest.java | 51 +++++++++++--------
2 files changed, 39 insertions(+), 37 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 9e0f5c2f..0fce1b2a 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -25,13 +25,18 @@ public class ExtractingParseObserver implements ParseObserver {
protected static String cssUrlPatString =
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
+ protected static String cssUrlTrimPatString =
+ "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$";
protected static String cssImportNoUrlPatString =
- "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
+ "@import\\s+((?:'[^']+')|(?:\"[^\"]+\")|(?:\\('[^']+'\\))|(?:\\(\"[^\"]+\"\\))|(?:\\([^)]+\\))|(?:[a-z0-9_.:/\\\\-]+))\\s*;";
protected static Pattern cssImportNoUrlPattern = Pattern
.compile(cssImportNoUrlPatString);
protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);
+
+ protected static Pattern cssUrlTrimPattern = Pattern.compile(cssUrlTrimPatString);
+
private final static int MAX_TEXT_LEN = 100;
// private static String GLOBAL_ATTR[] = {"background"};
@@ -417,22 +422,10 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
while((idx < contentLen) && m.find()) {
idx = m.end();
String url = m.group(1);
- if(url.length() < 2) {
- continue;
- }
- if ((url.charAt(0) == '(')
- && (url.charAt(url.length()-1) == ')')) {
- url = url.substring(1, url.length() - 1);
- }
- if (url.charAt(0) == '"' || url.charAt(0) == '\'') {
- url = url.substring(1, url.length() - 1);
- } else if (url.charAt(0) == '\\') {
- if(url.length() <= 4) {
- continue;
- }
- url = url.substring(2, url.length() - 2);
+ url = cssUrlTrimPattern.matcher(url).replaceAll("");
+ if (!url.isEmpty()) {
+ data.addHref("path","STYLE/#text","href", url);
}
- data.addHref("path","STYLE/#text","href",url);
}
}
}
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 24b6c18a..b052e375 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -19,7 +19,9 @@ public void testHandleStyleNodeExceptions() throws Exception {
"url (' ')",
"url('\")",
"url(')",
- "url('\"')"
+ "url('\"')",
+ "url('\\\"\"')",
+ "url(''''')"
};
boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
@@ -37,6 +39,7 @@ public void testHandleStyleNodeExceptions() throws Exception {
assertFalse(except);
}
}
+
public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
@@ -45,31 +48,36 @@ public void testHandleStyleNode() throws Exception {
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},
-
- };
+ {"url(''foo.gif'')","foo.gif"},
+ {"url( foo.gif )","foo.gif"},
+ {"url('''')"},
+ {"url('foo.gif'')","foo.gif"},
+ };
for(String[] testa : tests) {
checkExtract(testa);
}
- // boolean except = false;
-// HTMLMetaData md = new HTMLMetaData(new MetaData());
-// ExtractingParseObserver epo = new ExtractingParseObserver(md);
-// for(String css : tests) {
-// try {
-// TextNode tn = new TextNode(css);
-// epo.handleStyleNode(tn);
-// } catch(Exception e) {
-// System.err.format("And the winner is....(%s)\n", css);
-// e.printStackTrace();
-// except = true;
-// throw e;
-// }
-// assertFalse(except);
-// }
}
+
+ /**
+ * Test whether the pattern matcher does extract nothing and also does not
+ * not hang-up if an overlong CSS link is truncated.
+ */
+ public void testHandleStyleNodeNoHangupTruncated() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ sb.append("url(");
+ for (int i = 0; i < 500000; i++)
+ sb.append('\'');
+ sb.append("foo.gif");
+ for (int i = 0; i < 499000; i++)
+ sb.append('\'');
+ String[] test = new String[1];
+ test[0] = sb.toString();
+ checkExtract(test);
+ }
+
private void checkExtract(String[] data) throws JSONException {
// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]);
String css = data[0];
- boolean except = false;
HTMLMetaData md = new HTMLMetaData(new MetaData());
ExtractingParseObserver epo = new ExtractingParseObserver(md);
try {
@@ -87,10 +95,11 @@ private void checkExtract(String[] data) throws JSONException {
assertTrue(o instanceof JSONObject);
JSONObject jo = (JSONObject) o;
- assertEquals(data[i],jo.getString("href"));
+ assertEquals("CSS link extraction failed for <" + css + ">",
+ data[i], jo.getString("href"));
}
} else {
- assertNull(a);
+ assertNull("Expected no extracted link for <" + css + ">", a);
}
}
From bf22eecbb18c75fa34b00f3f4ed9f863c089ea03 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 27 Jan 2017 09:05:19 +0100
Subject: [PATCH 18/83] CharsetDetector: sync with iipc/webarchive-commons
---
.../org/archive/format/text/charset/CharsetDetector.java | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
index 1c0fd227..690f8b99 100644
--- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java
+++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java
@@ -179,9 +179,10 @@ protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
}
private static String trimAttrValue(String value) {
+ if (value.isEmpty()) {
+ return value;
+ }
String result = value;
- if (result.isEmpty())
- return result;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
From 7b9e81233512ff9754995e0aa83788e8b814b8dc Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 20 Feb 2017 11:54:59 +0100
Subject: [PATCH 19/83] fix indentation (use tab)
---
.../archive/extract/ResourceExtractor.java | 17 ++--
.../archive/extract/WATExtractorOutput.java | 31 +++---
.../html/ExtractingParseObserver.java | 96 +++++++++----------
3 files changed, 71 insertions(+), 73 deletions(-)
diff --git a/src/main/java/org/archive/extract/ResourceExtractor.java b/src/main/java/org/archive/extract/ResourceExtractor.java
index ff7d8c50..5d71bbd5 100644
--- a/src/main/java/org/archive/extract/ResourceExtractor.java
+++ b/src/main/java/org/archive/extract/ResourceExtractor.java
@@ -53,8 +53,7 @@ private static int USAGE(int exitCode) {
System.err.println("\t\t\t (note that column 1 is NOT standard Wayback canonicalized)\n");
System.err.println("\t\t-wat\tembed JSON output in a compressed WARC" +
"wrapper, for storage, or sharing.");
- System.err.println("\t\t-wet\tembed text extracts in a compressed WARC" +
- "wrapper, for storage, or sharing.");
+ System.err.println("\t\t-wet\tembed text extracts in a compressed WARC" + "wrapper, for storage, or sharing.");
return exitCode;
}
@@ -103,13 +102,13 @@ public int run(String[] args)
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
out = new WATExtractorOutput(os);
- } else if(args[arg].equals("-wet")) {
- path = args[arg+1];
- out = new WETExtractorOutput(os);
- } else {
- String filter = args[arg+1];
- out = new JSONViewExtractorOutput(os, filter);
- }
+ } else if (args[arg].equals("-wet")) {
+ path = args[arg + 1];
+ out = new WETExtractorOutput(os);
+ } else {
+ String filter = args[arg + 1];
+ out = new JSONViewExtractorOutput(os, filter);
+ }
} else {
out = new DumpingExtractorOutput(os);
}
diff --git a/src/main/java/org/archive/extract/WATExtractorOutput.java b/src/main/java/org/archive/extract/WATExtractorOutput.java
index ee803672..7bb9fb88 100644
--- a/src/main/java/org/archive/extract/WATExtractorOutput.java
+++ b/src/main/java/org/archive/extract/WATExtractorOutput.java
@@ -29,17 +29,17 @@ public class WATExtractorOutput implements ExtractorOutput {
private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
private int bufferRAM = DEFAULT_BUFFER_RAM;
private final static Charset UTF8 = Charset.forName("UTF-8");
- private String outFilename;
+ private String outFilename;
- public WATExtractorOutput(OutputStream out) {
- this(out, null);
- }
+ public WATExtractorOutput(OutputStream out) {
+ this(out, null);
+ }
public WATExtractorOutput(OutputStream out, String filename) {
gzW = new GZIPMemberWriter(out);
recW = new WARCRecordWriter();
wroteFirst = false;
- outFilename = filename;
+ outFilename = filename;
}
private CommitedOutputStream getOutput() {
@@ -62,9 +62,8 @@ public void output(Resource resource) throws IOException {
throw new IOException("Missing Envelope.Format");
}
- // remove the text extracts if it exists
- JSONUtils.removeObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata", "Text");
-
+ // remove the text extracts if it exists
+ JSONUtils.removeObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata", "Text");
cos = getOutput();
if(envelopeFormat.equals("ARC")) {
@@ -79,15 +78,15 @@ public void output(Resource resource) throws IOException {
}
private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
- String filename = outFilename;
+ String filename = outFilename;
- if (filename == null) {
- filename = JSONUtils.extractSingle(md, "Container.Filename");
+ if (filename == null) {
+ filename = JSONUtils.extractSingle(md, "Container.Filename");
- if(filename == null) {
- throw new IOException("No Container.Filename...");
- }
- }
+ if (filename == null) {
+ throw new IOException("No Container.Filename...");
+ }
+ }
HttpHeaders headers = new HttpHeaders();
headers.add("Software-Info", IAUtils.COMMONS_VERSION);
@@ -95,7 +94,7 @@ private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException
ByteArrayOutputStream baos = new ByteArrayOutputStream();
headers.write(baos);
- recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
+ recW.writeWARCInfoRecord(recOut, filename, baos.toByteArray());
}
private String extractOrIO(MetaData md, String path) throws IOException {
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 0fce1b2a..1ed61497 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -18,10 +18,10 @@ public class ExtractingParseObserver implements ParseObserver {
HTMLMetaData data;
Stack> openAnchors;
Stack openAnchorTexts;
- StringBuffer textExtract;
+ StringBuffer textExtract;
String title = null;
boolean inTitle = false;
- boolean inPre = false;
+ boolean inPre = false;
protected static String cssUrlPatString =
"url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)";
@@ -67,7 +67,7 @@ public ExtractingParseObserver(HTMLMetaData data) {
this.data = data;
openAnchors = new Stack>();
openAnchorTexts = new Stack();
- textExtract = new StringBuffer(8192);
+ textExtract = new StringBuffer(8192);
}
public void handleDocumentStart() {
@@ -75,10 +75,10 @@ public void handleDocumentStart() {
}
public void handleDocumentComplete() {
- if (textExtract.length() > 0) {
- data.setTextExtract(textExtract.toString());
- textExtract = new StringBuffer(8192);
- }
+ if (textExtract.length() > 0) {
+ data.setTextExtract(textExtract.toString());
+ textExtract = new StringBuffer(8192);
+ }
}
public void handleTagEmpty(TagNode tag) {
@@ -91,8 +91,8 @@ public void handleTagOpen(TagNode tag) {
inTitle = !tag.isEmptyXmlTag();
return;
} else if (name.equals("PRE")) {
- inPre = true;
- }
+ inPre = true;
+ }
// first the global attributes:
// background
@@ -139,59 +139,59 @@ public void handleTagClose(TagNode tag) {
}
}
} else if (tag.getTagName().equals("PRE")) {
- inPre = false;
- }
+ inPre = false;
+ }
}
public void handleTextNode(TextNode text) {
// TODO: OPTIMIZ: This can be a lot smarter, if StringBuilders are full,
- // this result is thrown away.
- //System.out.println("JDBUG: Got text from node: " + text.getText().toString());
-
- String txt = text.getText();
- if (!inPre) {
- txt = Translate.decode(txt);
- txt = txt.replace('\u00a0', ' ');
-
- char c = ' ';
- if (textExtract.length() > 0) {
- c = textExtract.charAt(textExtract.length()-1);
- }
- for (int i = 0; i < txt.length(); i++) {
- char c2 = txt.charAt(i);
- // Translate so output is a bit cleaner
- if (c2 == '\r') {
- c2 = '\n';
- }
- if (!Character.isWhitespace(c) || !Character.isWhitespace(c2)) {
- textExtract.append(c2);
- }
- c = c2;
- }
- }
- else
- textExtract.append(txt);
-
- String t = text.getText().replaceAll("\\s+", " ");
-
- if(t.length() > MAX_TEXT_LEN) {
- t = t.substring(0,MAX_TEXT_LEN);
+ // this result is thrown away.
+ // System.out.println("JDBUG: Got text from node: " +
+ // text.getText().toString());
+
+ String txt = text.getText();
+ if (!inPre) {
+ txt = Translate.decode(txt);
+ txt = txt.replace('\u00a0', ' ');
+
+ char c = ' ';
+ if (textExtract.length() > 0) {
+ c = textExtract.charAt(textExtract.length() - 1);
+ }
+ for (int i = 0; i < txt.length(); i++) {
+ char c2 = txt.charAt(i);
+ // Translate so output is a bit cleaner
+ if (c2 == '\r') {
+ c2 = '\n';
+ }
+ if (!Character.isWhitespace(c) || !Character.isWhitespace(c2)) {
+ textExtract.append(c2);
+ }
+ c = c2;
+ }
+ } else
+ textExtract.append(txt);
+
+ String t = text.getText().replaceAll("\\s+", " ");
+
+ if (t.length() > MAX_TEXT_LEN) {
+ t = t.substring(0, MAX_TEXT_LEN);
}
- if(inTitle) {
+ if (inTitle) {
title = t;
} else {
-
- for(StringBuilder s : openAnchorTexts) {
- if(s.length() >= MAX_TEXT_LEN) {
+
+ for (StringBuilder s : openAnchorTexts) {
+ if (s.length() >= MAX_TEXT_LEN) {
// if we are full, parents enclosing us should be too..
break;
}
- if(s.length() + t.length() < MAX_TEXT_LEN) {
+ if (s.length() + t.length() < MAX_TEXT_LEN) {
s.append(t);
} else {
// only add as much as we can:
- s.append(t.substring(0,MAX_TEXT_LEN - s.length()));
+ s.append(t.substring(0, MAX_TEXT_LEN - s.length()));
}
// BUGBUG: check now for multiple trailing spaces, and strip:
}
From 4a56f814ab0689562d7e2d0a51464a25b367be24 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Mon, 20 Feb 2017 11:55:38 +0100
Subject: [PATCH 20/83] upgrade to CDH 5.10.0
---
pom-cdh5.xml | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/pom-cdh5.xml b/pom-cdh5.xml
index 3619206d..f0c90ac2 100644
--- a/pom-cdh5.xml
+++ b/pom-cdh5.xml
@@ -51,12 +51,12 @@
commons-httpclient
commons-httpclient
3.1
-
+
org.apache.hadoop
hadoop-client
- 2.6.0-cdh5.8.0
+ 2.6.0-cdh5.10.0
commons-httpclient
@@ -85,8 +85,8 @@
tomcat
jasper-compiler
-
-
+
+
org.apache.hadoop
@@ -128,7 +128,7 @@
libidn
1.15
-
+
it.unimi.dsi
mg4j
1.0.1
From 5626c90ad5cfeed215a733c8cb756648d180ecde Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 21 Feb 2017 13:42:43 +0100
Subject: [PATCH 21/83] Improve HTML link extraction, fixes #9 - add extractors
for more elements which can take URLs as attribute values, complete
attributes - add unit test to verify link extraction
---
.../html/ExtractingParseObserver.java | 48 ++++++-
.../html/ExtractingParseObserverTest.java | 116 +++++++++++++++
.../resource/html/link-extraction-test.warc | 136 ++++++++++++++++++
3 files changed, 298 insertions(+), 2 deletions(-)
create mode 100644 src/test/resources/org/archive/resource/html/link-extraction-test.warc
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index 1ed61497..c97e0d42 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -60,6 +60,17 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("META", new MetaTagExtractor());
extractors.put("OBJECT", new ObjectTagExtractor());
extractors.put("SCRIPT", new ScriptTagExtractor());
+ extractors.put("Q", new QuotationLinkTagExtractor());
+ extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
+ extractors.put("DEL", new QuotationLinkTagExtractor());
+ extractors.put("INS", new QuotationLinkTagExtractor());
+ // HTML5:
+ extractors.put("BUTTON", new ButtonTagExtractor());
+ extractors.put("MENUITEM", new MenuitemTagExtractor());
+ extractors.put("VIDEO", new EmbedVideoTagExtractor());
+ extractors.put("AUDIO", new EmbedTagExtractor());
+ extractors.put("TRACK", new EmbedTagExtractor());
+ extractors.put("SOURCE", new EmbedTagExtractor());
}
@@ -335,12 +346,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class ButtonTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"formaction");
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+ private static class EmbedVideoTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"src","poster");
+ }
+ }
+
private static class FormTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
@@ -368,21 +391,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
addBasicHrefs(data,node,"src");
}
}
+
private static class IFrameTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+
private static class ImgTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
+ addBasicHrefs(data,node,"longdesc");
}
}
+
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"src");
+ addBasicHrefs(data,node,"src","formaction");
}
}
+
private static class LinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"href","rel","type");
@@ -391,6 +419,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
+ private static class MenuitemTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"icon");
+ }
+ }
+
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrList(node,"name","rel","content","http-equiv","property");
@@ -399,11 +434,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private static class ObjectTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"codebase","cdata");
+ addBasicHrefs(data,node,"codebase","cdata","data");
}
}
+
+ private static class QuotationLinkTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"cite");
+ }
+ }
+
private static class ScriptTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"src","type");
@@ -412,6 +455,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
int idx = 0;
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index b052e375..a8b5213b 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -1,15 +1,33 @@
package org.archive.resource.html;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+
import junit.framework.TestCase;
public class ExtractingParseObserverTest extends TestCase {
+ private static final Logger LOG =
+ Logger.getLogger(ExtractingParseObserverTest.class.getName());
+
public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
@@ -103,5 +121,103 @@ private void checkExtract(String[] data) throws JSONException {
}
}
+ private void checkLink(Multimap links, String url, String path) {
+ assertTrue("Link with URL " + url + " not found", links.containsKey(url));
+ assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
+ }
+
+ private void checkLinks(Resource resource, String[][] expectedLinks) {
+ assertNotNull(resource);
+ assertTrue(resource instanceof HTMLResource);
+ MetaData md = resource.getMetaData();
+ LOG.info(md.toString());
+ Multimap links = ArrayListMultimap.create();
+ try {
+ //
+ String baseUrl = (String) md.getJSONObject("Head").opt("Base");
+ if (baseUrl != null) {
+ links.put(baseUrl, "__base__");
+ }
+ //
+ JSONArray metas = md.getJSONObject("Head").optJSONArray("Metas");
+ if (metas != null) {
+ for (int i = 0; i < metas.length(); i++) {
+ JSONObject o = (JSONObject) metas.optJSONObject(i);
+ if (o.getString("http-equiv").equals("Refresh")) {
+ String metaRefreshTarget = o.getString("content").replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
+ LOG.info(metaRefreshTarget);
+ links.put(metaRefreshTarget, "__meta_refresh__");
+ }
+ }
+ }
+ } catch (JSONException e) {
+ fail("Failed to parse JSON: " + e.getMessage());
+ }
+ // extract outlinks
+ List linkArrays = new ArrayList();
+ if (md.optJSONArray("Links") != null) {
+ linkArrays.add(md.optJSONArray("Links"));
+ }
+ try {
+ if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
+ linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
+ }
+ } catch (JSONException e1) {
+ }
+ for (JSONArray ldata : linkArrays) {
+ for (int i = 0; i < ldata.length(); i++) {
+ JSONObject o = (JSONObject) ldata.optJSONObject(i);
+ try {
+ String url = o.getString("url");
+ links.put(url, o.getString("path"));
+ LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
+ } catch (JSONException e) {
+ fail("Failed to extract URL from link: " + e.getMessage());
+ }
+ }
+ }
+ assertEquals("Unexpected number of links", expectedLinks.length, links.size());
+ for (String[] l : expectedLinks) {
+ checkLink(links, l[0], l[1]);
+ }
+ }
+
+ public void testLinkExtraction() throws ResourceParseException, IOException {
+ String testFileName = "link-extraction-test.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).toString());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor =
+ new ExtractingResourceProducer(producer, mapper);
+ extractor.getNext(); // skip warcinfo record
+ String[][] html4links = {
+ {"http://www.example.com/", "__base__"},
+ {"http://www.example.com/redirected.html", "__meta_refresh__"},
+ {"background.jpg", "BODY@/background"},
+ {"http://www.example.com/a-href.html", "A@/href"},
+ {"#anchor", "A@/href"},
+ {"image.png", "IMG@/src"},
+ {"image.gif", "IMG@/src"},
+ {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
+ {"helloworld.swf", "OBJECT@/data"},
+ {"http://www.example.com/shakespeare.html", "Q@/cite"},
+ {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
+ };
+ checkLinks(extractor.getNext(), html4links);
+ String[][] html5links = {
+ {"http:///www.example.com/video.html", "LINK@/href", "canonical"},
+ {"video.rss", "LINK@/href", "alternate"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
+ };
+ checkLinks(extractor.getNext(), html5links);
+ String[][] fbVideoLinks = {
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ };
+ checkLinks(extractor.getNext(), fbVideoLinks);
+ }
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
new file mode 100644
index 00000000..aed76aad
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -0,0 +1,136 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+WARC-Date: 2017-02-20T14:00:56Z
+Content-Length: 128
+
+format: WARC File Format 1.0
+conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+robots: classic
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-02-20T14:00:56Z
+WARC-Target-URI: http://www.example.com/html4.html
+Content-Type: application/http;msgtype=response
+Content-Length: 1243
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 14:00:56 GMT
+Content-Length: 1125
+Content-Type: application/xhtml+xml
+
+
+
+
+
+
+
+Test XHTML Link Extraction
+
+
+A@/href
+
+ anchor only
+
+
+
+
+
+ To be or not to be.
+
+
+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, …
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html
+WARC-Date: 2017-02-20T21:35:03Z
+Content-Type: application/http;msgtype=response
+Content-Length: 890
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 21:35:03 GMT
+Content-Length: 789
+Content-Type: text/html
+
+
+
+
+Test HTML5 Video Tag
+
+
+
+
+
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/fb-video.html
+WARC-Date: 2017-02-20T16:58:50Z
+Content-Type: application/http;msgtype=response
+Content-Length: 1330
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 16:58:50 GMT
+Content-Length: 1194
+Content-Type: text/html
+
+
+
+
+ fb-video - Embedded Videos - Social Plugins
+
+
+
+
+
+
+
+
+
+
+
+
+
+
From 4b0deb414db6917611b4df054cd4cbcd87eb8c89 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 22 Feb 2017 10:15:04 +0100
Subject: [PATCH 22/83] Extract links/URLs from data-href and data-uri
attributes, fixes #7
---
.../html/ExtractingParseObserver.java | 30 ++-
.../html/ExtractingParseObserverTest.java | 63 +++++-
.../resource/html/link-extraction-test.warc | 184 ++++++++++++++++++
3 files changed, 261 insertions(+), 16 deletions(-)
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index c97e0d42..8821d4cd 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -2,12 +2,17 @@
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
+import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.text.html.ParseObserver;
+import org.htmlparser.Attribute;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
@@ -39,11 +44,10 @@ public class ExtractingParseObserver implements ParseObserver {
private final static int MAX_TEXT_LEN = 100;
-// private static String GLOBAL_ATTR[] = {"background"};
-
private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
- private final static Map extractors;
+ private static final Map extractors;
+ private static final Set globalHrefAttributes;
static {
extractors = new HashMap();
extractors.put("A", new AnchorTagExtractor());
@@ -71,6 +75,11 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
+
+ globalHrefAttributes = new HashSet();
+ globalHrefAttributes.add("background");
+ globalHrefAttributes.add("data-href");
+ globalHrefAttributes.add("data-uri");
}
@@ -106,10 +115,17 @@ public void handleTagOpen(TagNode tag) {
}
// first the global attributes:
- // background
- String v = tag.getAttribute("background");
- if(v != null) {
- data.addHref(PATH,makePath(name,"background"),"url",v);
+ Vector attributes = tag.getAttributesEx();
+ for (Attribute a : attributes) {
+ String attrName = a.getName();
+ String attrValue = a.getValue();
+ if (attrName == null || attrValue == null) {
+ continue;
+ }
+ attrName = attrName.toLowerCase(Locale.ROOT);
+ if (globalHrefAttributes.contains(attrName)) {
+ data.addHref(PATH,makePath(name,attrName),"url",attrValue);
+ }
}
// TODO: style attribute, BASE(href) tag, Resolve URLs
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index a8b5213b..b1b800c2 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -132,26 +132,28 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
MetaData md = resource.getMetaData();
LOG.info(md.toString());
Multimap links = ArrayListMultimap.create();
- try {
+ JSONObject head = md.optJSONObject("Head");
+ if (head != null) {
//
- String baseUrl = (String) md.getJSONObject("Head").opt("Base");
+ String baseUrl = (String) head.opt("Base");
if (baseUrl != null) {
links.put(baseUrl, "__base__");
}
//
- JSONArray metas = md.getJSONObject("Head").optJSONArray("Metas");
+ JSONArray metas = head.optJSONArray("Metas");
if (metas != null) {
for (int i = 0; i < metas.length(); i++) {
JSONObject o = (JSONObject) metas.optJSONObject(i);
- if (o.getString("http-equiv").equals("Refresh")) {
- String metaRefreshTarget = o.getString("content").replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
- LOG.info(metaRefreshTarget);
- links.put(metaRefreshTarget, "__meta_refresh__");
+ String httpEquiv = o.optString("http-equiv");
+ if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
+ String metaRefreshTarget = o.optString("content");
+ if (metaRefreshTarget != null) {
+ metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
+ links.put(metaRefreshTarget, "__meta_refresh__");
+ }
}
}
}
- } catch (JSONException e) {
- fail("Failed to parse JSON: " + e.getMessage());
}
// extract outlinks
List linkArrays = new ArrayList();
@@ -212,12 +214,55 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
};
checkLinks(extractor.getNext(), html5links);
+ String[][] html5links2 = {
+ {"http://www.example.com/", "A@/href"},
+ };
+ checkLinks(extractor.getNext(), html5links2);
String[][] fbVideoLinks = {
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbVideoLinks);
+ String[][] dataHrefLinks = {
+ {"standard.css", "LINK@/href", "stylesheet"},
+ {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
+ {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
+ {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
+ {"/content-page", "ARTICLE@/data-href"},
+ {"/content-page", "A@/href"},
+ {"/tags/content","A@/href"},
+ {"/tags/headlines", "A@/href"},
+ {"http://grabaperch.com", "DIV@/data-href"},
+ {"green.css", "LINK@/data-href"},
+ {"blue.css", "LINK@/data-href"},
+ {"http://codecanyon.net/user/CodingJack", "A@/data-href"},
+ {"jackbox/img/thumbs/4.jpg", "IMG@/src"},
+ {"//venobox-destination", "A@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
+ };
+ checkLinks(extractor.getNext(), dataHrefLinks);
+ String[][] fbSocialLinks = {
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
+ {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck", "DIV@/data-href"},
+ {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook", "A@/href"},
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbSocialLinks);
}
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
index aed76aad..1781168c 100644
--- a/src/test/resources/org/archive/resource/html/link-extraction-test.warc
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -90,6 +90,42 @@ Content-Type: text/html