diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index deb8c8c0..826851e0 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -2,12 +2,17 @@
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
+import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.text.html.ParseObserver;
+import org.htmlparser.Attribute;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
@@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver {
private final static int MAX_TEXT_LEN = 100;
-// private static String GLOBAL_ATTR[] = {"background"};
-
private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
- private final static Map extractors;
+ private static final Map extractors;
+ private static final Set globalHrefAttributes;
static {
extractors = new HashMap();
extractors.put("A", new AnchorTagExtractor());
@@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("META", new MetaTagExtractor());
extractors.put("OBJECT", new ObjectTagExtractor());
extractors.put("SCRIPT", new ScriptTagExtractor());
+ extractors.put("Q", new QuotationLinkTagExtractor());
+ extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
+ extractors.put("DEL", new QuotationLinkTagExtractor());
+ extractors.put("INS", new QuotationLinkTagExtractor());
+ // HTML5:
+ extractors.put("BUTTON", new ButtonTagExtractor());
+ extractors.put("MENUITEM", new MenuitemTagExtractor());
+ extractors.put("VIDEO", new EmbedVideoTagExtractor());
+ extractors.put("AUDIO", new EmbedTagExtractor());
+ extractors.put("TRACK", new EmbedTagExtractor());
+ extractors.put("SOURCE", new EmbedTagExtractor());
+
+ globalHrefAttributes = new HashSet();
+ globalHrefAttributes.add("background");
+ globalHrefAttributes.add("data-href");
+ globalHrefAttributes.add("data-uri");
}
@@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) {
inTitle = !tag.isEmptyXmlTag();
return;
}
+
// first the global attributes:
- // background
- String v = tag.getAttribute("background");
- if(v != null) {
- data.addHref(PATH,makePath(name,"background"),"url",v);
+ Vector attributes = tag.getAttributesEx();
+ for (Attribute a : attributes) {
+ String attrName = a.getName();
+ String attrValue = a.getValue();
+ if (attrName == null || attrValue == null) {
+ continue;
+ }
+ attrName = attrName.toLowerCase(Locale.ROOT);
+ if (globalHrefAttributes.contains(attrName)) {
+ data.addHref(PATH,makePath(name,attrName),"url",attrValue);
+ }
}
// TODO: style attribute, BASE(href) tag, Resolve URLs
@@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class ButtonTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"formaction");
+ }
+ }
+
private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+ private static class EmbedVideoTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"src","poster");
+ }
+ }
+
private static class FormTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = new ArrayList();
@@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
addBasicHrefs(data,node,"src");
}
}
+
private static class IFrameTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}
+
private static class ImgTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
+ addBasicHrefs(data,node,"longdesc");
}
}
+
private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"src");
+ addBasicHrefs(data,node,"src","formaction");
}
}
+
private static class LinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"href","rel","type");
@@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
+ private static class MenuitemTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"icon");
+ }
+ }
+
private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrList(node,"name","rel","content","http-equiv");
@@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private static class ObjectTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
- addBasicHrefs(data,node,"codebase","cdata");
+ addBasicHrefs(data,node,"codebase","cdata","data");
}
}
+
+ private static class QuotationLinkTagExtractor implements TagExtractor {
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ addBasicHrefs(data,node,"cite");
+ }
+ }
+
private static class ScriptTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList l = getAttrListUrl(node,"src","type");
@@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}
+
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
int idx = 0;
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index bfbd6f02..8f690a06 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -1,15 +1,33 @@
package org.archive.resource.html;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.archive.extract.ExtractingResourceFactoryMapper;
+import org.archive.extract.ExtractingResourceProducer;
+import org.archive.extract.ProducerUtils;
+import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
+import org.archive.resource.Resource;
+import org.archive.resource.ResourceParseException;
+import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+
import junit.framework.TestCase;
public class ExtractingParseObserverTest extends TestCase {
+ private static final Logger LOG =
+ Logger.getLogger(ExtractingParseObserverTest.class.getName());
+
public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
@@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException {
}
}
+ private void checkLink(Multimap links, String url, String path) {
+ assertTrue("Link with URL " + url + " not found", links.containsKey(url));
+ assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
+ }
+
+ private void checkLinks(Resource resource, String[][] expectedLinks) {
+ assertNotNull(resource);
+ assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+ MetaData md = resource.getMetaData();
+ LOG.info(md.toString());
+ Multimap links = ArrayListMultimap.create();
+ JSONObject head = md.optJSONObject("Head");
+ if (head != null) {
+ //
+ String baseUrl = (String) head.opt("Base");
+ if (baseUrl != null) {
+ links.put(baseUrl, "__base__");
+ }
+ //
+ JSONArray metas = head.optJSONArray("Metas");
+ if (metas != null) {
+ for (int i = 0; i < metas.length(); i++) {
+ JSONObject o = (JSONObject) metas.optJSONObject(i);
+ String httpEquiv = o.optString("http-equiv");
+ if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
+ String metaRefreshTarget = o.optString("content");
+ if (metaRefreshTarget != null) {
+ metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
+ links.put(metaRefreshTarget, "__meta_refresh__");
+ }
+ }
+ }
+ }
+ }
+ // extract outlinks
+ List linkArrays = new ArrayList();
+ if (md.optJSONArray("Links") != null) {
+ linkArrays.add(md.optJSONArray("Links"));
+ }
+ try {
+ if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
+ linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
+ }
+ } catch (JSONException e1) {
+ }
+ for (JSONArray ldata : linkArrays) {
+ for (int i = 0; i < ldata.length(); i++) {
+ JSONObject o = (JSONObject) ldata.optJSONObject(i);
+ try {
+ String url = o.getString("url");
+ links.put(url, o.getString("path"));
+ LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
+ } catch (JSONException e) {
+ fail("Failed to extract URL from link: " + e.getMessage());
+ }
+ }
+ }
+ assertEquals("Unexpected number of links", expectedLinks.length, links.size());
+ for (String[] l : expectedLinks) {
+ checkLink(links, l[0], l[1]);
+ }
+ }
+
+ public void testLinkExtraction() throws ResourceParseException, IOException {
+ String testFileName = "link-extraction-test.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor =
+ new ExtractingResourceProducer(producer, mapper);
+ extractor.getNext(); // skip warcinfo record
+ String[][] html4links = {
+ {"http://www.example.com/", "__base__"},
+ {"http://www.example.com/redirected.html", "__meta_refresh__"},
+ {"background.jpg", "BODY@/background"},
+ {"http://www.example.com/a-href.html", "A@/href"},
+ {"#anchor", "A@/href"},
+ {"image.png", "IMG@/src"},
+ {"image.gif", "IMG@/src"},
+ {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
+ {"helloworld.swf", "OBJECT@/data"},
+ {"http://www.example.com/shakespeare.html", "Q@/cite"},
+ {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
+ };
+ checkLinks(extractor.getNext(), html4links);
+ String[][] html5links = {
+ {"http:///www.example.com/video.html", "LINK@/href", "canonical"},
+ {"video.rss", "LINK@/href", "alternate"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
+ {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
+ };
+ checkLinks(extractor.getNext(), html5links);
+ String[][] html5links2 = {
+ {"http://www.example.com/", "A@/href"},
+ };
+ checkLinks(extractor.getNext(), html5links2);
+ String[][] fbVideoLinks = {
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbVideoLinks);
+ String[][] dataHrefLinks = {
+ {"standard.css", "LINK@/href", "stylesheet"},
+ {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
+ {"https://www.facebook.com/facebook/", "A@/href"},
+ {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
+ {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
+ {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
+ {"/content-page", "ARTICLE@/data-href"},
+ {"/content-page", "A@/href"},
+ {"/tags/content","A@/href"},
+ {"/tags/headlines", "A@/href"},
+ {"http://grabaperch.com", "DIV@/data-href"},
+ {"green.css", "LINK@/data-href"},
+ {"blue.css", "LINK@/data-href"},
+ {"http://codecanyon.net/user/CodingJack", "A@/data-href"},
+ {"jackbox/img/thumbs/4.jpg", "IMG@/src"},
+ {"//venobox-destination", "A@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
+ {"#", "A@/href"},
+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
+ };
+ checkLinks(extractor.getNext(), dataHrefLinks);
+ String[][] fbSocialLinks = {
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
+ {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
+ {"https://www.facebook.com/zuck", "DIV@/data-href"},
+ {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "DIV@/data-href"},
+ {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
+ {"https://www.facebook.com/facebook", "A@/href"},
+ {"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
+ };
+ checkLinks(extractor.getNext(), fbSocialLinks);
+ }
}
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
new file mode 100644
index 00000000..ab0e54c8
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -0,0 +1,320 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+WARC-Date: 2017-02-20T14:00:56Z
+Content-Length: 128
+
+format: WARC File Format 1.0
+conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
+robots: classic
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2017-02-20T14:00:56Z
+WARC-Target-URI: http://www.example.com/html4.html
+Content-Type: application/http; msgtype=response
+Content-Length: 1243
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 14:00:56 GMT
+Content-Length: 1125
+Content-Type: application/xhtml+xml
+
+
+
+
+
+
+
+Test XHTML Link Extraction
+
+
+A@/href
+
+ anchor only
+
+
+
+
+
+ To be or not to be.
+
+
+To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune, …
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/link-extraction-test-html5-video.html
+WARC-Date: 2017-02-20T21:35:03Z
+Content-Type: application/http; msgtype=response
+Content-Length: 890
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 21:35:03 GMT
+Content-Length: 789
+Content-Type: text/html
+
+
+
+
+Test HTML5 Video Tag
+
+
+
+
+
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/poor_html5.html
+WARC-Date: 2017-02-21T15:50:40Z
+Content-Type: application/http; msgtype=response
+Content-Length: 594
+
+HTTP/1.1 200 OK
+Date: Tue, 21 Feb 2017 15:50:40 GMT
+Content-Length: 486
+Content-Type: text/html
+
+
+Testing poor HTML5
+
+
+
+
+
+This is valid HTML5!
+
+
+
+
+
+headline
+
+paragraph one with link.
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/fb-video.html
+WARC-Date: 2017-02-20T16:58:50Z
+Content-Type: application/http; msgtype=response
+Content-Length: 1330
+
+HTTP/1.1 200 OK
+Date: Mon, 20 Feb 2017 16:58:50 GMT
+Content-Length: 1194
+Content-Type: text/html
+
+
+
+
+ fb-video - Embedded Videos - Social Plugins
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/data-href.examples.html
+WARC-Date: 2017-02-21T21:05:10Z
+Content-Type: application/http; msgtype=response
+Content-Length: 3160
+
+HTTP/1.1 200 OK
+Date: Tue, 21 Feb 2017 21:05:10 GMT
+Content-Length: 3057
+Content-Type: text/html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ And here goes a bit of copy about the content of the article.
+ Tags: content, headlines
+
+
+
+
+
+
+
+
+
+
+
+venobox
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/fb-social-plugins.html
+WARC-Date: 2017-02-22T09:33:02Z
+Content-Type: application/http; msgtype=response
+Content-Length: 1870
+
+HTTP/1.1 200 OK
+Date: Wed, 22 Feb 2017 09:33:02 GMT
+Content-Length: 1767
+Content-Type: text/html
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+