Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.format.text.html.ParseObserver;
import org.htmlparser.Attribute;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
Expand Down Expand Up @@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver {

private final static int MAX_TEXT_LEN = 100;

// private static String GLOBAL_ATTR[] = {"background"};

private static final String PATH = "path";
private static final String PATH_SEPARATOR = "@/";
private final static Map<String, TagExtractor> extractors;
private static final Map<String, TagExtractor> extractors;
private static final Set<String> globalHrefAttributes;
static {
extractors = new HashMap<String,ExtractingParseObserver.TagExtractor>();
extractors.put("A", new AnchorTagExtractor());
Expand All @@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("META", new MetaTagExtractor());
extractors.put("OBJECT", new ObjectTagExtractor());
extractors.put("SCRIPT", new ScriptTagExtractor());
extractors.put("Q", new QuotationLinkTagExtractor());
extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
extractors.put("DEL", new QuotationLinkTagExtractor());
extractors.put("INS", new QuotationLinkTagExtractor());
// HTML5:
extractors.put("BUTTON", new ButtonTagExtractor());
extractors.put("MENUITEM", new MenuitemTagExtractor());
extractors.put("VIDEO", new EmbedVideoTagExtractor());
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());

globalHrefAttributes = new HashSet<String>();
globalHrefAttributes.add("background");
globalHrefAttributes.add("data-href");
globalHrefAttributes.add("data-uri");
}


Expand Down Expand Up @@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) {
inTitle = !tag.isEmptyXmlTag();
return;
}

// first the global attributes:
// background
String v = tag.getAttribute("background");
if(v != null) {
data.addHref(PATH,makePath(name,"background"),"url",v);
Vector<Attribute> attributes = tag.getAttributesEx();
for (Attribute a : attributes) {
String attrName = a.getName();
String attrValue = a.getValue();
if (attrName == null || attrValue == null) {
continue;
}
attrName = attrName.toLowerCase(Locale.ROOT);
if (globalHrefAttributes.contains(attrName)) {
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
}
}
// TODO: style attribute, BASE(href) tag, Resolve URLs

Expand Down Expand Up @@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}

private static class ButtonTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"formaction");
}
}

private static class EmbedTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}

private static class EmbedVideoTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src","poster");
}
}

private static class FormTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = new ArrayList<String>();
Expand Down Expand Up @@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
addBasicHrefs(data,node,"src");
}
}

private static class IFrameTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
}
}

private static class ImgTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addHrefWithAttrs(data,node,"src","alt","title");
addBasicHrefs(data,node,"longdesc");
}
}

private static class InputTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"src");
addBasicHrefs(data,node,"src","formaction");
}
}

private static class LinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrListUrl(node,"href","rel","type");
Expand All @@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}

private static class MenuitemTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"icon");
}
}

private static class MetaTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrList(node,"name","rel","content","http-equiv");
Expand All @@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}

private static class ObjectTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"codebase","cdata");
addBasicHrefs(data,node,"codebase","cdata","data");
}
}

private static class QuotationLinkTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
addBasicHrefs(data,node,"cite");
}
}

private static class ScriptTagExtractor implements TagExtractor {
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrListUrl(node,"src","type");
Expand All @@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
}

private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
Matcher m = pattern.matcher(content);
int idx = 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
package org.archive.resource.html;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.htmlparser.nodes.TextNode;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;

import junit.framework.TestCase;

public class ExtractingParseObserverTest extends TestCase {

private static final Logger LOG =
Logger.getLogger(ExtractingParseObserverTest.class.getName());

public void testHandleStyleNodeExceptions() throws Exception {
String[] tests = {
"some css",
Expand Down Expand Up @@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException {
}
}

private void checkLink(Multimap<String,String> links, String url, String path) {
assertTrue("Link with URL " + url + " not found", links.containsKey(url));
assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
}

private void checkLinks(Resource resource, String[][] expectedLinks) {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
MetaData md = resource.getMetaData();
LOG.info(md.toString());
Multimap<String, String> links = ArrayListMultimap.create();
JSONObject head = md.optJSONObject("Head");
if (head != null) {
// <base href="http://www.example.com/" />
String baseUrl = (String) head.opt("Base");
if (baseUrl != null) {
links.put(baseUrl, "__base__");
}
// <meta http-equiv="Refresh" content="5; URL=http://www.example.com/redirected.html" />
JSONArray metas = head.optJSONArray("Metas");
if (metas != null) {
for (int i = 0; i < metas.length(); i++) {
JSONObject o = (JSONObject) metas.optJSONObject(i);
String httpEquiv = o.optString("http-equiv");
if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
String metaRefreshTarget = o.optString("content");
if (metaRefreshTarget != null) {
metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
links.put(metaRefreshTarget, "__meta_refresh__");
}
}
}
}
}
// extract outlinks
List<JSONArray> linkArrays = new ArrayList<JSONArray>();
if (md.optJSONArray("Links") != null) {
linkArrays.add(md.optJSONArray("Links"));
}
try {
if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
}
} catch (JSONException e1) {
}
for (JSONArray ldata : linkArrays) {
for (int i = 0; i < ldata.length(); i++) {
JSONObject o = (JSONObject) ldata.optJSONObject(i);
try {
String url = o.getString("url");
links.put(url, o.getString("path"));
LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
} catch (JSONException e) {
fail("Failed to extract URL from link: " + e.getMessage());
}
}
}
assertEquals("Unexpected number of links", expectedLinks.length, links.size());
for (String[] l : expectedLinks) {
checkLink(links, l[0], l[1]);
}
}

public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor =
new ExtractingResourceProducer(producer, mapper);
extractor.getNext(); // skip warcinfo record
String[][] html4links = {
{"http://www.example.com/", "__base__"},
{"http://www.example.com/redirected.html", "__meta_refresh__"},
{"background.jpg", "BODY@/background"},
{"http://www.example.com/a-href.html", "A@/href"},
{"#anchor", "A@/href"},
{"image.png", "IMG@/src"},
{"image.gif", "IMG@/src"},
{"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
{"helloworld.swf", "OBJECT@/data"},
{"http://www.example.com/shakespeare.html", "Q@/cite"},
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
};
checkLinks(extractor.getNext(), html4links);
String[][] html5links = {
{"http:///www.example.com/video.html", "LINK@/href", "canonical"},
{"video.rss", "LINK@/href", "alternate"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
};
checkLinks(extractor.getNext(), html5links);
String[][] html5links2 = {
{"http://www.example.com/", "A@/href"},
};
checkLinks(extractor.getNext(), html5links2);
String[][] fbVideoLinks = {
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbVideoLinks);
String[][] dataHrefLinks = {
{"standard.css", "LINK@/href", "stylesheet"},
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
{"https://www.facebook.com/facebook/", "A@/href"},
{"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
{"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
{"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
{"/content-page", "ARTICLE@/data-href"},
{"/content-page", "A@/href"},
{"/tags/content","A@/href"},
{"/tags/headlines", "A@/href"},
{"http://grabaperch.com", "DIV@/data-href"},
{"green.css", "LINK@/data-href"},
{"blue.css", "LINK@/data-href"},
{"http://codecanyon.net/user/CodingJack", "A@/data-href"},
{"jackbox/img/thumbs/4.jpg", "IMG@/src"},
{"//venobox-destination", "A@/data-href"},
{"#", "A@/href"},
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1", "DIV@/data-href"},
{"#", "A@/href"},
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0", "IFRAME@/src"}
};
checkLinks(extractor.getNext(), dataHrefLinks);
String[][] fbSocialLinks = {
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
{"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
{"https://www.facebook.com/zuck", "DIV@/data-href"},
{"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
{"https://www.facebook.com/facebook", "DIV@/data-href"},
{"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
{"https://www.facebook.com/facebook", "A@/href"},
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
};
checkLinks(extractor.getNext(), fbSocialLinks);
}

}
Loading