Skip to content

Commit f12e86f

Browse files
committed
Merge branch 'wat-improved-link-extraction' of https://github.com/sebastian-nagel/webarchive-commons into sebastian-nagel-wat-improved-link-extraction
2 parents 59f8d26 + 11579c2 commit f12e86f

3 files changed

Lines changed: 551 additions & 9 deletions

File tree

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22

33
import java.util.ArrayList;
44
import java.util.HashMap;
5+
import java.util.HashSet;
6+
import java.util.Locale;
57
import java.util.Map;
8+
import java.util.Set;
69
import java.util.Stack;
10+
import java.util.Vector;
711
import java.util.regex.Matcher;
812
import java.util.regex.Pattern;
913

1014
import org.archive.format.text.html.ParseObserver;
15+
import org.htmlparser.Attribute;
1116
import org.htmlparser.nodes.RemarkNode;
1217
import org.htmlparser.nodes.TagNode;
1318
import org.htmlparser.nodes.TextNode;
@@ -36,11 +41,10 @@ public class ExtractingParseObserver implements ParseObserver {
3641

3742
private final static int MAX_TEXT_LEN = 100;
3843

39-
// private static String GLOBAL_ATTR[] = {"background"};
40-
4144
private static final String PATH = "path";
4245
private static final String PATH_SEPARATOR = "@/";
43-
private final static Map<String, TagExtractor> extractors;
46+
private static final Map<String, TagExtractor> extractors;
47+
private static final Set<String> globalHrefAttributes;
4448
static {
4549
extractors = new HashMap<String,ExtractingParseObserver.TagExtractor>();
4650
extractors.put("A", new AnchorTagExtractor());
@@ -57,6 +61,22 @@ public class ExtractingParseObserver implements ParseObserver {
5761
extractors.put("META", new MetaTagExtractor());
5862
extractors.put("OBJECT", new ObjectTagExtractor());
5963
extractors.put("SCRIPT", new ScriptTagExtractor());
64+
extractors.put("Q", new QuotationLinkTagExtractor());
65+
extractors.put("BLOCKQUOTE", new QuotationLinkTagExtractor());
66+
extractors.put("DEL", new QuotationLinkTagExtractor());
67+
extractors.put("INS", new QuotationLinkTagExtractor());
68+
// HTML5:
69+
extractors.put("BUTTON", new ButtonTagExtractor());
70+
extractors.put("MENUITEM", new MenuitemTagExtractor());
71+
extractors.put("VIDEO", new EmbedVideoTagExtractor());
72+
extractors.put("AUDIO", new EmbedTagExtractor());
73+
extractors.put("TRACK", new EmbedTagExtractor());
74+
extractors.put("SOURCE", new EmbedTagExtractor());
75+
76+
globalHrefAttributes = new HashSet<String>();
77+
globalHrefAttributes.add("background");
78+
globalHrefAttributes.add("data-href");
79+
globalHrefAttributes.add("data-uri");
6080
}
6181

6282

@@ -84,11 +104,19 @@ public void handleTagOpen(TagNode tag) {
84104
inTitle = !tag.isEmptyXmlTag();
85105
return;
86106
}
107+
87108
// first the global attributes:
88-
// background
89-
String v = tag.getAttribute("background");
90-
if(v != null) {
91-
data.addHref(PATH,makePath(name,"background"),"url",v);
109+
Vector<Attribute> attributes = tag.getAttributesEx();
110+
for (Attribute a : attributes) {
111+
String attrName = a.getName();
112+
String attrValue = a.getValue();
113+
if (attrName == null || attrValue == null) {
114+
continue;
115+
}
116+
attrName = attrName.toLowerCase(Locale.ROOT);
117+
if (globalHrefAttributes.contains(attrName)) {
118+
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
119+
}
92120
}
93121
// TODO: style attribute, BASE(href) tag, Resolve URLs
94122

@@ -296,12 +324,24 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
296324
}
297325
}
298326

327+
private static class ButtonTagExtractor implements TagExtractor {
328+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
329+
addBasicHrefs(data,node,"formaction");
330+
}
331+
}
332+
299333
private static class EmbedTagExtractor implements TagExtractor {
300334
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
301335
addBasicHrefs(data,node,"src");
302336
}
303337
}
304338

339+
private static class EmbedVideoTagExtractor implements TagExtractor {
340+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
341+
addBasicHrefs(data,node,"src","poster");
342+
}
343+
}
344+
305345
private static class FormTagExtractor implements TagExtractor {
306346
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
307347
ArrayList<String> l = new ArrayList<String>();
@@ -329,21 +369,26 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
329369
addBasicHrefs(data,node,"src");
330370
}
331371
}
372+
332373
private static class IFrameTagExtractor implements TagExtractor {
333374
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
334375
addBasicHrefs(data,node,"src");
335376
}
336377
}
378+
337379
private static class ImgTagExtractor implements TagExtractor {
338380
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
339381
addHrefWithAttrs(data,node,"src","alt","title");
382+
addBasicHrefs(data,node,"longdesc");
340383
}
341384
}
385+
342386
private static class InputTagExtractor implements TagExtractor {
343387
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
344-
addBasicHrefs(data,node,"src");
388+
addBasicHrefs(data,node,"src","formaction");
345389
}
346390
}
391+
347392
private static class LinkTagExtractor implements TagExtractor {
348393
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
349394
ArrayList<String> l = getAttrListUrl(node,"href","rel","type");
@@ -352,6 +397,13 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
352397
}
353398
}
354399
}
400+
401+
private static class MenuitemTagExtractor implements TagExtractor {
402+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
403+
addBasicHrefs(data,node,"icon");
404+
}
405+
}
406+
355407
private static class MetaTagExtractor implements TagExtractor {
356408
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
357409
ArrayList<String> l = getAttrList(node,"name","rel","content","http-equiv");
@@ -360,11 +412,19 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
360412
}
361413
}
362414
}
415+
363416
private static class ObjectTagExtractor implements TagExtractor {
364417
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
365-
addBasicHrefs(data,node,"codebase","cdata");
418+
addBasicHrefs(data,node,"codebase","cdata","data");
366419
}
367420
}
421+
422+
private static class QuotationLinkTagExtractor implements TagExtractor {
423+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
424+
addBasicHrefs(data,node,"cite");
425+
}
426+
}
427+
368428
private static class ScriptTagExtractor implements TagExtractor {
369429
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
370430
ArrayList<String> l = getAttrListUrl(node,"src","type");
@@ -373,6 +433,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
373433
}
374434
}
375435
}
436+
376437
private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String content) {
377438
Matcher m = pattern.matcher(content);
378439
int idx = 0;

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,33 @@
11
package org.archive.resource.html;
22

3+
import java.io.IOException;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
import java.util.logging.Logger;
7+
8+
import org.archive.extract.ExtractingResourceFactoryMapper;
9+
import org.archive.extract.ExtractingResourceProducer;
10+
import org.archive.extract.ProducerUtils;
11+
import org.archive.extract.ResourceFactoryMapper;
312
import org.archive.resource.MetaData;
13+
import org.archive.resource.Resource;
14+
import org.archive.resource.ResourceParseException;
15+
import org.archive.resource.ResourceProducer;
416
import org.htmlparser.nodes.TextNode;
517
import org.json.JSONArray;
618
import org.json.JSONException;
719
import org.json.JSONObject;
820

21+
import com.google.common.collect.ArrayListMultimap;
22+
import com.google.common.collect.Multimap;
23+
924
import junit.framework.TestCase;
1025

1126
public class ExtractingParseObserverTest extends TestCase {
1227

28+
private static final Logger LOG =
29+
Logger.getLogger(ExtractingParseObserverTest.class.getName());
30+
1331
public void testHandleStyleNodeExceptions() throws Exception {
1432
String[] tests = {
1533
"some css",
@@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException {
103121
}
104122
}
105123

124+
private void checkLink(Multimap<String,String> links, String url, String path) {
125+
assertTrue("Link with URL " + url + " not found", links.containsKey(url));
126+
assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
127+
}
128+
129+
private void checkLinks(Resource resource, String[][] expectedLinks) {
130+
assertNotNull(resource);
131+
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
132+
MetaData md = resource.getMetaData();
133+
LOG.info(md.toString());
134+
Multimap<String, String> links = ArrayListMultimap.create();
135+
JSONObject head = md.optJSONObject("Head");
136+
if (head != null) {
137+
// <base href="http://www.example.com/" />
138+
String baseUrl = (String) head.opt("Base");
139+
if (baseUrl != null) {
140+
links.put(baseUrl, "__base__");
141+
}
142+
// <meta http-equiv="Refresh" content="5; URL=http://www.example.com/redirected.html" />
143+
JSONArray metas = head.optJSONArray("Metas");
144+
if (metas != null) {
145+
for (int i = 0; i < metas.length(); i++) {
146+
JSONObject o = (JSONObject) metas.optJSONObject(i);
147+
String httpEquiv = o.optString("http-equiv");
148+
if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) {
149+
String metaRefreshTarget = o.optString("content");
150+
if (metaRefreshTarget != null) {
151+
metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", "");
152+
links.put(metaRefreshTarget, "__meta_refresh__");
153+
}
154+
}
155+
}
156+
}
157+
}
158+
// extract outlinks
159+
List<JSONArray> linkArrays = new ArrayList<JSONArray>();
160+
if (md.optJSONArray("Links") != null) {
161+
linkArrays.add(md.optJSONArray("Links"));
162+
}
163+
try {
164+
if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) {
165+
linkArrays.add(md.getJSONObject("Head").getJSONArray("Link"));
166+
}
167+
} catch (JSONException e1) {
168+
}
169+
for (JSONArray ldata : linkArrays) {
170+
for (int i = 0; i < ldata.length(); i++) {
171+
JSONObject o = (JSONObject) ldata.optJSONObject(i);
172+
try {
173+
String url = o.getString("url");
174+
links.put(url, o.getString("path"));
175+
LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
176+
} catch (JSONException e) {
177+
fail("Failed to extract URL from link: " + e.getMessage());
178+
}
179+
}
180+
}
181+
assertEquals("Unexpected number of links", expectedLinks.length, links.size());
182+
for (String[] l : expectedLinks) {
183+
checkLink(links, l[0], l[1]);
184+
}
185+
}
186+
187+
public void testLinkExtraction() throws ResourceParseException, IOException {
188+
String testFileName = "link-extraction-test.warc";
189+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
190+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
191+
ExtractingResourceProducer extractor =
192+
new ExtractingResourceProducer(producer, mapper);
193+
extractor.getNext(); // skip warcinfo record
194+
String[][] html4links = {
195+
{"http://www.example.com/", "__base__"},
196+
{"http://www.example.com/redirected.html", "__meta_refresh__"},
197+
{"background.jpg", "BODY@/background"},
198+
{"http://www.example.com/a-href.html", "A@/href"},
199+
{"#anchor", "A@/href"},
200+
{"image.png", "IMG@/src"},
201+
{"image.gif", "IMG@/src"},
202+
{"http://example.com/image-description.html#image.gif", "IMG@/longdesc"},
203+
{"helloworld.swf", "OBJECT@/data"},
204+
{"http://www.example.com/shakespeare.html", "Q@/cite"},
205+
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
206+
};
207+
checkLinks(extractor.getNext(), html4links);
208+
String[][] html5links = {
209+
{"http:///www.example.com/video.html", "LINK@/href", "canonical"},
210+
{"video.rss", "LINK@/href", "alternate"},
211+
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
212+
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
213+
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
214+
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
215+
};
216+
checkLinks(extractor.getNext(), html5links);
217+
String[][] html5links2 = {
218+
{"http://www.example.com/", "A@/href"},
219+
};
220+
checkLinks(extractor.getNext(), html5links2);
221+
String[][] fbVideoLinks = {
222+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
223+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
224+
{"https://www.facebook.com/facebook/", "A@/href"},
225+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
226+
};
227+
checkLinks(extractor.getNext(), fbVideoLinks);
228+
String[][] dataHrefLinks = {
229+
{"standard.css", "LINK@/href", "stylesheet"},
230+
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
231+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
232+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
233+
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
234+
{"https://www.facebook.com/facebook/", "A@/href"},
235+
{"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"},
236+
{"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"},
237+
{"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"},
238+
{"/content-page", "ARTICLE@/data-href"},
239+
{"/content-page", "A@/href"},
240+
{"/tags/content","A@/href"},
241+
{"/tags/headlines", "A@/href"},
242+
{"http://grabaperch.com", "DIV@/data-href"},
243+
{"green.css", "LINK@/data-href"},
244+
{"blue.css", "LINK@/data-href"},
245+
{"http://codecanyon.net/user/CodingJack", "A@/data-href"},
246+
{"jackbox/img/thumbs/4.jpg", "IMG@/src"},
247+
{"//venobox-destination", "A@/data-href"},
248+
{"#", "A@/href"},
249+
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1", "DIV@/data-href"},
250+
{"#", "A@/href"},
251+
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0", "IFRAME@/src"}
252+
};
253+
checkLinks(extractor.getNext(), dataHrefLinks);
254+
String[][] fbSocialLinks = {
255+
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
256+
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
257+
{"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"},
258+
{"https://www.facebook.com/zuck", "DIV@/data-href"},
259+
{"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"},
260+
{"https://www.facebook.com/facebook", "DIV@/data-href"},
261+
{"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"},
262+
{"https://www.facebook.com/facebook", "A@/href"},
263+
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
264+
};
265+
checkLinks(extractor.getNext(), fbSocialLinks);
266+
}
106267

107268
}

0 commit comments

Comments
 (0)