|
1 | 1 | package org.archive.resource.html; |
2 | 2 |
|
| 3 | +import java.io.IOException; |
| 4 | +import java.util.ArrayList; |
| 5 | +import java.util.List; |
| 6 | +import java.util.logging.Logger; |
| 7 | + |
| 8 | +import org.archive.extract.ExtractingResourceFactoryMapper; |
| 9 | +import org.archive.extract.ExtractingResourceProducer; |
| 10 | +import org.archive.extract.ProducerUtils; |
| 11 | +import org.archive.extract.ResourceFactoryMapper; |
3 | 12 | import org.archive.resource.MetaData; |
| 13 | +import org.archive.resource.Resource; |
| 14 | +import org.archive.resource.ResourceParseException; |
| 15 | +import org.archive.resource.ResourceProducer; |
4 | 16 | import org.htmlparser.nodes.TextNode; |
5 | 17 | import org.json.JSONArray; |
6 | 18 | import org.json.JSONException; |
7 | 19 | import org.json.JSONObject; |
8 | 20 |
|
| 21 | +import com.google.common.collect.ArrayListMultimap; |
| 22 | +import com.google.common.collect.Multimap; |
| 23 | + |
9 | 24 | import junit.framework.TestCase; |
10 | 25 |
|
11 | 26 | public class ExtractingParseObserverTest extends TestCase { |
12 | 27 |
|
| 28 | + private static final Logger LOG = |
| 29 | + Logger.getLogger(ExtractingParseObserverTest.class.getName()); |
| 30 | + |
13 | 31 | public void testHandleStyleNodeExceptions() throws Exception { |
14 | 32 | String[] tests = { |
15 | 33 | "some css", |
@@ -103,5 +121,148 @@ private void checkExtract(String[] data) throws JSONException { |
103 | 121 | } |
104 | 122 | } |
105 | 123 |
|
| 124 | + private void checkLink(Multimap<String,String> links, String url, String path) { |
| 125 | + assertTrue("Link with URL " + url + " not found", links.containsKey(url)); |
| 126 | + assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path)); |
| 127 | + } |
| 128 | + |
| 129 | + private void checkLinks(Resource resource, String[][] expectedLinks) { |
| 130 | + assertNotNull(resource); |
| 131 | + assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource); |
| 132 | + MetaData md = resource.getMetaData(); |
| 133 | + LOG.info(md.toString()); |
| 134 | + Multimap<String, String> links = ArrayListMultimap.create(); |
| 135 | + JSONObject head = md.optJSONObject("Head"); |
| 136 | + if (head != null) { |
| 137 | + // <base href="http://www.example.com/" /> |
| 138 | + String baseUrl = (String) head.opt("Base"); |
| 139 | + if (baseUrl != null) { |
| 140 | + links.put(baseUrl, "__base__"); |
| 141 | + } |
| 142 | + // <meta http-equiv="Refresh" content="5; URL=http://www.example.com/redirected.html" /> |
| 143 | + JSONArray metas = head.optJSONArray("Metas"); |
| 144 | + if (metas != null) { |
| 145 | + for (int i = 0; i < metas.length(); i++) { |
| 146 | + JSONObject o = (JSONObject) metas.optJSONObject(i); |
| 147 | + String httpEquiv = o.optString("http-equiv"); |
| 148 | + if (httpEquiv != null && httpEquiv.equalsIgnoreCase("Refresh")) { |
| 149 | + String metaRefreshTarget = o.optString("content"); |
| 150 | + if (metaRefreshTarget != null) { |
| 151 | + metaRefreshTarget = metaRefreshTarget.replaceFirst("(?i)(?:^\\d+\\s*;)?\\s*url=", ""); |
| 152 | + links.put(metaRefreshTarget, "__meta_refresh__"); |
| 153 | + } |
| 154 | + } |
| 155 | + } |
| 156 | + } |
| 157 | + } |
| 158 | + // extract outlinks |
| 159 | + List<JSONArray> linkArrays = new ArrayList<JSONArray>(); |
| 160 | + if (md.optJSONArray("Links") != null) { |
| 161 | + linkArrays.add(md.optJSONArray("Links")); |
| 162 | + } |
| 163 | + try { |
| 164 | + if (md.getJSONObject("Head") != null && md.getJSONObject("Head").getJSONArray("Link") != null) { |
| 165 | + linkArrays.add(md.getJSONObject("Head").getJSONArray("Link")); |
| 166 | + } |
| 167 | + } catch (JSONException e1) { |
| 168 | + } |
| 169 | + for (JSONArray ldata : linkArrays) { |
| 170 | + for (int i = 0; i < ldata.length(); i++) { |
| 171 | + JSONObject o = (JSONObject) ldata.optJSONObject(i); |
| 172 | + try { |
| 173 | + String url = o.getString("url"); |
| 174 | + links.put(url, o.getString("path")); |
| 175 | + LOG.info(" found link: " + o.getString("url") + " " + o.getString("path")); |
| 176 | + } catch (JSONException e) { |
| 177 | + fail("Failed to extract URL from link: " + e.getMessage()); |
| 178 | + } |
| 179 | + } |
| 180 | + } |
| 181 | + assertEquals("Unexpected number of links", expectedLinks.length, links.size()); |
| 182 | + for (String[] l : expectedLinks) { |
| 183 | + checkLink(links, l[0], l[1]); |
| 184 | + } |
| 185 | + } |
| 186 | + |
| 187 | + public void testLinkExtraction() throws ResourceParseException, IOException { |
| 188 | + String testFileName = "link-extraction-test.warc"; |
| 189 | + ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); |
| 190 | + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); |
| 191 | + ExtractingResourceProducer extractor = |
| 192 | + new ExtractingResourceProducer(producer, mapper); |
| 193 | + extractor.getNext(); // skip warcinfo record |
| 194 | + String[][] html4links = { |
| 195 | + {"http://www.example.com/", "__base__"}, |
| 196 | + {"http://www.example.com/redirected.html", "__meta_refresh__"}, |
| 197 | + {"background.jpg", "BODY@/background"}, |
| 198 | + {"http://www.example.com/a-href.html", "A@/href"}, |
| 199 | + {"#anchor", "A@/href"}, |
| 200 | + {"image.png", "IMG@/src"}, |
| 201 | + {"image.gif", "IMG@/src"}, |
| 202 | + {"http://example.com/image-description.html#image.gif", "IMG@/longdesc"}, |
| 203 | + {"helloworld.swf", "OBJECT@/data"}, |
| 204 | + {"http://www.example.com/shakespeare.html", "Q@/cite"}, |
| 205 | + {"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"} |
| 206 | + }; |
| 207 | + checkLinks(extractor.getNext(), html4links); |
| 208 | + String[][] html5links = { |
| 209 | + {"http:///www.example.com/video.html", "LINK@/href", "canonical"}, |
| 210 | + {"video.rss", "LINK@/href", "alternate"}, |
| 211 | + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"}, |
| 212 | + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"}, |
| 213 | + {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"}, |
| 214 | + {"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"} |
| 215 | + }; |
| 216 | + checkLinks(extractor.getNext(), html5links); |
| 217 | + String[][] html5links2 = { |
| 218 | + {"http://www.example.com/", "A@/href"}, |
| 219 | + }; |
| 220 | + checkLinks(extractor.getNext(), html5links2); |
| 221 | + String[][] fbVideoLinks = { |
| 222 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, |
| 223 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, |
| 224 | + {"https://www.facebook.com/facebook/", "A@/href"}, |
| 225 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"} |
| 226 | + }; |
| 227 | + checkLinks(extractor.getNext(), fbVideoLinks); |
| 228 | + String[][] dataHrefLinks = { |
| 229 | + {"standard.css", "LINK@/href", "stylesheet"}, |
| 230 | + {"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"}, |
| 231 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}, |
| 232 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"}, |
| 233 | + {"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"}, |
| 234 | + {"https://www.facebook.com/facebook/", "A@/href"}, |
| 235 | + {"//edge.flowplayer.org/bauhaus.webm", "SOURCE@/src"}, |
| 236 | + {"//edge.flowplayer.org/bauhaus.mp4", "SOURCE@/src"}, |
| 237 | + {"//edge.flowplayer.org/functional.webm", "BUTTON@/data-href"}, |
| 238 | + {"/content-page", "ARTICLE@/data-href"}, |
| 239 | + {"/content-page", "A@/href"}, |
| 240 | + {"/tags/content","A@/href"}, |
| 241 | + {"/tags/headlines", "A@/href"}, |
| 242 | + {"http://grabaperch.com", "DIV@/data-href"}, |
| 243 | + {"green.css", "LINK@/data-href"}, |
| 244 | + {"blue.css", "LINK@/data-href"}, |
| 245 | + {"http://codecanyon.net/user/CodingJack", "A@/data-href"}, |
| 246 | + {"jackbox/img/thumbs/4.jpg", "IMG@/src"}, |
| 247 | + {"//venobox-destination", "A@/data-href"}, |
| 248 | + {"#", "A@/href"}, |
| 249 | + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"}, |
| 250 | + {"#", "A@/href"}, |
| 251 | + {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"} |
| 252 | + }; |
| 253 | + checkLinks(extractor.getNext(), dataHrefLinks); |
| 254 | + String[][] fbSocialLinks = { |
| 255 | + {"http://www.your-domain.com/your-page.html", "DIV@/data-uri"}, |
| 256 | + {"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"}, |
| 257 | + {"https://www.facebook.com/zuck/posts/10102735452532991?comment_id=1070233703036185", "DIV@/data-href"}, |
| 258 | + {"https://www.facebook.com/zuck", "DIV@/data-href"}, |
| 259 | + {"https://developers.facebook.com/docs/plugins/", "DIV@/data-href"}, |
| 260 | + {"https://www.facebook.com/facebook", "DIV@/data-href"}, |
| 261 | + {"https://www.facebook.com/facebook", "BLOCKQUOTE@/cite"}, |
| 262 | + {"https://www.facebook.com/facebook", "A@/href"}, |
| 263 | + {"http://www.your-domain.com/your-page.html", "DIV@/data-href"} |
| 264 | + }; |
| 265 | + checkLinks(extractor.getNext(), fbSocialLinks); |
| 266 | + } |
106 | 267 |
|
107 | 268 | } |
0 commit comments