Skip to content

Commit 26b1e7a

Browse files
ExtractingParseObserver: extract rel, hreflang and type attributes
- add "rel" attribute to A and AREA links - add attributes "hreflang" and "type" (MIME type) to A@/href links
1 parent 336a49e commit 26b1e7a

1 file changed

Lines changed: 17 additions & 2 deletions

File tree

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
284284
l.add(makePath("A","href"));
285285
l.add("url");
286286
l.add(url);
287-
for(String a : new String[] {"target","alt","title"}) {
287+
for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
288288
String v = node.getAttribute(a);
289289
if(v != null) {
290290
l.add(a);
@@ -311,7 +311,22 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
311311

312312
private static class AreaTagExtractor implements TagExtractor {
313313
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
314-
addBasicHrefs(data,node,"href");
314+
String url = node.getAttribute("href");
315+
if(url != null) {
316+
ArrayList<String> l = new ArrayList<String>();
317+
l.add(PATH);
318+
l.add(makePath("AREA","href"));
319+
l.add("url");
320+
l.add(url);
321+
for(String a : new String[] {"rel"}) {
322+
String v = node.getAttribute(a);
323+
if(v != null) {
324+
l.add(a);
325+
l.add(v);
326+
}
327+
}
328+
data.addHref(l);
329+
}
315330
}
316331
}
317332

0 commit comments

Comments
 (0)