From 5f223d60c365a53533b2ad7217deaa65b3a91667 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 11:51:10 +0100 Subject: [PATCH 1/4] Use CharsetDetector to guess encoding of HTML document --- .../resource/html/HTMLResourceFactory.java | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 935843f1..34062ed9 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -1,9 +1,14 @@ package org.archive.resource.html; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.archive.format.http.HttpHeaders; +import org.archive.format.json.JSONUtils; +import org.archive.format.text.charset.CharsetDetector; +import org.archive.format.text.charset.StandardCharsetDetector; import org.archive.format.text.html.CDATALexer; import org.archive.format.text.html.LexParser; import org.archive.resource.MetaData; @@ -13,17 +18,40 @@ import org.archive.resource.ResourceParseException; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; +import org.json.JSONException; +import org.json.JSONObject; public class HTMLResourceFactory implements ResourceFactory { + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; + protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; + + protected CharsetDetector charSetDetector = new StandardCharsetDetector(); + + public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { HTMLMetaData hmd = new HTMLMetaData(parentMetaData); ExtractingParseObserver epo = new ExtractingParseObserver(hmd); LexParser parser = new LexParser(epo); CDATALexer lex = new CDATALexer(); - // TODO: figure out charset: - String charset = "UTF-8"; + + // guess charset based on HTTP header and sniffed content chunk + is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); + byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; + is.mark(0); + int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); + is.reset(); + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } + + String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + Page page; try { page = new Page(is, charset); From 607acaa734183b72c816359c588bbf157485d5ba Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 12:44:53 +0100 Subject: [PATCH 2/4] HTML encoding detection: fix errors with empty content or empty charset values --- .../format/text/charset/CharsetDetector.java | 2 ++ .../resource/html/HTMLResourceFactory.java | 24 +++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index ae71b5fa..0534ff85 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -182,6 +182,8 @@ private static String trimAttrValue(String value) { return value; } String result = value; + if (result.isEmpty()) + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 34062ed9..afb1c850 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -5,6 +5,8 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.archive.format.http.HttpHeaders; import org.archive.format.json.JSONUtils; import org.archive.format.text.charset.CharsetDetector; @@ -23,6 +25,8 @@ public class HTMLResourceFactory implements ResourceFactory { + public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; @@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData, CDATALexer lex = new CDATALexer(); // guess charset based on HTTP header and sniffed content chunk + String charset = "UTF-8"; is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; is.mark(0); int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); is.reset(); - JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); - HttpHeaders httpHeaders = new HttpHeaders(); - if (headers.has("Content-Type")) { + if (chunkSize > 0) { + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } try { - httpHeaders.add("Content-Type", headers.getString("Content-Type")); - } catch (JSONException e) { } + charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + } catch (Exception e) { + LOG.error("Failed to guess charset: " + e.getMessage()); + } } - String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); - Page page; try { page = new Page(is, charset); From 824dd82f5f9c9e60392ece498f8e5d44a7e431b9 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 24 Nov 2016 14:05:55 +0100 Subject: [PATCH 3/4] Match http-equiv meta elements with unquoted attribute values, e.g. --- .../org/archive/format/text/charset/CharsetDetector.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 0534ff85..9b4c8523 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -60,7 +60,8 @@ public abstract class CharsetDetector { private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" + META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" + - META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" + + ANY_ATTR_VALUE + ")(?:\\s|>)?"; @@ -183,7 +184,7 @@ private static String trimAttrValue(String value) { } String result = value; if (result.isEmpty()) - return result; + return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') { @@ -232,7 +233,6 @@ public static String findMetaContentType(String pageSample) { protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; - UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); From 038402885f85a426601d5f85936e210e4f55636f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 27 Jan 2017 08:59:25 +0100 Subject: [PATCH 4/4] CharsetDetector: remove unnecessary check for empty string (contributed by @ldko) --- .../java/org/archive/format/text/charset/CharsetDetector.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index 9b4c8523..690f8b99 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -183,8 +183,6 @@ private static String trimAttrValue(String value) { return value; } String result = value; - if (result.isEmpty()) - return result; if (result.charAt(0) == '"') { result = result.substring(1, result.length() - 1); } else if (result.charAt(0) == '\'') {