diff --git a/src/main/java/org/archive/format/text/charset/CharsetDetector.java b/src/main/java/org/archive/format/text/charset/CharsetDetector.java index ae71b5fa..690f8b99 100644 --- a/src/main/java/org/archive/format/text/charset/CharsetDetector.java +++ b/src/main/java/org/archive/format/text/charset/CharsetDetector.java @@ -60,7 +60,8 @@ public abstract class CharsetDetector { private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" + META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" + - META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" + + ANY_ATTR_VALUE + ")(?:\\s|>)?"; @@ -230,7 +231,6 @@ public static String findMetaContentType(String pageSample) { protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; - UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); diff --git a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java index 935843f1..afb1c850 100644 --- a/src/main/java/org/archive/resource/html/HTMLResourceFactory.java +++ b/src/main/java/org/archive/resource/html/HTMLResourceFactory.java @@ -1,9 +1,16 @@ package org.archive.resource.html; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.archive.format.http.HttpHeaders; +import org.archive.format.json.JSONUtils; +import org.archive.format.text.charset.CharsetDetector; +import org.archive.format.text.charset.StandardCharsetDetector; import org.archive.format.text.html.CDATALexer; import org.archive.format.text.html.LexParser; import org.archive.resource.MetaData; @@ -13,17 +20,48 @@ import org.archive.resource.ResourceParseException; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; +import org.json.JSONException; +import org.json.JSONObject; public class HTMLResourceFactory implements ResourceFactory { + public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); + + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; + protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; + + protected CharsetDetector charSetDetector = new StandardCharsetDetector(); + + public Resource getResource(InputStream is, MetaData parentMetaData, ResourceContainer container) throws ResourceParseException, IOException { HTMLMetaData hmd = new HTMLMetaData(parentMetaData); ExtractingParseObserver epo = new ExtractingParseObserver(hmd); LexParser parser = new LexParser(epo); CDATALexer lex = new CDATALexer(); - // TODO: figure out charset: + + // guess charset based on HTTP header and sniffed content chunk String charset = "UTF-8"; + is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); + byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; + is.mark(0); + int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); + is.reset(); + if (chunkSize > 0) { + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); + HttpHeaders httpHeaders = new HttpHeaders(); + if (headers.has("Content-Type")) { + try { + httpHeaders.add("Content-Type", headers.getString("Content-Type")); + } catch (JSONException e) { } + } + try { + charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); + } catch (Exception e) { + LOG.error("Failed to guess charset: " + e.getMessage()); + } + } + Page page; try { page = new Page(is, charset);