Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ public abstract class CharsetDetector {
private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" +
META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" +
META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" +
ANY_ATTR_VALUE + ")(?:\\s|>)?";



Expand Down Expand Up @@ -230,7 +231,6 @@ public static String findMetaContentType(String pageSample) {
protected String getCharsetFromBytes(byte buffer[], int len)
throws IOException {
String charsetName = null;

UniversalDetector detector = new UniversalDetector(null);
detector.handleData(buffer, 0, len);
detector.dataEnd();
Expand Down
40 changes: 39 additions & 1 deletion src/main/java/org/archive/resource/html/HTMLResourceFactory.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
package org.archive.resource.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
Expand All @@ -13,17 +20,48 @@
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;

public class HTMLResourceFactory implements ResourceFactory {

public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);

protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";

protected CharsetDetector charSetDetector = new StandardCharsetDetector();


public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException, IOException {
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
LexParser parser = new LexParser(epo);
CDATALexer lex = new CDATALexer();
// TODO: figure out charset:

// guess charset based on HTTP header and sniffed content chunk
String charset = "UTF-8";
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
is.mark(0);
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
is.reset();
if (chunkSize > 0) {
JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
HttpHeaders httpHeaders = new HttpHeaders();
if (headers.has("Content-Type")) {
try {
httpHeaders.add("Content-Type", headers.getString("Content-Type"));
} catch (JSONException e) { }
}
try {
charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
} catch (Exception e) {
LOG.error("Failed to guess charset: " + e.getMessage());
}
}

Page page;
try {
page = new Page(is, charset);
Expand Down