Skip to content
This repository was archived by the owner on Nov 21, 2025. It is now read-only.

Commit ed33bae

Browse files
authored
Merge pull request iipc#68 from sebastian-nagel/HTMLResourceFactory_to_use_CharsetDetector
Use CharsetDetector to guess encoding of HTML documents
2 parents 1ee2529 + 0384028 commit ed33bae

2 files changed

Lines changed: 41 additions & 3 deletions

File tree

src/main/java/org/archive/format/text/charset/CharsetDetector.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ public abstract class CharsetDetector {
6060
private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" +
6161
META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
6262
private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" +
63-
META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
63+
META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + META_CONTENT_TYPE + "|" +
64+
ANY_ATTR_VALUE + ")(?:\\s|>)?";
6465

6566

6667

@@ -230,7 +231,6 @@ public static String findMetaContentType(String pageSample) {
230231
protected String getCharsetFromBytes(byte buffer[], int len)
231232
throws IOException {
232233
String charsetName = null;
233-
234234
UniversalDetector detector = new UniversalDetector(null);
235235
detector.handleData(buffer, 0, len);
236236
detector.dataEnd();

src/main/java/org/archive/resource/html/HTMLResourceFactory.java

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
package org.archive.resource.html;
22

3+
import java.io.BufferedInputStream;
34
import java.io.IOException;
45
import java.io.InputStream;
56
import java.io.UnsupportedEncodingException;
67

8+
import org.apache.commons.logging.Log;
9+
import org.apache.commons.logging.LogFactory;
10+
import org.archive.format.http.HttpHeaders;
11+
import org.archive.format.json.JSONUtils;
12+
import org.archive.format.text.charset.CharsetDetector;
13+
import org.archive.format.text.charset.StandardCharsetDetector;
714
import org.archive.format.text.html.CDATALexer;
815
import org.archive.format.text.html.LexParser;
916
import org.archive.resource.MetaData;
@@ -13,17 +20,48 @@
1320
import org.archive.resource.ResourceParseException;
1421
import org.htmlparser.lexer.Page;
1522
import org.htmlparser.util.ParserException;
23+
import org.json.JSONException;
24+
import org.json.JSONObject;
1625

1726
public class HTMLResourceFactory implements ResourceFactory {
1827

28+
public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
29+
30+
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
31+
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
32+
33+
protected CharsetDetector charSetDetector = new StandardCharsetDetector();
34+
35+
1936
public Resource getResource(InputStream is, MetaData parentMetaData,
2037
ResourceContainer container) throws ResourceParseException, IOException {
2138
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
2239
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
2340
LexParser parser = new LexParser(epo);
2441
CDATALexer lex = new CDATALexer();
25-
// TODO: figure out charset:
42+
43+
// guess charset based on HTTP header and sniffed content chunk
2644
String charset = "UTF-8";
45+
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
46+
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
47+
is.mark(0);
48+
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
49+
is.reset();
50+
if (chunkSize > 0) {
51+
JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
52+
HttpHeaders httpHeaders = new HttpHeaders();
53+
if (headers.has("Content-Type")) {
54+
try {
55+
httpHeaders.add("Content-Type", headers.getString("Content-Type"));
56+
} catch (JSONException e) { }
57+
}
58+
try {
59+
charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
60+
} catch (Exception e) {
61+
LOG.error("Failed to guess charset: " + e.getMessage());
62+
}
63+
}
64+
2765
Page page;
2866
try {
2967
page = new Page(is, charset);

0 commit comments

Comments
 (0)