Skip to content

Commit 607acaa

Browse files
HTML encoding detection: fix errors with empty content or empty charset values
1 parent 5f223d6 commit 607acaa

2 files changed

Lines changed: 19 additions & 7 deletions

File tree

src/main/java/org/archive/format/text/charset/CharsetDetector.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ private static String trimAttrValue(String value) {
182182
return value;
183183
}
184184
String result = value;
185+
if (result.isEmpty())
186+
return result;
185187
if (result.charAt(0) == '"') {
186188
result = result.substring(1, result.length() - 1);
187189
} else if (result.charAt(0) == '\'') {

src/main/java/org/archive/resource/html/HTMLResourceFactory.java

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import java.io.InputStream;
66
import java.io.UnsupportedEncodingException;
77

8+
import org.apache.commons.logging.Log;
9+
import org.apache.commons.logging.LogFactory;
810
import org.archive.format.http.HttpHeaders;
911
import org.archive.format.json.JSONUtils;
1012
import org.archive.format.text.charset.CharsetDetector;
@@ -23,6 +25,8 @@
2325

2426
public class HTMLResourceFactory implements ResourceFactory {
2527

28+
public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
29+
2630
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
2731
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
2832

@@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
3741
CDATALexer lex = new CDATALexer();
3842

3943
// guess charset based on HTTP header and sniffed content chunk
44+
String charset = "UTF-8";
4045
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
4146
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
4247
is.mark(0);
4348
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
4449
is.reset();
45-
JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
46-
HttpHeaders httpHeaders = new HttpHeaders();
47-
if (headers.has("Content-Type")) {
50+
if (chunkSize > 0) {
51+
JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
52+
HttpHeaders httpHeaders = new HttpHeaders();
53+
if (headers.has("Content-Type")) {
54+
try {
55+
httpHeaders.add("Content-Type", headers.getString("Content-Type"));
56+
} catch (JSONException e) { }
57+
}
4858
try {
49-
httpHeaders.add("Content-Type", headers.getString("Content-Type"));
50-
} catch (JSONException e) { }
59+
charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
60+
} catch (Exception e) {
61+
LOG.error("Failed to guess charset: " + e.getMessage());
62+
}
5163
}
5264

53-
String charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
54-
5565
Page page;
5666
try {
5767
page = new Page(is, charset);

0 commit comments

Comments
 (0)