|
1 | 1 | package org.archive.resource.html; |
2 | 2 |
|
| 3 | +import java.io.BufferedInputStream; |
3 | 4 | import java.io.IOException; |
4 | 5 | import java.io.InputStream; |
5 | 6 | import java.io.UnsupportedEncodingException; |
6 | 7 |
|
| 8 | +import org.apache.commons.logging.Log; |
| 9 | +import org.apache.commons.logging.LogFactory; |
| 10 | +import org.archive.format.http.HttpHeaders; |
| 11 | +import org.archive.format.json.JSONUtils; |
| 12 | +import org.archive.format.text.charset.CharsetDetector; |
| 13 | +import org.archive.format.text.charset.StandardCharsetDetector; |
7 | 14 | import org.archive.format.text.html.CDATALexer; |
8 | 15 | import org.archive.format.text.html.LexParser; |
9 | 16 | import org.archive.resource.MetaData; |
|
13 | 20 | import org.archive.resource.ResourceParseException; |
14 | 21 | import org.htmlparser.lexer.Page; |
15 | 22 | import org.htmlparser.util.ParserException; |
| 23 | +import org.json.JSONException; |
| 24 | +import org.json.JSONObject; |
16 | 25 |
|
17 | 26 | public class HTMLResourceFactory implements ResourceFactory { |
18 | 27 |
|
| 28 | + public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class); |
| 29 | + |
| 30 | + protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192; |
| 31 | + protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"; |
| 32 | + |
| 33 | + protected CharsetDetector charSetDetector = new StandardCharsetDetector(); |
| 34 | + |
| 35 | + |
19 | 36 | public Resource getResource(InputStream is, MetaData parentMetaData, |
20 | 37 | ResourceContainer container) throws ResourceParseException, IOException { |
21 | 38 | HTMLMetaData hmd = new HTMLMetaData(parentMetaData); |
22 | 39 | ExtractingParseObserver epo = new ExtractingParseObserver(hmd); |
23 | 40 | LexParser parser = new LexParser(epo); |
24 | 41 | CDATALexer lex = new CDATALexer(); |
25 | | - // TODO: figure out charset: |
| 42 | + |
| 43 | + // guess charset based on HTTP header and sniffed content chunk |
26 | 44 | String charset = "UTF-8"; |
| 45 | + is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE); |
| 46 | + byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE]; |
| 47 | + is.mark(0); |
| 48 | + int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE); |
| 49 | + is.reset(); |
| 50 | + if (chunkSize > 0) { |
| 51 | + JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH); |
| 52 | + HttpHeaders httpHeaders = new HttpHeaders(); |
| 53 | + if (headers.has("Content-Type")) { |
| 54 | + try { |
| 55 | + httpHeaders.add("Content-Type", headers.getString("Content-Type")); |
| 56 | + } catch (JSONException e) { } |
| 57 | + } |
| 58 | + try { |
| 59 | + charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders); |
| 60 | + } catch (Exception e) { |
| 61 | + LOG.error("Failed to guess charset: " + e.getMessage()); |
| 62 | + } |
| 63 | + } |
| 64 | + |
27 | 65 | Page page; |
28 | 66 | try { |
29 | 67 | page = new Page(is, charset); |
|
0 commit comments