55import java .io .InputStream ;
66import java .io .UnsupportedEncodingException ;
77
8+ import org .apache .commons .logging .Log ;
9+ import org .apache .commons .logging .LogFactory ;
810import org .archive .format .http .HttpHeaders ;
911import org .archive .format .json .JSONUtils ;
1012import org .archive .format .text .charset .CharsetDetector ;
2325
2426public class HTMLResourceFactory implements ResourceFactory {
2527
28+ public static final Log LOG = LogFactory .getLog (HTMLResourceFactory .class );
29+
2630 protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192 ;
2731 protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers" ;
2832
@@ -37,21 +41,27 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
3741 CDATALexer lex = new CDATALexer ();
3842
3943 // guess charset based on HTTP header and sniffed content chunk
44+ String charset = "UTF-8" ;
4045 is = new BufferedInputStream (is , CHARSET_GUESS_CHUNK_SIZE );
4146 byte [] chunk = new byte [CHARSET_GUESS_CHUNK_SIZE ];
4247 is .mark (0 );
4348 int chunkSize = is .read (chunk , 0 , CHARSET_GUESS_CHUNK_SIZE );
4449 is .reset ();
45- JSONObject headers = JSONUtils .extractObject (hmd .getTopMetaData (), HTTP_HEADER_PATH );
46- HttpHeaders httpHeaders = new HttpHeaders ();
47- if (headers .has ("Content-Type" )) {
50+ if (chunkSize > 0 ) {
51+ JSONObject headers = JSONUtils .extractObject (hmd .getTopMetaData (), HTTP_HEADER_PATH );
52+ HttpHeaders httpHeaders = new HttpHeaders ();
53+ if (headers .has ("Content-Type" )) {
54+ try {
55+ httpHeaders .add ("Content-Type" , headers .getString ("Content-Type" ));
56+ } catch (JSONException e ) { }
57+ }
4858 try {
49- httpHeaders .add ("Content-Type" , headers .getString ("Content-Type" ));
50- } catch (JSONException e ) { }
59+ charset = charSetDetector .getCharset (chunk , chunkSize , httpHeaders );
60+ } catch (Exception e ) {
61+ LOG .error ("Failed to guess charset: " + e .getMessage ());
62+ }
5163 }
5264
53- String charset = charSetDetector .getCharset (chunk , chunkSize , httpHeaders );
54-
5565 Page page ;
5666 try {
5767 page = new Page (is , charset );
0 commit comments