@@ -36,32 +36,25 @@ def reduce_by_key_func(a, b):
36
36
# sum values of tuple <term_frequency, document_frequency>
37
37
return ((a [0 ] + b [0 ]), (a [1 ] + b [1 ]))
38
38
39
- def html_to_text (self , page , record ):
40
- try :
41
- encoding = self .get_warc_header (record , 'WARC-Identified-Content-Charset' )
42
- if not encoding :
43
- for encoding in EncodingDetector (page , is_html = True ).encodings :
44
- # take the first detected encoding
45
- break
46
- soup = BeautifulSoup (page , 'lxml' , from_encoding = encoding )
47
- for script in soup (['script' , 'style' ]):
48
- script .extract ()
49
- return soup .get_text (' ' , strip = True )
50
- except Exception as e :
51
- self .get_logger ().error ("Error converting HTML to text for {}: {}" ,
52
- self .get_warc_header (record , 'WARC-Target-URI' ), e )
53
- self .records_parsing_failed .add (1 )
54
- return ''
55
-
56
39
def process_record (self , record ):
57
40
if not self .is_response_record (record ):
58
41
# skip over WARC request or metadata records
59
42
return
60
43
if not self .is_html (record ):
61
44
self .records_non_html .add (1 )
62
45
return
63
- page = self .get_payload_stream (record ).read ()
64
- text = self .html_to_text (page , record )
46
+
47
+ text = ""
48
+ try :
49
+ page = self .get_payload_stream (record ).read ()
50
+ encoding = self .get_warc_header (record , 'WARC-Identified-Content-Charset' )
51
+ parser = self .get_html_parser ()
52
+ html_tree = parser .get_html_tree (page , encoding = encoding )
53
+ text = parser .html_to_text (html_tree )
54
+ except Exception as e :
55
+ self .get_logger ().error ("Error converting HTML to text for {}: {}" ,
56
+ self .get_warc_header (record , 'WARC-Target-URI' ), e )
57
+ self .records_parsing_failed .add (1 )
65
58
words = map (lambda w : w .lower (),
66
59
WordCountJob .word_pattern .findall (text ))
67
60
for word , count in Counter (words ).items ():
0 commit comments