Skip to content

Commit 864241a

Browse files
committed
modify the index word count to use the html parsers
1 parent c6b4ac9 commit 864241a

File tree

1 file changed

+12
-19
lines changed

1 file changed

+12
-19
lines changed

cc_index_word_count.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,32 +36,25 @@ def reduce_by_key_func(a, b):
3636
# sum values of tuple <term_frequency, document_frequency>
3737
return ((a[0] + b[0]), (a[1] + b[1]))
3838

39-
def html_to_text(self, page, record):
40-
try:
41-
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
42-
if not encoding:
43-
for encoding in EncodingDetector(page, is_html=True).encodings:
44-
# take the first detected encoding
45-
break
46-
soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
47-
for script in soup(['script', 'style']):
48-
script.extract()
49-
return soup.get_text(' ', strip=True)
50-
except Exception as e:
51-
self.get_logger().error("Error converting HTML to text for {}: {}",
52-
self.get_warc_header(record, 'WARC-Target-URI'), e)
53-
self.records_parsing_failed.add(1)
54-
return ''
55-
5639
def process_record(self, record):
5740
if not self.is_response_record(record):
5841
# skip over WARC request or metadata records
5942
return
6043
if not self.is_html(record):
6144
self.records_non_html.add(1)
6245
return
63-
page = self.get_payload_stream(record).read()
64-
text = self.html_to_text(page, record)
46+
47+
text = ""
48+
try:
49+
page = self.get_payload_stream(record).read()
50+
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
51+
parser = self.get_html_parser()
52+
html_tree = parser.get_html_tree(page, encoding=encoding)
53+
text = parser.html_to_text(html_tree)
54+
except Exception as e:
55+
self.get_logger().error("Error converting HTML to text for {}: {}",
56+
self.get_warc_header(record, 'WARC-Target-URI'), e)
57+
self.records_parsing_failed.add(1)
6558
words = map(lambda w: w.lower(),
6659
WordCountJob.word_pattern.findall(text))
6760
for word, count in Counter(words).items():

0 commit comments

Comments
 (0)