Skip to content

Commit a44321c

Browse files
Improvements and fixes processing HTML from WARC files
- encoding detection: use EncodingDetector which tries BOM, metadata charset or detects the encoding from byte content - fix method is_html used in case WARC-Identitifed-Payload-Type is absent: use Content-Type from HTTP header (not WARC header) if present - TagCountJob: use improved method is_html
1 parent 9a3875b commit a44321c

File tree

3 files changed

+22
-15
lines changed

3 files changed

+22
-15
lines changed

cc_index_word_count.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,28 @@ def reduce_by_key_func(a, b):
3838
def html_to_text(self, page, record):
3939
try:
4040
encoding = record.rec_headers['WARC-Identified-Content-Charset']
41-
if encoding is None:
42-
encoding = EncodingDetector.find_declared_encoding(page,
43-
is_html=True)
44-
soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
45-
for script in soup(["script", "style"]):
41+
if not encoding:
42+
for encoding in EncodingDetector(page, is_html=True).encodings:
43+
# take the first detected encoding
44+
break
45+
soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
46+
for script in soup(['script', 'style']):
4647
script.extract()
47-
return soup.get_text(" ", strip=True)
48-
except:
48+
return soup.get_text(' ', strip=True)
49+
except Exception as e:
50+
self.get_logger().error("Error converting HTML to text for {}: {}",
51+
record.rec_headers['WARC-Target-URI'], e)
4952
self.records_parsing_failed.add(1)
50-
return ""
53+
return ''
5154

5255
def process_record(self, record):
53-
page = record.content_stream().read()
56+
if record.rec_type != 'response':
57+
# skip over WARC request or metadata records
58+
return
5459
if not self.is_html(record):
5560
self.records_non_html.add(1)
5661
return
62+
page = record.content_stream().read()
5763
text = self.html_to_text(page, record)
5864
words = map(lambda w: w.lower(),
5965
WordCountJob.word_pattern.findall(text))

html_tag_count.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@ class TagCountJob(CCSparkJob):
1515

1616
def process_record(self, record):
1717
if record.rec_type != 'response':
18-
# WARC request or metadata records
18+
# skip over WARC request or metadata records
1919
return
20-
content_type = record.http_headers.get_header('content-type', None)
21-
if content_type is None or 'html' not in content_type:
20+
if not self.is_html(record):
2221
# skip non-HTML or unknown content types
2322
return
2423
data = record.content_stream().read()

sparkcc.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,11 @@ def is_html(record):
286286
(record.rec_headers['WARC-Identified-Payload-Type'] in
287287
html_types)):
288288
return True
289-
for html_type in html_types:
290-
if html_type in record.content_type:
291-
return True
289+
content_type = record.http_headers.get_header('content-type', None)
290+
if content_type:
291+
for html_type in html_types:
292+
if html_type in content_type:
293+
return True
292294
return False
293295

294296

0 commit comments

Comments
 (0)