Improvements and fixes processing HTML from WARC files

sebastian-nagel · sebastian-nagel · commit a44321c7efa8 · 2021-02-07T22:03:53.000+01:00
- encoding detection: use EncodingDetector which tries BOM, metadata
  charset or detects the encoding from byte content
- fix method is_html used in case WARC-Identitifed-Payload-Type is absent:
  use Content-Type from HTTP header (not WARC header) if present
- TagCountJob: use improved method is_html
diff --git a/cc_index_word_count.py b/cc_index_word_count.py
@@ -38,22 +38,28 @@ def reduce_by_key_func(a, b):
     def html_to_text(self, page, record):
         try:
             encoding = record.rec_headers['WARC-Identified-Content-Charset']
-            if encoding is None:
-                encoding = EncodingDetector.find_declared_encoding(page,
-                                                                   is_html=True)
-            soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
-            for script in soup(["script", "style"]):
+            if not encoding:
+                for encoding in EncodingDetector(page, is_html=True).encodings:
+                    # take the first detected encoding
+                    break
+            soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
+            for script in soup(['script', 'style']):
                 script.extract()
-            return soup.get_text(" ", strip=True)
-        except:
+            return soup.get_text(' ', strip=True)
+        except Exception as e:
+            self.get_logger().error("Error converting HTML to text for {}: {}",
+                                    record.rec_headers['WARC-Target-URI'], e)
             self.records_parsing_failed.add(1)
-            return ""
+            return ''
 
     def process_record(self, record):
-        page = record.content_stream().read()
+        if record.rec_type != 'response':
+            # skip over WARC request or metadata records
+            return
         if not self.is_html(record):
             self.records_non_html.add(1)
             return
+        page = record.content_stream().read()
         text = self.html_to_text(page, record)
         words = map(lambda w: w.lower(),
                     WordCountJob.word_pattern.findall(text))
diff --git a/html_tag_count.py b/html_tag_count.py
@@ -15,10 +15,9 @@ class TagCountJob(CCSparkJob):
 
     def process_record(self, record):
         if record.rec_type != 'response':
-            # WARC request or metadata records
+            # skip over WARC request or metadata records
             return
-        content_type = record.http_headers.get_header('content-type', None)
-        if content_type is None or 'html' not in content_type:
+        if not self.is_html(record):
             # skip non-HTML or unknown content types
             return
         data = record.content_stream().read()
diff --git a/sparkcc.py b/sparkcc.py
@@ -286,9 +286,11 @@ def is_html(record):
             (record.rec_headers['WARC-Identified-Payload-Type'] in
              html_types)):
             return True
-        for html_type in html_types:
-            if html_type in record.content_type:
-                return True
+        content_type = record.http_headers.get_header('content-type', None)
+        if content_type:
+            for html_type in html_types:
+                if html_type in content_type:
+                    return True
         return False