Skip to content

Commit 5c44f97

Browse files
Merge pull request #47 from silentninja/html-parser
Add Configurable HTML Parser Wrappers for BeautifulSoup and Resiliparse
2 parents ccb8ee6 + 5ddbf5d commit 5c44f97

6 files changed

+122
-20
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i
207207

208208
However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes).
209209

210+
210211
## Credits
211212

212213
Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades:

bs4_parser.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from bs4 import BeautifulSoup
2+
from bs4.dammit import EncodingDetector
3+
4+
5+
class HTMLParser(object):
6+
"""
7+
HTML parser using BeautifulSoup4
8+
"""
9+
10+
def html_to_text(self, html_tree: BeautifulSoup) -> str:
11+
"""
12+
Convert HTML content to plain text using BeautifulSoup4.
13+
14+
Returns:
15+
str: Extracted plain text with scripts and styles removed
16+
"""
17+
for script in html_tree(['script', 'style']):
18+
script.extract()
19+
text = html_tree.get_text(' ', strip=True)
20+
return text
21+
22+
def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup:
23+
"""
24+
Return the HTML tree object
25+
26+
Args:
27+
page (bytes): Raw HTML content as bytes
28+
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
29+
features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers.
30+
**kwargs: Additional arguments passed to BeautifulSoup constructor.
31+
Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments.
32+
33+
Returns:
34+
BeautifulSoup: HTML tree object
35+
"""
36+
if not encoding:
37+
for encoding in EncodingDetector(page, is_html=True).encodings:
38+
# take the first detected encoding
39+
break
40+
soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs)
41+
return soup

cc_index_word_count.py

+39-19
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,33 @@ class CCIndexWordCountJob(WordCountJob, CCIndexWarcSparkJob):
1616
records_parsing_failed = None
1717
records_non_html = None
1818

19+
def add_arguments(self, parser):
20+
super(CCIndexWordCountJob, self).add_arguments(parser)
21+
parser.add_argument(
22+
"--html_parser", default="beautifulsoup",
23+
help="HTML parser: beautifulsoup or resiliparse."
24+
" Make sure to install the correct dependencies for the parser and "
25+
"include the correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) to the cluster"
26+
)
27+
28+
def get_html_parser(self):
29+
try:
30+
if self.args.html_parser == 'beautifulsoup':
31+
from bs4_parser import HTMLParser
32+
return HTMLParser()
33+
elif self.args.html_parser == 'resiliparse':
34+
from resiliparse_parser import HTMLParser
35+
return HTMLParser()
36+
else:
37+
raise ValueError(
38+
"Unknown HTML parser: {}".format(self.args.html_parser)
39+
)
40+
except ImportError as e:
41+
raise ImportError(
42+
f"Failed to import HTML parser module '{self.args.html_parser}'."
43+
f" Please ensure the module is correctly added to PySpark cluster via `--py-files`: {str(e)}"
44+
)
45+
1946
def init_accumulators(self, session):
2047
super(CCIndexWordCountJob, self).init_accumulators(session)
2148

@@ -36,32 +63,25 @@ def reduce_by_key_func(a, b):
3663
# sum values of tuple <term_frequency, document_frequency>
3764
return ((a[0] + b[0]), (a[1] + b[1]))
3865

39-
def html_to_text(self, page, record):
40-
try:
41-
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
42-
if not encoding:
43-
for encoding in EncodingDetector(page, is_html=True).encodings:
44-
# take the first detected encoding
45-
break
46-
soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
47-
for script in soup(['script', 'style']):
48-
script.extract()
49-
return soup.get_text(' ', strip=True)
50-
except Exception as e:
51-
self.get_logger().error("Error converting HTML to text for {}: {}",
52-
self.get_warc_header(record, 'WARC-Target-URI'), e)
53-
self.records_parsing_failed.add(1)
54-
return ''
55-
5666
def process_record(self, record):
5767
if not self.is_response_record(record):
5868
# skip over WARC request or metadata records
5969
return
6070
if not self.is_html(record):
6171
self.records_non_html.add(1)
6272
return
63-
page = self.get_payload_stream(record).read()
64-
text = self.html_to_text(page, record)
73+
74+
text = ""
75+
try:
76+
page = self.get_payload_stream(record).read()
77+
encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
78+
parser = self.get_html_parser()
79+
html_tree = parser.get_html_tree(page, encoding=encoding)
80+
text = parser.html_to_text(html_tree)
81+
except Exception as e:
82+
self.get_logger().error("Error converting HTML to text for {}: {}",
83+
self.get_warc_header(record, 'WARC-Target-URI'), e)
84+
self.records_parsing_failed.add(1)
6585
words = map(lambda w: w.lower(),
6686
WordCountJob.word_pattern.findall(text))
6787
for word, count in Counter(words).items():

requirements.txt

+5
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,8 @@ lxml
1818
#fastwarc
1919
# (tested with)
2020
#fastwarc==0.14.1
21+
22+
# to parse HTML (used in cc_index_word_count.py) using Resiliparse (https://pypi.org/project/Resiliparse/). Resiliparse requires compatible fastwarc version.
23+
#Resiliparse
24+
# (tested with)
25+
#Resiliparse==0.14.1

resiliparse_parser.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from resiliparse.extract.html2text import extract_plain_text
2+
from resiliparse.parse import detect_encoding
3+
from resiliparse.parse.html import HTMLTree
4+
5+
6+
class HTMLParser(object):
7+
"""
8+
HTML parser using Resiliparse
9+
"""
10+
11+
def html_to_text(self, tree, **kwargs) -> str:
12+
"""
13+
Convert HTML content to plain text using Resiliparse.
14+
15+
Returns:
16+
str: Extracted plain text with scripts and styles removed
17+
"""
18+
text = extract_plain_text(tree, **kwargs)
19+
return text
20+
21+
def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree:
22+
"""
23+
Get the HTML tree object
24+
25+
Args:
26+
page (bytes): Raw HTML content as bytes
27+
encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
28+
**kwargs: Additional arguments passed to extract_plain_text:
29+
Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments.
30+
Returns:
31+
str: Extracted plain text content
32+
"""
33+
if not encoding:
34+
encoding = detect_encoding(page)
35+
tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs)
36+
return tree

sparkcc.py

-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def parse_arguments(self):
7171

7272
arg_parser.add_argument("input", help=self.input_descr)
7373
arg_parser.add_argument("output", help=self.output_descr)
74-
7574
arg_parser.add_argument("--input_base_url",
7675
help="Base URL (prefix) used if paths to WARC/WAT/WET "
7776
"files are relative paths. Used to select the "

0 commit comments

Comments
 (0)