From 70bdd9320f6ec6e3d8fd33d9cdc22ca319363702 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 16 Oct 2025 15:48:54 +0200 Subject: [PATCH 01/33] well-formed example passes --- requirements.txt | 5 ++ sitemaps_from_robotstxt.py | 99 ++++++++++++++++++++++++++++ test/test_sitemaps_from_robotstxt.py | 62 +++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 sitemaps_from_robotstxt.py create mode 100644 test/test_sitemaps_from_robotstxt.py diff --git a/requirements.txt b/requirements.txt index 8aca807..dd56089 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,8 @@ lxml #Resiliparse # (tested with) #Resiliparse==0.15.2 + +# testing +pytest +pytest-mock +pyspark==3.5.7 diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py new file mode 100644 index 0000000..e58ad4a --- /dev/null +++ b/sitemaps_from_robotstxt.py @@ -0,0 +1,99 @@ +import re +from urllib.parse import urlparse, urljoin + +from pyspark.sql.types import StructType, StructField, StringType, ArrayType +from warcio.recordloader import ArcWarcRecord + +from sparkcc import CCSparkJob + +class SitemapExtractorJob(CCSparkJob): + """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files.""" + + name = "SitemapExtractor" + + output_schema = StructType([ + StructField('key', StringType(), True), + StructField('val', StructType([ + StructField('hosts', ArrayType(StringType()), True) + ]), True) + ]) + + # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line + sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) + + robots_txt_processed = None + sitemap_urls_found = None + sitemap_url_invalid_encoding = None + robots_txt_invalid_url = None + robots_txt_announcing_sitemap = None + robots_txt_with_more_than_50_sitemaps = None + + + def init_accumulators(self, session): + super(SitemapExtractorJob, self).init_accumulators(session) + + sc = session.sparkContext + self.robots_txt_processed = sc.accumulator(0) + self.sitemap_urls_found = sc.accumulator(0) + self.sitemap_url_invalid_encoding = sc.accumulator(0) + self.robots_txt_invalid_url = sc.accumulator(0) + self.robots_txt_announcing_sitemap = sc.accumulator(0) + self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) + + + def process_record(self, record: ArcWarcRecord): + """ emit: sitemap_url => [host] """ + if not self.is_response_record(record): + # we're only interested in the HTTP responses + return + + self.robots_txt_processed.add(1) + url = None + host = None + n_sitemaps = 0 + + for raw_line in self.get_payload_stream(record).readlines(): + raw_line = raw_line.strip() + + match = SitemapExtractorJob.sitemap_pattern.match(raw_line) + if match: + sitemap_url = match.group(1).strip() + self.sitemap_urls_found.add(1) + n_sitemaps += 1 + try: + sitemap_url = sitemap_url.decode("utf-8", "strict") + except UnicodeEncodeError: + # invalid encoding, ignore + self.sitemap_url_invalid_encoding.add(1) + continue + + if url is None: + # first sitemap found: set base URL and get host from URL + url = record['WARC-Target-URI'] + try: + host = urlparse(url).netloc.lower().lstrip('.') + except Exception as url_parse_error: + try: + self.get_logger().warning('Invalid robots.txt URL: %s - %s', + url, str(url_parse_error)) + except UnicodeEncodeError as unicode_error: + self.get_logger().warning('Invalid robots.txt URL (cannot be displayed) - %s - %', + str(url_parse_error), str(unicode_error)) + self.robots_txt_invalid_url.add(1) + # skip this robots.txt record + return None + + if not sitemap_url.startswith('http'): + sitemap_url = urljoin(url, sitemap_url) + + yield sitemap_url, [host] + + if n_sitemaps > 0: + self.robots_txt_announcing_sitemap.add(1) + + if n_sitemaps > 50: + self.robots_txt_with_more_than_50_sitemaps.add(1) + +if __name__ == '__main__': + job = SitemapExtractorJob() + job.run() diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py new file mode 100644 index 0000000..03008a9 --- /dev/null +++ b/test/test_sitemaps_from_robotstxt.py @@ -0,0 +1,62 @@ +from io import BytesIO + +import pytest +from unittest.mock import MagicMock + +from pyspark.sql import SparkSession +from warcio.recordloader import ArcWarcRecord + +from sitemaps_from_robotstxt import SitemapExtractorJob + +@pytest.fixture(scope='session') +def spark(): + return SparkSession.builder.appName('test_session').getOrCreate() + + +def make_robots_txt_record(warc_target_uri: str, response_text: str) -> ArcWarcRecord: + record = MagicMock() + record.rec_type = 'response' + d = {'WARC-Target-URI': warc_target_uri} + record.__getitem__.side_effect = d.__getitem__ + record.content_stream = lambda: BytesIO(response_text.encode('utf-8')) + return record + + +def test_well_formed_record(spark): + record = make_robots_txt_record("http://ajedrezhoygol.blogspot.com.ar/robots.txt", + """User-agent: Mediapartners-Google +Disallow: + +User-agent: * +Disallow: /search +Allow: / + +Sitemap: http://ajedrezhoygol.blogspot.com/sitemap.xml +""") + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 1 + assert results[0][0] == 'http://ajedrezhoygol.blogspot.com/sitemap.xml' + assert results[0][1] == ["ajedrezhoygol.blogspot.com.ar"] + + +def test_empty_record(spark): + record = make_robots_txt_record("http://agencasinosbobet5.weebly.com/robots.txt", +"""Sitemap: http://agencasinosbobet5.weebly.com/sitemap.xml + +User-agent: NerdyBot +Disallow: / + +User-agent: * +Disallow: /ajax/ +Disallow: /apps/ +""") + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 1 + assert results[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' + assert results[0][1] == [] + + From 92f30c6890b5b49d278bc1ec0299c9125a99d314 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 17 Oct 2025 15:09:47 +0200 Subject: [PATCH 02/33] wip (back to basics on pyspark) --- sitemaps_from_robotstxt.py | 43 +++++-- sparkcc.py | 21 +++- test/test_sitemaps_from_robotstxt.py | 175 ++++++++++++++++++++++++++- 3 files changed, 225 insertions(+), 14 deletions(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index e58ad4a..4d8b6a1 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -12,34 +12,60 @@ class SitemapExtractorJob(CCSparkJob): name = "SitemapExtractor" output_schema = StructType([ - StructField('key', StringType(), True), - StructField('val', StructType([ - StructField('hosts', ArrayType(StringType()), True) - ]), True) + StructField('sitemap_url', StringType(), True), + StructField('hosts', ArrayType(StringType()), True) ]) + #merge_method = 'combineByKey' + # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) robots_txt_processed = None sitemap_urls_found = None + sitemap_url_invalid = None sitemap_url_invalid_encoding = None robots_txt_invalid_url = None robots_txt_announcing_sitemap = None robots_txt_with_more_than_50_sitemaps = None - def init_accumulators(self, session): super(SitemapExtractorJob, self).init_accumulators(session) sc = session.sparkContext self.robots_txt_processed = sc.accumulator(0) self.sitemap_urls_found = sc.accumulator(0) + self.sitemap_url_invalid = sc.accumulator(0) self.sitemap_url_invalid_encoding = sc.accumulator(0) self.robots_txt_invalid_url = sc.accumulator(0) self.robots_txt_announcing_sitemap = sc.accumulator(0) self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) + def reduce_grouped_by_key_func(self, kv: tuple): + """Map sitemap URL to cross-submit hosts: + sitemap_url => [host_1, ..., host_n]""" + key, values = kv + try: + sitemap_uri = urlparse(key) + except Exception as url_parse_error: + try: + self.get_logger().warn('Invalid sitemap URL: %s - %s', key, url_parse_error) + except UnicodeEncodeError as unicode_error: + self.get_logger().warn('Invalid sitemap URL (cannot display): %s - %s', url_parse_error, unicode_error) + self.sitemap_url_invalid.add(1) + return + + sitemap_host = sitemap_uri.netloc.lower().lstrip('.') + cross_submit_hosts = set() + + for robots_txt_hosts in values: + for robots_txt_host in robots_txt_hosts: + if robots_txt_host != sitemap_host: + cross_submit_hosts.add(robots_txt_host) + + self.get_logger().warn(f'Cross submit hosts: {key} {cross_submit_hosts}') + yield key, list(cross_submit_hosts) + def process_record(self, record: ArcWarcRecord): """ emit: sitemap_url => [host] """ @@ -52,7 +78,8 @@ def process_record(self, record: ArcWarcRecord): host = None n_sitemaps = 0 - for raw_line in self.get_payload_stream(record).readlines(): + data = self.get_payload_stream(record).read() + for raw_line in data.splitlines(): raw_line = raw_line.strip() match = SitemapExtractorJob.sitemap_pattern.match(raw_line) @@ -69,7 +96,7 @@ def process_record(self, record: ArcWarcRecord): if url is None: # first sitemap found: set base URL and get host from URL - url = record['WARC-Target-URI'] + url = record.rec_headers['WARC-Target-URI'] try: host = urlparse(url).netloc.lower().lstrip('.') except Exception as url_parse_error: @@ -81,7 +108,7 @@ def process_record(self, record: ArcWarcRecord): str(url_parse_error), str(unicode_error)) self.robots_txt_invalid_url.add(1) # skip this robots.txt record - return None + return if not sitemap_url.startswith('http'): sitemap_url = urljoin(url, sitemap_url) diff --git a/sparkcc.py b/sparkcc.py index 15d7a70..739583c 100644 --- a/sparkcc.py +++ b/sparkcc.py @@ -6,6 +6,7 @@ from io import BytesIO from tempfile import SpooledTemporaryFile, TemporaryFile +from typing import Literal import boto3 import botocore @@ -34,6 +35,8 @@ class CCSparkJob(object): StructField("val", LongType(), True) ]) + merge_method: Literal['reduceValues', 'reduceValuesWithKeys'] = 'reduceValues' + # description of input and output shown by --help input_descr = "Path to file listing input paths" output_descr = "Name of output table (saved in spark.sql.warehouse.dir)" @@ -207,12 +210,19 @@ def log_accumulators(self, session): def reduce_by_key_func(a, b): return a + b + def reduce_grouped_by_key_func(self, kv: tuple): + return kv + def run_job(self, session): input_data = session.sparkContext.textFile(self.args.input, minPartitions=self.args.num_input_partitions) - output = input_data.mapPartitionsWithIndex(self.process_warcs) \ - .reduceByKey(self.reduce_by_key_func) + output = input_data.mapPartitionsWithIndex(self.process_warcs) + #self.get_logger().warning("merge method:", self.merge_method) + if self.merge_method == 'reduceValuesWithKeys': + output = output.groupByKey().map(lambda kv: self.reduce_grouped_by_key_func(kv)) + else: + output = output.reduceByKey(self.reduce_by_key_func) session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ @@ -608,8 +618,11 @@ def run_job(self, session): columns.append('content_charset') warc_recs = sqldf.select(*columns).rdd - output = warc_recs.mapPartitions(self.fetch_process_warc_records) \ - .reduceByKey(self.reduce_by_key_func) + output = warc_recs.mapPartitions(self.fetch_process_warc_records) + if self.merge_method == 'reduceValuesWithKeys': + output = output.groupByKey().map(self.reduce_grouped_by_key_func) + else: + output = output.reduceByKey(self.reduce_by_key_func) session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 03008a9..bfce2e3 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -16,8 +16,7 @@ def spark(): def make_robots_txt_record(warc_target_uri: str, response_text: str) -> ArcWarcRecord: record = MagicMock() record.rec_type = 'response' - d = {'WARC-Target-URI': warc_target_uri} - record.__getitem__.side_effect = d.__getitem__ + record.rec_headers = {'WARC-Target-URI': warc_target_uri} record.content_stream = lambda: BytesIO(response_text.encode('utf-8')) return record @@ -60,3 +59,175 @@ def test_empty_record(spark): assert results[0][1] == [] +def test_different_host_record(spark): + record = make_robots_txt_record( + 'http://177.52.3535.ru/robots.txt', +''' +User-agent: Yandex +Disallow: /bitrix/ +Disallow: /upload/ +Disallow: /detsad/ +Disallow: /videouroki/ +Disallow: /humor/ +Disallow: /radio/ +Disallow: /recepts/ +Disallow: /school_life/ +Disallow: /workgroups/ +Disallow: /institutions/ +Disallow: /kindergarten/ +Disallow: /unified-state-exam/ +Disallow: /ideas/ +Disallow: /documents/ +Disallow: /videosearch/ +Disallow: /auth/ +Disallow: /demotivators/ +Disallow: /additional-education/ +Disallow: /admission/ +Disallow: /random/ +Disallow: /horoscope/ +Disallow: /monitoring-in-the-sphere-of-education/ +Disallow: /votes/ +Disallow: /news/ +Disallow: /clips/ +Disallow: /preschool-education/ +Disallow: /movies/ +Disallow: /TV/ +Disallow: /dreambook/ +Disallow: /about/ +Disallow: /company/ +Disallow: /edit/ +Disallow: /com-docs/ +Disallow: /professional-education/ +Disallow: /vs.php +Disallow: /index-old.php +Disallow: /404.php +Disallow: /suz/ +Disallow: /school-education/ +Disallow: /municipal-education-authorities/ +Disallow: /com-about/ +Disallow: /parents/ +Disallow: /view/ +Disallow: /stat/ +Disallow: /quotes/ +Disallow: /region/ +Disallow: /students/ +Disallow: /graduates/ +Disallow: /job/ +Disallow: /auth.php +Disallow: /search/ +Disallow: /search/ +Disallow: /auth/ +Disallow: /auth.php +Disallow: /personal/ +Disallow: /*?print= +Disallow: /*&print= +Disallow: /*register=yes +Disallow: /*forgot_password=yes +Disallow: /*change_password=yes +Disallow: /*login=yes +Disallow: /*logout=yes +Disallow: /*auth=yes +Disallow: /*action=ADD_TO_COMPARE_LIST +Disallow: /*action=DELETE_FROM_COMPARE_LIST +Disallow: /*action=ADD2BASKET +Disallow: /*action=BUY +Disallow: /*bitrix_*= +Disallow: /*backurl=* +Disallow: /*BACKURL=* +Disallow: /*back_url=* +Disallow: /*BACK_URL=* +Disallow: /*back_url_admin=* +Disallow: /*index.php$ +Disallow: /*?* + + +User-agent: * +Disallow: /bitrix/ +Disallow: /upload/ +Disallow: /detsad/ +Disallow: /humor/ +Disallow: /videouroki/ +Disallow: /radio/ +Disallow: /recepts/ +Disallow: /school_life/ +Disallow: /workgroups/ +Disallow: /institutions/ +Disallow: /kindergarten/ +Disallow: /unified-state-exam/ +Disallow: /ideas/ +Disallow: /documents/ +Disallow: /videosearch/ +Disallow: /auth/ +Disallow: /demotivators/ +Disallow: /additional-education/ +Disallow: /admission/ +Disallow: /random/ +Disallow: /horoscope/ +Disallow: /monitoring-in-the-sphere-of-education/ +Disallow: /votes/ +Disallow: /news/ +Disallow: /clips/ +Disallow: /preschool-education/ +Disallow: /movies/ +Disallow: /TV/ +Disallow: /dreambook/ +Disallow: /about/ +Disallow: /company/ +Disallow: /edit/ +Disallow: /com-docs/ +Disallow: /professional-education/ +Disallow: /vs.php +Disallow: /index-old.php +Disallow: /404.php +Disallow: /suz/ +Disallow: /school-education/ +Disallow: /municipal-education-authorities/ +Disallow: /com-about/ +Disallow: /parents/ +Disallow: /view/ +Disallow: /stat/ +Disallow: /quotes/ +Disallow: /region/ +Disallow: /students/ +Disallow: /graduates/ +Disallow: /job/ +Disallow: /auth.php +Disallow: /search/ +Disallow: /search/ +Disallow: /auth/ +Disallow: /auth.php +Disallow: /personal/ +Disallow: /*?print= +Disallow: /*&print= +Disallow: /*register=yes +Disallow: /*forgot_password=yes +Disallow: /*change_password=yes +Disallow: /*login=yes +Disallow: /*logout=yes +Disallow: /*auth=yes +Disallow: /*action=ADD_TO_COMPARE_LIST +Disallow: /*action=DELETE_FROM_COMPARE_LIST +Disallow: /*action=ADD2BASKET +Disallow: /*action=BUY +Disallow: /*bitrix_*= +Disallow: /*backurl=* +Disallow: /*BACKURL=* +Disallow: /*back_url=* +Disallow: /*BACK_URL=* +Disallow: /*back_url_admin=* +Disallow: /*index.php$ +Disallow: /*?* +Host: 3535.ru + + +Sitemap: http://3535.ru/sitemap_000.xml +''' + ) + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 1 + assert results[0][0] == 'http://3535.ru/sitemap_000.xml' + assert results[0][1] == ['177.52.3535.ru'] + + From fed0b2c5e7de2167608ba0082d5b917a1ea80b84 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 17 Oct 2025 16:01:08 +0200 Subject: [PATCH 03/33] identical output with different format and sort order --- get-data.sh | 6 ++++ sitemaps_from_robotstxt.py | 59 +++++++++++++++++++++----------------- sparkcc.py | 21 ++++++++------ 3 files changed, 52 insertions(+), 34 deletions(-) diff --git a/get-data.sh b/get-data.sh index e8e71b6..de776a5 100755 --- a/get-data.sh +++ b/get-data.sh @@ -45,3 +45,9 @@ for data_type in warc wat wet; do done +echo "Downloading sample robots.txt file..." +ccfile='crawl-data/CC-MAIN-2017-30/segments/1500549423183.57/robotstxt/CC-MAIN-20170720121902-20170720141902-00000.warc.gz' +mkdir -p `dirname "$ccfile"` +wget --no-clobber https://data.commoncrawl.org/$ccfile -O $ccfile + + diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index 4d8b6a1..dce3a95 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -1,5 +1,6 @@ import re -from urllib.parse import urlparse, urljoin +from typing import Optional +from urllib.parse import urlparse, urljoin, ParseResult from pyspark.sql.types import StructType, StructField, StringType, ArrayType from warcio.recordloader import ArcWarcRecord @@ -16,7 +17,7 @@ class SitemapExtractorJob(CCSparkJob): StructField('hosts', ArrayType(StringType()), True) ]) - #merge_method = 'combineByKey' + merge_method = 'reduce_group_by_key' # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) @@ -41,30 +42,24 @@ def init_accumulators(self, session): self.robots_txt_announcing_sitemap = sc.accumulator(0) self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) - def reduce_grouped_by_key_func(self, kv: tuple): + @staticmethod + def reduce_group_by_key_func(kv: tuple): """Map sitemap URL to cross-submit hosts: sitemap_url => [host_1, ..., host_n]""" - key, values = kv + sitemap_uri, hosts_lists = kv try: - sitemap_uri = urlparse(key) - except Exception as url_parse_error: - try: - self.get_logger().warn('Invalid sitemap URL: %s - %s', key, url_parse_error) - except UnicodeEncodeError as unicode_error: - self.get_logger().warn('Invalid sitemap URL (cannot display): %s - %s', url_parse_error, unicode_error) - self.sitemap_url_invalid.add(1) - return + sitemap_host = urlparse(sitemap_uri).netloc.lower().lstrip('.') + except Exception as e: + raise RuntimeError("A sitemap URI somehow made it through the initial parsing phase, this shouldn't happen: " + repr(e)) - sitemap_host = sitemap_uri.netloc.lower().lstrip('.') cross_submit_hosts = set() - for robots_txt_hosts in values: + for robots_txt_hosts in hosts_lists: for robots_txt_host in robots_txt_hosts: if robots_txt_host != sitemap_host: cross_submit_hosts.add(robots_txt_host) - self.get_logger().warn(f'Cross submit hosts: {key} {cross_submit_hosts}') - yield key, list(cross_submit_hosts) + return sitemap_uri, list(cross_submit_hosts) def process_record(self, record: ArcWarcRecord): @@ -94,22 +89,21 @@ def process_record(self, record: ArcWarcRecord): self.sitemap_url_invalid_encoding.add(1) continue + if self._try_parse_url(sitemap_url, label_for_log='sitemap') is None: + self.sitemap_url_invalid.add(1) + continue + if url is None: # first sitemap found: set base URL and get host from URL url = record.rec_headers['WARC-Target-URI'] - try: - host = urlparse(url).netloc.lower().lstrip('.') - except Exception as url_parse_error: - try: - self.get_logger().warning('Invalid robots.txt URL: %s - %s', - url, str(url_parse_error)) - except UnicodeEncodeError as unicode_error: - self.get_logger().warning('Invalid robots.txt URL (cannot be displayed) - %s - %', - str(url_parse_error), str(unicode_error)) - self.robots_txt_invalid_url.add(1) + url_parsed = self._try_parse_url(url, label_for_log='robots.txt') + if url_parsed is None: # skip this robots.txt record + self.robots_txt_invalid_url.add(1) return + host = url_parsed.netloc.lower().lstrip('.') + if not sitemap_url.startswith('http'): sitemap_url = urljoin(url, sitemap_url) @@ -121,6 +115,19 @@ def process_record(self, record: ArcWarcRecord): if n_sitemaps > 50: self.robots_txt_with_more_than_50_sitemaps.add(1) + + def _try_parse_url(self, url, label_for_log) -> Optional[ParseResult]: + try: + return urlparse(url) + except Exception as url_parse_error: + try: + self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, url_parse_error) + except UnicodeEncodeError as unicode_error: + self.get_logger().warn('Invalid %s URL (cannot be displayed): %s - %s', + label_for_log, url_parse_error, unicode_error) + return None + + if __name__ == '__main__': job = SitemapExtractorJob() job.run() diff --git a/sparkcc.py b/sparkcc.py index 739583c..8dc55dc 100644 --- a/sparkcc.py +++ b/sparkcc.py @@ -35,7 +35,7 @@ class CCSparkJob(object): StructField("val", LongType(), True) ]) - merge_method: Literal['reduceValues', 'reduceValuesWithKeys'] = 'reduceValues' + merge_method: Literal['reduce_by_key', 'reduce_group_by_key'] = 'reduce_by_key' # description of input and output shown by --help input_descr = "Path to file listing input paths" @@ -210,7 +210,8 @@ def log_accumulators(self, session): def reduce_by_key_func(a, b): return a + b - def reduce_grouped_by_key_func(self, kv: tuple): + @staticmethod + def reduce_group_by_key_func(kv: tuple): return kv def run_job(self, session): @@ -219,10 +220,12 @@ def run_job(self, session): output = input_data.mapPartitionsWithIndex(self.process_warcs) #self.get_logger().warning("merge method:", self.merge_method) - if self.merge_method == 'reduceValuesWithKeys': - output = output.groupByKey().map(lambda kv: self.reduce_grouped_by_key_func(kv)) - else: + if self.merge_method == 'reduce_group_by_key': + output = output.groupByKey().map(self.reduce_group_by_key_func) + elif self.merge_method == 'reduce_by_key': output = output.reduceByKey(self.reduce_by_key_func) + else: + raise ValueError(f"Unknown merge method: {self.merge_method}") session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ @@ -619,10 +622,12 @@ def run_job(self, session): warc_recs = sqldf.select(*columns).rdd output = warc_recs.mapPartitions(self.fetch_process_warc_records) - if self.merge_method == 'reduceValuesWithKeys': - output = output.groupByKey().map(self.reduce_grouped_by_key_func) - else: + if self.merge_method == 'reduce_group_by_key': + output = output.groupByKey().map(self.reduce_group_by_key_func) + elif self.merge_method == 'reduce_by_key': output = output.reduceByKey(self.reduce_by_key_func) + else: + raise ValueError(f"Unknown merge method: {self.merge_method}") session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ From 9bb3f41a38692b93f9854c2b6cd3ec5d17cb25f5 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 17 Oct 2025 16:40:58 +0200 Subject: [PATCH 04/33] typo --- sitemaps_from_robotstxt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index dce3a95..fb5660b 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -50,7 +50,7 @@ def reduce_group_by_key_func(kv: tuple): try: sitemap_host = urlparse(sitemap_uri).netloc.lower().lstrip('.') except Exception as e: - raise RuntimeError("A sitemap URI somehow made it through the initial parsing phase, this shouldn't happen: " + repr(e)) + raise RuntimeError("A invalid sitemap URI somehow made it through the initial parsing phase, this shouldn't happen: " + repr(e)) cross_submit_hosts = set() From 458cb2eca1dcbe6de96ac1146c2cc30cc8c01fff Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 17 Oct 2025 17:19:46 +0200 Subject: [PATCH 05/33] fix failing tests by spinning up a proper spark session and RDD --- test/test_sitemaps_from_robotstxt.py | 144 +++++++++++++++++++++++---- test/utils.py | 5 + 2 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 test/utils.py diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index bfce2e3..8262655 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -7,6 +7,8 @@ from warcio.recordloader import ArcWarcRecord from sitemaps_from_robotstxt import SitemapExtractorJob +from utils import _process_jobs + @pytest.fixture(scope='session') def spark(): @@ -40,24 +42,6 @@ def test_well_formed_record(spark): assert results[0][1] == ["ajedrezhoygol.blogspot.com.ar"] -def test_empty_record(spark): - record = make_robots_txt_record("http://agencasinosbobet5.weebly.com/robots.txt", -"""Sitemap: http://agencasinosbobet5.weebly.com/sitemap.xml - -User-agent: NerdyBot -Disallow: / - -User-agent: * -Disallow: /ajax/ -Disallow: /apps/ -""") - job = SitemapExtractorJob() - job.init_accumulators(session=spark) - results = list(job.process_record(record)) - assert len(results) == 1 - assert results[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' - assert results[0][1] == [] - def test_different_host_record(spark): record = make_robots_txt_record( @@ -231,3 +215,127 @@ def test_different_host_record(spark): assert results[0][1] == ['177.52.3535.ru'] + + +def test_host_accumulation_empty(spark): + """ + Test accumulation of hosts when sitemap url host and robots.txt url host match + Requires test/ on PYTHONPATH so utils._process_jobs can be imported + """ + + record = make_robots_txt_record("http://agencasinosbobet5.weebly.com/robots.txt", +"""Sitemap: http://agencasinosbobet5.weebly.com/sitemap.xml + +User-agent: NerdyBot +Disallow: / + +User-agent: * +Disallow: /ajax/ +Disallow: /apps/ +""") + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + + records = [record] + rdd = spark.sparkContext.parallelize(records) + _process_jobs_partial = lambda partition_index, records: _process_jobs(partition_index, records, job=job) + output = rdd.mapPartitionsWithIndex(_process_jobs_partial) + output = output.groupByKey().map(SitemapExtractorJob.reduce_group_by_key_func).collect() + + assert len(output) == 1 + assert output[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' + assert output[0][1] == [] + + +def test_host_accumulation_multi(spark): + """ + Test accumulation of multiple hosts for same sitemap URL from different robots.txt records + Requires test/ on PYTHONPATH so utils._process_jobs can be imported + """ + + multi_robots_txt_data = [ + ( + "http://the-mayflower-hotel-autograph-collection-washington.ibooked.com.br/robots.txt", + """User-Agent: * +Allow: / +Disallow: /reviewpage/ +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Disallow: /reviewpage2/ + + +User-agent: Yandex +Host: nochi.com +Allow: / +Disallow: /reviewpage/ + +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Sitemap: http://nochi.com/data/sitemaps/ru_index.xml +""" + ), + ( + "http://the-rockies-condominiums-steamboat-springs.booked.net/robots.txt", + """User-Agent: * +Allow: / +Disallow: /reviewpage/ +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Disallow: /reviewpage2/ + + +User-agent: Yandex +Host: nochi.com +Allow: / +Disallow: /reviewpage/ + +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Sitemap: http://nochi.com/data/sitemaps/ru_index.xml +""", + ), + ( + "http://hotel-flora-venice.booked.kr/robots.txt", + """User-Agent: * +Allow: / +Disallow: /reviewpage/ +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Disallow: /reviewpage2/ + + +User-agent: Yandex +Host: nochi.com +Allow: / +Disallow: /reviewpage/ + +Disallow: /ajax/ +Disallow: /?page=stat +Disallow: /?page=hotel_ajax +Disallow: /?page=hotellist_json +Sitemap: http://nochi.com/data/sitemaps/ru_index.xml +""" + ) + ] + + records = [make_robots_txt_record(robots_txt_url, robots_txt_content) + for robots_txt_url, robots_txt_content in multi_robots_txt_data] + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + rdd = spark.sparkContext.parallelize(records) + _process_jobs_partial = lambda partition_index, records: _process_jobs(partition_index, records, job=job) + output = rdd.mapPartitionsWithIndex(_process_jobs_partial) + output = output.groupByKey().map(SitemapExtractorJob.reduce_group_by_key_func).collect() + assert len(output) == 1 + assert output[0][0] == 'http://nochi.com/data/sitemaps/ru_index.xml' + assert sorted(output[0][1]) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) diff --git a/test/utils.py b/test/utils.py new file mode 100644 index 0000000..3c16f02 --- /dev/null +++ b/test/utils.py @@ -0,0 +1,5 @@ + +def _process_jobs(partition_index, records, job): + for record in records: + for result in job.process_record(record): + yield result From ada147175119bfb7d7e0941ebd80f1f48e564cb6 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 20 Oct 2025 11:38:58 +0200 Subject: [PATCH 06/33] exactly match cc-mrjob output Signed-off-by: Damian Stewart --- sitemaps_from_robotstxt.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index fb5660b..7250e01 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -1,3 +1,4 @@ +import json import re from typing import Optional from urllib.parse import urlparse, urljoin, ParseResult @@ -14,7 +15,7 @@ class SitemapExtractorJob(CCSparkJob): output_schema = StructType([ StructField('sitemap_url', StringType(), True), - StructField('hosts', ArrayType(StringType()), True) + StructField('hosts', StringType(), True) ]) merge_method = 'reduce_group_by_key' @@ -59,7 +60,14 @@ def reduce_group_by_key_func(kv: tuple): if robots_txt_host != sitemap_host: cross_submit_hosts.add(robots_txt_host) - return sitemap_uri, list(cross_submit_hosts) + return sitemap_uri, json.dumps(list(cross_submit_hosts)) + + + def add_arguments(self, parser): + super(SitemapExtractorJob, self).add_arguments(parser) + # set output options to match old cc-mrjob output + parser.set_defaults(output_option=['sep=\t', 'escapeQuotes=false', 'header=false']) + parser.set_defaults(output_format='csv') def process_record(self, record: ArcWarcRecord): From 1f82d33d20fec2f0d8d857f1c575a6687554ab95 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 20 Oct 2025 12:19:22 +0200 Subject: [PATCH 07/33] wip content encoding and invalid url tests --- requirements.txt | 3 + sitemaps_from_robotstxt.py | 40 +++-- test/test_sitemaps_from_robotstxt.py | 216 ++++++++++++++++++++++++++- 3 files changed, 239 insertions(+), 20 deletions(-) diff --git a/requirements.txt b/requirements.txt index dd56089..ab8d7be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,9 @@ ujson orjson warcio +# for validating URLs in robots.txt: +validators + # for link extraction and webgraph construction also: idna diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index 7250e01..7d3ce33 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -1,8 +1,9 @@ import json import re from typing import Optional -from urllib.parse import urlparse, urljoin, ParseResult +from urllib.parse import urlparse, urljoin +import validators from pyspark.sql.types import StructType, StructField, StringType, ArrayType from warcio.recordloader import ArcWarcRecord @@ -92,25 +93,24 @@ def process_record(self, record: ArcWarcRecord): n_sitemaps += 1 try: sitemap_url = sitemap_url.decode("utf-8", "strict") - except UnicodeEncodeError: + except UnicodeDecodeError: # invalid encoding, ignore self.sitemap_url_invalid_encoding.add(1) continue - if self._try_parse_url(sitemap_url, label_for_log='sitemap') is None: + if not self._is_valid_url(sitemap_url, label_for_log='sitemap'): self.sitemap_url_invalid.add(1) continue if url is None: # first sitemap found: set base URL and get host from URL url = record.rec_headers['WARC-Target-URI'] - url_parsed = self._try_parse_url(url, label_for_log='robots.txt') - if url_parsed is None: + if not self._is_valid_url(url, label_for_log='robots.txt'): # skip this robots.txt record self.robots_txt_invalid_url.add(1) return - host = url_parsed.netloc.lower().lstrip('.') + host = urlparse(url).netloc.lower().lstrip('.') if not sitemap_url.startswith('http'): sitemap_url = urljoin(url, sitemap_url) @@ -124,16 +124,28 @@ def process_record(self, record: ArcWarcRecord): self.robots_txt_with_more_than_50_sitemaps.add(1) - def _try_parse_url(self, url, label_for_log) -> Optional[ParseResult]: + def _is_valid_url(self, url, label_for_log) -> bool: + """Validate URL using validators.url and log if invalid.""" try: - return urlparse(url) - except Exception as url_parse_error: + result = validators.url(url) + # validators.url returns True for valid URLs, ValidationError for invalid + if result is True: + return True + else: + # ValidationError object returned - convert to string for logging + try: + self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, str(result)) + except Exception: + # If logging fails, just continue without logging + pass + return False + except Exception as e: try: - self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, url_parse_error) - except UnicodeEncodeError as unicode_error: - self.get_logger().warn('Invalid %s URL (cannot be displayed): %s - %s', - label_for_log, url_parse_error, unicode_error) - return None + self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, str(e)) + except Exception: + # If logging fails, just continue without logging + pass + return False if __name__ == '__main__': diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 8262655..4d6ed97 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -1,7 +1,8 @@ +import json from io import BytesIO import pytest -from unittest.mock import MagicMock +from unittest.mock import MagicMock, Mock from pyspark.sql import SparkSession from warcio.recordloader import ArcWarcRecord @@ -15,11 +16,17 @@ def spark(): return SparkSession.builder.appName('test_session').getOrCreate() -def make_robots_txt_record(warc_target_uri: str, response_text: str) -> ArcWarcRecord: +def make_robots_txt_record(warc_target_uri: str, response_text: str, + response_text_encoding='utf-8', + warc_target_uri_is_invalid=False) -> Mock: record = MagicMock() record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': warc_target_uri} - record.content_stream = lambda: BytesIO(response_text.encode('utf-8')) + if warc_target_uri_is_invalid: + # Create an invalid URL that will cause urlparse to fail + record.rec_headers = {'WARC-Target-URI': warc_target_uri} + else: + record.rec_headers = {'WARC-Target-URI': warc_target_uri} + record.content_stream = lambda: BytesIO(response_text.encode(response_text_encoding)) return record @@ -244,7 +251,7 @@ def test_host_accumulation_empty(spark): assert len(output) == 1 assert output[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' - assert output[0][1] == [] + assert output[0][1] == '[]' def test_host_accumulation_multi(spark): @@ -338,4 +345,201 @@ def test_host_accumulation_multi(spark): output = output.groupByKey().map(SitemapExtractorJob.reduce_group_by_key_func).collect() assert len(output) == 1 assert output[0][0] == 'http://nochi.com/data/sitemaps/ru_index.xml' - assert sorted(output[0][1]) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) + assert sorted(json.loads(output[0][1])) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) + + +def test_wrong_encoding_utf16_record(spark): + record = make_robots_txt_record("http://ajedrezhoygol.blogspot.com.ar/robots.txt", + """User-agent: Mediapartners-Google +Disallow: + +User-agent: * +Disallow: /search +Allow: / + +Sitemap: http://ajedrezhoygol.blogspot.com/sitemap.xml +""", response_text_encoding='utf-16') + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 0 + + +def test_robots_txt_invalid_url_malformed(spark): + """Test that robots_txt_invalid_url increments when robots.txt URL causes urlparse to fail""" + # urlparse will raise AttributeError when given a non-string type in rec_headers + # We need to mock the record more carefully to trigger an actual exception + record = MagicMock() + record.rec_type = 'response' + # Pass an integer instead of string to cause AttributeError in urlparse + record.rec_headers = {'WARC-Target-URI': 12345} # Non-string type + record.content_stream = lambda: BytesIO(b"""User-agent: * +Disallow: / + +Sitemap: http://example.com/sitemap.xml +""") + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + + # Mock the logger to avoid serialization issues with Spark's Java logger + job.get_logger = lambda: MagicMock() + + results = list(job.process_record(record)) + # Should return early due to invalid robots.txt URL + assert len(results) == 0 + assert job.robots_txt_invalid_url.value == 1 + + +def test_robots_txt_invalid_url_unparseable_netloc(spark): + """Test that robots_txt_invalid_url increments when robots.txt URL is a list""" + # Another way to trigger urlparse exception with a non-string type + record = MagicMock() + record.rec_type = 'response' + record.rec_headers = {'WARC-Target-URI': ['http://example.com']} # List will cause TypeError + record.content_stream = lambda: BytesIO(b"""User-agent: * +Disallow: /admin/ + +Sitemap: http://valid-example.com/sitemap.xml +Sitemap: http://valid-example.com/sitemap2.xml +""") + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + + # Mock the logger to avoid serialization issues + job.get_logger = lambda: MagicMock() + + results = list(job.process_record(record)) + # Should return early due to invalid robots.txt URL + assert len(results) == 0 + assert job.robots_txt_invalid_url.value == 1 + + +def test_robots_txt_invalid_punycode_url(spark): + """Test handling of invalid punycode domain in robots.txt URL""" + # xn--foo is an invalid punycode domain (incomplete encoding) + # validators.url properly rejects it, so this tests that invalid punycode is caught + record = make_robots_txt_record("http://xn--foo/robots.txt", + """User-agent: * +Disallow: / + +Sitemap: http://example.com/sitemap.xml +""") + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + # validators.url properly detects invalid punycode and rejects it + # So the robots.txt URL is invalid and no results are returned + assert len(results) == 0 + assert job.robots_txt_invalid_url.value == 1 + + +def test_sitemap_url_invalid_encoding_latin1(spark): + """Test that sitemap_url_invalid_encoding increments for non-UTF8 sitemap URLs""" + # Create a robots.txt with a sitemap URL containing Latin-1 bytes that aren't valid UTF-8 + # The byte sequence \xe9 is é in Latin-1 but invalid in UTF-8 when standalone + robots_txt_bytes = b"""User-agent: * +Disallow: / + +Sitemap: http://example.com/sitemap_caf\xe9.xml +""" + record = MagicMock() + record.rec_type = 'response' + record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} + record.content_stream = lambda: BytesIO(robots_txt_bytes) + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 0 + assert job.sitemap_url_invalid_encoding.value == 1 + + +def test_sitemap_url_invalid_encoding_mixed_bytes(spark): + """Test that sitemap_url_invalid_encoding increments for mixed invalid byte sequences""" + # Create a robots.txt with multiple sitemap URLs, one with invalid UTF-8 + # The byte sequence \xff\xfe is not valid UTF-8 + robots_txt_bytes = b"""User-agent: * +Disallow: /search + +Sitemap: http://example.com/good_sitemap.xml +Sitemap: http://example.com/bad\xff\xfe_sitemap.xml +Sitemap: http://example.com/another_good.xml +""" + record = MagicMock() + record.rec_type = 'response' + record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} + record.content_stream = lambda: BytesIO(robots_txt_bytes) + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + # Should get 2 valid sitemaps and 1 invalid encoding + assert len(results) == 2 + assert job.sitemap_url_invalid_encoding.value == 1 + assert job.sitemap_urls_found.value == 3 # All 3 matched the pattern + + +def test_sitemap_url_invalid_malformed_url(spark): + """Test that sitemap_url_invalid increments when sitemap URL causes validation to fail""" + robots_txt_bytes = b"""User-agent: * +Disallow: / + +Sitemap: http://example.com/sitemap.xml +""" + record = MagicMock() + record.rec_type = 'response' + record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} + record.content_stream = lambda: BytesIO(robots_txt_bytes) + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + + # Mock _is_valid_url to return False for sitemap URLs (simulating validation failure) + original_is_valid = job._is_valid_url + def mock_is_valid(url, label_for_log): + if label_for_log == 'sitemap': + return False # Simulate validation failure for sitemap URLs + return original_is_valid(url, label_for_log) + + job._is_valid_url = mock_is_valid + + results = list(job.process_record(record)) + assert len(results) == 0 + assert job.sitemap_url_invalid.value == 1 + assert job.sitemap_urls_found.value == 1 + + +def test_sitemap_url_invalid_unparseable_scheme(spark): + """Test that sitemap_url_invalid increments for multiple unparseable sitemap URLs""" + robots_txt_bytes = b"""User-agent: * +Disallow: /admin/ + +Sitemap: http://valid.com/sitemap1.xml +Sitemap: http://broken.com/sitemap.xml +Sitemap: http://valid.com/sitemap2.xml +""" + record = MagicMock() + record.rec_type = 'response' + record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} + record.content_stream = lambda: BytesIO(robots_txt_bytes) + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + + # Mock _is_valid_url to return False for specific sitemap URLs + original_is_valid = job._is_valid_url + def mock_is_valid(url, label_for_log): + if label_for_log == 'sitemap' and 'broken.com' in url: + return False # Simulate validation failure for broken.com + return original_is_valid(url, label_for_log) + + job._is_valid_url = mock_is_valid + + results = list(job.process_record(record)) + # Should get 2 valid sitemaps and 1 invalid + assert len(results) == 2 + assert job.sitemap_url_invalid.value == 1 + assert job.sitemap_urls_found.value == 3 + From 35de5cac821c6da87ffa73b8d6bbe6f99145ff59 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 20 Oct 2025 14:54:20 +0200 Subject: [PATCH 08/33] Add test for >50 sitemaps; add accumulator checks to tests Signed-off-by: Damian Stewart --- test/test_sitemaps_from_robotstxt.py | 259 +++++++++++++++------------ 1 file changed, 147 insertions(+), 112 deletions(-) diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 4d6ed97..2471ade 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -16,17 +16,16 @@ def spark(): return SparkSession.builder.appName('test_session').getOrCreate() -def make_robots_txt_record(warc_target_uri: str, response_text: str, - response_text_encoding='utf-8', - warc_target_uri_is_invalid=False) -> Mock: +def make_robots_txt_record(warc_target_uri, + response_bytes) -> Mock: + """ + Create a mock robots.txt WARC record for testing. + """ record = MagicMock() record.rec_type = 'response' - if warc_target_uri_is_invalid: - # Create an invalid URL that will cause urlparse to fail - record.rec_headers = {'WARC-Target-URI': warc_target_uri} - else: - record.rec_headers = {'WARC-Target-URI': warc_target_uri} - record.content_stream = lambda: BytesIO(response_text.encode(response_text_encoding)) + record.rec_headers = {'WARC-Target-URI': warc_target_uri} + record.content_stream = lambda: BytesIO(response_bytes) + return record @@ -40,14 +39,20 @@ def test_well_formed_record(spark): Allow: / Sitemap: http://ajedrezhoygol.blogspot.com/sitemap.xml -""") +""".encode('utf-8')) job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) assert len(results) == 1 assert results[0][0] == 'http://ajedrezhoygol.blogspot.com/sitemap.xml' assert results[0][1] == ["ajedrezhoygol.blogspot.com.ar"] - + assert job.sitemap_urls_found.value == 1 + assert job.robots_txt_invalid_url.value == 0 + assert job.sitemap_url_invalid.value == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_different_host_record(spark): @@ -212,7 +217,7 @@ def test_different_host_record(spark): Sitemap: http://3535.ru/sitemap_000.xml -''' +'''.encode('utf-8') ) job = SitemapExtractorJob() job.init_accumulators(session=spark) @@ -239,7 +244,7 @@ def test_host_accumulation_empty(spark): User-agent: * Disallow: /ajax/ Disallow: /apps/ -""") +""".encode('utf-8')) job = SitemapExtractorJob() job.init_accumulators(session=spark) @@ -252,6 +257,13 @@ def test_host_accumulation_empty(spark): assert len(output) == 1 assert output[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' assert output[0][1] == '[]' + assert job.sitemap_urls_found.value == 1 + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_url_invalid_encoding.value == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_host_accumulation_multi(spark): @@ -283,7 +295,7 @@ def test_host_accumulation_multi(spark): Disallow: /?page=hotel_ajax Disallow: /?page=hotellist_json Sitemap: http://nochi.com/data/sitemaps/ru_index.xml -""" +""".encode('utf-8') ), ( "http://the-rockies-condominiums-steamboat-springs.booked.net/robots.txt", @@ -307,7 +319,7 @@ def test_host_accumulation_multi(spark): Disallow: /?page=hotel_ajax Disallow: /?page=hotellist_json Sitemap: http://nochi.com/data/sitemaps/ru_index.xml -""", +""".encode('utf-8'), ), ( "http://hotel-flora-venice.booked.kr/robots.txt", @@ -331,7 +343,7 @@ def test_host_accumulation_multi(spark): Disallow: /?page=hotel_ajax Disallow: /?page=hotellist_json Sitemap: http://nochi.com/data/sitemaps/ru_index.xml -""" +""".encode('utf-8') ) ] @@ -346,6 +358,13 @@ def test_host_accumulation_multi(spark): assert len(output) == 1 assert output[0][0] == 'http://nochi.com/data/sitemaps/ru_index.xml' assert sorted(json.loads(output[0][1])) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) + assert job.sitemap_urls_found.value == 3 + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_url_invalid_encoding.value == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 3 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 3 def test_wrong_encoding_utf16_record(spark): @@ -358,51 +377,31 @@ def test_wrong_encoding_utf16_record(spark): Allow: / Sitemap: http://ajedrezhoygol.blogspot.com/sitemap.xml -""", response_text_encoding='utf-16') +""".encode('utf-16')) job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) assert len(results) == 0 - - -def test_robots_txt_invalid_url_malformed(spark): - """Test that robots_txt_invalid_url increments when robots.txt URL causes urlparse to fail""" - # urlparse will raise AttributeError when given a non-string type in rec_headers - # We need to mock the record more carefully to trigger an actual exception - record = MagicMock() - record.rec_type = 'response' - # Pass an integer instead of string to cause AttributeError in urlparse - record.rec_headers = {'WARC-Target-URI': 12345} # Non-string type - record.content_stream = lambda: BytesIO(b"""User-agent: * -Disallow: / - -Sitemap: http://example.com/sitemap.xml -""") - - job = SitemapExtractorJob() - job.init_accumulators(session=spark) - - # Mock the logger to avoid serialization issues with Spark's Java logger - job.get_logger = lambda: MagicMock() - - results = list(job.process_record(record)) - # Should return early due to invalid robots.txt URL - assert len(results) == 0 - assert job.robots_txt_invalid_url.value == 1 + assert job.sitemap_urls_found.value == 0 + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_url_invalid_encoding.value == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 0 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_robots_txt_invalid_url_unparseable_netloc(spark): - """Test that robots_txt_invalid_url increments when robots.txt URL is a list""" - # Another way to trigger urlparse exception with a non-string type - record = MagicMock() - record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': ['http://example.com']} # List will cause TypeError - record.content_stream = lambda: BytesIO(b"""User-agent: * + """ Test malformed WARC-Target-URI """ + record = make_robots_txt_record( + warc_target_uri='http://[malformed::url]/robots.txt', + response_bytes="""User-agent: * Disallow: /admin/ -Sitemap: http://valid-example.com/sitemap.xml -Sitemap: http://valid-example.com/sitemap2.xml -""") +Sitemap: http://example.com/sitemap.xml +Sitemap: http://example.com/sitemap2.xml +""".encode('utf-8') + ) job = SitemapExtractorJob() job.init_accumulators(session=spark) @@ -411,135 +410,171 @@ def test_robots_txt_invalid_url_unparseable_netloc(spark): job.get_logger = lambda: MagicMock() results = list(job.process_record(record)) - # Should return early due to invalid robots.txt URL assert len(results) == 0 assert job.robots_txt_invalid_url.value == 1 + assert job.sitemap_urls_found.value == 1 + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_url_invalid_encoding.value == 0 + assert job.robots_txt_invalid_url.value == 1 + assert job.robots_txt_announcing_sitemap.value == 0 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_robots_txt_invalid_punycode_url(spark): - """Test handling of invalid punycode domain in robots.txt URL""" - # xn--foo is an invalid punycode domain (incomplete encoding) - # validators.url properly rejects it, so this tests that invalid punycode is caught + """ Test invalid punycode in WARC-Target-URI """ record = make_robots_txt_record("http://xn--foo/robots.txt", """User-agent: * Disallow: / Sitemap: http://example.com/sitemap.xml -""") +""".encode('utf-8')) job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) - # validators.url properly detects invalid punycode and rejects it - # So the robots.txt URL is invalid and no results are returned assert len(results) == 0 assert job.robots_txt_invalid_url.value == 1 + assert job.sitemap_urls_found.value == 1 + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_url_invalid_encoding.value == 0 + assert job.robots_txt_invalid_url.value == 1 + assert job.robots_txt_announcing_sitemap.value == 0 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_sitemap_url_invalid_encoding_latin1(spark): - """Test that sitemap_url_invalid_encoding increments for non-UTF8 sitemap URLs""" - # Create a robots.txt with a sitemap URL containing Latin-1 bytes that aren't valid UTF-8 + """ Test incorrectly encoded sitemap URL - latin-1 bytes """ # The byte sequence \xe9 is é in Latin-1 but invalid in UTF-8 when standalone - robots_txt_bytes = b"""User-agent: * + record = make_robots_txt_record( + warc_target_uri='http://example.com/robots.txt', + response_bytes=b"""User-agent: * Disallow: / Sitemap: http://example.com/sitemap_caf\xe9.xml """ - record = MagicMock() - record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} - record.content_stream = lambda: BytesIO(robots_txt_bytes) + ) job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) assert len(results) == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.sitemap_urls_found.value == 1 + assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 1 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_sitemap_url_invalid_encoding_mixed_bytes(spark): - """Test that sitemap_url_invalid_encoding increments for mixed invalid byte sequences""" - # Create a robots.txt with multiple sitemap URLs, one with invalid UTF-8 + """ Test incorrectly encoded sitemap URL - mixed UTF-8 invalid bytes """ # The byte sequence \xff\xfe is not valid UTF-8 - robots_txt_bytes = b"""User-agent: * + record = make_robots_txt_record( + warc_target_uri='http://example.com/robots.txt', + response_bytes=b"""User-agent: * Disallow: /search Sitemap: http://example.com/good_sitemap.xml Sitemap: http://example.com/bad\xff\xfe_sitemap.xml Sitemap: http://example.com/another_good.xml """ - record = MagicMock() - record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} - record.content_stream = lambda: BytesIO(robots_txt_bytes) + ) job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) - # Should get 2 valid sitemaps and 1 invalid encoding assert len(results) == 2 + assert results == [ + ('http://example.com/good_sitemap.xml', ['example.com']), + ('http://example.com/another_good.xml', ['example.com']) + ] assert job.sitemap_url_invalid_encoding.value == 1 assert job.sitemap_urls_found.value == 3 # All 3 matched the pattern + assert job.robots_txt_invalid_url.value == 0 + assert job.sitemap_url_invalid.value == 0 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 def test_sitemap_url_invalid_malformed_url(spark): - """Test that sitemap_url_invalid increments when sitemap URL causes validation to fail""" - robots_txt_bytes = b"""User-agent: * + """ Test invalid malformed sitemap URL """ + record = make_robots_txt_record( + warc_target_uri='http://example.com/robots.txt', + response_bytes=b"""User-agent: * Disallow: / -Sitemap: http://example.com/sitemap.xml +Sitemap: ht!tp://[malformed::url]/sitemap.xml """ - record = MagicMock() - record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} - record.content_stream = lambda: BytesIO(robots_txt_bytes) + ) job = SitemapExtractorJob() job.init_accumulators(session=spark) - - # Mock _is_valid_url to return False for sitemap URLs (simulating validation failure) - original_is_valid = job._is_valid_url - def mock_is_valid(url, label_for_log): - if label_for_log == 'sitemap': - return False # Simulate validation failure for sitemap URLs - return original_is_valid(url, label_for_log) - - job._is_valid_url = mock_is_valid - results = list(job.process_record(record)) assert len(results) == 0 assert job.sitemap_url_invalid.value == 1 assert job.sitemap_urls_found.value == 1 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 -def test_sitemap_url_invalid_unparseable_scheme(spark): - """Test that sitemap_url_invalid increments for multiple unparseable sitemap URLs""" - robots_txt_bytes = b"""User-agent: * +def test_sitemap_url_invalid_malformed_url_multi(spark): + """ Test multiple sitemap URLs, one invalid """ + # http://xn--invalid is malformed punycode + record = make_robots_txt_record( + warc_target_uri='http://example.com/robots.txt', + response_bytes=b"""User-agent: * Disallow: /admin/ -Sitemap: http://valid.com/sitemap1.xml -Sitemap: http://broken.com/sitemap.xml -Sitemap: http://valid.com/sitemap2.xml +Sitemap: http://valid-site.com/sitemap1.xml +Sitemap: http://xn--invalid/sitemap.xml +Sitemap: http://another-valid-site.com/sitemap2.xml """ - record = MagicMock() - record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': 'http://example.com/robots.txt'} - record.content_stream = lambda: BytesIO(robots_txt_bytes) + ) job = SitemapExtractorJob() job.init_accumulators(session=spark) - - # Mock _is_valid_url to return False for specific sitemap URLs - original_is_valid = job._is_valid_url - def mock_is_valid(url, label_for_log): - if label_for_log == 'sitemap' and 'broken.com' in url: - return False # Simulate validation failure for broken.com - return original_is_valid(url, label_for_log) - - job._is_valid_url = mock_is_valid - results = list(job.process_record(record)) - # Should get 2 valid sitemaps and 1 invalid assert len(results) == 2 + assert results == [ + ('http://valid-site.com/sitemap1.xml', ['example.com']), + ('http://another-valid-site.com/sitemap2.xml', ['example.com']) + ] assert job.sitemap_url_invalid.value == 1 assert job.sitemap_urls_found.value == 3 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 0 + assert job.robots_txt_processed.value == 1 + +def test_50_sitemap_urls(spark): + """ Test multiple sitemap URLs, one invalid """ + # http://xn--invalid is malformed punycode + record = make_robots_txt_record( + warc_target_uri='http://example.com/robots.txt', + response_bytes=("""User-agent: * +Disallow: /admin/ + +""" + "\n".join(f"Sitemap: http://valid-site.com/sitemap{i}.xml" for i in range(1, 61)) + ).encode('utf-8')) + + job = SitemapExtractorJob() + job.init_accumulators(session=spark) + results = list(job.process_record(record)) + assert len(results) == 60 + for sitemap_url, host in results: + assert sitemap_url.startswith("http://valid-site.com/sitemap") + assert host == ["example.com"] + assert job.sitemap_url_invalid.value == 0 + assert job.sitemap_urls_found.value == 60 + assert job.robots_txt_invalid_url.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_with_more_than_50_sitemaps.value == 1 + assert job.robots_txt_processed.value == 1 From 0bd0b4a41bd7b022788323c5f8325c7a31db4a3e Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 20 Oct 2025 15:02:34 +0200 Subject: [PATCH 09/33] add fastwarc implementation --- sitemaps_from_robotstxt_fastwarc.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 sitemaps_from_robotstxt_fastwarc.py diff --git a/sitemaps_from_robotstxt_fastwarc.py b/sitemaps_from_robotstxt_fastwarc.py new file mode 100644 index 0000000..e15762d --- /dev/null +++ b/sitemaps_from_robotstxt_fastwarc.py @@ -0,0 +1,16 @@ +from fastwarc.warc import WarcRecordType + +from sparkcc_fastwarc import CCFastWarcSparkJob +from sitemaps_from_robotstxt import SitemapExtractorJob + + +class SitemapExtractorFastWarcJob(SitemapExtractorJob, CCFastWarcSparkJob): + """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files + using FastWARC to parse WARC files.""" + + name = "SitemapExtractorFastWarc" + + # process only WARC response and metadata (including WAT) records + fastwarc_record_filter = WarcRecordType.response + + # process_record is implemented by SitemapExtractorJob From d17683f2c5c7ae9e044cf7a8f7bf9b5eaeced9e2 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 20 Oct 2025 16:55:37 +0200 Subject: [PATCH 10/33] fix logging Signed-off-by: Damian Stewart --- sitemaps_from_robotstxt.py | 33 +++++++++++----------------- test/test_sitemaps_from_robotstxt.py | 4 ---- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index 7d3ce33..2cb5f96 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -4,6 +4,7 @@ from urllib.parse import urlparse, urljoin import validators +from py4j.protocol import Py4JError from pyspark.sql.types import StructType, StructField, StringType, ArrayType from warcio.recordloader import ArcWarcRecord @@ -93,8 +94,8 @@ def process_record(self, record: ArcWarcRecord): n_sitemaps += 1 try: sitemap_url = sitemap_url.decode("utf-8", "strict") - except UnicodeDecodeError: - # invalid encoding, ignore + except UnicodeDecodeError as e: + self.get_logger().warn(f'Invalid encoding of sitemap URL {sitemap_url}: {repr(e)}') self.sitemap_url_invalid_encoding.add(1) continue @@ -126,25 +127,17 @@ def process_record(self, record: ArcWarcRecord): def _is_valid_url(self, url, label_for_log) -> bool: """Validate URL using validators.url and log if invalid.""" - try: - result = validators.url(url) - # validators.url returns True for valid URLs, ValidationError for invalid - if result is True: - return True - else: - # ValidationError object returned - convert to string for logging - try: - self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, str(result)) - except Exception: - # If logging fails, just continue without logging - pass - return False - except Exception as e: + result = validators.url(url) + # validators.url returns True for valid URLs, ValidationError for invalid + if result is True: + return True + else: + validation_error = str(result) try: - self.get_logger().warn('Invalid %s URL: %s - %s', label_for_log, url, str(e)) - except Exception: - # If logging fails, just continue without logging - pass + self.get_logger().warn('Invalid {} URL: {} - {}'.format(label_for_log, url, validation_error)) + except Exception as e: + self.get_logger().warn('Invalid {} URL (cannot be displayed): {}'.format( + label_for_log, repr(e))) return False diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 2471ade..631680b 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -405,10 +405,6 @@ def test_robots_txt_invalid_url_unparseable_netloc(spark): job = SitemapExtractorJob() job.init_accumulators(session=spark) - - # Mock the logger to avoid serialization issues - job.get_logger = lambda: MagicMock() - results = list(job.process_record(record)) assert len(results) == 0 assert job.robots_txt_invalid_url.value == 1 From 41a3d0b46c273031ce67853a885723df4cc50b15 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 15:14:45 +0100 Subject: [PATCH 11/33] simplify & address review comments Signed-off-by: Damian Stewart --- sitemaps_from_robotstxt.py | 98 ++++++++++------------------ sparkcc.py | 26 ++------ test/test_sitemaps_from_robotstxt.py | 57 +++++----------- 3 files changed, 54 insertions(+), 127 deletions(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index 2cb5f96..d0447a7 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -17,60 +17,29 @@ class SitemapExtractorJob(CCSparkJob): output_schema = StructType([ StructField('sitemap_url', StringType(), True), - StructField('hosts', StringType(), True) + StructField('hosts', ArrayType(elementType=StringType()), True) ]) - merge_method = 'reduce_group_by_key' - # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) robots_txt_processed = None sitemap_urls_found = None - sitemap_url_invalid = None sitemap_url_invalid_encoding = None - robots_txt_invalid_url = None robots_txt_announcing_sitemap = None robots_txt_with_more_than_50_sitemaps = None + def init_accumulators(self, session): super(SitemapExtractorJob, self).init_accumulators(session) sc = session.sparkContext self.robots_txt_processed = sc.accumulator(0) self.sitemap_urls_found = sc.accumulator(0) - self.sitemap_url_invalid = sc.accumulator(0) self.sitemap_url_invalid_encoding = sc.accumulator(0) - self.robots_txt_invalid_url = sc.accumulator(0) self.robots_txt_announcing_sitemap = sc.accumulator(0) self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) - @staticmethod - def reduce_group_by_key_func(kv: tuple): - """Map sitemap URL to cross-submit hosts: - sitemap_url => [host_1, ..., host_n]""" - sitemap_uri, hosts_lists = kv - try: - sitemap_host = urlparse(sitemap_uri).netloc.lower().lstrip('.') - except Exception as e: - raise RuntimeError("A invalid sitemap URI somehow made it through the initial parsing phase, this shouldn't happen: " + repr(e)) - - cross_submit_hosts = set() - - for robots_txt_hosts in hosts_lists: - for robots_txt_host in robots_txt_hosts: - if robots_txt_host != sitemap_host: - cross_submit_hosts.add(robots_txt_host) - - return sitemap_uri, json.dumps(list(cross_submit_hosts)) - - - def add_arguments(self, parser): - super(SitemapExtractorJob, self).add_arguments(parser) - # set output options to match old cc-mrjob output - parser.set_defaults(output_option=['sep=\t', 'escapeQuotes=false', 'header=false']) - parser.set_defaults(output_format='csv') - def process_record(self, record: ArcWarcRecord): """ emit: sitemap_url => [host] """ @@ -79,8 +48,9 @@ def process_record(self, record: ArcWarcRecord): return self.robots_txt_processed.add(1) - url = None - host = None + # robots_txt url/host are lazily computed when we encounter the first valid sitemap URL + robots_txt_url = None + robots_txt_host = None n_sitemaps = 0 data = self.get_payload_stream(record).read() @@ -91,7 +61,6 @@ def process_record(self, record: ArcWarcRecord): if match: sitemap_url = match.group(1).strip() self.sitemap_urls_found.add(1) - n_sitemaps += 1 try: sitemap_url = sitemap_url.decode("utf-8", "strict") except UnicodeDecodeError as e: @@ -99,46 +68,45 @@ def process_record(self, record: ArcWarcRecord): self.sitemap_url_invalid_encoding.add(1) continue - if not self._is_valid_url(sitemap_url, label_for_log='sitemap'): - self.sitemap_url_invalid.add(1) - continue - - if url is None: + if robots_txt_url is None: # first sitemap found: set base URL and get host from URL - url = record.rec_headers['WARC-Target-URI'] - if not self._is_valid_url(url, label_for_log='robots.txt'): - # skip this robots.txt record - self.robots_txt_invalid_url.add(1) + robots_txt_url = record.rec_headers['WARC-Target-URI'] + robots_txt_host = self._try_parse_host(robots_txt_url, label_for_log='robots.txt') + if robots_txt_host is None: + # skip this entire robots.txt record return - host = urlparse(url).netloc.lower().lstrip('.') + if not (sitemap_url.startswith('http:') or sitemap_url.startswith('https:')): + # sitemap_url is relative; pass straight to urljoin which knows how to handle it correctly + sitemap_url = urljoin(robots_txt_url, sitemap_url) - if not sitemap_url.startswith('http'): - sitemap_url = urljoin(url, sitemap_url) + sitemap_host = self._try_parse_host(sitemap_url, label_for_log='sitemap') + if sitemap_host is None: + # skip this sitemap URL, continue processing others + continue - yield sitemap_url, [host] + if sitemap_host == robots_txt_host: + # optimization: same host, save us a fetch later + yield sitemap_url, [] + else: + yield sitemap_url, [robots_txt_host] + n_sitemaps += 1 if n_sitemaps > 0: self.robots_txt_announcing_sitemap.add(1) - - if n_sitemaps > 50: - self.robots_txt_with_more_than_50_sitemaps.add(1) + if n_sitemaps > 50: + self.robots_txt_with_more_than_50_sitemaps.add(1) - def _is_valid_url(self, url, label_for_log) -> bool: - """Validate URL using validators.url and log if invalid.""" - result = validators.url(url) - # validators.url returns True for valid URLs, ValidationError for invalid - if result is True: - return True - else: - validation_error = str(result) + def _try_parse_host(self, url: str, label_for_log: str) -> str|None: + try: + return urlparse(url).netloc.lower().lstrip('.') + except Exception as e: try: - self.get_logger().warn('Invalid {} URL: {} - {}'.format(label_for_log, url, validation_error)) - except Exception as e: - self.get_logger().warn('Invalid {} URL (cannot be displayed): {}'.format( - label_for_log, repr(e))) - return False + self.get_logger().warn(f'Invalid {label_for_log} URL: {url} - {repr(e)}') + except Exception as log_e: + self.get_logger().warn(f'Invalid {label_for_log} URL - {repr(e)} (cannot display: {repr(log_e)})') + return None if __name__ == '__main__': diff --git a/sparkcc.py b/sparkcc.py index 8dc55dc..15d7a70 100644 --- a/sparkcc.py +++ b/sparkcc.py @@ -6,7 +6,6 @@ from io import BytesIO from tempfile import SpooledTemporaryFile, TemporaryFile -from typing import Literal import boto3 import botocore @@ -35,8 +34,6 @@ class CCSparkJob(object): StructField("val", LongType(), True) ]) - merge_method: Literal['reduce_by_key', 'reduce_group_by_key'] = 'reduce_by_key' - # description of input and output shown by --help input_descr = "Path to file listing input paths" output_descr = "Name of output table (saved in spark.sql.warehouse.dir)" @@ -210,22 +207,12 @@ def log_accumulators(self, session): def reduce_by_key_func(a, b): return a + b - @staticmethod - def reduce_group_by_key_func(kv: tuple): - return kv - def run_job(self, session): input_data = session.sparkContext.textFile(self.args.input, minPartitions=self.args.num_input_partitions) - output = input_data.mapPartitionsWithIndex(self.process_warcs) - #self.get_logger().warning("merge method:", self.merge_method) - if self.merge_method == 'reduce_group_by_key': - output = output.groupByKey().map(self.reduce_group_by_key_func) - elif self.merge_method == 'reduce_by_key': - output = output.reduceByKey(self.reduce_by_key_func) - else: - raise ValueError(f"Unknown merge method: {self.merge_method}") + output = input_data.mapPartitionsWithIndex(self.process_warcs) \ + .reduceByKey(self.reduce_by_key_func) session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ @@ -621,13 +608,8 @@ def run_job(self, session): columns.append('content_charset') warc_recs = sqldf.select(*columns).rdd - output = warc_recs.mapPartitions(self.fetch_process_warc_records) - if self.merge_method == 'reduce_group_by_key': - output = output.groupByKey().map(self.reduce_group_by_key_func) - elif self.merge_method == 'reduce_by_key': - output = output.reduceByKey(self.reduce_by_key_func) - else: - raise ValueError(f"Unknown merge method: {self.merge_method}") + output = warc_recs.mapPartitions(self.fetch_process_warc_records) \ + .reduceByKey(self.reduce_by_key_func) session.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 631680b..d1ac3e5 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -8,6 +8,7 @@ from warcio.recordloader import ArcWarcRecord from sitemaps_from_robotstxt import SitemapExtractorJob +from sparkcc import CCSparkJob from utils import _process_jobs @@ -47,9 +48,6 @@ def test_well_formed_record(spark): assert results[0][0] == 'http://ajedrezhoygol.blogspot.com/sitemap.xml' assert results[0][1] == ["ajedrezhoygol.blogspot.com.ar"] assert job.sitemap_urls_found.value == 1 - assert job.robots_txt_invalid_url.value == 0 - assert job.sitemap_url_invalid.value == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -252,15 +250,13 @@ def test_host_accumulation_empty(spark): rdd = spark.sparkContext.parallelize(records) _process_jobs_partial = lambda partition_index, records: _process_jobs(partition_index, records, job=job) output = rdd.mapPartitionsWithIndex(_process_jobs_partial) - output = output.groupByKey().map(SitemapExtractorJob.reduce_group_by_key_func).collect() + output = output.reduceByKey(CCSparkJob.reduce_by_key_func).collect() assert len(output) == 1 assert output[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' - assert output[0][1] == '[]' + assert output[0][1] == [] assert job.sitemap_urls_found.value == 1 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -354,14 +350,12 @@ def test_host_accumulation_multi(spark): rdd = spark.sparkContext.parallelize(records) _process_jobs_partial = lambda partition_index, records: _process_jobs(partition_index, records, job=job) output = rdd.mapPartitionsWithIndex(_process_jobs_partial) - output = output.groupByKey().map(SitemapExtractorJob.reduce_group_by_key_func).collect() + output = output.reduceByKey(CCSparkJob.reduce_by_key_func).collect() assert len(output) == 1 assert output[0][0] == 'http://nochi.com/data/sitemaps/ru_index.xml' - assert sorted(json.loads(output[0][1])) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) + assert sorted(output[0][1]) == sorted(["the-mayflower-hotel-autograph-collection-washington.ibooked.com.br","the-rockies-condominiums-steamboat-springs.booked.net","hotel-flora-venice.booked.kr"]) assert job.sitemap_urls_found.value == 3 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 3 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 3 @@ -383,9 +377,7 @@ def test_wrong_encoding_utf16_record(spark): results = list(job.process_record(record)) assert len(results) == 0 assert job.sitemap_urls_found.value == 0 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 0 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -407,11 +399,8 @@ def test_robots_txt_invalid_url_unparseable_netloc(spark): job.init_accumulators(session=spark) results = list(job.process_record(record)) assert len(results) == 0 - assert job.robots_txt_invalid_url.value == 1 assert job.sitemap_urls_found.value == 1 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 0 - assert job.robots_txt_invalid_url.value == 1 assert job.robots_txt_announcing_sitemap.value == 0 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -428,13 +417,10 @@ def test_robots_txt_invalid_punycode_url(spark): job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) - assert len(results) == 0 - assert job.robots_txt_invalid_url.value == 1 + assert len(results) == 1 assert job.sitemap_urls_found.value == 1 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 0 - assert job.robots_txt_invalid_url.value == 1 - assert job.robots_txt_announcing_sitemap.value == 0 + assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -455,12 +441,9 @@ def test_sitemap_url_invalid_encoding_latin1(spark): job.init_accumulators(session=spark) results = list(job.process_record(record)) assert len(results) == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.sitemap_urls_found.value == 1 - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_url_invalid_encoding.value == 1 - assert job.robots_txt_invalid_url.value == 0 - assert job.robots_txt_announcing_sitemap.value == 1 + assert job.robots_txt_announcing_sitemap.value == 0 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -470,12 +453,13 @@ def test_sitemap_url_invalid_encoding_mixed_bytes(spark): # The byte sequence \xff\xfe is not valid UTF-8 record = make_robots_txt_record( warc_target_uri='http://example.com/robots.txt', + # improperly encoded UTF-8 byte sequence in second sitemap URL response_bytes=b"""User-agent: * Disallow: /search Sitemap: http://example.com/good_sitemap.xml Sitemap: http://example.com/bad\xff\xfe_sitemap.xml -Sitemap: http://example.com/another_good.xml +Sitemap: http://example2.com/another_good.xml """ ) @@ -484,14 +468,11 @@ def test_sitemap_url_invalid_encoding_mixed_bytes(spark): results = list(job.process_record(record)) assert len(results) == 2 assert results == [ - ('http://example.com/good_sitemap.xml', ['example.com']), - ('http://example.com/another_good.xml', ['example.com']) + ('http://example.com/good_sitemap.xml', []), + ('http://example2.com/another_good.xml', ['example.com']) ] assert job.sitemap_url_invalid_encoding.value == 1 assert job.sitemap_urls_found.value == 3 # All 3 matched the pattern - assert job.robots_txt_invalid_url.value == 0 - assert job.sitemap_url_invalid.value == 0 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -511,10 +492,8 @@ def test_sitemap_url_invalid_malformed_url(spark): job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) - assert len(results) == 0 - assert job.sitemap_url_invalid.value == 1 + assert len(results) == 1 assert job.sitemap_urls_found.value == 1 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -537,14 +516,13 @@ def test_sitemap_url_invalid_malformed_url_multi(spark): job = SitemapExtractorJob() job.init_accumulators(session=spark) results = list(job.process_record(record)) - assert len(results) == 2 + assert len(results) == 3 assert results == [ ('http://valid-site.com/sitemap1.xml', ['example.com']), + ('http://xn--invalid/sitemap.xml', ['example.com']), ('http://another-valid-site.com/sitemap2.xml', ['example.com']) ] - assert job.sitemap_url_invalid.value == 1 assert job.sitemap_urls_found.value == 3 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 0 assert job.robots_txt_processed.value == 1 @@ -558,7 +536,7 @@ def test_50_sitemap_urls(spark): response_bytes=("""User-agent: * Disallow: /admin/ -""" + "\n".join(f"Sitemap: http://valid-site.com/sitemap{i}.xml" for i in range(1, 61)) +""" + "\n".join(f"Sitemap: http://valid-site.com/sitemap{i}.xml" for i in range(60)) ).encode('utf-8')) job = SitemapExtractorJob() @@ -568,9 +546,8 @@ def test_50_sitemap_urls(spark): for sitemap_url, host in results: assert sitemap_url.startswith("http://valid-site.com/sitemap") assert host == ["example.com"] - assert job.sitemap_url_invalid.value == 0 assert job.sitemap_urls_found.value == 60 - assert job.robots_txt_invalid_url.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 assert job.robots_txt_with_more_than_50_sitemaps.value == 1 assert job.robots_txt_processed.value == 1 + From d50a39a3510b67792ba582993943cc9852ee5897 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 15:17:47 +0100 Subject: [PATCH 12/33] add exception handling for urljoin Signed-off-by: Damian Stewart --- sitemaps_from_robotstxt.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index d0447a7..444a3d5 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -78,7 +78,14 @@ def process_record(self, record: ArcWarcRecord): if not (sitemap_url.startswith('http:') or sitemap_url.startswith('https:')): # sitemap_url is relative; pass straight to urljoin which knows how to handle it correctly - sitemap_url = urljoin(robots_txt_url, sitemap_url) + try: + sitemap_url = urljoin(robots_txt_url, sitemap_url) + except Exception as e: + try: + self.get_logger().warn(f'Error joining sitemap URL {sitemap_url} with base {robots_txt_url}: {repr(e)}') + except Exception as log_e: + self.get_logger().warn(f'Error joining sitemap URL with base - {repr(e)} (cannot display: {repr(log_e)})') + continue sitemap_host = self._try_parse_host(sitemap_url, label_for_log='sitemap') if sitemap_host is None: From c1639592750dbf7a238f7e995c3e477295ee32b3 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 15:23:12 +0100 Subject: [PATCH 13/33] add python unit test github workflow Signed-off-by: Damian Stewart --- .github/workflows/python_test.yaml | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python_test.yaml diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml new file mode 100644 index 0000000..31176b5 --- /dev/null +++ b/.github/workflows/python_test.yaml @@ -0,0 +1,40 @@ +name: Python Unit Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12', '3.13'] + fail-fast: false + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install uv + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Install dependencies + run: | + uv pip install -r requirements.txt + + - name: Run tests + run: | + uv run -m pytest . -v + env: + # pyspark needs this to find the test files + PYTHONPATH: test From 1ba160f024de5bb1e09fa1f96b438837f9b70081 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 15:24:37 +0100 Subject: [PATCH 14/33] downgrade from uv --- .github/workflows/python_test.yaml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index 31176b5..597f0b5 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -23,18 +23,13 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install uv - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh - echo "$HOME/.cargo/bin" >> $GITHUB_PATH - - name: Install dependencies run: | - uv pip install -r requirements.txt + pip install -r requirements.txt - name: Run tests run: | - uv run -m pytest . -v + python -m pytest . -v env: # pyspark needs this to find the test files PYTHONPATH: test From e2e9646f58fc5b9bdc85914fcd796603e57e22e6 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 15:29:26 +0100 Subject: [PATCH 15/33] remove same-host optimization --- sitemaps_from_robotstxt.py | 20 ++++++++------------ test/test_sitemaps_from_robotstxt.py | 6 +++--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index 444a3d5..ad6d454 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -71,8 +71,13 @@ def process_record(self, record: ArcWarcRecord): if robots_txt_url is None: # first sitemap found: set base URL and get host from URL robots_txt_url = record.rec_headers['WARC-Target-URI'] - robots_txt_host = self._try_parse_host(robots_txt_url, label_for_log='robots.txt') - if robots_txt_host is None: + try: + robots_txt_host = urlparse(robots_txt_url).netloc.lower().lstrip('.') + except Exception as e1: + try: + self.get_logger().warn(f'Invalid robots.txt URL: {robots_txt_url} - {repr(e1)}') + except Exception as e2: + self.get_logger().warn(f'Invalid robots.txt URL - {repr(e1)} (cannot display: {repr(e2)})') # skip this entire robots.txt record return @@ -87,16 +92,7 @@ def process_record(self, record: ArcWarcRecord): self.get_logger().warn(f'Error joining sitemap URL with base - {repr(e)} (cannot display: {repr(log_e)})') continue - sitemap_host = self._try_parse_host(sitemap_url, label_for_log='sitemap') - if sitemap_host is None: - # skip this sitemap URL, continue processing others - continue - - if sitemap_host == robots_txt_host: - # optimization: same host, save us a fetch later - yield sitemap_url, [] - else: - yield sitemap_url, [robots_txt_host] + yield sitemap_url, [robots_txt_host] n_sitemaps += 1 if n_sitemaps > 0: diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index d1ac3e5..48ace06 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -227,7 +227,7 @@ def test_different_host_record(spark): -def test_host_accumulation_empty(spark): +def test_host_accumulation_same_host(spark): """ Test accumulation of hosts when sitemap url host and robots.txt url host match Requires test/ on PYTHONPATH so utils._process_jobs can be imported @@ -254,7 +254,7 @@ def test_host_accumulation_empty(spark): assert len(output) == 1 assert output[0][0] == 'http://agencasinosbobet5.weebly.com/sitemap.xml' - assert output[0][1] == [] + assert output[0][1] == ['agencasinosbobet5.weebly.com'] assert job.sitemap_urls_found.value == 1 assert job.sitemap_url_invalid_encoding.value == 0 assert job.robots_txt_announcing_sitemap.value == 1 @@ -468,7 +468,7 @@ def test_sitemap_url_invalid_encoding_mixed_bytes(spark): results = list(job.process_record(record)) assert len(results) == 2 assert results == [ - ('http://example.com/good_sitemap.xml', []), + ('http://example.com/good_sitemap.xml', ['example.com']), ('http://example2.com/another_good.xml', ['example.com']) ] assert job.sitemap_url_invalid_encoding.value == 1 From 94a98d757a4e2627a7eb41ff7c8de6e542ff024f Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 17:29:55 +0100 Subject: [PATCH 16/33] split PySpark into its own requirements.txt; update README Signed-off-by: Damian Stewart --- .github/workflows/python_test.yaml | 1 + README.md | 49 ++++++++++++++++++++++-------- requirements-pyspark.txt | 1 + requirements.txt | 1 - 4 files changed, 38 insertions(+), 14 deletions(-) create mode 100644 requirements-pyspark.txt diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index 597f0b5..2542bce 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -26,6 +26,7 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt + pip install -r requirements-pyspark.txt - name: Run tests run: | diff --git a/README.md b/README.md index caeea65..2796ef1 100644 --- a/README.md +++ b/README.md @@ -28,18 +28,32 @@ Extending the [CCSparkJob](./sparkcc.py) isn't difficult and for many use cases ## Setup -To develop and test locally, you will need to install -* Spark, see the [detailed instructions](https://spark.apache.org/docs/latest/), and -* all required Python modules by running -``` -pip install -r requirements.txt -``` -* (optionally, and only if you want to query the columnar index) [install S3 support libraries](#installation-of-s3-support-libraries) so that Spark can load the columnar index from S3 +To develop and test locally, you Python>=3.9 and Spark. + +#### If Spark is already installed: +(or if you want full control over your Spark cluster configuration), install only the Python dependencies: + + ```bash + pip install -r requirements.txt + ``` + + Ensure that `spark-submit` and `pyspark` are on your `$PATH`, or prepend `$SPARK_HOME/bin` when running eg `$SPARK_HOME/bin/spark-submit`. See the [Spark documentation](https://spark.apache.org/docs/latest/) for more information. + +#### If you just want to get started quickly with local no-cluster development: + + ```bash + pip install -r requirements.txt + pip install -r requirements-pyspark.txt + ``` + This will install [the PySpark python package](https://spark.apache.org/docs/latest/api/python/getting_started/index.html), which includes a local/client-only version of Spark, and adds `spark-submit` and `pyspark` to your `$PATH`. + +#### To query the columnar index: +In addition to the above, [install S3 support libraries](#installation-of-s3-support-libraries) so that Spark can load the columnar index from S3. ## Compatibility and Requirements -Tested with with Spark 3.2.3, 3.3.2, 3.4.1, 3.5.5 in combination with Python 3.8, 3.9, 3.10, 3.12 and 3.13. See the branch [python-2.7](/commoncrawl/cc-pyspark/tree/python-2.7) if you want to run the job on Python 2.7 and older Spark versions. +Tested with Spark 3.2.3, 3.3.2, 3.4.1, 3.5.5 in combination with Python 3.8, 3.9, 3.10, 3.12 and 3.13. See the branch [python-2.7](/commoncrawl/cc-pyspark/tree/python-2.7) if you want to run the job on Python 2.7 and older Spark versions. ## Get Sample Data @@ -62,11 +76,10 @@ CC-PySpark reads the list of input files from a manifest file. Typically, these ### Running locally -First, point the environment variable `SPARK_HOME` to your Spark installation. -Then submit a job via +Spark jobs can be started using `spark-submit` (see [Setup](#setup) above if you have a manual installation of Spark): ``` -$SPARK_HOME/bin/spark-submit ./server_count.py \ +spark-submit ./server_count.py \ --num_output_partitions 1 --log_level WARN \ ./input/test_warc.txt servernames ``` @@ -76,7 +89,7 @@ This will count web server names sent in HTTP response headers for the sample WA The output table can be accessed via SparkSQL, e.g., ``` -$SPARK_HOME/bin/pyspark +$ pyspark >>> df = sqlContext.read.parquet("spark-warehouse/servernames") >>> for row in df.sort(df.val.desc()).take(10): print(row) ... @@ -92,12 +105,22 @@ Row(key=u'Apache/2.2.15 (CentOS)', val=827) Row(key=u'Apache-Coyote/1.1', val=790) ``` -But it's also possible to configure a different output format, for example CSV or JSON, see the command-line options. +But it's also possible to configure a different output format, for example CSV or JSON; pass `--help` on the command line for more details. See also * [running the Spark shell and submitting Spark jobs](https://spark.apache.org/docs/latest/#running-the-examples-and-shell) * [PySpark SQL API](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html) +#### Debugging in an IDE + +If the `.py` file for the job you want to debug is runnable (i.e. if it has a `if __name__ == "__main__":` line), you can invoke it directly as a Python script without needing to go through spark-submit: + +```bash +python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames` +```` + +Spark will complain if the output directory exists - you may want to add a preprocessing step that deletes the output folder under `spark-warehouse` before each run, eg `rm -rf wpark-warehouse/servernames`. + ### Running in Spark cluster over large amounts of data diff --git a/requirements-pyspark.txt b/requirements-pyspark.txt new file mode 100644 index 0000000..3da099f --- /dev/null +++ b/requirements-pyspark.txt @@ -0,0 +1 @@ +pyspark==3.5.7 diff --git a/requirements.txt b/requirements.txt index ab8d7be..f62f564 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,3 @@ lxml # testing pytest pytest-mock -pyspark==3.5.7 From 6a23d23cb08d324e233a89a7e7290fbc14223b97 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 17:33:36 +0100 Subject: [PATCH 17/33] typo Signed-off-by: Damian Stewart --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2796ef1..bdaa170 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Extending the [CCSparkJob](./sparkcc.py) isn't difficult and for many use cases ## Setup -To develop and test locally, you Python>=3.9 and Spark. +To develop and test locally, you'll need Python>=3.9 and Spark. #### If Spark is already installed: (or if you want full control over your Spark cluster configuration), install only the Python dependencies: From 8339e6c4ea0a4244258586cf6b94dcbd786dcb4d Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 17:34:52 +0100 Subject: [PATCH 18/33] typos Signed-off-by: Damian Stewart --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bdaa170..a677db3 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,9 @@ To develop and test locally, you'll need Python>=3.9 and Spark. pip install -r requirements.txt pip install -r requirements-pyspark.txt ``` - This will install [the PySpark python package](https://spark.apache.org/docs/latest/api/python/getting_started/index.html), which includes a local/client-only version of Spark, and adds `spark-submit` and `pyspark` to your `$PATH`. + This will install [the PySpark python package](https://spark.apache.org/docs/latest/api/python/getting_started/index.html), which includes a local/client-only version of Spark and also adds `spark-submit` and `pyspark` to your `$PATH`. -#### To query the columnar index: +#### If you want to query the columnar index: In addition to the above, [install S3 support libraries](#installation-of-s3-support-libraries) so that Spark can load the columnar index from S3. From be9a0464beacad14a224a9316cc4ef475538a1e8 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 17:36:12 +0100 Subject: [PATCH 19/33] polish Signed-off-by: Damian Stewart --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a677db3..1c49421 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ If the `.py` file for the job you want to debug is runnable (i.e. if it has a `i python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames` ```` -Spark will complain if the output directory exists - you may want to add a preprocessing step that deletes the output folder under `spark-warehouse` before each run, eg `rm -rf wpark-warehouse/servernames`. +Spark will complain if the output directory exists - you may want to add a preprocessing step that deletes the appropriate subdirectory under `spark-warehouse` before each run, eg `rm -rf wpark-warehouse/servernames`. ### Running in Spark cluster over large amounts of data From 0210cd8ee23f2b64605c2bfe3780b10e0127e796 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 30 Oct 2025 17:37:19 +0100 Subject: [PATCH 20/33] cleanup remaining spark-submit invocations --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1c49421..33aea8c 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ As the Common Crawl dataset lives in the Amazon Public Datasets program, you can All examples show the available command-line options if called with the parameter `--help` or `-h`, e.g. ``` -$SPARK_HOME/bin/spark-submit ./server_count.py --help +spark-submit ./server_count.py --help ``` #### Overwriting Spark configuration properties @@ -149,7 +149,7 @@ There are many [Spark configuration properties](https://spark.apache.org/docs/la It's possible to overwrite Spark properties when [submitting the job](https://spark.apache.org/docs/latest/submitting-applications.html): ``` -$SPARK_HOME/bin/spark-submit \ +spark-submit \ --conf spark.sql.warehouse.dir=myWareHouseDir \ ... (other Spark options, flags, config properties) \ ./server_count.py \ @@ -193,7 +193,7 @@ Please also note that: Below an example call to count words in 10 WARC records host under the `.is` top-level domain using the `--packages` option: ``` -$SPARK_HOME/bin/spark-submit \ +spark-submit \ --packages org.apache.hadoop:hadoop-aws:3.3.2 \ ./cc_index_word_count.py \ --input_base_url s3://commoncrawl/ \ From cb9c4cab76d4ca8f337dc0dcfbdc2c4fe448f9a2 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 31 Oct 2025 14:35:29 +0100 Subject: [PATCH 21/33] update docs with expanded PySpark info --- .github/workflows/python_test.yaml | 4 +- README.md | 66 +++++++++++++++++++++++------- requirements-pyspark.txt | 1 - 3 files changed, 53 insertions(+), 18 deletions(-) delete mode 100644 requirements-pyspark.txt diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index 2542bce..4921533 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -26,11 +26,11 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt - pip install -r requirements-pyspark.txt + pip install pyspark==3.5.7 - name: Run tests run: | python -m pytest . -v env: - # pyspark needs this to find the test files + # PySpark needs this to find the test files PYTHONPATH: test diff --git a/README.md b/README.md index 33aea8c..5061136 100644 --- a/README.md +++ b/README.md @@ -28,28 +28,44 @@ Extending the [CCSparkJob](./sparkcc.py) isn't difficult and for many use cases ## Setup -To develop and test locally, you'll need Python>=3.9 and Spark. +To develop and test locally, you'll need **Python>=3.9** and **Spark**. -#### If Spark is already installed: -(or if you want full control over your Spark cluster configuration), install only the Python dependencies: +### JRE - ```bash - pip install -r requirements.txt - ``` - - Ensure that `spark-submit` and `pyspark` are on your `$PATH`, or prepend `$SPARK_HOME/bin` when running eg `$SPARK_HOME/bin/spark-submit`. See the [Spark documentation](https://spark.apache.org/docs/latest/) for more information. +Spark requires a 64-bit Java JRE (v8, 11, or 17 for Spark 3.5.7). Install this first. If you have an Apple Silicon device, Azul Zulu JRE is recommended for native architecture support. Ensure that either `java` is on your `$PATH` or the `$JAVA_HOME` env var points to your JRE. -#### If you just want to get started quickly with local no-cluster development: +### Python dependencies - ```bash - pip install -r requirements.txt - pip install -r requirements-pyspark.txt - ``` - This will install [the PySpark python package](https://spark.apache.org/docs/latest/api/python/getting_started/index.html), which includes a local/client-only version of Spark and also adds `spark-submit` and `pyspark` to your `$PATH`. +Assuming you have Python already setup and a venv activated, install the `cc-pyspark` dependencies: + +``` +pip install -r requirements.txt +``` #### If you want to query the columnar index: In addition to the above, [install S3 support libraries](#installation-of-s3-support-libraries) so that Spark can load the columnar index from S3. +### Spark + +There are two ways to obtain Spark: +* manual installation / preinstallation +* as a pip package with `pip install` + +#### For simple development or to get started quickly, the `pip install` route is recommended: + +```bash +pip install pyspark==3.5.7 +``` + +This will install v3.5.7 of [the PySpark python package](https://spark.apache.org/docs/latest/api/python/getting_started/index.html), which includes a local/client-only version of Spark and also adds `spark-submit` and `pyspark` to your `$PATH`. + +> If you need to interact with a remote Spark cluster, use a version of PySpark that matches the cluster version. + +#### If Spark is already installed or if you want full tooling to configure a local Spark cluster: + +Install Spark if (see the [Spark documentation](https://spark.apache.org/docs/latest/) for guidance). Then, ensure that `spark-submit` and `pyspark` are on your `$PATH`, or prepend `$SPARK_HOME/bin` when running eg `$SPARK_HOME/bin/spark-submit`. + +> Note: The PySpark package is required if you want to run the tests in `test/`. ## Compatibility and Requirements @@ -113,7 +129,7 @@ See also #### Debugging in an IDE -If the `.py` file for the job you want to debug is runnable (i.e. if it has a `if __name__ == "__main__":` line), you can invoke it directly as a Python script without needing to go through spark-submit: +If the `.py` file for the job you want to debug is runnable (i.e. if it has a `if __name__ == "__main__":` line), you can bypass `spark-submit` and run it directly as a Python script: ```bash python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames` @@ -121,6 +137,11 @@ python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernam Spark will complain if the output directory exists - you may want to add a preprocessing step that deletes the appropriate subdirectory under `spark-warehouse` before each run, eg `rm -rf wpark-warehouse/servernames`. +> If you have manually installed Spark you'll need to ensure the pyspark package is on your PYTHONPATH: +> ```bash +> PYTHONPATH=$SPARK_HOME/python python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames +? ``` + ### Running in Spark cluster over large amounts of data @@ -233,6 +254,21 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes). +## Running the Tests + +To run the tests in `test/` you will need to add `.` and `test` to the PYTHONPATH: + +```bash +PYTHONPATH=.:test pytest -v test +``` + +or if you have a manual installation of Spark: + +```bash +PYTHONPATH=$SPARK_HOME/python:.:test pytest -v test +``` + + ## Credits Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades: diff --git a/requirements-pyspark.txt b/requirements-pyspark.txt deleted file mode 100644 index 3da099f..0000000 --- a/requirements-pyspark.txt +++ /dev/null @@ -1 +0,0 @@ -pyspark==3.5.7 From 2dd22dc1f6be12bd63d124181316476d0455309b Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 31 Oct 2025 14:57:24 +0100 Subject: [PATCH 22/33] typo; clarify debugging limitations Signed-off-by: Damian Stewart --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5061136..97a5cdf 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,9 @@ Spark will complain if the output directory exists - you may want to add a prepr > If you have manually installed Spark you'll need to ensure the pyspark package is on your PYTHONPATH: > ```bash > PYTHONPATH=$SPARK_HOME/python python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames -? ``` +> ``` + +Note that the `run_job` code is still invoked by the Spark Java binary behind the scenes, which normally prevents a debugger from attaching. To debug the `run_job` internals, it's recommended to set up a unit test and debug that; see `test/test_sitemaps_from_robotstxt` for examples of single and batched job tests. ### Running in Spark cluster over large amounts of data From 126b31b735b409eb111630cffff22a199a6f8391 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 31 Oct 2025 15:17:01 +0100 Subject: [PATCH 23/33] address review comments --- README.md | 6 +++--- requirements.txt | 3 --- sitemaps_from_robotstxt.py | 21 ++++++++++++++++----- sitemaps_from_robotstxt_fastwarc.py | 4 ++++ test/test_sitemaps_from_robotstxt.py | 3 ++- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 97a5cdf..2abacd8 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ Spark will complain if the output directory exists - you may want to add a prepr > If you have manually installed Spark you'll need to ensure the pyspark package is on your PYTHONPATH: > ```bash -> PYTHONPATH=$SPARK_HOME/python python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames +> PYTHONPATH=$PYTHONPATH:$SPARK_HOME/python python server_count.py --num_output_partitions 1 ./input/test_warc.txt servernames > ``` Note that the `run_job` code is still invoked by the Spark Java binary behind the scenes, which normally prevents a debugger from attaching. To debug the `run_job` internals, it's recommended to set up a unit test and debug that; see `test/test_sitemaps_from_robotstxt` for examples of single and batched job tests. @@ -261,13 +261,13 @@ However, it's recommended that you carefully verify that your custom job impleme To run the tests in `test/` you will need to add `.` and `test` to the PYTHONPATH: ```bash -PYTHONPATH=.:test pytest -v test +PYTHONPATH=$PYTHONPATH:.:test pytest -v test ``` or if you have a manual installation of Spark: ```bash -PYTHONPATH=$SPARK_HOME/python:.:test pytest -v test +PYTHONPATH=$PYTHONPATH:$SPARK_HOME/python:.:test pytest -v test ``` diff --git a/requirements.txt b/requirements.txt index f62f564..9c2bbb9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,6 @@ ujson orjson warcio -# for validating URLs in robots.txt: -validators - # for link extraction and webgraph construction also: idna diff --git a/sitemaps_from_robotstxt.py b/sitemaps_from_robotstxt.py index ad6d454..55ee651 100644 --- a/sitemaps_from_robotstxt.py +++ b/sitemaps_from_robotstxt.py @@ -1,10 +1,6 @@ -import json import re -from typing import Optional from urllib.parse import urlparse, urljoin -import validators -from py4j.protocol import Py4JError from pyspark.sql.types import StructType, StructField, StringType, ArrayType from warcio.recordloader import ArcWarcRecord @@ -30,6 +26,21 @@ class SitemapExtractorJob(CCSparkJob): robots_txt_with_more_than_50_sitemaps = None + def log_accumulators(self, session): + super(SitemapExtractorJob, self).log_accumulators(session) + + self.log_accumulator(session, self.robots_txt_processed, + 'robots.txt successfully parsed = {}') + self.log_accumulator(session, self.sitemap_urls_found, + 'sitemap urls found = {}') + self.log_accumulator(session, self.sitemap_url_invalid_encoding, + 'sitemap urls with invalid utf-8 encoding = {}') + self.log_accumulator(session, self.robots_txt_announcing_sitemap, + 'robots.txt announcing at least 1 sitemap = {}') + self.log_accumulator(session, self.robots_txt_with_more_than_50_sitemaps, + 'robots.txt with more than 50 sitemaps = {}') + + def init_accumulators(self, session): super(SitemapExtractorJob, self).init_accumulators(session) @@ -70,7 +81,7 @@ def process_record(self, record: ArcWarcRecord): if robots_txt_url is None: # first sitemap found: set base URL and get host from URL - robots_txt_url = record.rec_headers['WARC-Target-URI'] + robots_txt_url = self.get_warc_header(record, 'WARC-Target-URI') try: robots_txt_host = urlparse(robots_txt_url).netloc.lower().lstrip('.') except Exception as e1: diff --git a/sitemaps_from_robotstxt_fastwarc.py b/sitemaps_from_robotstxt_fastwarc.py index e15762d..ce1d3a7 100644 --- a/sitemaps_from_robotstxt_fastwarc.py +++ b/sitemaps_from_robotstxt_fastwarc.py @@ -14,3 +14,7 @@ class SitemapExtractorFastWarcJob(SitemapExtractorJob, CCFastWarcSparkJob): fastwarc_record_filter = WarcRecordType.response # process_record is implemented by SitemapExtractorJob + +if __name__ == '__main__': + job = SitemapExtractorFastWarcJob() + job.run() diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 48ace06..ba32527 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -24,7 +24,8 @@ def make_robots_txt_record(warc_target_uri, """ record = MagicMock() record.rec_type = 'response' - record.rec_headers = {'WARC-Target-URI': warc_target_uri} + # mock rec_headers.get_header('WARC-Target-URI') + record.rec_headers.get_header = lambda key, default: warc_target_uri if key == 'WARC-Target-URI' else default record.content_stream = lambda: BytesIO(response_bytes) return record From 3a7ba3b6fbfcce8ad1b05dae59ec67a7094220ff Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 31 Oct 2025 17:40:53 +0100 Subject: [PATCH 24/33] wip --- README.md | 22 ++++++------- requirements.txt | 33 ------------------- src/__init__.py | 0 src/cc_pyspark/__init__.py | 0 bs4_parser.py => src/cc_pyspark/bs4_parser.py | 0 .../cc_pyspark/cc_index_export.py | 0 .../cc_pyspark/cc_index_word_count.py | 0 .../cc_pyspark/hostlinks_extract_fastwarc.py | 0 .../cc_pyspark/hostlinks_to_graph.py | 0 .../cc_pyspark/html_tag_count.py | 0 iana_tld.py => src/cc_pyspark/iana_tld.py | 0 .../cc_pyspark/json_importer.py | 0 .../cc_pyspark/linkmap2parquet.py | 0 .../cc_pyspark/resiliparse_parser.py | 0 .../cc_pyspark/server_count.py | 0 .../cc_pyspark/server_count_fastwarc.py | 0 .../cc_pyspark/server_ip_address.py | 0 .../cc_pyspark/sitemaps_from_robotstxt.py | 0 .../sitemaps_from_robotstxt_fastwarc.py | 0 sparkcc.py => src/cc_pyspark/sparkcc.py | 0 .../cc_pyspark/sparkcc_fastwarc.py | 0 .../cc_pyspark/wat_extract_links.py | 2 +- word_count.py => src/cc_pyspark/word_count.py | 0 test/test_sitemaps_from_robotstxt.py | 6 ++-- 24 files changed, 14 insertions(+), 49 deletions(-) delete mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/cc_pyspark/__init__.py rename bs4_parser.py => src/cc_pyspark/bs4_parser.py (100%) rename cc_index_export.py => src/cc_pyspark/cc_index_export.py (100%) rename cc_index_word_count.py => src/cc_pyspark/cc_index_word_count.py (100%) rename hostlinks_extract_fastwarc.py => src/cc_pyspark/hostlinks_extract_fastwarc.py (100%) rename hostlinks_to_graph.py => src/cc_pyspark/hostlinks_to_graph.py (100%) rename html_tag_count.py => src/cc_pyspark/html_tag_count.py (100%) rename iana_tld.py => src/cc_pyspark/iana_tld.py (100%) rename json_importer.py => src/cc_pyspark/json_importer.py (100%) rename linkmap2parquet.py => src/cc_pyspark/linkmap2parquet.py (100%) rename resiliparse_parser.py => src/cc_pyspark/resiliparse_parser.py (100%) rename server_count.py => src/cc_pyspark/server_count.py (100%) rename server_count_fastwarc.py => src/cc_pyspark/server_count_fastwarc.py (100%) rename server_ip_address.py => src/cc_pyspark/server_ip_address.py (100%) rename sitemaps_from_robotstxt.py => src/cc_pyspark/sitemaps_from_robotstxt.py (100%) rename sitemaps_from_robotstxt_fastwarc.py => src/cc_pyspark/sitemaps_from_robotstxt_fastwarc.py (100%) rename sparkcc.py => src/cc_pyspark/sparkcc.py (100%) rename sparkcc_fastwarc.py => src/cc_pyspark/sparkcc_fastwarc.py (100%) rename wat_extract_links.py => src/cc_pyspark/wat_extract_links.py (99%) rename word_count.py => src/cc_pyspark/word_count.py (100%) diff --git a/README.md b/README.md index 2abacd8..40c50e1 100644 --- a/README.md +++ b/README.md @@ -4,27 +4,27 @@ This project provides examples how to process the Common Crawl dataset with [Apache Spark](https://spark.apache.org/) and Python: -+ [count HTML tags](./html_tag_count.py) in Common Crawl's raw response data (WARC files) ++ [count HTML tags](src/cc_pyspark/html_tag_count.py) in Common Crawl's raw response data (WARC files) -+ [count web server names](./server_count.py) in Common Crawl's metadata (WAT files or WARC files) ++ [count web server names](src/cc_pyspark/server_count.py) in Common Crawl's metadata (WAT files or WARC files) -+ list host names and corresponding [IP addresses](./server_ip_address.py) (WAT files or WARC files) ++ list host names and corresponding [IP addresses](src/cc_pyspark/server_ip_address.py) (WAT files or WARC files) -+ [word count](./word_count.py) (term and document frequency) in Common Crawl's extracted text (WET files) ++ [word count](src/cc_pyspark/word_count.py) (term and document frequency) in Common Crawl's extracted text (WET files) -+ [extract links](./wat_extract_links.py) from WAT files and [construct the (host-level) web graph](./hostlinks_to_graph.py) – for further details about the web graphs see the project [cc-webgraph](https://github.com/commoncrawl/cc-webgraph) ++ [extract links](src/cc_pyspark/wat_extract_links.py) from WAT files and [construct the (host-level) web graph](src/cc_pyspark/hostlinks_to_graph.py) – for further details about the web graphs see the project [cc-webgraph](https://github.com/commoncrawl/cc-webgraph) + work with the [columnar URL index](https://commoncrawl.org/2018/03/index-to-warc-files-and-urls-in-columnar-format/) (see also [cc-index-table](https://github.com/commoncrawl/cc-index-table) and the notes about [querying the columnar index](#querying-the-columnar-index)): - - run a SQL query and [export the result as a table](./cc_index_export.py) + - run a SQL query and [export the result as a table](src/cc_pyspark/cc_index_export.py) - - select WARC records by a SQL query, parse the HTML, extract the text and [count words](./cc_index_word_count.py). Alternatively, the first step (query the columnar index) can be executed using Amazon Athena. The list of WARC record coordinates (CSV or a table created by a CTAS statement) is then passed via `--csv` or `--input_table_format`) to the Spark job. + - select WARC records by a SQL query, parse the HTML, extract the text and [count words](src/cc_pyspark/cc_index_word_count.py). Alternatively, the first step (query the columnar index) can be executed using Amazon Athena. The list of WARC record coordinates (CSV or a table created by a CTAS statement) is then passed via `--csv` or `--input_table_format`) to the Spark job. Further information about the examples and available options is shown via the [command-line option](#command-line-options) `--help`. ## Implementing a Custom Extractor -Extending the [CCSparkJob](./sparkcc.py) isn't difficult and for many use cases it is sufficient to override a single method (`process_record`). Have a look at one of the examples, e.g. to [count HTML tags](./html_tag_count.py). +Extending the [CCSparkJob](src/cc_pyspark/sparkcc.py) isn't difficult and for many use cases it is sufficient to override a single method (`process_record`). Have a look at one of the examples, e.g. to [count HTML tags](src/cc_pyspark/html_tag_count.py). ## Setup @@ -88,7 +88,7 @@ Note that the sample data is from an older crawl (`CC-MAIN-2017-13` run in March ## Process Common Crawl Data on Spark -CC-PySpark reads the list of input files from a manifest file. Typically, these are Common Crawl WARC, WAT or WET files, but it could be any other type of file, as long it is supported by the class implementing [CCSparkJob](./sparkcc.py). The files can be given as absolute URLs or as paths relative to a base URL (option `--input_base_url`). The URL cat point to a local file (`file://`), to a remote location (typically below `s3://commoncrawl/` resp. `https://data.commoncrawl.org/`). For development and testing, you'd start with local files. +CC-PySpark reads the list of input files from a manifest file. Typically, these are Common Crawl WARC, WAT or WET files, but it could be any other type of file, as long it is supported by the class implementing [CCSparkJob](src/cc_pyspark/sparkcc.py). The files can be given as absolute URLs or as paths relative to a base URL (option `--input_base_url`). The URL cat point to a local file (`file://`), to a remote location (typically below `s3://commoncrawl/` resp. `https://data.commoncrawl.org/`). For development and testing, you'd start with local files. ### Running locally @@ -248,10 +248,10 @@ Alternatively, it's possible configure the table schema explicitly: Replacing [FastWARC](https://resiliparse.chatnoir.eu/en/latest/man/fastwarc.html) can speed up job execution by 25% if little custom computations are done and most of the time is spent for parsing WARC files. To use FastWARC -- the job class must inherit from [CCFastWarcSparkJob](./sparkcc_fastwarc.py) instead of [CCSparkJob](./sparkcc.py). See [ServerCountFastWarcJob](./server_count_fastwarc.py) for an example. +- the job class must inherit from [CCFastWarcSparkJob](src/cc_pyspark/sparkcc_fastwarc.py) instead of [CCSparkJob](src/cc_pyspark/sparkcc.py). See [ServerCountFastWarcJob](src/cc_pyspark/server_count_fastwarc.py) for an example. - when running the job in a Spark cluster, `sparkcc_fastwarc.py` must be passed via `--py-files` in addition to `sparkcc.py` and further job-specific Python files. See also [running in a Spark cluster](#running-in-spark-cluster-over-large-amounts-of-data). -Some differences between the warcio and FastWARC APIs are hidden from the user in methods implemented in [CCSparkJob](./sparkcc.py) and [CCFastWarcSparkJob](./sparkcc_fastwarc.py) respectively. These methods allow to access WARC or HTTP headers and the payload stream in a unique way, regardless of whether warcio or FastWARC are used. +Some differences between the warcio and FastWARC APIs are hidden from the user in methods implemented in [CCSparkJob](src/cc_pyspark/sparkcc.py) and [CCFastWarcSparkJob](src/cc_pyspark/sparkcc_fastwarc.py) respectively. These methods allow to access WARC or HTTP headers and the payload stream in a unique way, regardless of whether warcio or FastWARC are used. However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes). diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9c2bbb9..0000000 --- a/requirements.txt +++ /dev/null @@ -1,33 +0,0 @@ -botocore -boto3 -requests -ujson -orjson -warcio - -# for link extraction and webgraph construction also: -idna - -# for parsing HTML (used in cc_index_word_count.py) -beautifulsoup4 -lxml - -# for HDFS support (requires environments variables JAVA_HOME and HADOOP_HOME): -#pydoop - -# to parse WARC/WAT/WET files using FastWARC (https://pypi.org/project/FastWARC/) -# cf. https://github.com/commoncrawl/cc-pyspark/issues/37 -#fastwarc -# (tested with) -#fastwarc==0.15.2 - -# to parse HTML (used in cc_index_word_count.py) using Resiliparse (https://pypi.org/project/Resiliparse/). -# Resiliparse requires compatible fastwarc version. -# cf. https://github.com/commoncrawl/cc-pyspark/issues/43 -#Resiliparse -# (tested with) -#Resiliparse==0.15.2 - -# testing -pytest -pytest-mock diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cc_pyspark/__init__.py b/src/cc_pyspark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bs4_parser.py b/src/cc_pyspark/bs4_parser.py similarity index 100% rename from bs4_parser.py rename to src/cc_pyspark/bs4_parser.py diff --git a/cc_index_export.py b/src/cc_pyspark/cc_index_export.py similarity index 100% rename from cc_index_export.py rename to src/cc_pyspark/cc_index_export.py diff --git a/cc_index_word_count.py b/src/cc_pyspark/cc_index_word_count.py similarity index 100% rename from cc_index_word_count.py rename to src/cc_pyspark/cc_index_word_count.py diff --git a/hostlinks_extract_fastwarc.py b/src/cc_pyspark/hostlinks_extract_fastwarc.py similarity index 100% rename from hostlinks_extract_fastwarc.py rename to src/cc_pyspark/hostlinks_extract_fastwarc.py diff --git a/hostlinks_to_graph.py b/src/cc_pyspark/hostlinks_to_graph.py similarity index 100% rename from hostlinks_to_graph.py rename to src/cc_pyspark/hostlinks_to_graph.py diff --git a/html_tag_count.py b/src/cc_pyspark/html_tag_count.py similarity index 100% rename from html_tag_count.py rename to src/cc_pyspark/html_tag_count.py diff --git a/iana_tld.py b/src/cc_pyspark/iana_tld.py similarity index 100% rename from iana_tld.py rename to src/cc_pyspark/iana_tld.py diff --git a/json_importer.py b/src/cc_pyspark/json_importer.py similarity index 100% rename from json_importer.py rename to src/cc_pyspark/json_importer.py diff --git a/linkmap2parquet.py b/src/cc_pyspark/linkmap2parquet.py similarity index 100% rename from linkmap2parquet.py rename to src/cc_pyspark/linkmap2parquet.py diff --git a/resiliparse_parser.py b/src/cc_pyspark/resiliparse_parser.py similarity index 100% rename from resiliparse_parser.py rename to src/cc_pyspark/resiliparse_parser.py diff --git a/server_count.py b/src/cc_pyspark/server_count.py similarity index 100% rename from server_count.py rename to src/cc_pyspark/server_count.py diff --git a/server_count_fastwarc.py b/src/cc_pyspark/server_count_fastwarc.py similarity index 100% rename from server_count_fastwarc.py rename to src/cc_pyspark/server_count_fastwarc.py diff --git a/server_ip_address.py b/src/cc_pyspark/server_ip_address.py similarity index 100% rename from server_ip_address.py rename to src/cc_pyspark/server_ip_address.py diff --git a/sitemaps_from_robotstxt.py b/src/cc_pyspark/sitemaps_from_robotstxt.py similarity index 100% rename from sitemaps_from_robotstxt.py rename to src/cc_pyspark/sitemaps_from_robotstxt.py diff --git a/sitemaps_from_robotstxt_fastwarc.py b/src/cc_pyspark/sitemaps_from_robotstxt_fastwarc.py similarity index 100% rename from sitemaps_from_robotstxt_fastwarc.py rename to src/cc_pyspark/sitemaps_from_robotstxt_fastwarc.py diff --git a/sparkcc.py b/src/cc_pyspark/sparkcc.py similarity index 100% rename from sparkcc.py rename to src/cc_pyspark/sparkcc.py diff --git a/sparkcc_fastwarc.py b/src/cc_pyspark/sparkcc_fastwarc.py similarity index 100% rename from sparkcc_fastwarc.py rename to src/cc_pyspark/sparkcc_fastwarc.py diff --git a/wat_extract_links.py b/src/cc_pyspark/wat_extract_links.py similarity index 99% rename from wat_extract_links.py rename to src/cc_pyspark/wat_extract_links.py index c09e103..f7c0b0f 100644 --- a/wat_extract_links.py +++ b/src/cc_pyspark/wat_extract_links.py @@ -7,7 +7,7 @@ from pyspark.sql.types import StructType, StructField, StringType from sparkcc import CCSparkJob -from json_importer import json +from src.cc_pyspark.json_importer import json class ExtractLinksJob(CCSparkJob): diff --git a/word_count.py b/src/cc_pyspark/word_count.py similarity index 100% rename from word_count.py rename to src/cc_pyspark/word_count.py diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index ba32527..385cf35 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -1,14 +1,12 @@ -import json from io import BytesIO import pytest from unittest.mock import MagicMock, Mock from pyspark.sql import SparkSession -from warcio.recordloader import ArcWarcRecord -from sitemaps_from_robotstxt import SitemapExtractorJob -from sparkcc import CCSparkJob +from cc_pyspark.sitemaps_from_robotstxt import SitemapExtractorJob +from cc_pyspark.sparkcc import CCSparkJob from utils import _process_jobs From 69d66a0ef14fd7f3bbeb6b5b168a019c3aa01780 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 31 Oct 2025 17:41:03 +0100 Subject: [PATCH 25/33] add pyproject.toml --- pyproject.toml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0ce8763 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "cc-pyspark" +version = "0.1.0" +description = "Common Crawl data processing examples for PySpark." +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.9" +dependencies = [ + "beautifulsoup4>=4.14.2", + "boto3>=1.40.63", + "botocore>=1.40.63", + "idna>=3.11", + "lxml>=6.0.2", + "orjson>=3.11.4", + "pytest>=8.4.2", + "pytest-mock>=3.15.1", + "requests>=2.32.5", + "ujson>=5.11.0", + "warcio>=1.7.5", + "pyspark==3.5.7", +] +[tool.pytest.ini_options] +pythonpath = [ + "src" +] From 8fa2c046b02b19bd84425a921740b87e826e4c3c Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Mon, 3 Nov 2025 14:03:38 +0100 Subject: [PATCH 26/33] wip packaging --- README.md | 18 ++++---- pyproject.toml | 1 - script/package_and_run_job.sh | 38 ++++++++++++++++ script/run_job.py | 43 +++++++++++++++++++ src/cc_pyspark/jobs/__init__.py | 0 src/cc_pyspark/{ => jobs}/cc_index_export.py | 2 +- .../{ => jobs}/cc_index_word_count.py | 2 +- .../{ => jobs}/hostlinks_extract_fastwarc.py | 0 .../{ => jobs}/hostlinks_to_graph.py | 2 +- src/cc_pyspark/{ => jobs}/html_tag_count.py | 2 +- src/cc_pyspark/{ => jobs}/linkmap2parquet.py | 2 +- src/cc_pyspark/{ => jobs}/server_count.py | 4 +- .../{ => jobs}/server_count_fastwarc.py | 0 .../{ => jobs}/server_ip_address.py | 2 +- .../{ => jobs}/sitemaps_from_robotstxt.py | 2 +- .../sitemaps_from_robotstxt_fastwarc.py | 0 .../{ => jobs}/wat_extract_links.py | 2 +- src/cc_pyspark/{ => jobs}/word_count.py | 2 +- src/cc_pyspark/sparkcc_fastwarc.py | 2 +- test/test_sitemaps_from_robotstxt.py | 2 +- 20 files changed, 103 insertions(+), 23 deletions(-) create mode 100755 script/package_and_run_job.sh create mode 100644 script/run_job.py create mode 100644 src/cc_pyspark/jobs/__init__.py rename src/cc_pyspark/{ => jobs}/cc_index_export.py (81%) rename src/cc_pyspark/{ => jobs}/cc_index_word_count.py (98%) rename src/cc_pyspark/{ => jobs}/hostlinks_extract_fastwarc.py (100%) rename src/cc_pyspark/{ => jobs}/hostlinks_to_graph.py (99%) rename src/cc_pyspark/{ => jobs}/html_tag_count.py (95%) rename src/cc_pyspark/{ => jobs}/linkmap2parquet.py (96%) rename src/cc_pyspark/{ => jobs}/server_count.py (97%) rename src/cc_pyspark/{ => jobs}/server_count_fastwarc.py (100%) rename src/cc_pyspark/{ => jobs}/server_ip_address.py (98%) rename src/cc_pyspark/{ => jobs}/sitemaps_from_robotstxt.py (99%) rename src/cc_pyspark/{ => jobs}/sitemaps_from_robotstxt_fastwarc.py (100%) rename src/cc_pyspark/{ => jobs}/wat_extract_links.py (99%) rename src/cc_pyspark/{ => jobs}/word_count.py (96%) diff --git a/README.md b/README.md index 40c50e1..bff6e81 100644 --- a/README.md +++ b/README.md @@ -4,27 +4,27 @@ This project provides examples how to process the Common Crawl dataset with [Apache Spark](https://spark.apache.org/) and Python: -+ [count HTML tags](src/cc_pyspark/html_tag_count.py) in Common Crawl's raw response data (WARC files) ++ [count HTML tags](src/cc_pyspark/jobs/html_tag_count.py) in Common Crawl's raw response data (WARC files) -+ [count web server names](src/cc_pyspark/server_count.py) in Common Crawl's metadata (WAT files or WARC files) ++ [count web server names](src/cc_pyspark/jobs/server_count.py) in Common Crawl's metadata (WAT files or WARC files) -+ list host names and corresponding [IP addresses](src/cc_pyspark/server_ip_address.py) (WAT files or WARC files) ++ list host names and corresponding [IP addresses](src/cc_pyspark/jobs/server_ip_address.py) (WAT files or WARC files) -+ [word count](src/cc_pyspark/word_count.py) (term and document frequency) in Common Crawl's extracted text (WET files) ++ [word count](src/cc_pyspark/jobs/word_count.py) (term and document frequency) in Common Crawl's extracted text (WET files) -+ [extract links](src/cc_pyspark/wat_extract_links.py) from WAT files and [construct the (host-level) web graph](src/cc_pyspark/hostlinks_to_graph.py) – for further details about the web graphs see the project [cc-webgraph](https://github.com/commoncrawl/cc-webgraph) ++ [extract links](src/cc_pyspark/jobs/wat_extract_links.py) from WAT files and [construct the (host-level) web graph](src/cc_pyspark/jobs/hostlinks_to_graph.py) – for further details about the web graphs see the project [cc-webgraph](https://github.com/commoncrawl/cc-webgraph) + work with the [columnar URL index](https://commoncrawl.org/2018/03/index-to-warc-files-and-urls-in-columnar-format/) (see also [cc-index-table](https://github.com/commoncrawl/cc-index-table) and the notes about [querying the columnar index](#querying-the-columnar-index)): - - run a SQL query and [export the result as a table](src/cc_pyspark/cc_index_export.py) + - run a SQL query and [export the result as a table](src/cc_pyspark/jobs/cc_index_export.py) - - select WARC records by a SQL query, parse the HTML, extract the text and [count words](src/cc_pyspark/cc_index_word_count.py). Alternatively, the first step (query the columnar index) can be executed using Amazon Athena. The list of WARC record coordinates (CSV or a table created by a CTAS statement) is then passed via `--csv` or `--input_table_format`) to the Spark job. + - select WARC records by a SQL query, parse the HTML, extract the text and [count words](src/cc_pyspark/jobs/cc_index_word_count.py). Alternatively, the first step (query the columnar index) can be executed using Amazon Athena. The list of WARC record coordinates (CSV or a table created by a CTAS statement) is then passed via `--csv` or `--input_table_format`) to the Spark job. Further information about the examples and available options is shown via the [command-line option](#command-line-options) `--help`. ## Implementing a Custom Extractor -Extending the [CCSparkJob](src/cc_pyspark/sparkcc.py) isn't difficult and for many use cases it is sufficient to override a single method (`process_record`). Have a look at one of the examples, e.g. to [count HTML tags](src/cc_pyspark/html_tag_count.py). +Extending the [CCSparkJob](src/cc_pyspark/sparkcc.py) isn't difficult and for many use cases it is sufficient to override a single method (`process_record`). Have a look at one of the examples, e.g. to [count HTML tags](src/cc_pyspark/jobs/html_tag_count.py). ## Setup @@ -248,7 +248,7 @@ Alternatively, it's possible configure the table schema explicitly: Replacing [FastWARC](https://resiliparse.chatnoir.eu/en/latest/man/fastwarc.html) can speed up job execution by 25% if little custom computations are done and most of the time is spent for parsing WARC files. To use FastWARC -- the job class must inherit from [CCFastWarcSparkJob](src/cc_pyspark/sparkcc_fastwarc.py) instead of [CCSparkJob](src/cc_pyspark/sparkcc.py). See [ServerCountFastWarcJob](src/cc_pyspark/server_count_fastwarc.py) for an example. +- the job class must inherit from [CCFastWarcSparkJob](src/cc_pyspark/sparkcc_fastwarc.py) instead of [CCSparkJob](src/cc_pyspark/sparkcc.py). See [ServerCountFastWarcJob](src/cc_pyspark/jobs/server_count_fastwarc.py) for an example. - when running the job in a Spark cluster, `sparkcc_fastwarc.py` must be passed via `--py-files` in addition to `sparkcc.py` and further job-specific Python files. See also [running in a Spark cluster](#running-in-spark-cluster-over-large-amounts-of-data). Some differences between the warcio and FastWARC APIs are hidden from the user in methods implemented in [CCSparkJob](src/cc_pyspark/sparkcc.py) and [CCFastWarcSparkJob](src/cc_pyspark/sparkcc_fastwarc.py) respectively. These methods allow to access WARC or HTTP headers and the payload stream in a unique way, regardless of whether warcio or FastWARC are used. diff --git a/pyproject.toml b/pyproject.toml index 0ce8763..c0b32b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "requests>=2.32.5", "ujson>=5.11.0", "warcio>=1.7.5", - "pyspark==3.5.7", ] [tool.pytest.ini_options] pythonpath = [ diff --git a/script/package_and_run_job.sh b/script/package_and_run_job.sh new file mode 100755 index 0000000..066289e --- /dev/null +++ b/script/package_and_run_job.sh @@ -0,0 +1,38 @@ + +DIR=$(dirname "$0") +# if "-h" or "--help" is passed, or no args, show usage +if [ "$#" -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + echo "Submits a Spark job using cc-pyspark packaged in a virtual environment." + echo "Usage: $0 [job_args...]" + echo " : The job module to run (e.g., cc_pyspark.jobs.server_count)" + echo " [job_args...]: Additional arguments to pass to the job" + exit 1 +fi + +JOB_MODULE=$1 +shift 1 + +VENV_DIR=$(mktemp -d) +VENV_PACK_FILE=$(mktemp -u).tar.gz + +cleanup() { + rm -rf "$VENV_DIR" + echo rm -f "$VENV_PACK_FILE" + echo NOT DELETING "$VENV_PACK_FILE" "for debugging purposes" +} +trap cleanup EXIT + +set -e +set -x + +python -m venv $VENV_DIR +source $VENV_DIR/bin/activate +#pip install cc-pyspark +pip install . +pip install venv-pack +venv-pack -o $VENV_PACK_FILE +deactivate + +export PYSPARK_PYTHON=./environment/bin/python +spark-submit --archives "$VENV_PACK_FILE#environment" "$DIR"/run_job.py --job_module $JOB_MODULE "$@" + diff --git a/script/run_job.py b/script/run_job.py new file mode 100644 index 0000000..ca73c2b --- /dev/null +++ b/script/run_job.py @@ -0,0 +1,43 @@ +import argparse +import importlib +import inspect +import sys + +from cc_pyspark.sparkcc import CCSparkJob + + +def load_and_run_job(module_name: str): + job_module = importlib.import_module(module_name) + + # Find the job class in the module + job_class = None + for name, obj in inspect.getmembers(job_module, inspect.isclass): + if obj.__module__ == job_module.__name__ and issubclass(obj, CCSparkJob): + print("found job class:", obj) + job_class = obj + break + + if job_class is None: + raise ValueError(f"No CCSparkJob subclass found in module {module_name}") + + job_instance = job_class() + print("running job:", job_instance) + job_instance.run() + + +def main(): + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--job_module', type=str, required=True,) + + args, remaining = arg_parser.parse_known_args() + + # remove wrapper args from sys.argv so that job class can parse its own args cleanly + sys.argv = [sys.argv[0]] + remaining + + load_and_run_job(args.job_module) + + +if __name__ == '__main__': + main() + + diff --git a/src/cc_pyspark/jobs/__init__.py b/src/cc_pyspark/jobs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cc_pyspark/cc_index_export.py b/src/cc_pyspark/jobs/cc_index_export.py similarity index 81% rename from src/cc_pyspark/cc_index_export.py rename to src/cc_pyspark/jobs/cc_index_export.py index 2a70344..4519528 100644 --- a/src/cc_pyspark/cc_index_export.py +++ b/src/cc_pyspark/jobs/cc_index_export.py @@ -1,4 +1,4 @@ -from sparkcc import CCIndexSparkJob +from cc_pyspark.sparkcc import CCIndexSparkJob class CCIndexExportJob(CCIndexSparkJob): diff --git a/src/cc_pyspark/cc_index_word_count.py b/src/cc_pyspark/jobs/cc_index_word_count.py similarity index 98% rename from src/cc_pyspark/cc_index_word_count.py rename to src/cc_pyspark/jobs/cc_index_word_count.py index bca69a5..1dd7e04 100644 --- a/src/cc_pyspark/cc_index_word_count.py +++ b/src/cc_pyspark/jobs/cc_index_word_count.py @@ -1,6 +1,6 @@ from collections import Counter -from sparkcc import CCIndexWarcSparkJob +from cc_pyspark.sparkcc import CCIndexWarcSparkJob from word_count import WordCountJob diff --git a/src/cc_pyspark/hostlinks_extract_fastwarc.py b/src/cc_pyspark/jobs/hostlinks_extract_fastwarc.py similarity index 100% rename from src/cc_pyspark/hostlinks_extract_fastwarc.py rename to src/cc_pyspark/jobs/hostlinks_extract_fastwarc.py diff --git a/src/cc_pyspark/hostlinks_to_graph.py b/src/cc_pyspark/jobs/hostlinks_to_graph.py similarity index 99% rename from src/cc_pyspark/hostlinks_to_graph.py rename to src/cc_pyspark/jobs/hostlinks_to_graph.py index 32d3833..776d077 100644 --- a/src/cc_pyspark/hostlinks_to_graph.py +++ b/src/cc_pyspark/jobs/hostlinks_to_graph.py @@ -2,7 +2,7 @@ import logging import os -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob from pyspark.sql import functions as sqlf from pyspark.sql.types import BooleanType, LongType, StringType, StructField, StructType diff --git a/src/cc_pyspark/html_tag_count.py b/src/cc_pyspark/jobs/html_tag_count.py similarity index 95% rename from src/cc_pyspark/html_tag_count.py rename to src/cc_pyspark/jobs/html_tag_count.py index ae5f579..baec297 100644 --- a/src/cc_pyspark/html_tag_count.py +++ b/src/cc_pyspark/jobs/html_tag_count.py @@ -2,7 +2,7 @@ from collections import Counter -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob class TagCountJob(CCSparkJob): diff --git a/src/cc_pyspark/linkmap2parquet.py b/src/cc_pyspark/jobs/linkmap2parquet.py similarity index 96% rename from src/cc_pyspark/linkmap2parquet.py rename to src/cc_pyspark/jobs/linkmap2parquet.py index e57ca5c..e69122f 100644 --- a/src/cc_pyspark/linkmap2parquet.py +++ b/src/cc_pyspark/jobs/linkmap2parquet.py @@ -1,4 +1,4 @@ -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob from pyspark.sql.types import StructType, StructField, StringType diff --git a/src/cc_pyspark/server_count.py b/src/cc_pyspark/jobs/server_count.py similarity index 97% rename from src/cc_pyspark/server_count.py rename to src/cc_pyspark/jobs/server_count.py index a10ee76..c01f7b4 100644 --- a/src/cc_pyspark/server_count.py +++ b/src/cc_pyspark/jobs/server_count.py @@ -1,5 +1,5 @@ -from sparkcc import CCSparkJob -from json_importer import json +from cc_pyspark.sparkcc import CCSparkJob +from cc_pyspark.json_importer import json class ServerCountJob(CCSparkJob): diff --git a/src/cc_pyspark/server_count_fastwarc.py b/src/cc_pyspark/jobs/server_count_fastwarc.py similarity index 100% rename from src/cc_pyspark/server_count_fastwarc.py rename to src/cc_pyspark/jobs/server_count_fastwarc.py diff --git a/src/cc_pyspark/server_ip_address.py b/src/cc_pyspark/jobs/server_ip_address.py similarity index 98% rename from src/cc_pyspark/server_ip_address.py rename to src/cc_pyspark/jobs/server_ip_address.py index bf3f30c..9ca0019 100644 --- a/src/cc_pyspark/server_ip_address.py +++ b/src/cc_pyspark/jobs/server_ip_address.py @@ -4,7 +4,7 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob class ServerIPAddressJob(CCSparkJob): diff --git a/src/cc_pyspark/sitemaps_from_robotstxt.py b/src/cc_pyspark/jobs/sitemaps_from_robotstxt.py similarity index 99% rename from src/cc_pyspark/sitemaps_from_robotstxt.py rename to src/cc_pyspark/jobs/sitemaps_from_robotstxt.py index 55ee651..ea86f1a 100644 --- a/src/cc_pyspark/sitemaps_from_robotstxt.py +++ b/src/cc_pyspark/jobs/sitemaps_from_robotstxt.py @@ -4,7 +4,7 @@ from pyspark.sql.types import StructType, StructField, StringType, ArrayType from warcio.recordloader import ArcWarcRecord -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob class SitemapExtractorJob(CCSparkJob): """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files.""" diff --git a/src/cc_pyspark/sitemaps_from_robotstxt_fastwarc.py b/src/cc_pyspark/jobs/sitemaps_from_robotstxt_fastwarc.py similarity index 100% rename from src/cc_pyspark/sitemaps_from_robotstxt_fastwarc.py rename to src/cc_pyspark/jobs/sitemaps_from_robotstxt_fastwarc.py diff --git a/src/cc_pyspark/wat_extract_links.py b/src/cc_pyspark/jobs/wat_extract_links.py similarity index 99% rename from src/cc_pyspark/wat_extract_links.py rename to src/cc_pyspark/jobs/wat_extract_links.py index f7c0b0f..91cd393 100644 --- a/src/cc_pyspark/wat_extract_links.py +++ b/src/cc_pyspark/jobs/wat_extract_links.py @@ -6,7 +6,7 @@ from pyspark.sql.types import StructType, StructField, StringType -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob from src.cc_pyspark.json_importer import json diff --git a/src/cc_pyspark/word_count.py b/src/cc_pyspark/jobs/word_count.py similarity index 96% rename from src/cc_pyspark/word_count.py rename to src/cc_pyspark/jobs/word_count.py index 0b5d5a0..4651c8b 100644 --- a/src/cc_pyspark/word_count.py +++ b/src/cc_pyspark/jobs/word_count.py @@ -4,7 +4,7 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob class WordCountJob(CCSparkJob): diff --git a/src/cc_pyspark/sparkcc_fastwarc.py b/src/cc_pyspark/sparkcc_fastwarc.py index ccf25ed..885f56f 100644 --- a/src/cc_pyspark/sparkcc_fastwarc.py +++ b/src/cc_pyspark/sparkcc_fastwarc.py @@ -2,7 +2,7 @@ from fastwarc.warc import WarcRecordType, WarcRecord from fastwarc.stream_io import FastWARCError -from sparkcc import CCSparkJob +from cc_pyspark.sparkcc import CCSparkJob class CCFastWarcSparkJob(CCSparkJob): diff --git a/test/test_sitemaps_from_robotstxt.py b/test/test_sitemaps_from_robotstxt.py index 385cf35..ce1f1b2 100644 --- a/test/test_sitemaps_from_robotstxt.py +++ b/test/test_sitemaps_from_robotstxt.py @@ -5,7 +5,7 @@ from pyspark.sql import SparkSession -from cc_pyspark.sitemaps_from_robotstxt import SitemapExtractorJob +from cc_pyspark.jobs.sitemaps_from_robotstxt import SitemapExtractorJob from cc_pyspark.sparkcc import CCSparkJob from utils import _process_jobs From 2daa4b76c1112d7cc1db0ab3a4084b9dc75abb83 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 7 Nov 2025 17:37:51 +0100 Subject: [PATCH 27/33] wip package_and_run_job --- script/package_and_run_job.sh | 65 +++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/script/package_and_run_job.sh b/script/package_and_run_job.sh index 066289e..23eebfb 100755 --- a/script/package_and_run_job.sh +++ b/script/package_and_run_job.sh @@ -1,16 +1,49 @@ DIR=$(dirname "$0") -# if "-h" or "--help" is passed, or no args, show usage -if [ "$#" -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then - echo "Submits a Spark job using cc-pyspark packaged in a virtual environment." - echo "Usage: $0 [job_args...]" - echo " : The job module to run (e.g., cc_pyspark.jobs.server_count)" - echo " [job_args...]: Additional arguments to pass to the job" + +function usage() { + echo "Usage: $0 --job_module [--master ] [job_args...]" + echo "" + echo "Options:" + echo " --job_module The job module to run (e.g., cc_pyspark.jobs.server_count)" + echo " --master The Spark master URL (optional)" + echo " -h, --help Show this help message" + echo "" + echo "Positional Arguments:" + echo " job_args... Additional arguments to pass to the job" +} + +MASTER_URL= +JOB_MODULE= +while [ "$#" -gt 0 ]; do + case $1 in + --job_module) + JOB_MODULE=$2 + shift 2 + ;; + --master) + MASTER_URL=$2 + shift 2 + ;; + -h|--help|-?) + usage + exit 0 + ;; + *) + # remaining args are job args + break + ;; + esac +done + +if [ -z "$JOB_MODULE" ]; then + echo "Error: --job_module is required" + usage exit 1 fi - -JOB_MODULE=$1 -shift 1 +if [ -z "$MASTER_URL" ]; then + MASTER_URL="local[*]" +fi VENV_DIR=$(mktemp -d) VENV_PACK_FILE=$(mktemp -u).tar.gz @@ -33,6 +66,18 @@ pip install venv-pack venv-pack -o $VENV_PACK_FILE deactivate +# if SPARK_HOME is not set, use `spark-submit` in path, otherwise use $SPARK_HOME/bin/spark-submit +if [ -z "$SPARK_HOME" ]; then + SPARK_SUBMIT="spark-submit" +else + SPARK_SUBMIT="$SPARK_HOME/bin/spark-submit" +fi + export PYSPARK_PYTHON=./environment/bin/python -spark-submit --archives "$VENV_PACK_FILE#environment" "$DIR"/run_job.py --job_module $JOB_MODULE "$@" +$SPARK_SUBMIT \ + --master $MASTER_URL \ + --archives "$VENV_PACK_FILE#environment" \ + "$DIR"/run_job.py \ + --job_module $JOB_MODULE \ + "$@" From 5c4b1c610ddef18d9368ea9e5eee6bebd54ee116 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 7 Nov 2025 17:38:14 +0100 Subject: [PATCH 28/33] update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c0b32b0..de902c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cc-pyspark" -version = "0.1.0" +version = "0.1.0-dev0" description = "Common Crawl data processing examples for PySpark." readme = "README.md" license = {text = "MIT"} From f6edfb4b7236b3157a727f242245d94ea1c82771 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Fri, 7 Nov 2025 17:43:12 +0100 Subject: [PATCH 29/33] add pypi publish workflow --- .github/workflows/python_publish.yaml | 73 +++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/python_publish.yaml diff --git a/.github/workflows/python_publish.yaml b/.github/workflows/python_publish.yaml new file mode 100644 index 0000000..9d64f60 --- /dev/null +++ b/.github/workflows/python_publish.yaml @@ -0,0 +1,73 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package to PyPI + +on: + workflow_dispatch + +#on: +# release: +# types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + url: https://pypi.org/p/compel + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ From 1ffe57d0c218c38ad8b63446976a8a0ca81e039e Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 13 Nov 2025 17:49:48 +0100 Subject: [PATCH 30/33] try and make the workflow run --- .github/workflows/python_publish.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_publish.yaml b/.github/workflows/python_publish.yaml index 9d64f60..175443c 100644 --- a/.github/workflows/python_publish.yaml +++ b/.github/workflows/python_publish.yaml @@ -9,12 +9,15 @@ name: Upload Python Package to PyPI on: - workflow_dispatch + #workflow_dispatch + pull_request: + types: [opened, reopened] #on: # release: # types: [published] + permissions: contents: read From f29d614268ea23083750a18c82fa8cd6cda55424 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 13 Nov 2025 17:50:52 +0100 Subject: [PATCH 31/33] try and make the workflow run 2 --- .github/workflows/python_publish.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python_publish.yaml b/.github/workflows/python_publish.yaml index 175443c..bcdf0f8 100644 --- a/.github/workflows/python_publish.yaml +++ b/.github/workflows/python_publish.yaml @@ -9,9 +9,8 @@ name: Upload Python Package to PyPI on: - #workflow_dispatch + workflow_dispatch pull_request: - types: [opened, reopened] #on: # release: From 3618e7b14e3fd1b5943246b47c288b36affd5cdf Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 13 Nov 2025 17:51:57 +0100 Subject: [PATCH 32/33] try and make the workflow run 3 --- .github/workflows/python_publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_publish.yaml b/.github/workflows/python_publish.yaml index bcdf0f8..07af94a 100644 --- a/.github/workflows/python_publish.yaml +++ b/.github/workflows/python_publish.yaml @@ -9,7 +9,7 @@ name: Upload Python Package to PyPI on: - workflow_dispatch + workflow_dispatch: pull_request: #on: From 9c08fc318fa4e164604b47e938a00817d6f7962d Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Thu, 13 Nov 2025 17:54:16 +0100 Subject: [PATCH 33/33] fix pypi project url --- .github/workflows/python_publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_publish.yaml b/.github/workflows/python_publish.yaml index 07af94a..a6de921 100644 --- a/.github/workflows/python_publish.yaml +++ b/.github/workflows/python_publish.yaml @@ -56,7 +56,7 @@ jobs: environment: name: pypi # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: - url: https://pypi.org/p/compel + url: https://pypi.org/p/cc-pyspark # # ALTERNATIVE: if your GitHub Release name is the PyPI project version string # ALTERNATIVE: exactly, uncomment the following line instead: