From 870cca73351111643d0e0a90bf11999436f2bcbb Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 1 Dec 2016 09:47:32 +0100
Subject: [PATCH 01/66] Fix property to compress map output

---
 run_index_hadoop.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index a3749e5..8de345c 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -76,7 +76,7 @@ fi
 if [ -n "$REUSE_SPLIT_FILE" ]; then
     SPLIT_FILE="$REUSE_SPLIT_FILE"
 else
-    # mapreduce.output.fileoutputformat.compress=true
+    # mapreduce.map.output.compress=true
     #    must compress task output to avoid that the single reducer node fails with a full disk
     #    anyway, it may require 60 GB of local disk space on the reducer node
     # mapreduce.map.memory.mb=640
@@ -91,7 +91,7 @@ else
            --jobconf "mapreduce.map.java.opts=-Xmx512m" \
            --jobconf "mapreduce.reduce.memory.mb=1024" \
            --jobconf "mapreduce.reduce.java.opts=-Xmx512m" \
-           --jobconf "mapreduce.output.fileoutputformat.compress=true" \
+           --jobconf "mapreduce.map.output.compress=true" \
            -r hadoop $WARC_CDX
     mv splits.seq $(basename s3${SPLIT_FILE#s3a})
 

From cdf4603ee33c4c12c245fe5574904982f6166398 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 1 Dec 2016 13:54:45 +0100
Subject: [PATCH 02/66] Add script to publish the index on s3://commoncrawl/

---
 publish_index.sh | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100755 publish_index.sh

diff --git a/publish_index.sh b/publish_index.sh
new file mode 100755
index 0000000..4c165d2
--- /dev/null
+++ b/publish_index.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+YEARWEEK="$1"
+
+if [ -z "$YEARWEEK" ]; then
+    echo "$0 <year-week-of-crawl-archive>"
+    exit 1
+fi
+
+set -x
+set -e
+
+
+## Create the metadata (title) for the index on the website
+if ! [ -e $YEARWEEK-metadata.yaml ]; then
+    s3cmd get s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
+    echo "Please, edit $YEARWEEK-metadata.yaml"
+    exit 1
+fi
+s3cmd put $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml --acl-public
+
+test -d cdx-$YEARWEEK || mkdir cdx-$YEARWEEK
+cd cdx-$YEARWEEK
+
+## create cluster index
+s3cmd get --force s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/part-*
+cat part-* | awk '{printf "%s\t%s\n",$0,NR}' > cluster.idx
+export LC_ALL=C
+sort -c ./cluster.idx
+rm ./part-00*
+s3cmd put ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl-public
+
+## set public permissions where needed (technically only the cdx-* need to be public)
+s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-private --recursive --exclude='*' --include='part-*' --include='_SUCCESS'
+s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
+
+# make *.cdx.gz files (to be deleted later) private
+s3cmd setacl s3://commoncrawl/cc-index/CC-MAIN-$YEARWEEK/segments/ --acl-private --recursive

From 9977d92919960f266164fe7684df06fc67bb8ff7 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 15 Dec 2016 15:09:10 +0100
Subject: [PATCH 03/66] proper error message if WARC file is not found

---
 indexwarcsjob.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index bf48fa2..7b7d71f 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -83,6 +83,10 @@ def _conv_warc_to_cdx_path(self, warc_path):
     def _load_and_index(self, warc_path):
         warckey = self.warc_bucket.get_key(warc_path)
 
+        if warckey is None:
+            sys.stderr.write('WARC not found: {}\n'.format(warc_path))
+            return
+
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
         if self.options.skip_existing:

From 0afe5bfc54188dfdbe94866b5a4a500121df00ec Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 15 Dec 2016 15:22:43 +0100
Subject: [PATCH 04/66] Explicitely do not compress output of sample job, fixes
 #2, add comments how to write sequence file from text file

---
 run_index_hadoop.sh | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 8de345c..2e56603 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -12,6 +12,7 @@ Create a Common Crawl index for a monthly crawl. All steps are run on Hadoop.
 
   <path-to-warc-file-list>  list of WARC file objects to be indexed, e.g, the WARC list
                                s3://commoncrawl/crawl-data/CC-MAIN-2016-44/warc.paths.gz
+                         or any subset or union of multiple WARC lists (incl. robots.txt WARCs).
                          Paths in the list must be keys/objects in the Common Crawl bucket.
                          The path to the list must be a valid and complete HDFS or S3A URL,
                          e.g. hdfs://hdfs-master.example.com/user/hadoop-user/CC-MAIN-2016-44.paths
@@ -42,10 +43,14 @@ echo
 echo WARC_MANIFEST="$WARC_MANIFEST"
 echo
 
+# glob pattern to match all CDX files generated in step 1 (indexwarcsjob.py)
+# (filesystem protocol must be supported by the used Hadoop version)
 export WARC_CDX="s3a://commoncrawl/cc-index/cdx/CC-MAIN-$YEARWEEK/segments/*/*/*.cdx.gz"
 
+# AWS S3 bucket to hold CDX files
 export WARC_CDX_BUCKET="commoncrawl"
 
+# path to index files
 export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/"
 
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
@@ -60,7 +65,7 @@ set -x
 
 if [ -n "$WARC_MANIFEST" ]; then
     python indexwarcsjob.py \
-           --cdx_bucket=$WARC_CDX_BUCKET \
+       --cdx_bucket=$WARC_CDX_BUCKET \
        --no-output \
        --cleanup NONE \
        --skip-existing \
@@ -81,6 +86,12 @@ else
     #    anyway, it may require 60 GB of local disk space on the reducer node
     # mapreduce.map.memory.mb=640
     #    mappers read only small cdx files: minimal memory requirements
+    # mapreduce.output.fileoutputformat.compress=false
+    #    must not compress output, even if this is the default, because it may not
+    #    be readable from Python via seqfileutils.py. Alternatively, compress
+    #    and decompress the data explicitely.
+    test -e splits.txt && rm splits.txt
+    test -e splits.seq && rm splits.seq
     python dosample.py \
            --verbose \
            --shards=300 \
@@ -92,7 +103,21 @@ else
            --jobconf "mapreduce.reduce.memory.mb=1024" \
            --jobconf "mapreduce.reduce.java.opts=-Xmx512m" \
            --jobconf "mapreduce.map.output.compress=true" \
+           --jobconf "mapreduce.output.fileoutputformat.compress=false \
            -r hadoop $WARC_CDX
+
+	# in case, the sequence file wasn't written:
+	# 1. verify the content
+	#      less splits.txt
+	#    or (in case it's compressed)
+	#      hadoop fs -text file:$PWD/splits.txt >splits.tmp
+	#      less splits.tmp
+	# 2. convert splits.txt (or the decompressed splits.tmp) into a sequence file
+	#      python seqfileutils.py --copyfrom splits.txt splits.seq
+	#      python seqfileutils.py --copyfrom splits.tmp splits.seq
+	# 3. verify the sequence file
+	#      hadoop fs -text file:$PWD/splits.seq | less
+
     mv splits.seq $(basename s3${SPLIT_FILE#s3a})
 
     if s3cmd info s3${SPLIT_FILE#s3a}; then

From 1cf422db51bebd7c0e73155804490030b37e8fb2 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 3 Mar 2017 16:39:27 +0100
Subject: [PATCH 05/66] fix syntax error

---
 run_index_hadoop.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 2e56603..1f11a06 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -103,7 +103,7 @@ else
            --jobconf "mapreduce.reduce.memory.mb=1024" \
            --jobconf "mapreduce.reduce.java.opts=-Xmx512m" \
            --jobconf "mapreduce.map.output.compress=true" \
-           --jobconf "mapreduce.output.fileoutputformat.compress=false \
+           --jobconf "mapreduce.output.fileoutputformat.compress=false" \
            -r hadoop $WARC_CDX
 
 	# in case, the sequence file wasn't written:

From 377d72c05d27727c2ce91244880c9f61378e6568 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 3 Jul 2017 16:45:45 +0200
Subject: [PATCH 06/66] Improved script to publish index - allow to perform
 publishing without interaction - remove intermediate files

---
 publish_index.sh | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index 4c165d2..fffb962 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 
 YEARWEEK="$1"
+MONTH="$2"
 
 if [ -z "$YEARWEEK" ]; then
-    echo "$0 <year-week-of-crawl-archive>"
-    exit 1
+	echo "$0 <year-week-of-crawl-archive> [<month>]"
+	exit 1
 fi
 
 set -x
@@ -13,9 +14,14 @@ set -e
 
 ## Create the metadata (title) for the index on the website
 if ! [ -e $YEARWEEK-metadata.yaml ]; then
-    s3cmd get s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
-    echo "Please, edit $YEARWEEK-metadata.yaml"
-    exit 1
+	if [ -n "$MONTH" ]; then
+		YEAR=${YEARWEEK%%-*}
+		echo "title: '${MONTH^} $YEAR Index'" >$YEARWEEK-metadata.yaml
+	else
+		s3cmd get s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
+		echo "Please, edit $YEARWEEK-metadata.yaml"
+		exit 1
+	fi
 fi
 s3cmd put $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml --acl-public
 
@@ -27,12 +33,17 @@ s3cmd get --force s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexe
 cat part-* | awk '{printf "%s\t%s\n",$0,NR}' > cluster.idx
 export LC_ALL=C
 sort -c ./cluster.idx
-rm ./part-00*
+#rm ./part-00*
 s3cmd put ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl-public
 
 ## set public permissions where needed (technically only the cdx-* need to be public)
-s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-private --recursive --exclude='*' --include='part-*' --include='_SUCCESS'
 s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
 
+# remove obsolete data from bucket
+#  - map-reduce _SUCCESS file/marker
+aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCESS
+#  - part-00* files concatenated to cluster.idx
+aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
+
 # make *.cdx.gz files (to be deleted later) private
-s3cmd setacl s3://commoncrawl/cc-index/CC-MAIN-$YEARWEEK/segments/ --acl-private --recursive
+#s3cmd setacl s3://commoncrawl/cc-index/CC-MAIN-$YEARWEEK/segments/ --acl-private --recursive

From ce88b5586f2040801665467cb79e1587f0819c71 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 28 Sep 2017 15:24:18 +0200
Subject: [PATCH 07/66] Increase task memory (use default) for sample jobs

---
 run_index_hadoop.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 1f11a06..da8859c 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -79,6 +79,7 @@ fi
 
 
 if [ -n "$REUSE_SPLIT_FILE" ]; then
+    echo "Reusing SPLIT_FILE $REUSE_SPLIT_FILE"
     SPLIT_FILE="$REUSE_SPLIT_FILE"
 else
     # mapreduce.map.output.compress=true
@@ -86,6 +87,10 @@ else
     #    anyway, it may require 60 GB of local disk space on the reducer node
     # mapreduce.map.memory.mb=640
     #    mappers read only small cdx files: minimal memory requirements
+    # mapreduce.reduce.memory.mb (use default)
+    #    reducer needs enough memory to hold all data during the shuffle phase
+    #      --jobconf "mapreduce.reduce.memory.mb=2730" \
+    #      --jobconf "mapreduce.reduce.java.opts=-Xmx2252m" \
     # mapreduce.output.fileoutputformat.compress=false
     #    must not compress output, even if this is the default, because it may not
     #    be readable from Python via seqfileutils.py. Alternatively, compress
@@ -100,8 +105,6 @@ else
            --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
            --jobconf "mapreduce.map.memory.mb=640" \
            --jobconf "mapreduce.map.java.opts=-Xmx512m" \
-           --jobconf "mapreduce.reduce.memory.mb=1024" \
-           --jobconf "mapreduce.reduce.java.opts=-Xmx512m" \
            --jobconf "mapreduce.map.output.compress=true" \
            --jobconf "mapreduce.output.fileoutputformat.compress=false" \
            -r hadoop $WARC_CDX

From 7a76abb40df560e335481e49102f759ac45eb924 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 9 Jan 2018 10:25:03 +0100
Subject: [PATCH 08/66] Upgrade mrjob option processing (from optparse to
 argparse), fixes #4

---
 indexwarcsjob.py    | 18 +++++++++---------
 samplecdxjob.py     | 30 +++++++++++++++---------------
 zipnumclusterjob.py | 36 ++++++++++++++++++------------------
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 7b7d71f..8379863 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -36,17 +36,17 @@ def configure_options(self):
         """Custom command line options for indexing"""
         super(IndexWARCJob, self).configure_options()
 
-        self.add_passthrough_option('--warc_bucket', dest='warc_bucket',
-                                    default='commoncrawl',
-                                    help='source bucket for warc paths, if input is a relative path (S3 Only)')
+        self.add_passthru_arg('--warc_bucket', dest='warc_bucket',
+                              default='commoncrawl',
+                              help='source bucket for warc paths, if input is a relative path (S3 Only)')
 
-        self.add_passthrough_option('--cdx_bucket', dest='cdx_bucket',
-                                    default='my_cdx_bucket',
-                                    help='destination bucket for cdx (S3 Only)')
+        self.add_passthru_arg('--cdx_bucket', dest='cdx_bucket',
+                              default='my_cdx_bucket',
+                              help='destination bucket for cdx (S3 Only)')
 
-        self.add_passthrough_option('--skip-existing', dest='skip_existing', action='store_true',
-                                    help='skip processing files that already have CDX',
-                                    default=True)
+        self.add_passthru_arg('--skip-existing', dest='skip_existing', action='store_true',
+                              help='skip processing files that already have CDX',
+                              default=True)
 
     def mapper_init(self):
         # Note: this assumes that credentials are set via
diff --git a/samplecdxjob.py b/samplecdxjob.py
index 9091ee7..98bc425 100644
--- a/samplecdxjob.py
+++ b/samplecdxjob.py
@@ -34,21 +34,21 @@ def configure_options(self):
         """Custom command line options for indexing"""
         super(SampleCDXJob, self).configure_options()
 
-        self.add_passthrough_option('--shards', dest='shards',
-                                    type=int,
-                                    default=300,
-                                    help='Number of shards in output ' +
-                                         '(create shards-1 splits')
-
-        self.add_passthrough_option('--scaler', dest='scaler',
-                                    type=int,
-                                    default=100,
-                                    help='Scaler for sample size: ' +
-                                         'Sample size = shards * scaler')
-
-        self.add_passthrough_option('--splitfile', dest='splitfile',
-                                    help='Split file output dest, ' +
-                                         'will contain shards-1 splits')
+        self.add_passthru_arg('--shards', dest='shards',
+                              type=int,
+                              default=300,
+                              help='Number of shards in output ' +
+                                   '(create shards-1 splits')
+
+        self.add_passthru_arg('--scaler', dest='scaler',
+                              type=int,
+                              default=100,
+                              help='Scaler for sample size: ' +
+                                   'Sample size = shards * scaler')
+
+        self.add_passthru_arg('--splitfile', dest='splitfile',
+                              help='Split file output dest, ' +
+                                   'will contain shards-1 splits')
 
     def mapper_init(self):
         self.N = self.options.shards * self.options.scaler
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index 8fcfe7b..411b165 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -36,24 +36,24 @@ def configure_options(self):
         """Custom command line options for indexing"""
         super(ZipNumClusterJob, self).configure_options()
 
-        self.add_passthrough_option('--numlines', dest='numlines',
-                                    type=int,
-                                    default=3000,
-                                    help='Number of lines per gzipped block')
-
-        self.add_passthrough_option('--splitfile', dest='splitfile',
-                                    help='Split file to use for CDX shard split')
-
-        self.add_passthrough_option('--convert', dest='convert',
-                                    action='store_true',
-                                    default=False,
-                                    help='Convert CDX through _convert_line() function')
-
-        self.add_passthrough_option('--shards', dest='shards',
-                                    type=int,
-                                    help='Num ZipNum Shards to create, ' +
-                                         '= num of entries in splits + 1' +
-                                         '= num of reducers used')
+        self.add_passthru_arg('--numlines', dest='numlines',
+                              type=int,
+                              default=3000,
+                              help='Number of lines per gzipped block')
+
+        self.add_passthru_arg('--splitfile', dest='splitfile',
+                              help='Split file to use for CDX shard split')
+
+        self.add_passthru_arg('--convert', dest='convert',
+                              action='store_true',
+                              default=False,
+                              help='Convert CDX through _convert_line() function')
+
+        self.add_passthru_arg('--shards', dest='shards',
+                              type=int,
+                              help='Num ZipNum Shards to create, ' +
+                                   '= num of entries in splits + 1' +
+                                   '= num of reducers used')
 
     def jobconf(self):
         orig_jobconf = super(ZipNumClusterJob, self).jobconf()

From 4b797ad6ed1fd1ad6ae710510fca4d99712e260b Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 29 Jan 2018 15:06:29 +0100
Subject: [PATCH 09/66] Replace s3cmd by AWS CLI, set public-read ACL when
 creating objects

---
 publish_index.sh    | 26 ++++++++++++++++++--------
 run_index_hadoop.sh |  5 +++--
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index fffb962..1e62ba6 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -18,26 +18,36 @@ if ! [ -e $YEARWEEK-metadata.yaml ]; then
 		YEAR=${YEARWEEK%%-*}
 		echo "title: '${MONTH^} $YEAR Index'" >$YEARWEEK-metadata.yaml
 	else
-		s3cmd get s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
+		aws s3 cp s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
 		echo "Please, edit $YEARWEEK-metadata.yaml"
 		exit 1
 	fi
 fi
-s3cmd put $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml --acl-public
+aws s3 cp $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml --acl public-read
 
 test -d cdx-$YEARWEEK || mkdir cdx-$YEARWEEK
 cd cdx-$YEARWEEK
 
 ## create cluster index
-s3cmd get --force s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/part-*
-cat part-* | awk '{printf "%s\t%s\n",$0,NR}' > cluster.idx
-export LC_ALL=C
-sort -c ./cluster.idx
+aws s3 cp --recursive --exclude '*' --include 'part-*' s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ ./
+cat part-* | awk '{printf "%s\t%s\n",$0,NR}' >cluster.idx
+LC_ALL=C sort -c ./cluster.idx
 #rm ./part-00*
-s3cmd put ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl-public
+aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl public-read
 
 ## set public permissions where needed (technically only the cdx-* need to be public)
-s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
+## this should be already done, if not run:
+#     s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
+## or:
+# aws s3 cp \
+#     --exclude='*' \
+#     --include='cdx-*.gz' \
+#     --include='cluster.idx' \
+#     --include='metadata.yaml' \
+#     --recursive \
+#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ \
+#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ \
+#     --acl public-read
 
 # remove obsolete data from bucket
 #  - map-reduce _SUCCESS file/marker
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index da8859c..7e50844 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -123,11 +123,11 @@ else
 
     mv splits.seq $(basename s3${SPLIT_FILE#s3a})
 
-    if s3cmd info s3${SPLIT_FILE#s3a}; then
+    if aws s3 ls s3${SPLIT_FILE#s3a}; then
         echo "Ok, split file was upload"
     else
         echo "Uploading split file ..."
-        s3cmd put $(basename s3${SPLIT_FILE#s3a}) s3${SPLIT_FILE#s3a}
+        aws s3 cp $(basename s3${SPLIT_FILE#s3a}) s3${SPLIT_FILE#s3a}
     fi
 fi
 
@@ -143,5 +143,6 @@ python zipnumclusterjob.py \
        --jobconf "mapreduce.map.java.opts=-Xmx512m" \
        --jobconf "mapreduce.reduce.memory.mb=1536" \
        --jobconf "mapreduce.reduce.java.opts=-Xmx1024m" \
+       --jobconf "fs.s3a.acl.default=PublicRead" \
        -r hadoop $WARC_CDX
 

From fa2fc8b5ccc2eed8c18a1b970a7655217946652d Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 9 Jan 2018 10:25:03 +0100
Subject: [PATCH 10/66] Upgrade to use boto3: - transition of index warcs (step
 1) - more logging - log with timestamps (to address performance issues) -
 NOTE: testing on Hadoop showed that         - S3 access is robust (no errors)
         - but some tasks are very slow fetching large WARC files

---
 indexwarcsjob.py    | 87 +++++++++++++++++++++++++++++++++------------
 run_index_hadoop.sh | 10 +++++-
 2 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 8379863..9a039c4 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -1,6 +1,10 @@
-import boto
 import sys
 
+from datetime import datetime
+
+import boto3
+import botocore
+
 from mrjob.job import MRJob
 from mrjob.protocol import RawValueProtocol
 
@@ -48,16 +52,31 @@ def configure_options(self):
                               help='skip processing files that already have CDX',
                               default=True)
 
+        self.add_passthru_arg("--s3_local_temp_dir", dest='s3_local_temp_dir',
+                              help='Local temporary directory to buffer content from S3',
+                              default=None)
+
     def mapper_init(self):
         # Note: this assumes that credentials are set via
         # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env variables
-        self.conn = boto.connect_s3()
+        self.boto_config = botocore.client.Config(
+            read_timeout=180,
+            retries={'max_attempts' : 20})
+        s3client = boto3.client('s3', config=self.boto_config)
 
-        self.warc_bucket = self.conn.lookup(self.options.warc_bucket)
-        assert(self.warc_bucket)
+        try:
+            s3client.head_bucket(Bucket=self.options.warc_bucket)
+        except botocore.exceptions.ClientError as e:
+            self._log('Failed to access bucket {}: {}'.format(
+                self.options.warc_bucket, e), 'ERROR')
+            return
 
-        self.cdx_bucket = self.conn.lookup(self.options.cdx_bucket)
-        assert(self.cdx_bucket)
+        try:
+            s3client.head_bucket(Bucket=self.options.cdx_bucket)
+        except botocore.exceptions.ClientError as e:
+            self._log('Failed to access bucket {}: {}'.format(
+                self.options.cdx_bucket, e), 'ERROR')
+            return
 
         self.index_options = {
             'surt_ordered': True,
@@ -71,9 +90,13 @@ def mapper(self, _, line):
         try:
             self._load_and_index(warc_path)
         except Exception as exc:
-            sys.stderr.write(warc_path + '\n')
+            self._log('Failed to index ' + warc_path, 'ERROR')
             raise
 
+    def _log(self, msg, level='INFO'):
+        dt = datetime.now()
+        sys.stderr.write('{:%y/%m/%d %H:%M:%S} {} {}\n'.format(dt, level, msg))
+
     def _conv_warc_to_cdx_path(self, warc_path):
         # set cdx path
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
@@ -81,36 +104,56 @@ def _conv_warc_to_cdx_path(self, warc_path):
         return cdx_path
 
     def _load_and_index(self, warc_path):
-        warckey = self.warc_bucket.get_key(warc_path)
-
-        if warckey is None:
-            sys.stderr.write('WARC not found: {}\n'.format(warc_path))
-            return
 
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
-        if self.options.skip_existing:
-            cdxkey = self.cdx_bucket.get_key(cdx_path)
+        self._log('Indexing WARC: ' + warc_path)
+        s3client = boto3.client('s3', config=self.boto_config)
 
-            if cdxkey:
-                sys.stderr.write('Already Exists: {}\n'.format(cdx_path))
+        if self.options.skip_existing:
+            try:
+                s3client.head_object(Bucket=self.options.cdx_bucket,
+                                          Key=cdx_path)
+                self._log('Already Exists: ' + cdx_path)
                 return
+            except botocore.client.ClientError as exception:
+                pass # ok, not found
+
+        try:
+            s3client.head_object(Bucket=self.options.warc_bucket,
+                                      Key=warc_path)
+        except botocore.client.ClientError as exception:
+            self._log('WARC not found: ' + warc_path, 'ERROR')
+            return
 
-        with TemporaryFile(mode='w+b') as warctemp:
-            warckey.get_file(warctemp, override_num_retries=10)
+        with TemporaryFile(mode='w+b',
+                           dir=self.options.s3_local_temp_dir) as warctemp:
+            self._log('Fetching WARC: ' + warc_path)
+            try:
+                s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp)
+            except botocore.client.ClientError as exception:
+                self._log('Failed to download {}: {}'.format(warc_path, exception), 'ERROR')
+                return
 
             warctemp.seek(0)
+            self._log('Successfully fetched WARC: ' + warc_path)
 
-            with TemporaryFile(mode='w+b') as cdxtemp:
+            with TemporaryFile(mode='w+b',
+                               dir=self.options.s3_local_temp_dir) as cdxtemp:
                 with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                     # Index to temp
                     write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options)
 
                 # Upload temp
-                cdxkey = self.cdx_bucket.new_key(cdx_path)
                 cdxtemp.flush()
-
-                cdxkey.set_contents_from_file(cdxtemp, rewind=True)
+                cdxtemp.seek(0)
+                self._log('Uploading CDX: ' + cdx_path)
+                try:
+                    s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path)
+                except botocore.client.ClientError as exception:
+                    self._log('Failed to upload {}: {}'.format(cdx_path, exception), 'ERROR')
+                    return
+                self._log('Successfully uploaded CDX: ' + cdx_path)
 
 
 if __name__ == "__main__":
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 7e50844..d374c39 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -56,6 +56,13 @@ export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/CC-MAIN-$YEARW
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
 export SPLIT_FILE="s3a://cc-cdx-index/${YEARWEEK}_splits.seq"
 
+# configure S3 buffer directory
+if [ -n "$S3_LOCAL_TEMP_DIR" ]; then
+	S3_LOCAL_TEMP_DIR="--s3_local_temp_dir=$S3_LOCAL_TEMP_DIR"
+else
+	S3_LOCAL_TEMP_DIR=""
+fi
+
 
 export LC_ALL=C
 
@@ -74,6 +81,7 @@ if [ -n "$WARC_MANIFEST" ]; then
        -r hadoop \
        --jobconf "mapreduce.map.memory.mb=800" \
        --jobconf "mapreduce.map.java.opts=-Xmx512m" \
+       $S3_LOCAL_TEMP_DIR \
        $WARC_MANIFEST
 fi
 
@@ -94,7 +102,7 @@ else
     # mapreduce.output.fileoutputformat.compress=false
     #    must not compress output, even if this is the default, because it may not
     #    be readable from Python via seqfileutils.py. Alternatively, compress
-    #    and decompress the data explicitely.
+    #    and decompress the data explicitly.
     test -e splits.txt && rm splits.txt
     test -e splits.seq && rm splits.seq
     python dosample.py \

From 56a03b8d9fea5ffb378fc26fffdbfaf5b00bce93 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 23 Feb 2018 11:53:29 +0100
Subject: [PATCH 11/66] Upgrade to use boto3: - upgrade zipnumcluster job (step
 3) - improved logging configuration - upgrade requirements/bootstrap

---
 bootstrap.sh        |  9 ++-------
 indexwarcsjob.py    | 39 +++++++++++++++++++++------------------
 requirements.txt    |  2 +-
 zipnumclusterjob.py | 33 ++++++++++++++++++++++++---------
 4 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/bootstrap.sh b/bootstrap.sh
index 91929a9..544c29c 100644
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,10 +1,5 @@
 #!/bin/bash
 
 # bootstrap commands
-sudo yum install -y python27 python27-devel gcc-c++ git libffi-devel
-sudo curl -o /tmp/get-pip.py https://bootstrap.pypa.io/get-pip.py
-sudo python2.7 /tmp/get-pip.py
-# http://superuser.com/questions/762185/python2-7-pip2-7-install-in-centos6-root-does-not-see-usr-local-bin
-sudo /usr/local/bin/pip2.7 install boto mrjob simplejson
-sudo /usr/local/bin/pip2.7 install pywb
-#sudo pip2.7 install -e "git+git://github.com/ikreymer/pywb.git@develop#egg=pywb-0.9.7-dev"
+sudo yum install -y python27 python27-devel python27-pip gcc-c++ git libffi-devel
+sudo pip2.7 install boto3 mrjob simplejson pywb
diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 9a039c4..0e932b9 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -1,3 +1,4 @@
+import logging
 import sys
 
 from datetime import datetime
@@ -7,12 +8,18 @@
 
 from mrjob.job import MRJob
 from mrjob.protocol import RawValueProtocol
+from mrjob.util import log_to_stream
 
 from tempfile import TemporaryFile
 from pywb.warc.cdxindexer import write_cdx_index
 from gzip import GzipFile
 
 
+LOG = logging.getLogger('IndexWARCJob')
+log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+              name='IndexWARCJob')
+
+
 #=============================================================================
 class IndexWARCJob(MRJob):
     """ This job receives as input a manifest of WARC/ARC files and produces
@@ -67,15 +74,15 @@ def mapper_init(self):
         try:
             s3client.head_bucket(Bucket=self.options.warc_bucket)
         except botocore.exceptions.ClientError as e:
-            self._log('Failed to access bucket {}: {}'.format(
-                self.options.warc_bucket, e), 'ERROR')
+            LOG.error('Failed to access bucket {}: {}'.format(
+                self.options.warc_bucket, e))
             return
 
         try:
             s3client.head_bucket(Bucket=self.options.cdx_bucket)
         except botocore.exceptions.ClientError as e:
-            self._log('Failed to access bucket {}: {}'.format(
-                self.options.cdx_bucket, e), 'ERROR')
+            LOG.error('Failed to access bucket {}: {}'.format(
+                self.options.cdx_bucket, e))
             return
 
         self.index_options = {
@@ -90,13 +97,9 @@ def mapper(self, _, line):
         try:
             self._load_and_index(warc_path)
         except Exception as exc:
-            self._log('Failed to index ' + warc_path, 'ERROR')
+            LOG.error('Failed to index ' + warc_path)
             raise
 
-    def _log(self, msg, level='INFO'):
-        dt = datetime.now()
-        sys.stderr.write('{:%y/%m/%d %H:%M:%S} {} {}\n'.format(dt, level, msg))
-
     def _conv_warc_to_cdx_path(self, warc_path):
         # set cdx path
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
@@ -107,14 +110,14 @@ def _load_and_index(self, warc_path):
 
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
-        self._log('Indexing WARC: ' + warc_path)
+        LOG.info('Indexing WARC: ' + warc_path)
         s3client = boto3.client('s3', config=self.boto_config)
 
         if self.options.skip_existing:
             try:
                 s3client.head_object(Bucket=self.options.cdx_bucket,
                                           Key=cdx_path)
-                self._log('Already Exists: ' + cdx_path)
+                LOG.info('Already Exists: ' + cdx_path)
                 return
             except botocore.client.ClientError as exception:
                 pass # ok, not found
@@ -123,20 +126,20 @@ def _load_and_index(self, warc_path):
             s3client.head_object(Bucket=self.options.warc_bucket,
                                       Key=warc_path)
         except botocore.client.ClientError as exception:
-            self._log('WARC not found: ' + warc_path, 'ERROR')
+            LOG.error('WARC not found: ' + warc_path)
             return
 
         with TemporaryFile(mode='w+b',
                            dir=self.options.s3_local_temp_dir) as warctemp:
-            self._log('Fetching WARC: ' + warc_path)
+            LOG.info('Fetching WARC: ' + warc_path)
             try:
                 s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp)
             except botocore.client.ClientError as exception:
-                self._log('Failed to download {}: {}'.format(warc_path, exception), 'ERROR')
+                LOG.error('Failed to download {}: {}'.format(warc_path, exception))
                 return
 
             warctemp.seek(0)
-            self._log('Successfully fetched WARC: ' + warc_path)
+            LOG.info('Successfully fetched WARC: ' + warc_path)
 
             with TemporaryFile(mode='w+b',
                                dir=self.options.s3_local_temp_dir) as cdxtemp:
@@ -147,13 +150,13 @@ def _load_and_index(self, warc_path):
                 # Upload temp
                 cdxtemp.flush()
                 cdxtemp.seek(0)
-                self._log('Uploading CDX: ' + cdx_path)
+                LOG.info('Uploading CDX: ' + cdx_path)
                 try:
                     s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path)
                 except botocore.client.ClientError as exception:
-                    self._log('Failed to upload {}: {}'.format(cdx_path, exception), 'ERROR')
+                    LOG.error('Failed to upload {}: {}'.format(cdx_path, exception))
                     return
-                self._log('Successfully uploaded CDX: ' + cdx_path)
+                LOG.info('Successfully uploaded CDX: ' + cdx_path)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 84f7ad7..b2a3750 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 mrjob
-boto
+boto3
 pywb
 #-e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop
 -e git+https://github.com/commoncrawl/python-hadoop.git#egg=master
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index 411b165..ede45d5 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -1,3 +1,4 @@
+import logging
 import shutil
 import sys
 import os
@@ -11,6 +12,11 @@
 from mrjob.job import MRJob
 from mrjob.conf import combine_dicts
 from mrjob.protocol import RawProtocol, RawValueProtocol
+from mrjob.util import log_to_stream
+
+
+LOG = logging.getLogger('ZipNumClusterJob')
+log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",name='ZipNumClusterJob')
 
 
 #=============================================================================
@@ -125,21 +131,30 @@ def reducer_final(self):
 
     def _do_upload(self):
         self.gzip_temp.flush()
+        self.gzip_temp.seek(0)
         #TODO: move to generalized put() function
         if self.output_dir.startswith('s3://') or self.output_dir.startswith('s3a://'):
-            import boto
-            conn = boto.connect_s3()
-            parts = urlparse.urlsplit(self.output_dir)
+            import boto3
+            import botocore
+            boto_config = botocore.client.Config(
+                read_timeout=180,
+                retries={'max_attempts' : 20})
+            s3client = boto3.client('s3', config=boto_config)
 
-            bucket = conn.lookup(parts.netloc)
-
-            cdxkey = bucket.new_key(parts.path + '/' + self.part_name)
-            cdxkey.set_contents_from_file(self.gzip_temp, rewind=True)
+            parts = urlparse.urlsplit(self.output_dir)
+            s3key = parts.path.strip('/') + '/' + self.part_name
+            s3url = parts.scheme + '://' + parts.netloc + '/' + s3key
+
+            LOG.info('Uploading index to ' + s3url)
+            try:
+                s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key)
+            except botocore.client.ClientError as exception:
+                LOG.error('Failed to upload {}: {}'.format(s3url, exception))
+                return
+            LOG.info('Successfully uploaded index file: ' + s3url)
         else:
             path = os.path.join(self.output_dir, self.part_name)
 
-            self.gzip_temp.seek(0)
-
             with open(path, 'w+b') as target:
                 shutil.copyfileobj(self.gzip_temp, target)
 

From bcd78be7e0d923af9bf12759682c731fcf22711a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 23 Feb 2018 13:40:36 +0100
Subject: [PATCH 12/66] Upgrade for use with PyWB 2.0

---
 README.md        | 2 +-
 indexwarcsjob.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6b11366..55ad27b 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ and output directory of `/cdx/`, the following will be created:
 
 This is a map only job, and a single mapper is created per input file by default.
 
-The `pywb.warc.cdxindexer.write_cdx_index`, the same used by the pywb `cdx-indexer` app is used to create the index.
+The `pywb.indexer.cdxindexer.write_cdx_index`, the same used by the pywb `cdx-indexer` app is used to create the index.
 Refer to `cdx-indexer -h` for list of possible options.
 
 ### Sampling CDXs to Create Split File ###
diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 0e932b9..e8df4f5 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -11,7 +11,7 @@
 from mrjob.util import log_to_stream
 
 from tempfile import TemporaryFile
-from pywb.warc.cdxindexer import write_cdx_index
+from pywb.indexer.cdxindexer import write_cdx_index
 from gzip import GzipFile
 
 
@@ -25,7 +25,7 @@ class IndexWARCJob(MRJob):
     """ This job receives as input a manifest of WARC/ARC files and produces
     a CDX index per file
 
-    The pywb.warc.cdxindexer is used to create the index, with a fixed set of options
+    The pywb.indexer.cdxindexer is used to create the index, with a fixed set of options
     TODO: add way to customized indexing options.
 
     """

From 5a995734342d6b23ee8ba37cc274559993d9bfc9 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Wed, 2 May 2018 14:40:30 +0200
Subject: [PATCH 13/66] - obligatorily adjust S3 access control / permissions -
 temporary CDX files are moved away from s3://commoncrawl/ bucket

---
 publish_index.sh    | 23 ++++++++++++++---------
 run_index_hadoop.sh | 10 +++++-----
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index 1e62ba6..f5b9787 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -35,9 +35,20 @@ LC_ALL=C sort -c ./cluster.idx
 #rm ./part-00*
 aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl public-read
 
+# remove obsolete data from bucket
+#  - map-reduce _SUCCESS file/marker
+aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCESS
+#  - part-00* files concatenated to cluster.idx
+aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
+
+## TODO:
+##   check why setting public-read permissions via
+##     --jobconf "fs.s3a.acl.default=PublicRead"
+##   does not work?
+##
 ## set public permissions where needed (technically only the cdx-* need to be public)
 ## this should be already done, if not run:
-#     s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
+s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
 ## or:
 # aws s3 cp \
 #     --exclude='*' \
@@ -45,15 +56,9 @@ aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/
 #     --include='cluster.idx' \
 #     --include='metadata.yaml' \
 #     --recursive \
-#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ \
-#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ \
+#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ \
+#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ \
 #     --acl public-read
 
-# remove obsolete data from bucket
-#  - map-reduce _SUCCESS file/marker
-aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCESS
-#  - part-00* files concatenated to cluster.idx
-aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
-
 # make *.cdx.gz files (to be deleted later) private
 #s3cmd setacl s3://commoncrawl/cc-index/CC-MAIN-$YEARWEEK/segments/ --acl-private --recursive
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index d374c39..38a1279 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -45,7 +45,7 @@ echo
 
 # glob pattern to match all CDX files generated in step 1 (indexwarcsjob.py)
 # (filesystem protocol must be supported by the used Hadoop version)
-export WARC_CDX="s3a://commoncrawl/cc-index/cdx/CC-MAIN-$YEARWEEK/segments/*/*/*.cdx.gz"
+export WARC_CDX="s3a://commoncrawl-index-temp/CC-MAIN-$YEARWEEK/cdx/segments/*/*/*.cdx.gz"
 
 # AWS S3 bucket to hold CDX files
 export WARC_CDX_BUCKET="commoncrawl"
@@ -54,7 +54,7 @@ export WARC_CDX_BUCKET="commoncrawl"
 export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/"
 
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
-export SPLIT_FILE="s3a://cc-cdx-index/${YEARWEEK}_splits.seq"
+export SPLIT_FILE="s3a://commoncrawl-index-temp/CC-MAIN-${YEARWEEK}/splits.seq"
 
 # configure S3 buffer directory
 if [ -n "$S3_LOCAL_TEMP_DIR" ]; then
@@ -129,13 +129,13 @@ else
 	# 3. verify the sequence file
 	#      hadoop fs -text file:$PWD/splits.seq | less
 
-    mv splits.seq $(basename s3${SPLIT_FILE#s3a})
+    mv splits.seq CC-MAIN-${YEARWEEK}-splits.seq
 
     if aws s3 ls s3${SPLIT_FILE#s3a}; then
-        echo "Ok, split file was upload"
+        echo "Ok, split file has been upload"
     else
         echo "Uploading split file ..."
-        aws s3 cp $(basename s3${SPLIT_FILE#s3a}) s3${SPLIT_FILE#s3a}
+        aws s3 cp CC-MAIN-${YEARWEEK}-splits.seq s3${SPLIT_FILE#s3a}
     fi
 fi
 

From a0431b3c3fadae6007b8359a65a6c0646de33835 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Wed, 16 Jan 2019 18:16:35 +0100
Subject: [PATCH 14/66] Improve script to create index on Hadoop

---
 run_index_hadoop.sh | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 38a1279..008daf4 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -11,11 +11,14 @@ Create a Common Crawl index for a monthly crawl. All steps are run on Hadoop.
                              s3://commoncrawl/cc-index/collections/CC-MAIN-2016-44/...
 
   <path-to-warc-file-list>  list of WARC file objects to be indexed, e.g, the WARC list
-                               s3://commoncrawl/crawl-data/CC-MAIN-2016-44/warc.paths.gz
+                               s3://commoncrawl/crawl-data/CC-MAIN-2016-44/warc.paths
                          or any subset or union of multiple WARC lists (incl. robots.txt WARCs).
                          Paths in the list must be keys/objects in the Common Crawl bucket.
                          The path to the list must be a valid and complete HDFS or S3A URL,
                          e.g. hdfs://hdfs-master.example.com/user/hadoop-user/CC-MAIN-2016-44.paths
+                         The list must not be compressed to allow that the paths list is split into
+                         multiple tasks (see mapreduce.input.lineinputformat.linespermap).
+
                          The "index warcs" step is skipped if an empty string is passed as argument.
 
   <split_file>           Optional split file to be reused from previous crawl with similar distribution of URLs.
@@ -38,23 +41,25 @@ YEARWEEK="$1"
 WARC_MANIFEST="$2"
 REUSE_SPLIT_FILE="$3"
 
-echo "Generating cc-index for $YEARWEEK"
+CRAWL="CC-MAIN-$YEARWEEK"
+
+echo "Generating cc-index for $CRAWL"
 echo
 echo WARC_MANIFEST="$WARC_MANIFEST"
 echo
 
-# glob pattern to match all CDX files generated in step 1 (indexwarcsjob.py)
-# (filesystem protocol must be supported by the used Hadoop version)
-export WARC_CDX="s3a://commoncrawl-index-temp/CC-MAIN-$YEARWEEK/cdx/segments/*/*/*.cdx.gz"
+# final path to index files
+export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/$CRAWL/indexes/"
 
 # AWS S3 bucket to hold CDX files
-export WARC_CDX_BUCKET="commoncrawl"
+export WARC_CDX_BUCKET="commoncrawl-index-temp"
 
-# path to index files
-export ZIPNUM_CLUSTER_DIR="s3a://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/"
+# glob pattern to match all CDX files generated in step 1 (indexwarcsjob.py)
+# (filesystem protocol must be supported by the used Hadoop version)
+export WARC_CDX="s3a://$WARC_CDX_BUCKET/$CRAWL/cdx/segments/*/*/*.cdx.gz"
 
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
-export SPLIT_FILE="s3a://commoncrawl-index-temp/CC-MAIN-${YEARWEEK}/splits.seq"
+export SPLIT_FILE="s3a://$WARC_CDX_BUCKET/$CRAWL/splits.seq"
 
 # configure S3 buffer directory
 if [ -n "$S3_LOCAL_TEMP_DIR" ]; then
@@ -129,13 +134,13 @@ else
 	# 3. verify the sequence file
 	#      hadoop fs -text file:$PWD/splits.seq | less
 
-    mv splits.seq CC-MAIN-${YEARWEEK}-splits.seq
+    mv splits.seq ${CRAWL}-splits.seq
 
     if aws s3 ls s3${SPLIT_FILE#s3a}; then
         echo "Ok, split file has been upload"
     else
         echo "Uploading split file ..."
-        aws s3 cp CC-MAIN-${YEARWEEK}-splits.seq s3${SPLIT_FILE#s3a}
+        aws s3 cp ${CRAWL}-splits.seq s3${SPLIT_FILE#s3a}
     fi
 fi
 

From eef0f5d20dd44fa62aa7ca0272f986a52c14beaf Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 28 Feb 2019 17:07:51 +0100
Subject: [PATCH 15/66] SampleCDXJob: - fix logging when running on Hadoop -
 use only a subset of segments for sampling

---
 run_index_hadoop.sh | 9 +++++++--
 samplecdxjob.py     | 5 +++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 008daf4..84b9341 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -57,6 +57,11 @@ export WARC_CDX_BUCKET="commoncrawl-index-temp"
 # glob pattern to match all CDX files generated in step 1 (indexwarcsjob.py)
 # (filesystem protocol must be supported by the used Hadoop version)
 export WARC_CDX="s3a://$WARC_CDX_BUCKET/$CRAWL/cdx/segments/*/*/*.cdx.gz"
+export WARC_CDX_SAMPLE="$WARC_CDX"
+# if URLs are randomly distributed over WARC/CDX files,
+# a sample (20-30%) is enough to determine the splits of the final CDX shards
+# (simple approach: take 30% of segments)
+export WARC_CDX_SAMPLE="s3a://$WARC_CDX_BUCKET/$CRAWL/cdx/segments/*[358]/*/*.cdx.gz"
 
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
 export SPLIT_FILE="s3a://$WARC_CDX_BUCKET/$CRAWL/splits.seq"
@@ -120,7 +125,7 @@ else
            --jobconf "mapreduce.map.java.opts=-Xmx512m" \
            --jobconf "mapreduce.map.output.compress=true" \
            --jobconf "mapreduce.output.fileoutputformat.compress=false" \
-           -r hadoop $WARC_CDX
+           -r hadoop $WARC_CDX_SAMPLE
 
 	# in case, the sequence file wasn't written:
 	# 1. verify the content
@@ -137,7 +142,7 @@ else
     mv splits.seq ${CRAWL}-splits.seq
 
     if aws s3 ls s3${SPLIT_FILE#s3a}; then
-        echo "Ok, split file has been upload"
+        echo "Ok, split file has been uploaded"
     else
         echo "Uploading split file ..."
         aws s3 cp ${CRAWL}-splits.seq s3${SPLIT_FILE#s3a}
diff --git a/samplecdxjob.py b/samplecdxjob.py
index 98bc425..340ef06 100644
--- a/samplecdxjob.py
+++ b/samplecdxjob.py
@@ -3,6 +3,11 @@
 
 from mrjob.job import MRJob
 from mrjob.protocol import RawValueProtocol
+from mrjob.util import log_to_stream
+
+
+LOG = logging.getLogger('SampleCDXJob')
+log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",name='SampleCDXJob')
 
 
 #=============================================================================

From 7a3d1bffc49de3393509ed09863277e6fe70ee8c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 16 Apr 2019 11:25:27 +0200
Subject: [PATCH 16/66] Add missing import

---
 samplecdxjob.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samplecdxjob.py b/samplecdxjob.py
index 340ef06..121edec 100644
--- a/samplecdxjob.py
+++ b/samplecdxjob.py
@@ -1,3 +1,4 @@
+import logging
 import random
 from heapq import heappush, heapreplace
 

From 1fe10d462bb6220806724fb70705a2e3df1fb435 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 29 Oct 2019 15:44:27 +0100
Subject: [PATCH 17/66] Replace s3cmd by AWS CLI

---
 publish_index.sh | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index f5b9787..7e170d0 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -41,24 +41,16 @@ aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCE
 #  - part-00* files concatenated to cluster.idx
 aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
 
+## set public read permissions
+##
 ## TODO:
 ##   check why setting public-read permissions via
 ##     --jobconf "fs.s3a.acl.default=PublicRead"
 ##   does not work?
 ##
-## set public permissions where needed (technically only the cdx-* need to be public)
-## this should be already done, if not run:
-s3cmd setacl s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ --acl-public --recursive --exclude='*' --include='cdx-*.gz' --include='cluster.idx' --include='metadata.yaml'
-## or:
-# aws s3 cp \
-#     --exclude='*' \
-#     --include='cdx-*.gz' \
-#     --include='cluster.idx' \
-#     --include='metadata.yaml' \
-#     --recursive \
-#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ \
-#     s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/ \
-#     --acl public-read
-
-# make *.cdx.gz files (to be deleted later) private
-#s3cmd setacl s3://commoncrawl/cc-index/CC-MAIN-$YEARWEEK/segments/ --acl-private --recursive
+for i in $(seq 0 299); do
+    aws s3api put-object-acl \
+        --acl public-read \
+        --bucket commoncrawl \
+        --key cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cdx-$(printf "%05i" $i).gz
+done

From 23a8dafbc26a34bffbaf9251333585e511d6415c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 25 Nov 2019 15:16:41 +0100
Subject: [PATCH 18/66] Set public-read ACL during cdx-*.gz upload

---
 publish_index.sh    | 13 -------------
 run_index_hadoop.sh |  1 +
 zipnumclusterjob.py |  9 ++++++++-
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index 7e170d0..c30f47f 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -41,16 +41,3 @@ aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCE
 #  - part-00* files concatenated to cluster.idx
 aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
 
-## set public read permissions
-##
-## TODO:
-##   check why setting public-read permissions via
-##     --jobconf "fs.s3a.acl.default=PublicRead"
-##   does not work?
-##
-for i in $(seq 0 299); do
-    aws s3api put-object-acl \
-        --acl public-read \
-        --bucket commoncrawl \
-        --key cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cdx-$(printf "%05i" $i).gz
-done
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 84b9341..269277f 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -154,6 +154,7 @@ python zipnumclusterjob.py \
        --shards=300 \
        --splitfile=$SPLIT_FILE \
        --output-dir="$ZIPNUM_CLUSTER_DIR" \
+       --s3-upload-acl="public-read" \
        --no-output \
        --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
        --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index ede45d5..a46f617 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -61,6 +61,9 @@ def configure_options(self):
                                    '= num of entries in splits + 1' +
                                    '= num of reducers used')
 
+        self.add_passthru_arg('--s3-upload-acl', dest='s3acl',
+                              help='S3 access permissions (ACL) to be applied to CDX files')
+
     def jobconf(self):
         orig_jobconf = super(ZipNumClusterJob, self).jobconf()
         custom_jobconf = {'mapreduce.job.reduces': self.options.shards,
@@ -140,6 +143,9 @@ def _do_upload(self):
                 read_timeout=180,
                 retries={'max_attempts' : 20})
             s3client = boto3.client('s3', config=boto_config)
+            s3args = None
+            if self.options.s3acl:
+                s3args = {'ACL': self.options.s3acl}
 
             parts = urlparse.urlsplit(self.output_dir)
             s3key = parts.path.strip('/') + '/' + self.part_name
@@ -147,7 +153,8 @@ def _do_upload(self):
 
             LOG.info('Uploading index to ' + s3url)
             try:
-                s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key)
+                s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key,
+                                        ExtraArgs=s3args)
             except botocore.client.ClientError as exception:
                 LOG.error('Failed to upload {}: {}'.format(s3url, exception))
                 return

From af341bb8c0c7cc1556622beca7cfa6f54eab4696 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 17 Dec 2019 12:29:18 +0100
Subject: [PATCH 19/66] Upgrade to mrjob 0.7.0 - replace removed/renamed
 deprecated methods

---
 dosample.py         | 2 +-
 indexwarcsjob.py    | 4 ++--
 samplecdxjob.py     | 6 +++---
 zipnumclusterjob.py | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dosample.py b/dosample.py
index 55db3cd..77dcd64 100644
--- a/dosample.py
+++ b/dosample.py
@@ -21,7 +21,7 @@ def run_sample_job():
 
         # dump streaming output to file
         with open(SPL_FILE, 'wb') as fh:
-            for x in runner.stream_output():
+            for x in runner.cat_output():
                 fh.write(x)
             fh.close()
 
diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index e8df4f5..6f782a5 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -43,9 +43,9 @@ class IndexWARCJob(MRJob):
                 'mapreduce.input.lineinputformat.linespermap': 2,
                 }
 
-    def configure_options(self):
+    def configure_args(self):
         """Custom command line options for indexing"""
-        super(IndexWARCJob, self).configure_options()
+        super(IndexWARCJob, self).configure_args()
 
         self.add_passthru_arg('--warc_bucket', dest='warc_bucket',
                               default='commoncrawl',
diff --git a/samplecdxjob.py b/samplecdxjob.py
index 121edec..591ac31 100644
--- a/samplecdxjob.py
+++ b/samplecdxjob.py
@@ -30,15 +30,15 @@ class SampleCDXJob(MRJob):
                 'mapreduce.job.jvm.numtasks': '-1',
 
                 # the output should not be compressed even if the default is to compress output,
-                # otherwise reading from MRJobRunner.stream_output() needs decompression on the fly
+                # otherwise reading from MRJobRunner.cat_output() needs decompression on the fly
                 'mapreduce.output.fileoutputformat.compress': 'false',
 
                 'mapreduce.job.reduces': '1'
                }
 
-    def configure_options(self):
+    def configure_args(self):
         """Custom command line options for indexing"""
-        super(SampleCDXJob, self).configure_options()
+        super(SampleCDXJob, self).configure_args()
 
         self.add_passthru_arg('--shards', dest='shards',
                               type=int,
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index a46f617..ae78295 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -38,9 +38,9 @@ class ZipNumClusterJob(MRJob):
                 'mapreduce.job.jvm.numtasks': '-1'
                }
 
-    def configure_options(self):
+    def configure_args(self):
         """Custom command line options for indexing"""
-        super(ZipNumClusterJob, self).configure_options()
+        super(ZipNumClusterJob, self).configure_args()
 
         self.add_passthru_arg('--numlines', dest='numlines',
                               type=int,

From d72d62135666595f0996edcd613c185bbbf1c43d Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 27 Sep 2021 14:26:18 +0200
Subject: [PATCH 20/66] Upgrade to Python 3: - run 2to3-2.7 - upgrade
 dependency python-hadoop (require branch "main") - improve log methods calls:
 use params instead of   string concatenation

---
 indexwarcsjob.py    | 32 ++++++++++++++++----------------
 requirements.txt    |  3 +--
 zipnumclusterjob.py |  6 +++---
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 6f782a5..894e34a 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -74,15 +74,15 @@ def mapper_init(self):
         try:
             s3client.head_bucket(Bucket=self.options.warc_bucket)
         except botocore.exceptions.ClientError as e:
-            LOG.error('Failed to access bucket {}: {}'.format(
-                self.options.warc_bucket, e))
+            LOG.error('Failed to access bucket %s: %s',
+                      self.options.warc_bucket, e)
             return
 
         try:
             s3client.head_bucket(Bucket=self.options.cdx_bucket)
         except botocore.exceptions.ClientError as e:
-            LOG.error('Failed to access bucket {}: {}'.format(
-                self.options.cdx_bucket, e))
+            LOG.error('Failed to access bucket %s: %s',
+                      self.options.cdx_bucket, e)
             return
 
         self.index_options = {
@@ -97,7 +97,7 @@ def mapper(self, _, line):
         try:
             self._load_and_index(warc_path)
         except Exception as exc:
-            LOG.error('Failed to index ' + warc_path)
+            LOG.error('Failed to index %s', warc_path)
             raise
 
     def _conv_warc_to_cdx_path(self, warc_path):
@@ -110,36 +110,36 @@ def _load_and_index(self, warc_path):
 
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
-        LOG.info('Indexing WARC: ' + warc_path)
+        LOG.info('Indexing WARC: %s', warc_path)
         s3client = boto3.client('s3', config=self.boto_config)
 
         if self.options.skip_existing:
             try:
                 s3client.head_object(Bucket=self.options.cdx_bucket,
-                                          Key=cdx_path)
-                LOG.info('Already Exists: ' + cdx_path)
+                                     Key=cdx_path)
+                LOG.info('Already Exists: %s', cdx_path)
                 return
             except botocore.client.ClientError as exception:
                 pass # ok, not found
 
         try:
             s3client.head_object(Bucket=self.options.warc_bucket,
-                                      Key=warc_path)
+                                 Key=warc_path)
         except botocore.client.ClientError as exception:
-            LOG.error('WARC not found: ' + warc_path)
+            LOG.error('WARC not found: %s', warc_path)
             return
 
         with TemporaryFile(mode='w+b',
                            dir=self.options.s3_local_temp_dir) as warctemp:
-            LOG.info('Fetching WARC: ' + warc_path)
+            LOG.info('Fetching WARC: %s', warc_path)
             try:
                 s3client.download_fileobj(self.options.warc_bucket, warc_path, warctemp)
             except botocore.client.ClientError as exception:
-                LOG.error('Failed to download {}: {}'.format(warc_path, exception))
+                LOG.error('Failed to download %s: %s', warc_path, exception)
                 return
 
             warctemp.seek(0)
-            LOG.info('Successfully fetched WARC: ' + warc_path)
+            LOG.info('Successfully fetched WARC: %s', warc_path)
 
             with TemporaryFile(mode='w+b',
                                dir=self.options.s3_local_temp_dir) as cdxtemp:
@@ -150,13 +150,13 @@ def _load_and_index(self, warc_path):
                 # Upload temp
                 cdxtemp.flush()
                 cdxtemp.seek(0)
-                LOG.info('Uploading CDX: ' + cdx_path)
+                LOG.info('Uploading CDX: %s', cdx_path)
                 try:
                     s3client.upload_fileobj(cdxtemp, self.options.cdx_bucket, cdx_path)
                 except botocore.client.ClientError as exception:
-                    LOG.error('Failed to upload {}: {}'.format(cdx_path, exception))
+                    LOG.error('Failed to upload %s: %s', cdx_path, exception)
                     return
-                LOG.info('Successfully uploaded CDX: ' + cdx_path)
+                LOG.info('Successfully uploaded CDX: %s', cdx_path)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index b2a3750..84605b9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 mrjob
 boto3
 pywb
-#-e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop
--e git+https://github.com/commoncrawl/python-hadoop.git#egg=master
+-e git+https://github.com/commoncrawl/python-hadoop.git@main#egg=hadoop
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index ae78295..ece7c49 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -4,7 +4,7 @@
 import os
 
 import zlib
-import urlparse
+import urllib.parse
 import json
 
 from tempfile import TemporaryFile
@@ -147,7 +147,7 @@ def _do_upload(self):
             if self.options.s3acl:
                 s3args = {'ACL': self.options.s3acl}
 
-            parts = urlparse.urlsplit(self.output_dir)
+            parts = urllib.parse.urlsplit(self.output_dir)
             s3key = parts.path.strip('/') + '/' + self.part_name
             s3url = parts.scheme + '://' + parts.netloc + '/' + s3key
 
@@ -172,7 +172,7 @@ def _write_part(self):
 
         offset = self.gzip_temp.tell()
 
-        buff = '\n'.join(self.curr_lines) + '\n'
+        buff = ('\n'.join(self.curr_lines) + '\n').encode('utf-8')
         self.curr_lines = []
 
         buff = z.compress(buff)

From 2e0e210ba57a356bfca3cc37c01df18cc0436827 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 27 Sep 2021 14:55:22 +0200
Subject: [PATCH 21/66] Upgrade to Python 3: - install Python 3 in bootstrap.sh
 - use "python3" in shell scripts to avoid errors in case Python 2   is still
 installed

---
 bootstrap.sh        | 4 ++--
 run_index_hadoop.sh | 6 +++---
 runindexwarcs.sh    | 2 +-
 runsample.sh        | 2 +-
 runzipcluster.sh    | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bootstrap.sh b/bootstrap.sh
index 544c29c..2e379c6 100644
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 # bootstrap commands
-sudo yum install -y python27 python27-devel python27-pip gcc-c++ git libffi-devel
-sudo pip2.7 install boto3 mrjob simplejson pywb
+sudo yum install -y python3 python3-devel python3-pip gcc-c++ git libffi-devel
+sudo pip3 install boto3 mrjob simplejson pywb
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 269277f..13f9900 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -81,7 +81,7 @@ set -x
 
 
 if [ -n "$WARC_MANIFEST" ]; then
-    python indexwarcsjob.py \
+    python3 indexwarcsjob.py \
        --cdx_bucket=$WARC_CDX_BUCKET \
        --no-output \
        --cleanup NONE \
@@ -115,7 +115,7 @@ else
     #    and decompress the data explicitly.
     test -e splits.txt && rm splits.txt
     test -e splits.seq && rm splits.seq
-    python dosample.py \
+    python3 dosample.py \
            --verbose \
            --shards=300 \
            --splitfile=$SPLIT_FILE \
@@ -150,7 +150,7 @@ else
 fi
 
 
-python zipnumclusterjob.py \
+python3 zipnumclusterjob.py \
        --shards=300 \
        --splitfile=$SPLIT_FILE \
        --output-dir="$ZIPNUM_CLUSTER_DIR" \
diff --git a/runindexwarcs.sh b/runindexwarcs.sh
index b4a0ecb..cbd6627 100755
--- a/runindexwarcs.sh
+++ b/runindexwarcs.sh
@@ -2,7 +2,7 @@
 
 source ./index_env.sh
 
-python indexwarcsjob.py \
+python3 indexwarcsjob.py \
 --conf-path ./mrjob.conf \
 --cdx_bucket=$WARC_CDX_BUCKET \
 --no-output \
diff --git a/runsample.sh b/runsample.sh
index 138e774..65a5813 100755
--- a/runsample.sh
+++ b/runsample.sh
@@ -2,7 +2,7 @@
 
 source ./index_env.sh
 
-python dosample.py \
+python3 dosample.py \
 --shards=300 \
 --splitfile=$SPLIT_FILE \
 --conf-path ./mrjob.conf \
diff --git a/runzipcluster.sh b/runzipcluster.sh
index 1e23e86..787133a 100755
--- a/runzipcluster.sh
+++ b/runzipcluster.sh
@@ -2,7 +2,7 @@
 
 source ./index_env.sh
 
-python zipnumclusterjob.py \
+python3 zipnumclusterjob.py \
 --shards=300 \
 --splitfile=$SPLIT_FILE \
 --output-dir="$ZIPNUM_CLUSTER_DIR" \

From 347228bcdede2ac93207983b428900626d304d95 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 28 Sep 2021 17:21:56 +0200
Subject: [PATCH 22/66] Make logging work for dosample.py - remove --verbose to
 avoid that the log file is flooded by DEBUG   messages now after logging is
 working

---
 dosample.py         | 20 ++++++++++++++++----
 run_index_hadoop.sh |  1 -
 samplecdxjob.py     |  5 +++--
 zipnumclusterjob.py |  3 ++-
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/dosample.py b/dosample.py
index 77dcd64..5396c52 100644
--- a/dosample.py
+++ b/dosample.py
@@ -1,16 +1,28 @@
-from samplecdxjob import SampleCDXJob
-from seqfileutils import make_text_null_seq
-
+import logging
+import os
 import sys
 import tempfile
-import os
+
+from mrjob.util import log_to_stream
+
+from samplecdxjob import SampleCDXJob
+from seqfileutils import make_text_null_seq
 
 SEQ_FILE = 'splits.seq'
 SPL_FILE = 'splits.txt'
 
+LOG = logging.getLogger('SampleCDXJob')
+LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s: %(message)s"
+
+
 def run_sample_job():
     job = SampleCDXJob(args=sys.argv[1:])
 
+    verbose = '--verbose' in sys.argv[1:]
+    log_to_stream(format=LOG_FORMAT, name='SampleCDXJob', debug=verbose)
+    log_to_stream(format=LOG_FORMAT, name='mrjob', debug=verbose)
+    log_to_stream(format=LOG_FORMAT, name='__main__', debug=verbose)
+
     with job.make_runner() as runner:
         runner.run()
 
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 13f9900..2378a39 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -116,7 +116,6 @@ else
     test -e splits.txt && rm splits.txt
     test -e splits.seq && rm splits.seq
     python3 dosample.py \
-           --verbose \
            --shards=300 \
            --splitfile=$SPLIT_FILE \
            --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
diff --git a/samplecdxjob.py b/samplecdxjob.py
index 591ac31..4d2d305 100644
--- a/samplecdxjob.py
+++ b/samplecdxjob.py
@@ -8,14 +8,15 @@
 
 
 LOG = logging.getLogger('SampleCDXJob')
-log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",name='SampleCDXJob')
+log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+              name='SampleCDXJob')
 
 
 #=============================================================================
 class SampleCDXJob(MRJob):
     """ Sample CDX key space using reservoir sampling
     MR algorithm adapted:
-    http://had00b.blogspot.com/2013/07/random-subset-in-mapreduce.html
+    https://had00b.blogspot.com/2013/07/random-subset-in-mapreduce.html
     """
 
     HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.CombineTextInputFormat'
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index ece7c49..eacd0e8 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -16,7 +16,8 @@
 
 
 LOG = logging.getLogger('ZipNumClusterJob')
-log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",name='ZipNumClusterJob')
+log_to_stream(format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+              name='ZipNumClusterJob')
 
 
 #=============================================================================

From ce45503d99e5e46ed4181cd45cd6c2601e3935dc Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Wed, 29 Sep 2021 11:20:32 +0200
Subject: [PATCH 23/66] Upgrade to Python 3: build_local_zipnum.py

---
 build_local_zipnum.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/build_local_zipnum.py b/build_local_zipnum.py
index d39f44d..7518f45 100644
--- a/build_local_zipnum.py
+++ b/build_local_zipnum.py
@@ -1,11 +1,13 @@
-import os
 import glob
-from argparse import ArgumentParser
-from zipnumclusterjob import ZipNumClusterJob
-from mrjob.launch import MRJobLauncher
 import logging
+import os
 import sys
 
+from argparse import ArgumentParser
+
+from mrjob.job import MRJob
+
+from zipnumclusterjob import ZipNumClusterJob
 
 log = logging.getLogger(__name__)
 
@@ -55,8 +57,8 @@ def build_summary_and_loc(output_dir):
             with open(filein, 'r+b') as partfh:
                 for line in partfh:
                     line = line.rstrip()
-                    line += '\t' + str(count)
-                    fh.write(line + '\n')
+                    line += b'\t' + str(count).encode('utf-8')
+                    fh.write(line + b'\n')
                     count += 1
 
     # Write loc file
@@ -68,7 +70,7 @@ def build_summary_and_loc(output_dir):
     print('Building Loc File: ' + loc_file)
     with open(loc_file, 'w+b') as fh:
         for filename in inputs:
-            fh.write(os.path.basename(filename) + '\t' + filename + '\n')
+            fh.write((os.path.basename(filename) + '\t' + filename + '\n').encode('utf-8'))
 
 
 def main():
@@ -82,13 +84,13 @@ def main():
                         help='Number of lines per gzip block (default 3000)')
 
     parser.add_argument('-p', '--parallel', action='store_true',
-                        help='Run in parllel (multiple maps/reducer processes)')
+                        help='Run in parallel (multiple maps/reducer processes)')
 
     r = parser.parse_args()
 
-    MRJobLauncher.set_up_logging(quiet=False,
-                                 verbose=False,
-                                 stream=sys.stderr)
+    MRJob.set_up_logging(quiet=False,
+                         verbose=False,
+                         stream=sys.stderr)
 
     log.setLevel(logging.INFO)
     compat_log = logging.getLogger('mrjob.compat')

From 09fdeb2704e19c016d6f127ed0664adfb9195f54 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Wed, 29 Sep 2021 11:39:28 +0200
Subject: [PATCH 24/66] Update README

---
 README.md | 63 +++++++++++++++++++++++++------------------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 55ad27b..4bc75e1 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,19 @@
-WebArchive Url Indexing
+WebArchive URL Indexing
 =======================
 
-This project contains several scripts (MapReduce jobs) for generating url indexes of web archive collections, ususally containing large number of of WARC (or ARC) files. The scripts are designed to ran in Hadoop or Amazon EMR to process terabytes or even petabytes of web archive content. Additionally, thanks to flexibility of the MRJob library,
-the scripts can also run on a local machine to build an index cluster.
+This project contains several scripts (MapReduce jobs) for generating URL indexes of web archive collections, usually containing large number of WARC (or ARC) files. The scripts are designed to ran on Hadoop or Amazon EMR to process terabytes or even petabytes of web archive content. Additionally, thanks to flexibility of the MRJob library, the scripts can also run on a local machine to build an index cluster.
 
 ## Initial Setup and Usage
 
-These tools use the MRJob Python library for Hadoop/EMR, and are a pure-python solution to web archive indexing.
+Python 3 is required - see the branch `python-2.7` for a previous version running on Python 2.7 (not maintained anymore).
+
+These tools use the MRJob Python library for Hadoop/EMR, and are a pure-Python solution to web archive indexing.
 
 To install [dependencies](#dependencies): `pip install -r requirements.txt`
 
 #### Remote - EMR/Hadoop
 
-*Note: At this time, the scripts have been tested with the CommonCrawl data set on EMR (AMI 3.9.0 + Hadoop 2.4.0) and on CDH 5.8.0.*
+*Note: At this time, the scripts have been tested with the Common Crawl data set on Apache Bigtop 1.5.0 and with Python 2.7 on CDH 6.3.2 and on EMR (AMI 3.9.0 + Hadoop 2.4.0).*
 
 To run with MRJob library on EMR, a system-specific `mrjob.conf` needs to be configured. The file contains all the settings necessary to specify your EMR cluster or to configure non-default settings on other Hadoop clusters. Refer to the [MRJob documentation for details](https://pythonhosted.org/mrjob/guides/configs-basics.html). The shell scripts to launch the tools are supposed to be run on EMR, for other Hadoop clusters replace `-r emr` by `-r hadoop`.
 
@@ -20,7 +21,9 @@ In addition, a bash script `index_env.sh` is used to specify all the relevant pa
 
 You can simply run `cp index_env.sample.sh index_env.sh` to copy the provided sample. Please refer to the file for more details and to fill in the actual paths. Note that on EMR paths to AWS S3 have to be given as `s3://bucket/path` while on Hadoop (no EMR) paths must start with `s3a://`.
 
-Requirements have to be installed on all nodes of the cluster. The script `bootstrap.sh` installs everything needed on EMR, including Python and packages necessary to compile the requirements.
+Requirements have to be installed on all nodes of the cluster. The script [bootstrap.sh](./bootstrap.sh) installs everything needed on EMR, including Python and packages necessary to compile the requirements.
+
+The script [run_index_hadoop.sh](./run_index_hadoop.sh) runs all steps necessary to create the CDX index of a monthly Common Crawl.
 
 #### Local
 
@@ -34,8 +37,7 @@ This repository provides three Hadoop MapReduce jobs to create [a shared url ind
 2. [Sampling CDXs to Create Split File](#sampling-cdxs-to-create-split-file)
 3. [Generating a ZipNum CDX Cluster](#generating-a-zipnum-cdx-cluster)
 
-Each step is a MapReduce job, run with the Python MRJob library. The first step may be omitted if you already have
-indexes for the WARCs.
+Each step is a MapReduce job, run with the Python MRJob library. The first step may be omitted if you already have indexes for the WARCs.
 
 If you have a small number of local cdx files, you also use these scripts to [build a local cluster](#building-a-local-cluster)
 
@@ -44,7 +46,6 @@ If you have a small number of local cdx files, you also use these scripts to [bu
 
 ## Indexing Individual ARC/WARCs to CDX Files ##
 
-
 *Note: If you already have .cdx files for each of your WARC/ARCS, you may skip this step*
 
 The job can be started by running:
@@ -53,13 +54,13 @@ The job can be started by running:
 runindexwarcs.sh
 ```
 
-This boostraps the `indexwarcsjobs.py` script, which will start a map-reduce job to create a cdx file for each WARC/ARC file in the input.
+This bootstraps the [indexwarcsjobs.py](./indexwarcsjobs.py) script, which will start a MapReduce job to create a cdx file for each WARC/ARC file in the input.
 
 **Input:** A manifest file of WARC/ARCs to be indexed
 
-**Output:** A compressed cdx file (.cdx.gz) for each WARC/ARC processed.
+**Output:** A compressed cdx file (`.cdx.gz`) for each WARC/ARC processed.
 
-The path of each input is kept and the extension is replaced with .cdx.gz.
+The path of each input is kept and the extension is replaced with `.cdx.gz`.
 
 Thus, for inputs:
 
@@ -87,15 +88,15 @@ This job can be started by running:
 runsample.sh
 ```
 
-The actual job, defined in  `samplecdxjob.py` determines *split points* for the cluster for an arbitrary number of splits. The final job will sort all the lines from all the CDX files into N parts (determined by number of reducers), however, in order to do so, it is necessary to determine a rough distribution of the url space.
+The actual job, defined in [samplecdxjob.py](./samplecdxjob.py) determines *split points* for the cluster for an arbitrary number of splits. The final job will sort all the lines from all the CDX files into N parts (determined by number of reducers), however, in order to do so, it is necessary to determine a rough distribution of the url space.
 
 **Input:** A path to per-WARC CDX files (created in step 1)
 
-**Output:** A file containing split points to split CDX space into N shards (in hadoop SequenceFile format) to 
+**Output:** A file containing split points to split CDX space into N shards (in Hadoop SequenceFile format) to 
 
 *Note: This step is generally only necessary the first time a cluster is created. If a subsequent cluster with similar distribution is created, it is possible to reuse an existing split file. Additionally, it will be possible to create a more accurate split file directly from an existing cluster (TODO)*
 
-To create the split file, all the CDX files are sampled using [a reservoir sampling technique](http://had00b.blogspot.com/2013/07/random-subset-in-mapreduce.html) (This technique may need some refinement but only an *approximate* distribution is needed).
+To create the split file, all the CDX files are sampled using [a reservoir sampling technique](https://had00b.blogspot.com/2013/07/random-subset-in-mapreduce.html) (This technique may need some refinement but only an *approximate* distribution is needed).
 
 The output of this job will be a single file with N-1 split points (for N parts/shards/reducers).
 
@@ -104,7 +105,7 @@ The job creates a plain text file with N-1 lines.
 #### Converting to SequenceFile
 
 However, to be used with the final job, the file needs to be in a Hadoop `SequenceFile<Text, NullWritable>` format.
-Fortunatelly, the `python-hadoop` library provides an easy way to convert a text file to a Hadoop SequenceFile of this format. The `dosample.py` script combines the map-reduce job with the SequenceFile conversion and then uploads the file sequencefile to final destination (currently S3 path).
+Fortunatelly, the [python-hadoop](//github.com/commoncrawl/python-hadoop) library provides an easy way to convert a text file to a Hadoop SequenceFile of this format. The [dosample.py](./dosample.py) script combines the MapReduce job with the SequenceFile conversion and then uploads the file SequenceFile to final destination (currently S3 path).
 
 ### Generating a ZipNum CDX Cluster
 
@@ -114,7 +115,7 @@ The final job can be started by running:
 runzipcluster.sh
 ```
 
-The corresponding script, `zipnumclusterjob.py`, creates the [ZipNum Sharded CDX Cluster](#zipnum-sharded-cdx-cluster) from the individual CDX files (created in the first job) using the split file (created in the second job).
+The corresponding script, [zipnumclusterjob.py](./zipnumclusterjob.py), creates the [ZipNum Sharded CDX Cluster](#zipnum-sharded-cdx-cluster) from the individual CDX files (created in the first job) using the split file (created in the second job).
 
 **Input:** Per-WARC CDX files and split points file (from previous two steps)
 
@@ -136,16 +137,15 @@ This index can then be used with existing tools, such as pywb and OpenWayback, w
 
 Thanks to the flexibility of the MRJob library, it is also possible to build a local ZipNum cluster, no Hadoop or EMR required! (MRJob automatically computes even split points when running locally, so the split file computation step is not necessary).
 
-If you have a number of [CDX](#cdx-file-format) files on disk, you can use the `build_local_zipnum.py` script to directly build a cluster locally on your machine.
+If you have a number of [CDX](#cdx-file-format) files on disk, you can use the [build_local_zipnum.py](./build_local_zipnum.py) script to directly build a cluster locally on your machine.
 
-For example, the following will be a cluster of 25 shards. 
+For example, the following command creates a cluster of 25 shards.
 
 ```
 python build_local_zipnum.py /path/to/zipnum/ -s 25 -p /path/to/cdx/*.cdx.gz
 ```
 
-(The `-p` flag will specify if parallel processes wil be created
-for each map/reduce task, or (if absent) all tasks will be created sequentially).
+(The `-p` flag specifies to create parallel processes for each map/reduce task, or (if absent) all tasks will be created sequentially).
 
 After the script runs, the following files will be created:
 ```
@@ -155,18 +155,17 @@ After the script runs, the following files will be created:
 /path/to/zipnum/cluster.loc
 ```
 
-The `cluster.summary` and `cluster.loc` files may be used with index ZipNum cluster support in the wayback machine, including
-pywb and OpenWayback.
+The `cluster.summary` and `cluster.loc` files may be used with index ZipNum cluster support in the wayback machine, including pywb and OpenWayback.
 
 
 ### Dependencies
 
 These tools depend on the following libraries/tools. If using Hadoop, they need to be installed on the cluster.
-If Using EMR, the MRJob library can do this automatically when starting a new cluster, and a bootstrap script is also provided for easy installation seperate in a persistant EMR job flow.
+If Using EMR, the MRJob library can do this automatically when starting a new cluster, and a bootstrap script is also provided for easy installation separate in a persistent EMR job flow.
 
 - [pywb web replay tools](https://github.com/ikreymer/pywb) for creating CDX indexes from WARCs and ARCs
 - [MRJob](https://pythonhosted.org/mrjob/) MapReduce library for running MapReduce jobs on Hadoop, Amazon EMR or locally.
-- [python-hadoop](https://github.com/matteobertozzi/Hadoop/tree/master/python-hadoop) - A python hadoop utility library for creating a hadoop SequenceFile in pure Python. (for generating split point SequenceFile)
+- [python-hadoop](https://github.com/commoncrawl/python-hadoop) - A Python Hadoop utility library for creating a Hadoop SequenceFile in pure Python (used to define splits required for total-order sorting). This project is forked from Matteo Bertozzi's [Hadoop](https://github.com/matteobertozzi/Hadoop/tree/master/python-hadoop) and ported to Python 3.
 
 
 ## Additional Info
@@ -190,24 +189,18 @@ The distributed indexing job uses this tool to build an index for each file in p
 
 ### CDX File Format
 
-An index for a web archive (WARC or ARC) file is often referred to as a CDX file, probably from **C**apture/**C**rawl 
-in**D**e**X** **(CDX)**. A CDX file is typically a sorted plain-text file (optionally gzip-compressed) format, with each line
-representing info about a single capture in an archive. The CDX contains multiple fields, typically the url and where to
-find the archived contents of that url. Unfortunately, no standardized format for CDX files exists, and there have been
-many formats, usually with varying number of space-seperated fields. Here is an old reference for [CDX File](https://archive.org/web/researcher/cdx_file_format.php) (from Internet Archive). In practice, CDX files typically contain a subset of the possible fields.
+An index for a web archive (WARC or ARC) file is often referred to as a CDX file, probably from **C**apture/**C**rawl in**D**e**X** **(CDX)**. A CDX file is typically a sorted plain-text file (optionally gzip-compressed) format, with each line representing info about a single capture in an archive. The CDX contains multiple fields, typically the url and where to find the archived contents of that url. Unfortunately, no standardized format for CDX files exists, and there have been many formats, usually with varying number of space-separated fields. Here is an old reference for [CDX File](https://archive.org/web/researcher/cdx_file_format.php) (from Internet Archive). In practice, CDX files typically contain a subset of the possible fields.
 
-While there are no required fields, in practice, the following 6 fields
-are needed to identify a record: `url search key`, `url timestamp`, `original url`, `archive file`, `archive offset`, `archive length`. The search key is often the url transformed and 'canonicalized' in a way to make it easier for lexigraphic seaching.
+While there are no required fields, in practice, the following 6 fields are needed to identify a record: `url search key`, `url timestamp`, `original url`, `archive file`, `archive offset`, `archive length`. The search key is often the url transformed and 'canonicalized' in a way to make it easier for lexicographic searching.
 A common transformation is to reverse subdomains `example.com` -> `com,example,)/` to allow for searching by domain, then subdomains.
 
-The indexing job uses the flexible pywb `cdx-indexer` to create indexs of a certain format. However, the other jobs are compatible with any existing CDX format as well. Other indexing tools can be used also but require seperate integration.
+The indexing job uses the flexible pywb `cdx-indexer` to create indexes of a certain format. However, the other jobs are compatible with any existing CDX format as well. Other indexing tools can be used also but require separate integration.
 
 ### ZipNum Sharded CDX Cluster
 
 A CDX file is generally accessed by doing a simple binary search through the file. This scales well to very large (multi-gigabyte) CDX files. However, for very large archives (many terabytes or petabytes), binary search across a single file has its limits.
 
 A more scalable alternative to a single CDX file is gzip compressed chunked cluster, with a binary searchable index.
-In this format, sometimes called the ZipNum or Ziplines cluster (for some X number of cdx lines zipped together), all actual
-CDX lines are gzipped compressed an concatenated together. To allow for random access, the lines are gzipped in groups of X lines (often 3000, but can be anything). This allows for the full index to be spread over N number of gzipped files, but has the overhead of requiring N lines to be read for each lookup. Generally, this overhead is negligible when looking up large indexes, and non-existent when doing a range query across many CDX lines.
+In this format, sometimes called the ZipNum or Ziplines cluster (for some X number of cdx lines zipped together), all actual CDX lines are gzipped compressed an concatenated together. To allow for random access, the lines are gzipped in groups of X lines (often 3000, but can be anything). This allows for the full index to be spread over N number of gzipped files, but has the overhead of requiring N lines to be read for each lookup. Generally, this overhead is negligible when looking up large indexes, and non-existent when doing a range query across many CDX lines.
 
 The goal of the last job is to create such a index, split into a number of arbitrary shards. For each shard, there is an index file and a secondary index file. At the end, the secondary index is concatenated to form the final, binary searchable index. The number of shards is variable and is equal to the number of reducers used.

From d55a75673a34189b11d4172b79b8711b2031ab8e Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Wed, 29 Sep 2021 15:57:03 +0200
Subject: [PATCH 25/66] Script to build Common Crawl index on Hadoop: -
 increase task memory to avoid potential OOM errors

---
 run_index_hadoop.sh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 2378a39..598733b 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -89,8 +89,8 @@ if [ -n "$WARC_MANIFEST" ]; then
        --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
        --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
        -r hadoop \
-       --jobconf "mapreduce.map.memory.mb=800" \
-       --jobconf "mapreduce.map.java.opts=-Xmx512m" \
+       --jobconf "mapreduce.map.memory.mb=1600" \
+       --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
        $S3_LOCAL_TEMP_DIR \
        $WARC_MANIFEST
 fi
@@ -120,8 +120,8 @@ else
            --splitfile=$SPLIT_FILE \
            --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
            --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-           --jobconf "mapreduce.map.memory.mb=640" \
-           --jobconf "mapreduce.map.java.opts=-Xmx512m" \
+           --jobconf "mapreduce.map.memory.mb=1600" \
+           --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
            --jobconf "mapreduce.map.output.compress=true" \
            --jobconf "mapreduce.output.fileoutputformat.compress=false" \
            -r hadoop $WARC_CDX_SAMPLE
@@ -157,10 +157,10 @@ python3 zipnumclusterjob.py \
        --no-output \
        --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
        --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-       --jobconf "mapreduce.map.memory.mb=640" \
-       --jobconf "mapreduce.map.java.opts=-Xmx512m" \
-       --jobconf "mapreduce.reduce.memory.mb=1536" \
-       --jobconf "mapreduce.reduce.java.opts=-Xmx1024m" \
+       --jobconf "mapreduce.map.memory.mb=1600" \
+       --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
+       --jobconf "mapreduce.reduce.memory.mb=3072" \
+       --jobconf "mapreduce.reduce.java.opts=-Xmx2048m" \
        --jobconf "fs.s3a.acl.default=PublicRead" \
        -r hadoop $WARC_CDX
 

From 1a362dfc30e96e124da36fb4b1d1cb1dbb4b7a04 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 20 May 2022 17:05:58 +0200
Subject: [PATCH 26/66] Improve authenticated S3 access configuration: - rely
 on IAM roles for authentication - do not pass credentials via environment
 variables which   may leak into log files etc. - drop setting object-level
 ACLs (use bucket-level ACLs) - update configuration accordingly

---
 README.md           | 13 +++++++++----
 index_env.sample.sh |  2 --
 indexwarcsjob.py    |  6 ++++--
 publish_index.sh    |  4 ++--
 run_index_hadoop.sh | 16 ----------------
 runindexwarcs.sh    |  2 --
 runsample.sh        |  2 --
 runzipcluster.sh    |  2 --
 8 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 4bc75e1..c7ebb6e 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ These tools use the MRJob Python library for Hadoop/EMR, and are a pure-Python s
 
 To install [dependencies](#dependencies): `pip install -r requirements.txt`
 
-#### Remote - EMR/Hadoop
+### Remote - EMR/Hadoop
 
 *Note: At this time, the scripts have been tested with the Common Crawl data set on Apache Bigtop 1.5.0 and with Python 2.7 on CDH 6.3.2 and on EMR (AMI 3.9.0 + Hadoop 2.4.0).*
 
@@ -25,11 +25,16 @@ Requirements have to be installed on all nodes of the cluster. The script [boots
 
 The script [run_index_hadoop.sh](./run_index_hadoop.sh) runs all steps necessary to create the CDX index of a monthly Common Crawl.
 
-#### Local
+### Local
 
 No additional setup is necessary. See [building a local cluster](#building-a-local-cluster).
 
-### Tools Provided
+### S3 Read and Write Permissions
+
+Permissions to read and write from the involved locations on S3 need to be granted – in "remote" mode to all nodes of the cluster. This is best done by attaching IAM roles to the EC2 cluster instances, see [boto3 configuring credentials](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials). In local mode, a credentials file or environment variables could be also an option.
+
+
+## Tools Provided
 
 This repository provides three Hadoop MapReduce jobs to create [a shared url index](#zipnum-sharded-cdx-cluster) from an input list of WARC/ARC files. This process can be split into three jobs.
 
@@ -44,7 +49,7 @@ If you have a small number of local cdx files, you also use these scripts to [bu
 [Additional background info on indexing and the formats used](#additional-info).
 
 
-## Indexing Individual ARC/WARCs to CDX Files ##
+### Indexing Individual ARC/WARCs to CDX Files ###
 
 *Note: If you already have .cdx files for each of your WARC/ARCS, you may skip this step*
 
diff --git a/index_env.sample.sh b/index_env.sample.sh
index 9c37687..a25a341 100644
--- a/index_env.sample.sh
+++ b/index_env.sample.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
-export AWS_ACCESS_KEY_ID="<s3 id>"
-export AWS_SECRET_ACCESS_KEY="<s3 secret>"
 
 export WARC_MANIFEST="<warc input file>"
 export WARC_CDX_BUCKET="<cdx output bucket>"
diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 894e34a..7fbacdf 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -64,8 +64,10 @@ def configure_args(self):
                               default=None)
 
     def mapper_init(self):
-        # Note: this assumes that credentials are set via
-        # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env variables
+        # Note: this assumes that credentials are properly configured, see
+        #   https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials
+        # best via IAM roles:
+        #   https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#best-practices-for-configuring-credentials
         self.boto_config = botocore.client.Config(
             read_timeout=180,
             retries={'max_attempts' : 20})
diff --git a/publish_index.sh b/publish_index.sh
index c30f47f..578ba89 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -23,7 +23,7 @@ if ! [ -e $YEARWEEK-metadata.yaml ]; then
 		exit 1
 	fi
 fi
-aws s3 cp $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml --acl public-read
+aws s3 cp $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml
 
 test -d cdx-$YEARWEEK || mkdir cdx-$YEARWEEK
 cd cdx-$YEARWEEK
@@ -33,7 +33,7 @@ aws s3 cp --recursive --exclude '*' --include 'part-*' s3://commoncrawl/cc-index
 cat part-* | awk '{printf "%s\t%s\n",$0,NR}' >cluster.idx
 LC_ALL=C sort -c ./cluster.idx
 #rm ./part-00*
-aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx --acl public-read
+aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx
 
 # remove obsolete data from bucket
 #  - map-reduce _SUCCESS file/marker
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 598733b..884a16c 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -24,18 +24,10 @@ Create a Common Crawl index for a monthly crawl. All steps are run on Hadoop.
   <split_file>           Optional split file to be reused from previous crawl with similar distribution of URLs.
                          If not given, splits are calculated and saved on the default split file path.
 
-Environment variables depend upon:
-  AWS_ACCESS_KEY_ID      - AWS credentials used by Boto to access the bucket (read and write)
-  AWS_SECRET_ACCESS_KEY
 EOF
     exit 1
 fi
 
-if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
-    echo "AWS credentials must passed to Boto via environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY!"
-    exit 1
-fi
-
 
 YEARWEEK="$1"
 WARC_MANIFEST="$2"
@@ -86,8 +78,6 @@ if [ -n "$WARC_MANIFEST" ]; then
        --no-output \
        --cleanup NONE \
        --skip-existing \
-       --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-       --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
        -r hadoop \
        --jobconf "mapreduce.map.memory.mb=1600" \
        --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
@@ -118,8 +108,6 @@ else
     python3 dosample.py \
            --shards=300 \
            --splitfile=$SPLIT_FILE \
-           --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-           --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
            --jobconf "mapreduce.map.memory.mb=1600" \
            --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
            --jobconf "mapreduce.map.output.compress=true" \
@@ -153,14 +141,10 @@ python3 zipnumclusterjob.py \
        --shards=300 \
        --splitfile=$SPLIT_FILE \
        --output-dir="$ZIPNUM_CLUSTER_DIR" \
-       --s3-upload-acl="public-read" \
        --no-output \
-       --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-       --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
        --jobconf "mapreduce.map.memory.mb=1600" \
        --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
        --jobconf "mapreduce.reduce.memory.mb=3072" \
        --jobconf "mapreduce.reduce.java.opts=-Xmx2048m" \
-       --jobconf "fs.s3a.acl.default=PublicRead" \
        -r hadoop $WARC_CDX
 
diff --git a/runindexwarcs.sh b/runindexwarcs.sh
index cbd6627..f9bef87 100755
--- a/runindexwarcs.sh
+++ b/runindexwarcs.sh
@@ -6,7 +6,5 @@ python3 indexwarcsjob.py \
 --conf-path ./mrjob.conf \
 --cdx_bucket=$WARC_CDX_BUCKET \
 --no-output \
---cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
---cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
 -r emr $WARC_MANIFEST &> /tmp/emrrun.log &
 
diff --git a/runsample.sh b/runsample.sh
index 65a5813..6aca0f7 100755
--- a/runsample.sh
+++ b/runsample.sh
@@ -6,7 +6,5 @@ python3 dosample.py \
 --shards=300 \
 --splitfile=$SPLIT_FILE \
 --conf-path ./mrjob.conf \
---cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
---cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
 -r emr $WARC_CDX &> /tmp/emrrun.log &
 
diff --git a/runzipcluster.sh b/runzipcluster.sh
index 787133a..27b4682 100755
--- a/runzipcluster.sh
+++ b/runzipcluster.sh
@@ -8,7 +8,5 @@ python3 zipnumclusterjob.py \
 --output-dir="$ZIPNUM_CLUSTER_DIR" \
 --no-output \
 --conf-path ./mrjob.conf \
---cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
---cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
 -r emr $WARC_CDX &> /tmp/emrrun2.log &
 

From bcd9c0f804d868d0e246728e7566e9347e95affe Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 31 Mar 2023 16:42:14 +0200
Subject: [PATCH 27/66] Improve shell scripts: quote variables potentially
 containing white space

---
 publish_index.sh    | 12 ++++++------
 run_index_hadoop.sh | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index 578ba89..441a6bd 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -13,20 +13,20 @@ set -e
 
 
 ## Create the metadata (title) for the index on the website
-if ! [ -e $YEARWEEK-metadata.yaml ]; then
+if ! [ -e "$YEARWEEK-metadata.yaml" ]; then
 	if [ -n "$MONTH" ]; then
 		YEAR=${YEARWEEK%%-*}
-		echo "title: '${MONTH^} $YEAR Index'" >$YEARWEEK-metadata.yaml
+		echo "title: '${MONTH^} $YEAR Index'" >"$YEARWEEK-metadata.yaml"
 	else
-		aws s3 cp s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml $YEARWEEK-metadata.yaml
+		aws s3 cp s3://commoncrawl/cc-index/collections/CC-MAIN-2015-18/metadata.yaml "$YEARWEEK-metadata.yaml"
 		echo "Please, edit $YEARWEEK-metadata.yaml"
 		exit 1
 	fi
 fi
-aws s3 cp $YEARWEEK-metadata.yaml s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml
+aws s3 cp "$YEARWEEK-metadata.yaml" "s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml"
 
-test -d cdx-$YEARWEEK || mkdir cdx-$YEARWEEK
-cd cdx-$YEARWEEK
+test -d "cdx-$YEARWEEK" || mkdir "cdx-$YEARWEEK"
+cd "cdx-$YEARWEEK"
 
 ## create cluster index
 aws s3 cp --recursive --exclude '*' --include 'part-*' s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ ./
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 884a16c..131995b 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -81,8 +81,8 @@ if [ -n "$WARC_MANIFEST" ]; then
        -r hadoop \
        --jobconf "mapreduce.map.memory.mb=1600" \
        --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
-       $S3_LOCAL_TEMP_DIR \
-       $WARC_MANIFEST
+       "$S3_LOCAL_TEMP_DIR" \
+       "$WARC_MANIFEST"
 fi
 
 
@@ -107,12 +107,12 @@ else
     test -e splits.seq && rm splits.seq
     python3 dosample.py \
            --shards=300 \
-           --splitfile=$SPLIT_FILE \
+           --splitfile="$SPLIT_FILE" \
            --jobconf "mapreduce.map.memory.mb=1600" \
            --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
            --jobconf "mapreduce.map.output.compress=true" \
            --jobconf "mapreduce.output.fileoutputformat.compress=false" \
-           -r hadoop $WARC_CDX_SAMPLE
+           -r hadoop "$WARC_CDX_SAMPLE"
 
 	# in case, the sequence file wasn't written:
 	# 1. verify the content
@@ -126,25 +126,25 @@ else
 	# 3. verify the sequence file
 	#      hadoop fs -text file:$PWD/splits.seq | less
 
-    mv splits.seq ${CRAWL}-splits.seq
+    mv splits.seq "${CRAWL}-splits.seq"
 
-    if aws s3 ls s3${SPLIT_FILE#s3a}; then
+    if aws s3 ls "s3${SPLIT_FILE#s3a}"; then
         echo "Ok, split file has been uploaded"
     else
         echo "Uploading split file ..."
-        aws s3 cp ${CRAWL}-splits.seq s3${SPLIT_FILE#s3a}
+        aws s3 cp "${CRAWL}-splits.seq" "s3${SPLIT_FILE#s3a}"
     fi
 fi
 
 
 python3 zipnumclusterjob.py \
        --shards=300 \
-       --splitfile=$SPLIT_FILE \
+       --splitfile="$SPLIT_FILE" \
        --output-dir="$ZIPNUM_CLUSTER_DIR" \
        --no-output \
        --jobconf "mapreduce.map.memory.mb=1600" \
        --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
        --jobconf "mapreduce.reduce.memory.mb=3072" \
        --jobconf "mapreduce.reduce.java.opts=-Xmx2048m" \
-       -r hadoop $WARC_CDX
+       -r hadoop "$WARC_CDX"
 

From a14805422c455c963ca261ea387b2580c7a0ef18 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 31 Mar 2023 16:42:37 +0200
Subject: [PATCH 28/66] ZipNumClusterJob not to write temporary output to
 s3://commoncrawl/ (implements #5) - add option --zipnum-dir defining the
 location (upload path)   for ZipNum CDX files (cdx-nnnnn.gz), independent of
 the output   directory (--output-dir) where part-nnnnn files are written to, 
  later concatenad to the cluster.idx file - point --output-dir to an internal
 bucket holding temporary   index data - remove obsolete option
 `--s3-upload-acl` - simplify index publication script (no clean-up on
 publication   bucket required)

---
 publish_index.sh    | 12 +++---------
 run_index_hadoop.sh |  7 ++++++-
 zipnumclusterjob.py | 22 +++++++---------------
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/publish_index.sh b/publish_index.sh
index 441a6bd..78d2c8f 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -29,15 +29,9 @@ test -d "cdx-$YEARWEEK" || mkdir "cdx-$YEARWEEK"
 cd "cdx-$YEARWEEK"
 
 ## create cluster index
-aws s3 cp --recursive --exclude '*' --include 'part-*' s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ ./
+
+aws s3 cp --recursive --exclude '*' --include 'part-*' "s3://commoncrawl-index-temp/CC-MAIN-$YEARWEEK/indexes/" ./
 cat part-* | awk '{printf "%s\t%s\n",$0,NR}' >cluster.idx
 LC_ALL=C sort -c ./cluster.idx
-#rm ./part-00*
-aws s3 cp ./cluster.idx s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx
-
-# remove obsolete data from bucket
-#  - map-reduce _SUCCESS file/marker
-aws s3 rm s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/_SUCCESS
-#  - part-00* files concatenated to cluster.idx
-aws s3 rm --recursive s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/ --exclude "*" --include "part-00*"
 
+aws s3 cp ./cluster.idx "s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx"
diff --git a/run_index_hadoop.sh b/run_index_hadoop.sh
index 131995b..1edb354 100755
--- a/run_index_hadoop.sh
+++ b/run_index_hadoop.sh
@@ -58,6 +58,9 @@ export WARC_CDX_SAMPLE="s3a://$WARC_CDX_BUCKET/$CRAWL/cdx/segments/*[358]/*/*.cd
 # SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
 export SPLIT_FILE="s3a://$WARC_CDX_BUCKET/$CRAWL/splits.seq"
 
+# output path of part-n files of the zipnum job, later concatenated into the cluster.idx
+export ZIPNUM_OUTPUT_DIR="s3a://$WARC_CDX_BUCKET/$CRAWL/indexes/"
+
 # configure S3 buffer directory
 if [ -n "$S3_LOCAL_TEMP_DIR" ]; then
 	S3_LOCAL_TEMP_DIR="--s3_local_temp_dir=$S3_LOCAL_TEMP_DIR"
@@ -140,11 +143,13 @@ fi
 python3 zipnumclusterjob.py \
        --shards=300 \
        --splitfile="$SPLIT_FILE" \
-       --output-dir="$ZIPNUM_CLUSTER_DIR" \
+       --zipnum-dir="$ZIPNUM_CLUSTER_DIR" \
+       --output-dir="$ZIPNUM_OUTPUT_DIR" \
        --no-output \
        --jobconf "mapreduce.map.memory.mb=1600" \
        --jobconf "mapreduce.map.java.opts=-Xmx1024m" \
        --jobconf "mapreduce.reduce.memory.mb=3072" \
        --jobconf "mapreduce.reduce.java.opts=-Xmx2048m" \
+       --jobconf "mapreduce.fileoutputcommitter.cleanup-failures.ignored=true" \
        -r hadoop "$WARC_CDX"
 
diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py
index eacd0e8..751782d 100644
--- a/zipnumclusterjob.py
+++ b/zipnumclusterjob.py
@@ -62,8 +62,8 @@ def configure_args(self):
                                    '= num of entries in splits + 1' +
                                    '= num of reducers used')
 
-        self.add_passthru_arg('--s3-upload-acl', dest='s3acl',
-                              help='S3 access permissions (ACL) to be applied to CDX files')
+        self.add_passthru_arg('--zipnum-dir', dest='zipnum_dir',
+                              help='Upload path / directory to place zipnum CDX files')
 
     def jobconf(self):
         orig_jobconf = super(ZipNumClusterJob, self).jobconf()
@@ -106,11 +106,7 @@ def reducer_init(self):
 
         self.part_name = 'cdx-%05d.gz' % int(self.part_num)
 
-        self.output_dir = self._get_prop(['mapreduce_output_fileoutputformat_outputdir',
-                                          'mapred.output.dir',
-                                          'mapred_work_output_dir'])
-
-        assert(self.output_dir)
+        assert(self.options.zipnum_dir)
         self.gzip_temp = TemporaryFile(mode='w+b')
 
     def reducer(self, key, values):
@@ -137,31 +133,27 @@ def _do_upload(self):
         self.gzip_temp.flush()
         self.gzip_temp.seek(0)
         #TODO: move to generalized put() function
-        if self.output_dir.startswith('s3://') or self.output_dir.startswith('s3a://'):
+        if self.options.zipnum_dir.startswith('s3://') or self.options.zipnum_dir.startswith('s3a://'):
             import boto3
             import botocore
             boto_config = botocore.client.Config(
                 read_timeout=180,
                 retries={'max_attempts' : 20})
             s3client = boto3.client('s3', config=boto_config)
-            s3args = None
-            if self.options.s3acl:
-                s3args = {'ACL': self.options.s3acl}
 
-            parts = urllib.parse.urlsplit(self.output_dir)
+            parts = urllib.parse.urlsplit(self.options.zipnum_dir)
             s3key = parts.path.strip('/') + '/' + self.part_name
             s3url = parts.scheme + '://' + parts.netloc + '/' + s3key
 
             LOG.info('Uploading index to ' + s3url)
             try:
-                s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key,
-                                        ExtraArgs=s3args)
+                s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key)
             except botocore.client.ClientError as exception:
                 LOG.error('Failed to upload {}: {}'.format(s3url, exception))
                 return
             LOG.info('Successfully uploaded index file: ' + s3url)
         else:
-            path = os.path.join(self.output_dir, self.part_name)
+            path = os.path.join(self.options.zipnum_dir, self.part_name)
 
             with open(path, 'w+b') as target:
                 shutil.copyfileobj(self.gzip_temp, target)

From 8cff8a0ac8e04fe1f6b2609965ac7672062eff9b Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Fri, 14 Jun 2024 19:14:30 -0500
Subject: [PATCH 29/66] added replacements for wet/wat paths so we get cdx
 filenames out

---
 .gitignore       | 1 +
 indexwarcsjob.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 20b9ebc..ecfe406 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@ nosetests.xml
 
 # Other
 mrjob.conf
+mrjob*.conf
 index_env.sh
 
 .vagrant
diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 7fbacdf..5a94ada 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -106,6 +106,8 @@ def _conv_warc_to_cdx_path(self, warc_path):
         # set cdx path
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
         cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
+        cdx_path = cdx_path.replace('.wet.gz', '.wet.cdx.gz')
+        cdx_path = cdx_path.replace('.wat.gz', '.wat.cdx.gz')
         return cdx_path
 
     def _load_and_index(self, warc_path):

From efaa6f1dfcbdb5192a8c78b0993c0eade6202208 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Mon, 17 Jun 2024 11:05:56 -0500
Subject: [PATCH 30/66] made name matching more specific as Sebastian suggested

---
 indexwarcsjob.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/indexwarcsjob.py b/indexwarcsjob.py
index 5a94ada..5a6de8b 100644
--- a/indexwarcsjob.py
+++ b/indexwarcsjob.py
@@ -106,8 +106,8 @@ def _conv_warc_to_cdx_path(self, warc_path):
         # set cdx path
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
         cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
-        cdx_path = cdx_path.replace('.wet.gz', '.wet.cdx.gz')
-        cdx_path = cdx_path.replace('.wat.gz', '.wat.cdx.gz')
+        cdx_path = cdx_path.replace('.warc.wet.gz', '.wet.cdx.gz')
+        cdx_path = cdx_path.replace('.warc.wat.gz', '.wat.cdx.gz')
         return cdx_path
 
     def _load_and_index(self, warc_path):

From 48a3672474ed0c59cdbf33121dbfcb9a062a5361 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Mon, 9 Sep 2024 20:06:06 -0500
Subject: [PATCH 31/66] Add an alternate spark version of indexwarcsjob
 (without mrjob)

---
 indexwarcsjob-ccpyspark.py | 76 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 indexwarcsjob-ccpyspark.py

diff --git a/indexwarcsjob-ccpyspark.py b/indexwarcsjob-ccpyspark.py
new file mode 100644
index 0000000..8dd5ed6
--- /dev/null
+++ b/indexwarcsjob-ccpyspark.py
@@ -0,0 +1,76 @@
+import logging
+from sparkcc import CCFileProcessorSparkJob
+
+from tempfile import TemporaryFile
+from pywb.indexer.cdxindexer import write_cdx_index
+from gzip import GzipFile
+
+
+LOG = logging.getLogger('IndexWARCJob')
+
+
+#=============================================================================
+class IndexWARCJob(CCFileProcessorSparkJob):
+    """ This job receives as input a manifest of WARC/ARC files and produces
+    a CDX index per file
+
+    The pywb.indexer.cdxindexer is used to create the index, with a fixed set of options
+    """
+
+    name = 'IndexWARCJob'
+
+    index_options = {
+        'surt_ordered': True,
+        'sort': True,
+        'cdxj': True,
+        #'minimal': True
+    }
+
+    def add_arguments(self, parser):
+        super(CCFileProcessorSparkJob, self).add_arguments(parser)
+        parser.add_argument("--output_base_url", required=False,
+                            default='my_cdx_bucket',
+                            help="destination for cdx output")
+        
+        parser.add_argument("--skip-existing", dest='skip_existing', action='store_true',
+                            help="skip processing files that already have CDX")
+    
+    def _conv_warc_to_cdx_path(self, warc_path):
+        # set cdx path
+        cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
+        cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
+        cdx_path = cdx_path.replace('.warc.wet.gz', '.wet.cdx.gz')
+        cdx_path = cdx_path.replace('.warc.wat.gz', '.wat.cdx.gz')
+        return cdx_path
+
+    def process_file(self, warc_path, tempfd):
+        
+        cdx_path = self._conv_warc_to_cdx_path(warc_path)
+
+        LOG.info('Indexing WARC: %s', warc_path)
+
+        if self.args.skip_existing and \
+            self.check_for_output_file(cdx_path,self.args.output_base_url):
+            LOG.info('Already Exists: %s', cdx_path)
+            yield cdx_path, 0
+            return
+
+        with TemporaryFile(mode='w+b',
+                            dir=self.args.local_temp_dir) as cdxtemp:
+            with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
+                # Index to temp
+                write_cdx_index(cdxfile, tempfd, warc_path, **self.index_options)
+
+            # Upload temp
+            cdxtemp.flush()
+            cdxtemp.seek(0)
+
+            self.write_output_file(cdx_path, cdxtemp, self.args.output_base_url)
+            LOG.info('Successfully uploaded CDX: %s', cdx_path)
+
+            yield cdx_path, 1
+
+
+if __name__ == "__main__":
+    job = IndexWARCJob()
+    job.run()

From 406b63f903c79bd76c976928cf819f6158ffdb72 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Thu, 7 Nov 2024 20:29:34 -0600
Subject: [PATCH 32/66] fix so we properly can skip fully failed files and log
 them.

---
 indexwarcsjob-ccpyspark.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/indexwarcsjob-ccpyspark.py b/indexwarcsjob-ccpyspark.py
index 8dd5ed6..91e1f72 100644
--- a/indexwarcsjob-ccpyspark.py
+++ b/indexwarcsjob-ccpyspark.py
@@ -57,18 +57,27 @@ def process_file(self, warc_path, tempfd):
 
         with TemporaryFile(mode='w+b',
                             dir=self.args.local_temp_dir) as cdxtemp:
+            
+            success = False
             with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                 # Index to temp
-                write_cdx_index(cdxfile, tempfd, warc_path, **self.index_options)
+                try:
+                    write_cdx_index(cdxfile, tempfd, warc_path, **self.index_options)
+                    success = True
+                except Exception as exc:
+                    # log detailed stack trace
+                    LOG.error('Failed to index %s: %s', warc_path, exc)
 
             # Upload temp
             cdxtemp.flush()
             cdxtemp.seek(0)
 
-            self.write_output_file(cdx_path, cdxtemp, self.args.output_base_url)
-            LOG.info('Successfully uploaded CDX: %s', cdx_path)
-
-            yield cdx_path, 1
+            if success:
+                self.write_output_file(cdx_path, cdxtemp, self.args.output_base_url)
+                LOG.info('Successfully uploaded CDX: %s', cdx_path)
+                yield cdx_path, 1
+            else:
+                yield cdx_path, -1
 
 
 if __name__ == "__main__":

From 65bf452c197ff885658f17ff6af9927869bde121 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Thu, 7 Nov 2024 20:30:48 -0600
Subject: [PATCH 33/66] feat: draft version of zipnum cluster job as spark
 (untested)

---
 zipnumcluster-ccpyspark.py | 157 +++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 zipnumcluster-ccpyspark.py

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
new file mode 100644
index 0000000..ffe3509
--- /dev/null
+++ b/zipnumcluster-ccpyspark.py
@@ -0,0 +1,157 @@
+import logging
+from sparkcc import CCSparkJob
+import os
+from pyspark.sql.functions import row_number, concat, lit, col
+import gzip
+from typing import Iterator, Tuple, List
+from pyspark.sql.types import StringType, LongType, StructType, StructField
+import zlib
+from pyspark.sql.window import Window
+
+LOG = logging.getLogger('IndexWARCJob')
+
+class ZipNumClusterCdx(CCSparkJob):
+    name = 'ZipNumClusterCdx'
+
+    def add_arguments(self, parser):
+        super().add_arguments(parser)
+        parser.add_argument("--output_base_url", required=False,
+                            default='my_cdx_bucket',
+                            help="destination for output")
+        parser.add_argument("--num_lines", type=int, required=False,
+                            default=3000,
+                            help="number of lines to compress in each chunk")
+        parser.add_argument("--num_output_partitions", type=int, required=False,
+                            default=300,
+                            help="number of partitions/shards")
+    
+    def get_partition_id(self, surt_key: str, num_partitions: int) -> int:
+        """
+        Determine partition based on SURT key structure.
+        Handles special cases like common TLD prefixes.
+        """
+        # Split SURT key into components
+        parts = surt_key.split(',')
+        
+        # Handle special cases for domain-based SURT keys
+        if len(parts) > 1:
+            # Skip common TLDs for better distribution
+            if parts[0] in {'com', 'org', 'net', 'edu', 'gov'}:
+                key_for_hash = parts[1]
+            else:
+                key_for_hash = parts[0]
+        else:
+            # Handle non-domain SURT keys (like IP addresses)
+            key_for_hash = parts[0]
+        
+        # Take first 3 meaningful characters for distribution
+        prefix = key_for_hash[:3].ljust(3)
+        
+        # Create a number from the characters that preserves ordering
+        # This ensures similar prefixes go to nearby partitions
+        value = (ord(prefix[0]) << 16) + (ord(prefix[1]) << 8) + ord(prefix[2])
+        
+        return value % num_partitions
+    
+    def run_job(self, session):
+        os.makedirs(self.args.output_base_url, exist_ok=True)
+        input = self.args.input_base_url + self.args.input
+        num_partitions = self.args.num_output_partitions
+
+        def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, int, int, int]]:
+            """Process partition with chunked compression"""
+            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+            output_filename = f"cdx{partition_id}.gz"
+            output_file = f"{self.args.output_base_url}/{output_filename}"
+            index_entries = []
+            current_offset = 0
+            chunk_size = self.args.num_lines
+            
+            # Sort partition contents
+            partition_data = sorted(partition_iter, key=lambda x: x[0])
+            
+            current_chunk = []
+            chunk_records = []  # Store full record info
+            
+            with open(output_file, 'wb') as f:
+                for _, (surt_key, timestamp, json_data) in partition_data:
+                    line = f"{surt_key} {timestamp} {json_data}\n"
+                    current_chunk.append(line)
+                    chunk_records.append((surt_key, timestamp))  # Store both surt_key and timestamp
+                    
+                    if len(current_chunk) >= chunk_size:
+                        # Compress and write chunk
+                        chunk_data = ''.join(current_chunk).encode('utf-8')
+                        compressed = z.compress(chunk_data)
+                        chunk_length = len(compressed)
+                        f.write(compressed)
+                        
+                        # Create single index entry per record
+                        for sk, ts in chunk_records:
+                            index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
+                        
+                        current_offset += chunk_length
+                        current_chunk = []
+                        chunk_records = []
+                    
+                # Handle final chunk
+                if current_chunk:
+                    chunk_data = ''.join(current_chunk).encode('utf-8')
+                    compressed = z.compress(chunk_data) + z.flush()
+                    chunk_length = len(compressed)
+                    f.write(compressed)
+                    
+                    for sk, ts in chunk_records:
+                        index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
+            
+            return index_entries
+
+        # Single pass processing with fixed-width partitioning
+        rdd = session.sparkContext.textFile(input) \
+            .map(lambda line: tuple(line.strip().split(" ", 2))) \
+            .keyBy(lambda x: x[0]) \
+            .partitionBy(num_partitions, 
+                        partitionFunc=lambda key: self.get_partition_id(key, num_partitions)) \
+            .mapPartitionsWithIndex(process_partition)
+
+        # Create index
+        index_schema = StructType([
+            StructField("surt_key", StringType(), False),
+            StructField("timestamp", StringType(), False),
+            StructField("partition_id", LongType(), False),
+            StructField("offset", LongType(), False),
+            StructField("length", LongType(), False)
+        ])
+        
+        w = Window.orderBy("surt_key")
+        # Create index with correct filename formatting
+        index_df = session.createDataFrame(rdd, index_schema)\
+            .withColumn("sequence_number", row_number().over(w))
+
+        index_df = index_df\
+            .withColumn("output_filename", concat(lit("cdx"), col("partition_id").cast(StringType()), lit(".gz")))\
+            .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number")
+        
+        # Save main index, sorted by surt_key for binary search
+        index_df.sort("surt_key").coalesce(1).write \
+        .option("sep", "\t").csv(
+            f"{self.args.output_base_url}/index.idx", 
+            header=False,
+            mode="overwrite"
+        )
+
+        # Create secondary index for partition boundaries
+        partition_bounds = index_df.groupBy("output_filename") \
+            .agg({"surt_key": "min", "surt_key": "max"}) \
+            .sort("output_filename")
+        
+        partition_bounds.coalesce(1).write \
+        .option("sep", "\t").csv(
+            f"{self.args.output_base_url}/secondary_index.idx",
+            header=False,
+            mode="overwrite"
+        )
+
+if __name__ == "__main__":
+    job = ZipNumClusterCdx()
+    job.run()
\ No newline at end of file

From fcb5695bae1cacc023a293c6b6a3cac755eaae4e Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Thu, 7 Nov 2024 20:36:15 -0600
Subject: [PATCH 34/66] docs: adding note about zipnum testing and behaviour

---
 zipnumcluster-ccpyspark.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index ffe3509..5e2ba28 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -10,6 +10,10 @@
 
 LOG = logging.getLogger('IndexWARCJob')
 
+# note: this is LESS strict about partitioning than the original
+# based on my read of the zipnum clustering code, this shoudl be just fine
+# but so far, it's untested. I plan to test it with the index server we use (locally)
+
 class ZipNumClusterCdx(CCSparkJob):
     name = 'ZipNumClusterCdx'
 

From e6bb313e4b59218d98fb65f59aa56aa151cf0af9 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 13:15:27 -0600
Subject: [PATCH 35/66] feat: reservoir sampled method of zipnum cluster job

---
 zipnumcluster-ccpyspark.py | 173 ++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 77 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 5e2ba28..d610e6c 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -7,13 +7,10 @@
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 import zlib
 from pyspark.sql.window import Window
+import random
 
 LOG = logging.getLogger('IndexWARCJob')
 
-# note: this is LESS strict about partitioning than the original
-# based on my read of the zipnum clustering code, this shoudl be just fine
-# but so far, it's untested. I plan to test it with the index server we use (locally)
-
 class ZipNumClusterCdx(CCSparkJob):
     name = 'ZipNumClusterCdx'
 
@@ -29,94 +26,116 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
     
-    def get_partition_id(self, surt_key: str, num_partitions: int) -> int:
-        """
-        Determine partition based on SURT key structure.
-        Handles special cases like common TLD prefixes.
-        """
-        # Split SURT key into components
-        parts = surt_key.split(',')
+    def get_partition_boundaries(self, session, input_path: str, num_partitions: int) -> List[str]:
+        """Use reservoir sampling to determine partition boundaries"""
+        def reservoir_sample(iterator: Iterator[str], k: int) -> List[str]:
+            sample = []
+            for i, item in enumerate(iterator):
+                if i < k:
+                    sample.append(item)
+                else:
+                    j = random.randint(0, i)
+                    if j < k:
+                        sample[j] = item
+            return sample
+
+        # Collect samples and sort them
+        samples = session.sparkContext.textFile(input_path) \
+            .map(lambda line: line.split(" ", 1)[0]) \
+            .mapPartitions(lambda x: reservoir_sample(x, 100)) \
+            .collect()
         
-        # Handle special cases for domain-based SURT keys
-        if len(parts) > 1:
-            # Skip common TLDs for better distribution
-            if parts[0] in {'com', 'org', 'net', 'edu', 'gov'}:
-                key_for_hash = parts[1]
-            else:
-                key_for_hash = parts[0]
-        else:
-            # Handle non-domain SURT keys (like IP addresses)
-            key_for_hash = parts[0]
+        samples.sort()
         
-        # Take first 3 meaningful characters for distribution
-        prefix = key_for_hash[:3].ljust(3)
+        # Select evenly spaced samples as boundaries
+        step = len(samples) // (num_partitions - 1)
+        return [samples[i] for i in range(0, len(samples), step)][:num_partitions-1]
+
+    def get_partition_id(self, key: str, boundaries: List[str]) -> int:
+        """Determine partition based on range boundaries"""
+        for i, boundary in enumerate(boundaries):
+            if key < boundary:
+                return i
+        return len(boundaries)
+
+    def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, str, int, int, int]]:
+        """Process partition with chunked compression and first-entry-only indexing"""
+        z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+        output_filename = f"cdx{partition_id}.gz"
+        output_file = f"{self.args.output_base_url}/{output_filename}"
+        index_entries = []
+        current_offset = 0
+        chunk_size = self.args.num_lines
         
-        # Create a number from the characters that preserves ordering
-        # This ensures similar prefixes go to nearby partitions
-        value = (ord(prefix[0]) << 16) + (ord(prefix[1]) << 8) + ord(prefix[2])
+        # Sort partition contents
+        partition_data = sorted(partition_iter, key=lambda x: x[0])
         
-        return value % num_partitions
-    
-    def run_job(self, session):
-        os.makedirs(self.args.output_base_url, exist_ok=True)
-        input = self.args.input_base_url + self.args.input
-        num_partitions = self.args.num_output_partitions
-
-        def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, int, int, int]]:
-            """Process partition with chunked compression"""
-            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-            output_filename = f"cdx{partition_id}.gz"
-            output_file = f"{self.args.output_base_url}/{output_filename}"
-            index_entries = []
-            current_offset = 0
-            chunk_size = self.args.num_lines
-            
-            # Sort partition contents
-            partition_data = sorted(partition_iter, key=lambda x: x[0])
-            
-            current_chunk = []
-            chunk_records = []  # Store full record info
-            
-            with open(output_file, 'wb') as f:
-                for _, (surt_key, timestamp, json_data) in partition_data:
-                    line = f"{surt_key} {timestamp} {json_data}\n"
-                    current_chunk.append(line)
-                    chunk_records.append((surt_key, timestamp))  # Store both surt_key and timestamp
-                    
-                    if len(current_chunk) >= chunk_size:
-                        # Compress and write chunk
-                        chunk_data = ''.join(current_chunk).encode('utf-8')
-                        compressed = z.compress(chunk_data)
-                        chunk_length = len(compressed)
-                        f.write(compressed)
-                        
-                        # Create single index entry per record
-                        for sk, ts in chunk_records:
-                            index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
-                        
-                        current_offset += chunk_length
-                        current_chunk = []
-                        chunk_records = []
-                    
-                # Handle final chunk
-                if current_chunk:
+        current_chunk = []
+        first_record = None
+        
+        with open(output_file, 'wb') as f:
+            for _, (surt_key, timestamp, json_data) in partition_data:
+                line = f"{surt_key} {timestamp} {json_data}\n"
+                if not first_record:
+                    first_record = (surt_key, timestamp)
+                current_chunk.append(line)
+                
+                if len(current_chunk) >= chunk_size:
+                    # Compress and write chunk
                     chunk_data = ''.join(current_chunk).encode('utf-8')
+                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
                     compressed = z.compress(chunk_data) + z.flush()
                     chunk_length = len(compressed)
                     f.write(compressed)
                     
-                    for sk, ts in chunk_records:
-                        index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
+                    # Only index the first entry of the chunk
+                    if first_record:
+                        index_entries.append((
+                            first_record[0],  # surt_key
+                            first_record[1],  # timestamp
+                            partition_id,
+                            current_offset,
+                            chunk_length
+                        ))
+                    
+                    current_offset += chunk_length
+                    current_chunk = []
+                    first_record = None
             
-            return index_entries
+            # Handle final chunk
+            if current_chunk:
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+                
+                if first_record:
+                    index_entries.append((
+                        first_record[0],
+                        first_record[1],
+                        partition_id,
+                        current_offset,
+                        chunk_length
+                    ))
+        
+        return index_entries
+
+    def run_job(self, session):
+        os.makedirs(self.args.output_base_url, exist_ok=True)
+        input = self.args.input_base_url + self.args.input
+        num_partitions = self.args.num_output_partitions
+
+        # Get partition boundaries using reservoir sampling
+        boundaries = self.get_partition_boundaries(session, input, num_partitions)
 
-        # Single pass processing with fixed-width partitioning
+        # Process with range partitioning
         rdd = session.sparkContext.textFile(input) \
             .map(lambda line: tuple(line.strip().split(" ", 2))) \
             .keyBy(lambda x: x[0]) \
             .partitionBy(num_partitions, 
-                        partitionFunc=lambda key: self.get_partition_id(key, num_partitions)) \
-            .mapPartitionsWithIndex(process_partition)
+                        partitionFunc=lambda key: self.get_partition_id(key, boundaries)) \
+            .mapPartitionsWithIndex(self.process_partition)
 
         # Create index
         index_schema = StructType([

From 4a99403b31a44505fc1265976b4ed3f6de2d7c9e Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 14:19:53 -0600
Subject: [PATCH 36/66] fix: simpler ordered partition id

---
 zipnumcluster-ccpyspark.py | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 5e2ba28..f742a82 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -34,26 +34,11 @@ def get_partition_id(self, surt_key: str, num_partitions: int) -> int:
         Determine partition based on SURT key structure.
         Handles special cases like common TLD prefixes.
         """
-        # Split SURT key into components
-        parts = surt_key.split(',')
-        
-        # Handle special cases for domain-based SURT keys
-        if len(parts) > 1:
-            # Skip common TLDs for better distribution
-            if parts[0] in {'com', 'org', 'net', 'edu', 'gov'}:
-                key_for_hash = parts[1]
-            else:
-                key_for_hash = parts[0]
-        else:
-            # Handle non-domain SURT keys (like IP addresses)
-            key_for_hash = parts[0]
-        
-        # Take first 3 meaningful characters for distribution
-        prefix = key_for_hash[:3].ljust(3)
-        
-        # Create a number from the characters that preserves ordering
-        # This ensures similar prefixes go to nearby partitions
-        value = (ord(prefix[0]) << 16) + (ord(prefix[1]) << 8) + ord(prefix[2])
+        if len(surt_key) < 3:
+            surt_key = surt_key.ljust(3, 'a')
+        value = 0
+        for i, c in enumerate(surt_key):
+            value = (value << 8) + ord(c)
         
         return value % num_partitions
     

From e8dcf78cea6d53af08aebe316d65f4c6e4e83adf Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 14:21:38 -0600
Subject: [PATCH 37/66] fix: re-init and flush gz

---
 zipnumcluster-ccpyspark.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index f742a82..4062d4e 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -71,7 +71,8 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                     if len(current_chunk) >= chunk_size:
                         # Compress and write chunk
                         chunk_data = ''.join(current_chunk).encode('utf-8')
-                        compressed = z.compress(chunk_data)
+                        z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                        compressed = z.compress(chunk_data) + z.flush()
                         chunk_length = len(compressed)
                         f.write(compressed)
                         
@@ -86,6 +87,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                 # Handle final chunk
                 if current_chunk:
                     chunk_data = ''.join(current_chunk).encode('utf-8')
+                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
                     compressed = z.compress(chunk_data) + z.flush()
                     chunk_length = len(compressed)
                     f.write(compressed)

From 2a1de315d24cd910d3f79979c2ce3312605edf2a Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 15:42:09 -0600
Subject: [PATCH 38/66] fix: bugfix for edge conditions

---
 zipnumcluster-ccpyspark.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index d610e6c..abfe52c 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -48,8 +48,14 @@ def reservoir_sample(iterator: Iterator[str], k: int) -> List[str]:
         samples.sort()
         
         # Select evenly spaced samples as boundaries
-        step = len(samples) // (num_partitions - 1)
-        return [samples[i] for i in range(0, len(samples), step)][:num_partitions-1]
+        if num_partitions > 1:
+            if len(samples) < num_partitions:
+                # If we have fewer samples than requested partitions, use all samples
+                return samples[:-1]  # exclude last sample to ensure num_partitions-1 boundaries
+            step = max(1, len(samples) // (num_partitions - 1))
+            return [samples[i] for i in range(0, len(samples), step)][:num_partitions-1]
+        else:
+            return samples
 
     def get_partition_id(self, key: str, boundaries: List[str]) -> int:
         """Determine partition based on range boundaries"""

From 951ede474c5f4bcbd493dec95927ce5386c9aa84 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 16:03:25 -0600
Subject: [PATCH 39/66] fix: getting order and structure of final index proper,
 and in single file

---
 zipnumcluster-ccpyspark.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index abfe52c..6b8ec07 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -1,7 +1,7 @@
 import logging
 from sparkcc import CCSparkJob
 import os
-from pyspark.sql.functions import row_number, concat, lit, col
+from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 import gzip
 from typing import Iterator, Tuple, List
 from pyspark.sql.types import StringType, LongType, StructType, StructField
@@ -159,7 +159,7 @@ def run_job(self, session):
 
         index_df = index_df\
             .withColumn("output_filename", concat(lit("cdx"), col("partition_id").cast(StringType()), lit(".gz")))\
-            .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number")
+            .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number", "partition_id")
         
         # Save main index, sorted by surt_key for binary search
         index_df.sort("surt_key").coalesce(1).write \
@@ -170,16 +170,23 @@ def run_job(self, session):
         )
 
         # Create secondary index for partition boundaries
-        partition_bounds = index_df.groupBy("output_filename") \
-            .agg({"surt_key": "min", "surt_key": "max"}) \
-            .sort("output_filename")
+        partition_bounds = index_df.groupBy("partition_id") \
+            .agg(
+                min_("surt_key").alias("min_surt_key"),
+                max_("surt_key").alias("max_surt_key")
+            ) \
+            .select("partition_id", "min_surt_key", "max_surt_key") \
+            .sort("partition_id")
         
-        partition_bounds.coalesce(1).write \
-        .option("sep", "\t").csv(
-            f"{self.args.output_base_url}/secondary_index.idx",
-            header=False,
-            mode="overwrite"
-        )
+        # Write the partition boundaries to a single text file
+        secondary_index_path = f"{self.args.output_base_url}/secondary_index.idx"
+        with open(secondary_index_path, 'w') as f:
+            for row in partition_bounds.collect():
+                filename = f"cdx{row['partition_id']}.gz"
+                # Write min entry
+                f.write(f"{filename}\tmin\t{row['min_surt_key']}\n")
+                # Write max entry
+                f.write(f"{filename}\tmax\t{row['max_surt_key']}\n")
 
 if __name__ == "__main__":
     job = ZipNumClusterCdx()

From d3f5ee9a52e70dc6f4d42efeebef0229a62fd9b9 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 16:14:28 -0600
Subject: [PATCH 40/66] fix: fix cdx filenames

---
 zipnumcluster-ccpyspark.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 6b8ec07..353ae8f 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -67,7 +67,7 @@ def get_partition_id(self, key: str, boundaries: List[str]) -> int:
     def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, str, int, int, int]]:
         """Process partition with chunked compression and first-entry-only indexing"""
         z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-        output_filename = f"cdx{partition_id}.gz"
+        output_filename = f"cdx-{partition_id:05d}.gz"
         output_file = f"{self.args.output_base_url}/{output_filename}"
         index_entries = []
         current_offset = 0
@@ -162,12 +162,12 @@ def run_job(self, session):
             .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number", "partition_id")
         
         # Save main index, sorted by surt_key for binary search
-        index_df.sort("surt_key").coalesce(1).write \
-        .option("sep", "\t").csv(
-            f"{self.args.output_base_url}/index.idx", 
-            header=False,
-            mode="overwrite"
-        )
+        #index_df.sort("surt_key").coalesce(1).write \
+        #.option("sep", "\t").csv(
+        #    f"{self.args.output_base_url}/index.idx", 
+        #    header=False,
+        #    mode="overwrite"
+        #)
 
         # Create secondary index for partition boundaries
         partition_bounds = index_df.groupBy("partition_id") \

From 3f876a85e074005e5c7b11f1413c222b6d384d5d Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 20 Nov 2024 16:32:03 -0600
Subject: [PATCH 41/66] fix: final cluster.idx logic

---
 zipnumcluster-ccpyspark.py | 104 +++++++++++++++----------------------
 1 file changed, 43 insertions(+), 61 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 353ae8f..ca28bdd 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -64,9 +64,8 @@ def get_partition_id(self, key: str, boundaries: List[str]) -> int:
                 return i
         return len(boundaries)
 
-    def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, str, int, int, int]]:
-        """Process partition with chunked compression and first-entry-only indexing"""
-        z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+    def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
+        """Process partition with chunked compression and chunk boundary tracking"""
         output_filename = f"cdx-{partition_id:05d}.gz"
         output_file = f"{self.args.output_base_url}/{output_filename}"
         index_entries = []
@@ -77,13 +76,15 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
         partition_data = sorted(partition_iter, key=lambda x: x[0])
         
         current_chunk = []
-        first_record = None
+        chunk_min_surt = None
+        chunk_max_surt = None
         
         with open(output_file, 'wb') as f:
             for _, (surt_key, timestamp, json_data) in partition_data:
                 line = f"{surt_key} {timestamp} {json_data}\n"
-                if not first_record:
-                    first_record = (surt_key, timestamp)
+                if chunk_min_surt is None:
+                    chunk_min_surt = surt_key
+                chunk_max_surt = surt_key  # Will end up as max since data is sorted
                 current_chunk.append(line)
                 
                 if len(current_chunk) >= chunk_size:
@@ -94,19 +95,20 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                     chunk_length = len(compressed)
                     f.write(compressed)
                     
-                    # Only index the first entry of the chunk
-                    if first_record:
-                        index_entries.append((
-                            first_record[0],  # surt_key
-                            first_record[1],  # timestamp
-                            partition_id,
-                            current_offset,
-                            chunk_length
-                        ))
+                    # Index entry with chunk boundaries
+                    index_entries.append((
+                        chunk_min_surt,  # min surt
+                        chunk_max_surt,  # max surt
+                        output_filename,  # filename
+                        partition_id,
+                        current_offset,
+                        chunk_length,
+                        len(current_chunk)  # number of records in chunk
+                    ))
                     
                     current_offset += chunk_length
                     current_chunk = []
-                    first_record = None
+                    chunk_min_surt = None
             
             # Handle final chunk
             if current_chunk:
@@ -116,14 +118,15 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                 chunk_length = len(compressed)
                 f.write(compressed)
                 
-                if first_record:
-                    index_entries.append((
-                        first_record[0],
-                        first_record[1],
-                        partition_id,
-                        current_offset,
-                        chunk_length
-                    ))
+                index_entries.append((
+                    chunk_min_surt,
+                    chunk_max_surt,
+                    output_filename,
+                    partition_id,
+                    current_offset,
+                    chunk_length,
+                    len(current_chunk)
+                ))
         
         return index_entries
 
@@ -143,50 +146,29 @@ def run_job(self, session):
                         partitionFunc=lambda key: self.get_partition_id(key, boundaries)) \
             .mapPartitionsWithIndex(self.process_partition)
 
-        # Create index
+        # Update schema for new index format
         index_schema = StructType([
-            StructField("surt_key", StringType(), False),
-            StructField("timestamp", StringType(), False),
+            StructField("min_surt", StringType(), False),
+            StructField("max_surt", StringType(), False),
+            StructField("filename", StringType(), False),
             StructField("partition_id", LongType(), False),
             StructField("offset", LongType(), False),
-            StructField("length", LongType(), False)
+            StructField("length", LongType(), False),
+            StructField("num_records", LongType(), False)
         ])
         
-        w = Window.orderBy("surt_key")
-        # Create index with correct filename formatting
-        index_df = session.createDataFrame(rdd, index_schema)\
-            .withColumn("sequence_number", row_number().over(w))
-
-        index_df = index_df\
-            .withColumn("output_filename", concat(lit("cdx"), col("partition_id").cast(StringType()), lit(".gz")))\
-            .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number", "partition_id")
-        
-        # Save main index, sorted by surt_key for binary search
-        #index_df.sort("surt_key").coalesce(1).write \
-        #.option("sep", "\t").csv(
-        #    f"{self.args.output_base_url}/index.idx", 
-        #    header=False,
-        #    mode="overwrite"
-        #)
-
-        # Create secondary index for partition boundaries
-        partition_bounds = index_df.groupBy("partition_id") \
-            .agg(
-                min_("surt_key").alias("min_surt_key"),
-                max_("surt_key").alias("max_surt_key")
-            ) \
-            .select("partition_id", "min_surt_key", "max_surt_key") \
-            .sort("partition_id")
+        index_df = session.createDataFrame(rdd, index_schema).orderBy("min_surt")
         
-        # Write the partition boundaries to a single text file
-        secondary_index_path = f"{self.args.output_base_url}/secondary_index.idx"
-        with open(secondary_index_path, 'w') as f:
-            for row in partition_bounds.collect():
-                filename = f"cdx{row['partition_id']}.gz"
+        # Write chunk-level index
+        chunk_index_path = f"{self.args.output_base_url}/cluster.idx"
+        with open(chunk_index_path, 'w') as f:
+            seq = 1
+            for row in index_df.collect():
                 # Write min entry
-                f.write(f"{filename}\tmin\t{row['min_surt_key']}\n")
-                # Write max entry
-                f.write(f"{filename}\tmax\t{row['max_surt_key']}\n")
+                f.write(f"{row['min_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{seq}\n")
+                # Write max entry (was just for testing, we don't really need this in final index I don't think...)
+                # f.write(f"{row['max_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{row['sequence_number']}\n")
+                seq += 1
 
 if __name__ == "__main__":
     job = ZipNumClusterCdx()

From 688e8799b14e4ad66997fc0a07d84c6fddc6c67f Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 27 Nov 2024 09:56:18 -0600
Subject: [PATCH 42/66] fix: get zipnumcluster-pyspark working with s3/emr

---
 zipnumcluster-ccpyspark.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index ca28bdd..b28131e 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -1,5 +1,5 @@
 import logging
-from sparkcc import CCSparkJob
+from sparkcc import CCFileProcessorSparkJob
 import os
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 import gzip
@@ -11,11 +11,14 @@
 
 LOG = logging.getLogger('IndexWARCJob')
 
-class ZipNumClusterCdx(CCSparkJob):
+# TODO: WE USE CCFileProcessorSparkJob here only for write_output_file, we should probably move write_output_file to CCSparkJob instead.
+# It's OK for this one, because we override the entire run_job method, but it's not ideal, because we're not really doing "file-wise" processing here...
+
+class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
 
     def add_arguments(self, parser):
-        super().add_arguments(parser)
+        super(CCFileProcessorSparkJob,self).add_arguments(parser)
         parser.add_argument("--output_base_url", required=False,
                             default='my_cdx_bucket',
                             help="destination for output")
@@ -67,7 +70,6 @@ def get_partition_id(self, key: str, boundaries: List[str]) -> int:
     def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
         """Process partition with chunked compression and chunk boundary tracking"""
         output_filename = f"cdx-{partition_id:05d}.gz"
-        output_file = f"{self.args.output_base_url}/{output_filename}"
         index_entries = []
         current_offset = 0
         chunk_size = self.args.num_lines
@@ -79,7 +81,7 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
         chunk_min_surt = None
         chunk_max_surt = None
         
-        with open(output_file, 'wb') as f:
+        with open(output_filename, 'wb') as f:
             for _, (surt_key, timestamp, json_data) in partition_data:
                 line = f"{surt_key} {timestamp} {json_data}\n"
                 if chunk_min_surt is None:
@@ -128,6 +130,11 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                     len(current_chunk)
                 ))
         
+        with open(output_filename, 'rb') as fd:
+            self.write_output_file(output_filename, fd, self.args.output_base_url)
+        
+        os.unlink(output_filename)
+
         return index_entries
 
     def run_job(self, session):
@@ -160,7 +167,7 @@ def run_job(self, session):
         index_df = session.createDataFrame(rdd, index_schema).orderBy("min_surt")
         
         # Write chunk-level index
-        chunk_index_path = f"{self.args.output_base_url}/cluster.idx"
+        chunk_index_path = f"cluster.idx"
         with open(chunk_index_path, 'w') as f:
             seq = 1
             for row in index_df.collect():
@@ -170,6 +177,12 @@ def run_job(self, session):
                 # f.write(f"{row['max_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{row['sequence_number']}\n")
                 seq += 1
 
+        with open(chunk_index_path, 'rb') as fd:
+            self.write_output_file(chunk_index_path, fd, self.args.output_base_url)
+        
+        os.unlink(chunk_index_path)
+        
+
 if __name__ == "__main__":
     job = ZipNumClusterCdx()
     job.run()
\ No newline at end of file

From 72996935603013e06dddfaaa8fa3235f3274f76a Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 27 Nov 2024 13:05:10 -0600
Subject: [PATCH 43/66] chore: merged latest normal spark solution upwards

---
 zipnumcluster-ccpyspark.py | 178 ++++++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 73 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 4062d4e..9b706d2 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -7,6 +7,9 @@
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 import zlib
 from pyspark.sql.window import Window
+import random
+import pickle
+from pyspark import StorageLevel
 
 LOG = logging.getLogger('IndexWARCJob')
 
@@ -22,70 +25,43 @@ def add_arguments(self, parser):
         parser.add_argument("--output_base_url", required=False,
                             default='my_cdx_bucket',
                             help="destination for output")
+        parser.add_argument("--partition_boundries_file", required=False,
+                            help="Full path to a file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,
                             help="number of lines to compress in each chunk")
         parser.add_argument("--num_output_partitions", type=int, required=False,
                             default=300,
                             help="number of partitions/shards")
-    
-    def get_partition_id(self, surt_key: str, num_partitions: int) -> int:
-        """
-        Determine partition based on SURT key structure.
-        Handles special cases like common TLD prefixes.
-        """
-        if len(surt_key) < 3:
-            surt_key = surt_key.ljust(3, 'a')
-        value = 0
-        for i, c in enumerate(surt_key):
-            value = (value << 8) + ord(c)
-        
-        return value % num_partitions
-    
-    def run_job(self, session):
-        os.makedirs(self.args.output_base_url, exist_ok=True)
-        input = self.args.input_base_url + self.args.input
-        num_partitions = self.args.num_output_partitions
 
-        def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str, str]]]) -> Iterator[Tuple[str, int, int, int]]:
-            """Process partition with chunked compression"""
-            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-            output_filename = f"cdx{partition_id}.gz"
-            output_file = f"{self.args.output_base_url}/{output_filename}"
-            index_entries = []
-            current_offset = 0
-            chunk_size = self.args.num_lines
-            
-            # Sort partition contents
-            partition_data = sorted(partition_iter, key=lambda x: x[0])
-            
-            current_chunk = []
-            chunk_records = []  # Store full record info
-            
-            with open(output_file, 'wb') as f:
-                for _, (surt_key, timestamp, json_data) in partition_data:
-                    line = f"{surt_key} {timestamp} {json_data}\n"
-                    current_chunk.append(line)
-                    chunk_records.append((surt_key, timestamp))  # Store both surt_key and timestamp
-                    
-                    if len(current_chunk) >= chunk_size:
-                        # Compress and write chunk
-                        chunk_data = ''.join(current_chunk).encode('utf-8')
-                        z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                        compressed = z.compress(chunk_data) + z.flush()
-                        chunk_length = len(compressed)
-                        f.write(compressed)
-                        
-                        # Create single index entry per record
-                        for sk, ts in chunk_records:
-                            index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
-                        
-                        current_offset += chunk_length
-                        current_chunk = []
-                        chunk_records = []
-                    
-                # Handle final chunk
-                if current_chunk:
+    def get_partition_id(self, key: str, boundaries: List[str]) -> int:
+        """Determine partition based on range boundaries"""
+        for i, boundary in enumerate(boundaries):
+            if key < boundary:
+                return i
+        return len(boundaries)
+
+    def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
+        """Process partition with chunked compression and chunk boundary tracking"""
+        output_filename = f"cdx-{partition_id:05d}.gz"
+        index_entries = []
+        current_offset = 0
+        chunk_size = self.args.num_lines
+        
+        current_chunk = []
+        chunk_min_surt = None
+        chunk_max_surt = None
+        
+        with open(output_filename, 'wb') as f:
+            for surt_key, (timestamp, json_data) in partition_iter:
+                line = f"{surt_key} {timestamp} {json_data}\n"
+                if chunk_min_surt is None:
+                    chunk_min_surt = surt_key
+                chunk_max_surt = surt_key  # Will end up as max since data is sorted
+                current_chunk.append(line)
+                
+                if len(current_chunk) >= chunk_size:
+                    # Compress and write chunk
                     chunk_data = ''.join(current_chunk).encode('utf-8')
                     z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
                     compressed = z.compress(chunk_data) + z.flush()
@@ -95,15 +71,79 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                     for sk, ts in chunk_records:
                         index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
             
-            return index_entries
+            # Handle final chunk
+            if current_chunk:
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+                
+                index_entries.append((
+                    chunk_min_surt,
+                    chunk_max_surt,
+                    output_filename,
+                    partition_id,
+                    current_offset,
+                    chunk_length,
+                    len(current_chunk)
+                ))
+        
+        with open(output_filename, 'rb') as fd:
+            self.write_output_file(output_filename, fd, self.args.output_base_url)
+        
+        os.unlink(output_filename)
 
-        # Single pass processing with fixed-width partitioning
-        rdd = session.sparkContext.textFile(input) \
-            .map(lambda line: tuple(line.strip().split(" ", 2))) \
-            .keyBy(lambda x: x[0]) \
-            .partitionBy(num_partitions, 
-                        partitionFunc=lambda key: self.get_partition_id(key, num_partitions)) \
-            .mapPartitionsWithIndex(process_partition)
+        return index_entries
+
+    def parse_line(_, line):
+        try:
+            parts = line.split(' ', 2)
+            if len(parts) != 3:
+                return None
+            surt_key, timestamp, json_str = parts
+            return (surt_key, (timestamp, json_str))
+        except:
+            return None
+    
+    def run_job(self, session):
+        os.makedirs(self.args.output_base_url, exist_ok=True)
+        input = self.args.input_base_url + self.args.input
+        num_partitions = self.args.num_output_partitions
+        boundries_file_uri = self.args.partition_boundries_file
+
+
+        rdd = session.sparkContext.textFile(input).map(self.parse_line).filter(lambda x: x is not None)
+
+        # Cache the RDD with MEMORY_AND_DISK storage level
+        rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
+
+        boundaries = None
+        if boundries_file_uri and self.check_for_output_file(boundries_file_uri):
+            with self.fetch_file(boundries_file_uri) as f:
+                boundaries = pickle.load(f)
+        else:
+            samples = rdd.keys().sample(False, 0.1).collect()
+            samples.sort()
+            step = len(samples) // num_partitions
+            boundaries = samples[::step][:num_partitions-1]
+            
+            temp_file_name = 'temp_range_boundaries.pkl'
+            with open(temp_file_name, 'wb') as f:
+                pickle.dump(boundaries, f)
+            
+            with open(temp_file_name, 'rb') as f:
+                self.write_output_file(boundries_file_uri, f)
+
+            os.unlink(temp_file_name)
+        
+        # Process with range partitioning
+        rdd = rdd.keyBy(lambda x: x[0]) \
+            .partitionBy(num_partitions, lambda k: self.get_partition_id(k, boundaries)) \
+            .sortByKey() \
+            .values()
+        
+        rdd = rdd.mapPartitionsWithIndex(self.process_partition)
 
         # Create index
         index_schema = StructType([
@@ -122,14 +162,6 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
         index_df = index_df\
             .withColumn("output_filename", concat(lit("cdx"), col("partition_id").cast(StringType()), lit(".gz")))\
             .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number")
-        
-        # Save main index, sorted by surt_key for binary search
-        index_df.sort("surt_key").coalesce(1).write \
-        .option("sep", "\t").csv(
-            f"{self.args.output_base_url}/index.idx", 
-            header=False,
-            mode="overwrite"
-        )
 
         # Create secondary index for partition boundaries
         partition_bounds = index_df.groupBy("output_filename") \

From 9c0f47bdf8d36a16d887b6632d4fc9275d63ba85 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 27 Nov 2024 13:47:31 -0600
Subject: [PATCH 44/66] fix: numerous bugfixes from complex merge issues

---
 zipnumcluster-ccpyspark.py | 82 ++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 9945c75..391ceb6 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -33,12 +33,7 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
 
-    def get_partition_id(self, key: str, boundaries: List[str]) -> int:
-        """Determine partition based on range boundaries"""
-        for i, boundary in enumerate(boundaries):
-            if key < boundary:
-                return i
-        return len(boundaries)
+
 
     def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
         """Process partition with chunked compression and chunk boundary tracking"""
@@ -67,8 +62,18 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                     chunk_length = len(compressed)
                     f.write(compressed)
                     
-                    for sk, ts in chunk_records:
-                        index_entries.append((sk, ts, partition_id, current_offset, chunk_length))
+                    index_entries.append((
+                        chunk_min_surt,
+                        chunk_max_surt,
+                        output_filename,
+                        partition_id,
+                        current_offset,
+                        chunk_length,
+                        len(current_chunk)
+                    ))
+                    current_offset += chunk_length
+                    current_chunk = []
+                    chunk_min_surt = None
             
             # Handle final chunk
             if current_chunk:
@@ -87,6 +92,7 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                     chunk_length,
                     len(current_chunk)
                 ))
+                current_chunk = []
         
         with open(output_filename, 'rb') as fd:
             self.write_output_file(output_filename, fd, self.args.output_base_url)
@@ -115,7 +121,8 @@ def run_job(self, session):
         rdd = session.sparkContext.textFile(input).map(self.parse_line).filter(lambda x: x is not None)
 
         # Cache the RDD with MEMORY_AND_DISK storage level
-        rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
+        #rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
+        #rdd = rdd.cache()
 
         boundaries = None
         if boundries_file_uri and self.check_for_output_file(boundries_file_uri):
@@ -136,13 +143,19 @@ def run_job(self, session):
 
             os.unlink(temp_file_name)
         
-        # Process with range partitioning
-        rdd = rdd.keyBy(lambda x: x[0]) \
-            .partitionBy(num_partitions, lambda k: self.get_partition_id(k, boundaries)) \
-            .sortByKey() \
-            .values()
+        def get_partition_id(key: str) -> int:
+            """Determine partition based on range boundaries"""
+            for i, boundary in enumerate(boundaries):
+                if key < boundary:
+                    return i
+            return len(boundaries)
         
-        rdd = rdd.mapPartitionsWithIndex(self.process_partition)
+        # Process with range partitioning
+        rdd = rdd.repartitionAndSortWithinPartitions(
+            numPartitions=num_partitions,
+            partitionFunc=lambda k: get_partition_id(k),
+            keyfunc=lambda x: x[0]) \
+        .mapPartitionsWithIndex(self.process_partition)
 
         # Update schema for new index format
         index_schema = StructType([
@@ -151,29 +164,28 @@ def run_job(self, session):
             StructField("filename", StringType(), False),
             StructField("partition_id", LongType(), False),
             StructField("offset", LongType(), False),
-            StructField("length", LongType(), False)
+            StructField("length", LongType(), False),
+            StructField("chunk_record_count", LongType(), False)
         ])
         
-        w = Window.orderBy("surt_key")
-        # Create index with correct filename formatting
-        index_df = session.createDataFrame(rdd, index_schema)\
-            .withColumn("sequence_number", row_number().over(w))
-
-        index_df = index_df\
-            .withColumn("output_filename", concat(lit("cdx"), col("partition_id").cast(StringType()), lit(".gz")))\
-            .select("surt_key", "timestamp", "output_filename", "offset", "length", "sequence_number")
-
-        # Create secondary index for partition boundaries
-        partition_bounds = index_df.groupBy("output_filename") \
-            .agg({"surt_key": "min", "surt_key": "max"}) \
-            .sort("output_filename")
+        index_df = session.createDataFrame(rdd, index_schema).orderBy("min_surt")
+        
+        # Write chunk-level index
+        chunk_index_path = f"cluster.idx"
+        with open(chunk_index_path, 'w') as f:
+            seq = 1
+            for row in index_df.collect():
+                # Write min entry
+                f.write(f"{row['min_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{seq}\n")
+                # Write max entry (was just for testing, we don't really need this in final index I don't think...)
+                # f.write(f"{row['max_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{row['sequence_number']}\n")
+                seq += 1
+
+        with open(chunk_index_path, 'rb') as fd:
+            self.write_output_file(chunk_index_path, fd, self.args.output_base_url)
+        
+        os.unlink(chunk_index_path)
         
-        partition_bounds.coalesce(1).write \
-        .option("sep", "\t").csv(
-            f"{self.args.output_base_url}/secondary_index.idx",
-            header=False,
-            mode="overwrite"
-        )
 
 if __name__ == "__main__":
     job = ZipNumClusterCdx()

From 1d9a85781c02a8eb916201de981b76781ab22f94 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Wed, 27 Nov 2024 17:06:25 -0600
Subject: [PATCH 45/66] fix: more emr bugfixes and tweaks

---
 zipnumcluster-ccpyspark.py | 90 +++++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 391ceb6..d83bc34 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -1,21 +1,39 @@
+import os
+import zlib
+import json
 import logging
+from typing import Iterator, Tuple, List
 from sparkcc import CCFileProcessorSparkJob
-import os
+from pyspark import StorageLevel
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
-import gzip
-from typing import Iterator, Tuple, List
 from pyspark.sql.types import StringType, LongType, StructType, StructField
-import zlib
 from pyspark.sql.window import Window
-import random
-import pickle
-from pyspark import StorageLevel
 
 LOG = logging.getLogger('IndexWARCJob')
 
-# TODO: WE USE CCFileProcessorSparkJob here only for write_output_file, we should probably move write_output_file to CCSparkJob instead.
-# It's OK for this one, because we override the entire run_job method, but it's not ideal, because we're not really doing "file-wise" processing here...
+# note: this is LESS strict about partitioning than the original
+# based on my read of the zipnum clustering code, this shoudl be just fine
+# but so far, it's untested. I plan to test it with the index server we use (locally)
 
+# some of these functions need to be serialized by spark, so, keep them outside of the class
+# so we don't have issues with EMR serialization
+def parse_line(line):
+    try:
+        parts = line.split(' ', 2)
+        if len(parts) != 3:
+            return None
+        surt_key, timestamp, json_str = parts
+        return (surt_key, (timestamp, json_str))
+    except:
+        return None
+
+def get_partition_id(key: str, boundaries_data) -> int:
+    """Determine partition based on range boundaries"""
+    for i, boundary in enumerate(boundaries_data):
+        if key < boundary:
+            return i
+    return len(boundaries_data)
+    
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
 
@@ -25,7 +43,7 @@ def add_arguments(self, parser):
                             default='my_cdx_bucket',
                             help="destination for output")
         parser.add_argument("--partition_boundries_file", required=False,
-                            help="Full path to a file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
+                            help="Full path to a json file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,
                             help="number of lines to compress in each chunk")
@@ -62,15 +80,17 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
                     chunk_length = len(compressed)
                     f.write(compressed)
                     
+                    # Index entry with chunk boundaries
                     index_entries.append((
-                        chunk_min_surt,
-                        chunk_max_surt,
-                        output_filename,
+                        chunk_min_surt,  # min surt
+                        chunk_max_surt,  # max surt
+                        output_filename,  # filename
                         partition_id,
                         current_offset,
                         chunk_length,
-                        len(current_chunk)
+                        len(current_chunk)  # number of records in chunk
                     ))
+                    
                     current_offset += chunk_length
                     current_chunk = []
                     chunk_min_surt = None
@@ -100,16 +120,6 @@ def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[st
         os.unlink(output_filename)
 
         return index_entries
-
-    def parse_line(_, line):
-        try:
-            parts = line.split(' ', 2)
-            if len(parts) != 3:
-                return None
-            surt_key, timestamp, json_str = parts
-            return (surt_key, (timestamp, json_str))
-        except:
-            return None
     
     def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
@@ -118,45 +128,47 @@ def run_job(self, session):
         boundries_file_uri = self.args.partition_boundries_file
 
 
-        rdd = session.sparkContext.textFile(input).map(self.parse_line).filter(lambda x: x is not None)
+        rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
 
         # Cache the RDD with MEMORY_AND_DISK storage level
         #rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
         #rdd = rdd.cache()
 
         boundaries = None
+        ##logging.info(f"Boundaries file: {boundries_file_uri}")
         if boundries_file_uri and self.check_for_output_file(boundries_file_uri):
+            ##logging.info(f"Boundaries file found, using it: {boundries_file_uri}")
             with self.fetch_file(boundries_file_uri) as f:
-                boundaries = pickle.load(f)
+                boundaries = json.load(f)
         else:
+            ##logging.info(f"NO Boundaries file found, creating it: {boundries_file_uri}")
             samples = rdd.keys().sample(False, 0.1).collect()
             samples.sort()
             step = len(samples) // num_partitions
             boundaries = samples[::step][:num_partitions-1]
             
-            temp_file_name = 'temp_range_boundaries.pkl'
-            with open(temp_file_name, 'wb') as f:
-                pickle.dump(boundaries, f)
+            temp_file_name = 'temp_range_boundaries.json'
+            with open(temp_file_name, 'w') as f:
+                json.dump(boundaries, f)
             
             with open(temp_file_name, 'rb') as f:
                 self.write_output_file(boundries_file_uri, f)
 
             os.unlink(temp_file_name)
         
-        def get_partition_id(key: str) -> int:
-            """Determine partition based on range boundaries"""
-            for i, boundary in enumerate(boundaries):
-                if key < boundary:
-                    return i
-            return len(boundaries)
+        
         
         # Process with range partitioning
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
-            partitionFunc=lambda k: get_partition_id(k),
+            partitionFunc=lambda k: get_partition_id(k,boundaries),
             keyfunc=lambda x: x[0]) \
-        .mapPartitionsWithIndex(self.process_partition)
-
+        .mapPartitionsWithIndex(self.process_partition) \
+        .values()
+        
+        # EMR has issues with this...
+        # rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
+        
         # Update schema for new index format
         index_schema = StructType([
             StructField("min_surt", StringType(), False),
@@ -165,7 +177,7 @@ def get_partition_id(key: str) -> int:
             StructField("partition_id", LongType(), False),
             StructField("offset", LongType(), False),
             StructField("length", LongType(), False),
-            StructField("chunk_record_count", LongType(), False)
+            StructField("num_records", LongType(), False)
         ])
         
         index_df = session.createDataFrame(rdd, index_schema).orderBy("min_surt")

From 55f907c8c235fbc27b4d81ed2b8524cd38f05af5 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Mon, 2 Dec 2024 10:51:01 -0600
Subject: [PATCH 46/66] fix: temporary working version  of zipnum job using
 normal spark features (need to refactor ccpyspark back into it now)

---
 zipnumcluster-ccpyspark.py | 193 ++++++++++++++++++++++---------------
 1 file changed, 113 insertions(+), 80 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index d83bc34..93208cf 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -8,8 +8,13 @@
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 from pyspark.sql.window import Window
+import boto3
+import botocore
+import re
 
 LOG = logging.getLogger('IndexWARCJob')
+data_url_pattern = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
+
 
 # note: this is LESS strict about partitioning than the original
 # based on my read of the zipnum clustering code, this shoudl be just fine
@@ -33,6 +38,109 @@ def get_partition_id(key: str, boundaries_data) -> int:
         if key < boundary:
             return i
     return len(boundaries_data)
+
+def write_output_file(filename: str, fd, base_url: str):
+    uri = os.path.join(base_url, filename)
+
+    (scheme, netloc, path) = (None, None, None)
+    uri_match = data_url_pattern.match(uri)
+    if not uri_match and base_uri:
+        # relative input URI (path) and base URI defined
+        uri = base_uri + uri
+        uri_match = data_url_pattern.match(uri)
+    if uri_match:
+        (scheme, netloc, path) = uri_match.groups()
+    else:
+        # keep local file paths as is
+        path = uri
+    
+    if scheme in ['s3', 's3a', 's3n']:
+        bucketname = netloc
+        output_path = path
+        try:
+            client = boto3.client('s3', use_ssl=False)
+            client.upload_fileobj(fd, bucketname, path)
+        except botocore.client.ClientError as exception:
+            LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
+    else:
+        LOG.info('Writing local file {}'.format(uri))
+        if scheme == 'file':
+            # must be an absolute path
+            uri = os.path.join('/', path)
+        else:
+            base_dir = os.path.abspath(os.path.dirname(__file__))
+            uri = os.path.join(base_dir, uri)
+        os.makedirs(os.path.dirname(uri), exist_ok=True)
+        with open(uri, 'wb') as f:
+            f.write(fd.read())
+
+def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]], 
+                     num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
+    """Process partition with chunked compression and chunk boundary tracking"""
+    output_filename = f"cdx-{partition_id:05d}.gz"
+    index_entries = []
+    current_offset = 0
+    chunk_size = num_lines
+    
+    current_chunk = []
+    chunk_min_surt = None
+    chunk_max_surt = None
+    
+    with open(output_filename, 'wb') as f:
+        for surt_key, (timestamp, json_data) in partition_iter:
+            line = f"{surt_key} {timestamp} {json_data}\n"
+            if chunk_min_surt is None:
+                chunk_min_surt = surt_key
+            chunk_max_surt = surt_key  # Will end up as max since data is sorted
+            current_chunk.append(line)
+            
+            if len(current_chunk) >= chunk_size:
+                # Compress and write chunk
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+                
+                # Index entry with chunk boundaries
+                index_entries.append((
+                    chunk_min_surt,  # min surt
+                    chunk_max_surt,  # max surt
+                    output_filename,  # filename
+                    partition_id,
+                    current_offset,
+                    chunk_length,
+                    len(current_chunk)  # number of records in chunk
+                ))
+                
+                current_offset += chunk_length
+                current_chunk = []
+                chunk_min_surt = None
+        
+        # Handle final chunk
+        if current_chunk:
+            chunk_data = ''.join(current_chunk).encode('utf-8')
+            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+            compressed = z.compress(chunk_data) + z.flush()
+            chunk_length = len(compressed)
+            f.write(compressed)
+            
+            index_entries.append((
+                chunk_min_surt,
+                chunk_max_surt,
+                output_filename,
+                partition_id,
+                current_offset,
+                chunk_length,
+                len(current_chunk)
+            ))
+            current_chunk = []
+    
+    with open(output_filename, 'rb') as fd:
+        write_output_file(output_filename, fd, output_base_url)
+    
+    os.unlink(output_filename)
+    return index_entries
     
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
@@ -51,83 +159,13 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
 
-
-
-    def process_partition(self, partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
-        """Process partition with chunked compression and chunk boundary tracking"""
-        output_filename = f"cdx-{partition_id:05d}.gz"
-        index_entries = []
-        current_offset = 0
-        chunk_size = self.args.num_lines
-        
-        current_chunk = []
-        chunk_min_surt = None
-        chunk_max_surt = None
-        
-        with open(output_filename, 'wb') as f:
-            for surt_key, (timestamp, json_data) in partition_iter:
-                line = f"{surt_key} {timestamp} {json_data}\n"
-                if chunk_min_surt is None:
-                    chunk_min_surt = surt_key
-                chunk_max_surt = surt_key  # Will end up as max since data is sorted
-                current_chunk.append(line)
-                
-                if len(current_chunk) >= chunk_size:
-                    # Compress and write chunk
-                    chunk_data = ''.join(current_chunk).encode('utf-8')
-                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                    compressed = z.compress(chunk_data) + z.flush()
-                    chunk_length = len(compressed)
-                    f.write(compressed)
-                    
-                    # Index entry with chunk boundaries
-                    index_entries.append((
-                        chunk_min_surt,  # min surt
-                        chunk_max_surt,  # max surt
-                        output_filename,  # filename
-                        partition_id,
-                        current_offset,
-                        chunk_length,
-                        len(current_chunk)  # number of records in chunk
-                    ))
-                    
-                    current_offset += chunk_length
-                    current_chunk = []
-                    chunk_min_surt = None
-            
-            # Handle final chunk
-            if current_chunk:
-                chunk_data = ''.join(current_chunk).encode('utf-8')
-                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                compressed = z.compress(chunk_data) + z.flush()
-                chunk_length = len(compressed)
-                f.write(compressed)
-                
-                index_entries.append((
-                    chunk_min_surt,
-                    chunk_max_surt,
-                    output_filename,
-                    partition_id,
-                    current_offset,
-                    chunk_length,
-                    len(current_chunk)
-                ))
-                current_chunk = []
-        
-        with open(output_filename, 'rb') as fd:
-            self.write_output_file(output_filename, fd, self.args.output_base_url)
-        
-        os.unlink(output_filename)
-
-        return index_entries
-    
     def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
         boundries_file_uri = self.args.partition_boundries_file
-
-
+        num_lines = self.args.num_lines
+        output_base_url = self.args.output_base_url
         rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
 
         # Cache the RDD with MEMORY_AND_DISK storage level
@@ -156,18 +194,13 @@ def run_job(self, session):
 
             os.unlink(temp_file_name)
         
-        
-        
-        # Process with range partitioning
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
             partitionFunc=lambda k: get_partition_id(k,boundaries),
             keyfunc=lambda x: x[0]) \
-        .mapPartitionsWithIndex(self.process_partition) \
-        .values()
-        
-        # EMR has issues with this...
-        # rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
+        .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
+        .map(lambda kv: kv[1]) \
+        .collect()
         
         # Update schema for new index format
         index_schema = StructType([

From c0c454b0ab4f630556a7e8b9920ba8f2f6c17dcf Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Tue, 3 Dec 2024 11:17:55 -0600
Subject: [PATCH 47/66] chore: going back to using CCFileProcessorSparkJob

---
 zipnumcluster-ccpyspark.py | 193 ++++++++++++++++---------------------
 1 file changed, 81 insertions(+), 112 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 93208cf..2f7dc24 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -8,13 +8,8 @@
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 from pyspark.sql.window import Window
-import boto3
-import botocore
-import re
 
 LOG = logging.getLogger('IndexWARCJob')
-data_url_pattern = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
-
 
 # note: this is LESS strict about partitioning than the original
 # based on my read of the zipnum clustering code, this shoudl be just fine
@@ -38,109 +33,6 @@ def get_partition_id(key: str, boundaries_data) -> int:
         if key < boundary:
             return i
     return len(boundaries_data)
-
-def write_output_file(filename: str, fd, base_url: str):
-    uri = os.path.join(base_url, filename)
-
-    (scheme, netloc, path) = (None, None, None)
-    uri_match = data_url_pattern.match(uri)
-    if not uri_match and base_uri:
-        # relative input URI (path) and base URI defined
-        uri = base_uri + uri
-        uri_match = data_url_pattern.match(uri)
-    if uri_match:
-        (scheme, netloc, path) = uri_match.groups()
-    else:
-        # keep local file paths as is
-        path = uri
-    
-    if scheme in ['s3', 's3a', 's3n']:
-        bucketname = netloc
-        output_path = path
-        try:
-            client = boto3.client('s3', use_ssl=False)
-            client.upload_fileobj(fd, bucketname, path)
-        except botocore.client.ClientError as exception:
-            LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
-    else:
-        LOG.info('Writing local file {}'.format(uri))
-        if scheme == 'file':
-            # must be an absolute path
-            uri = os.path.join('/', path)
-        else:
-            base_dir = os.path.abspath(os.path.dirname(__file__))
-            uri = os.path.join(base_dir, uri)
-        os.makedirs(os.path.dirname(uri), exist_ok=True)
-        with open(uri, 'wb') as f:
-            f.write(fd.read())
-
-def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]], 
-                     num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
-    """Process partition with chunked compression and chunk boundary tracking"""
-    output_filename = f"cdx-{partition_id:05d}.gz"
-    index_entries = []
-    current_offset = 0
-    chunk_size = num_lines
-    
-    current_chunk = []
-    chunk_min_surt = None
-    chunk_max_surt = None
-    
-    with open(output_filename, 'wb') as f:
-        for surt_key, (timestamp, json_data) in partition_iter:
-            line = f"{surt_key} {timestamp} {json_data}\n"
-            if chunk_min_surt is None:
-                chunk_min_surt = surt_key
-            chunk_max_surt = surt_key  # Will end up as max since data is sorted
-            current_chunk.append(line)
-            
-            if len(current_chunk) >= chunk_size:
-                # Compress and write chunk
-                chunk_data = ''.join(current_chunk).encode('utf-8')
-                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                compressed = z.compress(chunk_data) + z.flush()
-                chunk_length = len(compressed)
-                f.write(compressed)
-                
-                # Index entry with chunk boundaries
-                index_entries.append((
-                    chunk_min_surt,  # min surt
-                    chunk_max_surt,  # max surt
-                    output_filename,  # filename
-                    partition_id,
-                    current_offset,
-                    chunk_length,
-                    len(current_chunk)  # number of records in chunk
-                ))
-                
-                current_offset += chunk_length
-                current_chunk = []
-                chunk_min_surt = None
-        
-        # Handle final chunk
-        if current_chunk:
-            chunk_data = ''.join(current_chunk).encode('utf-8')
-            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-            compressed = z.compress(chunk_data) + z.flush()
-            chunk_length = len(compressed)
-            f.write(compressed)
-            
-            index_entries.append((
-                chunk_min_surt,
-                chunk_max_surt,
-                output_filename,
-                partition_id,
-                current_offset,
-                chunk_length,
-                len(current_chunk)
-            ))
-            current_chunk = []
-    
-    with open(output_filename, 'rb') as fd:
-        write_output_file(output_filename, fd, output_base_url)
-    
-    os.unlink(output_filename)
-    return index_entries
     
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
@@ -159,13 +51,84 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
 
+
+
+    def process_partition(partition_id: int, chunk_size: int, output_base_url: str,  write_output_file, partition_iter: Iterator[Tuple[str, Tuple[str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
+        """Process partition with chunked compression and chunk boundary tracking"""
+        output_filename = f"cdx-{partition_id:05d}.gz"
+        index_entries = []
+        current_offset = 0
+        
+        current_chunk = []
+        chunk_min_surt = None
+        chunk_max_surt = None
+        
+        with open(output_filename, 'wb') as f:
+            for surt_key, (timestamp, json_data) in partition_iter:
+                line = f"{surt_key} {timestamp} {json_data}\n"
+                if chunk_min_surt is None:
+                    chunk_min_surt = surt_key
+                chunk_max_surt = surt_key  # Will end up as max since data is sorted
+                current_chunk.append(line)
+                
+                if len(current_chunk) >= chunk_size:
+                    # Compress and write chunk
+                    chunk_data = ''.join(current_chunk).encode('utf-8')
+                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                    compressed = z.compress(chunk_data) + z.flush()
+                    chunk_length = len(compressed)
+                    f.write(compressed)
+                    
+                    # Index entry with chunk boundaries
+                    index_entries.append((
+                        chunk_min_surt,  # min surt
+                        chunk_max_surt,  # max surt
+                        output_filename,  # filename
+                        partition_id,
+                        current_offset,
+                        chunk_length,
+                        len(current_chunk)  # number of records in chunk
+                    ))
+                    
+                    current_offset += chunk_length
+                    current_chunk = []
+                    chunk_min_surt = None
+            
+            # Handle final chunk
+            if current_chunk:
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+                
+                index_entries.append((
+                    chunk_min_surt,
+                    chunk_max_surt,
+                    output_filename,
+                    partition_id,
+                    current_offset,
+                    chunk_length,
+                    len(current_chunk)
+                ))
+                current_chunk = []
+        
+        with open(output_filename, 'rb') as fd:
+            write_output_file(output_filename, fd, output_base_url)
+        
+        os.unlink(output_filename)
+
+        return index_entries
+    
     def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
         boundries_file_uri = self.args.partition_boundries_file
+        write_output_file = self.write_output_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
+
         rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
 
         # Cache the RDD with MEMORY_AND_DISK storage level
@@ -194,13 +157,19 @@ def run_job(self, session):
 
             os.unlink(temp_file_name)
         
+
+        process_partition = lambda x: ZipNumClusterCdx.process_partition(x, num_lines, output_base_url, write_output_file)
+        
+        # Process with range partitioning
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
             partitionFunc=lambda k: get_partition_id(k,boundaries),
             keyfunc=lambda x: x[0]) \
-        .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
-        .map(lambda kv: kv[1]) \
-        .collect()
+        .mapPartitionsWithIndex(process_partition) \
+        .values()
+        
+        # EMR has issues with this...
+        # rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
         
         # Update schema for new index format
         index_schema = StructType([
@@ -227,7 +196,7 @@ def run_job(self, session):
                 seq += 1
 
         with open(chunk_index_path, 'rb') as fd:
-            self.write_output_file(chunk_index_path, fd, self.args.output_base_url)
+            write_output_file(chunk_index_path, fd, self.args.output_base_url)
         
         os.unlink(chunk_index_path)
         

From 0f0b6d4613d01ff0309a59c588b54b4f9f680951 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Mon, 16 Dec 2024 12:59:05 -0600
Subject: [PATCH 48/66] fix: bugfixes found from integration testing

---
 zipnumcluster-ccpyspark.py | 293 ++++++++++++++++++++++---------------
 1 file changed, 175 insertions(+), 118 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 2f7dc24..334f794 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -8,8 +8,13 @@
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 from pyspark.sql.window import Window
+import boto3
+import botocore
+import re
 
 LOG = logging.getLogger('IndexWARCJob')
+data_url_pattern = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
+
 
 # note: this is LESS strict about partitioning than the original
 # based on my read of the zipnum clustering code, this shoudl be just fine
@@ -29,11 +34,152 @@ def parse_line(line):
 
 def get_partition_id(key: str, boundaries_data) -> int:
     """Determine partition based on range boundaries"""
-    for i, boundary in enumerate(boundaries_data):
-        if key < boundary:
-            return i
-    return len(boundaries_data)
+    if not boundaries_data:
+        return 0
+    
+    # Binary search to find the right partition
+    left = 0
+    right = len(boundaries_data)
+    
+    while left < right:
+        mid = (left + right) // 2
+        if mid == len(boundaries_data):
+            return mid
+        if key <= boundaries_data[mid]:
+            right = mid
+        else:
+            left = mid + 1
+            
+    return left
+
+def write_output_file(filename: str, fd, base_uri: str = None):
+    uri = os.path.join(base_uri, filename)
+
+    (scheme, netloc, path) = (None, None, None)
+    uri_match = data_url_pattern.match(uri)
+    if not uri_match and base_uri:
+        # relative input URI (path) and base URI defined
+        uri = base_uri + uri
+        uri_match = data_url_pattern.match(uri)
+    if uri_match:
+        (scheme, netloc, path) = uri_match.groups()
+    else:
+        # keep local file paths as is
+        path = uri
+    
+    if scheme in ['s3', 's3a', 's3n']:
+        bucketname = netloc
+        output_path = path
+        try:
+            client = boto3.client('s3', use_ssl=False)
+            client.upload_fileobj(fd, bucketname, path)
+        except botocore.client.ClientError as exception:
+            LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
+    else:
+        LOG.info('Writing local file {}'.format(uri))
+        if scheme == 'file':
+            # must be an absolute path
+            uri = os.path.join('/', path)
+        else:
+            base_dir = os.path.abspath(os.path.dirname(__file__))
+            uri = os.path.join(base_dir, uri)
+        os.makedirs(os.path.dirname(uri), exist_ok=True)
+        with open(uri, 'wb') as f:
+            f.write(fd.read())
+
+def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]], 
+                     num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, int, int, int, int]]:
+    """Process partition with chunked compression and chunk boundary tracking"""
+    output_filename = f"cdx-{partition_id:05d}.gz"
+    index_entries = []
+    current_offset = 0
+    chunk_size = num_lines
+    
+    current_chunk = []
+    chunk_min_surt = None
+    chunk_max_surt = None
+    
+    with open(output_filename, 'wb') as f:
+        for surt_key, (timestamp, json_data) in partition_iter:
+            line = f"{surt_key} {timestamp} {json_data}\n"
+            if chunk_min_surt is None:
+                chunk_min_surt = surt_key
+            chunk_max_surt = surt_key  # Will end up as max since data is sorted
+            current_chunk.append(line)
+            
+            if len(current_chunk) >= chunk_size:
+                # Compress and write chunk
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+                
+                # Index entry with chunk boundaries
+                index_entries.append((
+                    str(chunk_min_surt),  # min surt
+                    str(chunk_max_surt),  # max surt
+                    str(output_filename),  # filename
+                    int(partition_id),     # explicit integer conversion
+                    int(current_offset),   # explicit integer conversion
+                    int(chunk_length),     # explicit integer conversion
+                    int(len(current_chunk))  # number of records in chunk
+                ))
+                
+                current_offset += chunk_length
+                current_chunk = []
+                chunk_min_surt = None
+        
+        # Handle final chunk
+        if current_chunk:
+            chunk_data = ''.join(current_chunk).encode('utf-8')
+            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+            compressed = z.compress(chunk_data) + z.flush()
+            chunk_length = len(compressed)
+            f.write(compressed)
+            
+            index_entries.append((
+                str(chunk_min_surt),  # min surt
+                str(chunk_max_surt),  # max surt
+                str(output_filename),
+                int(partition_id),
+                int(current_offset),
+                int(chunk_length),
+                int(len(current_chunk))
+            ))
+            current_chunk = []
+    
+    with open(output_filename, 'rb') as fd:
+        write_output_file(output_filename, fd, output_base_url)
+    
+    os.unlink(output_filename)
+
+
+    final_files = write_partition_with_global_seq(partition_id, index_entries, num_lines, output_base_url)
+
+    return final_files
+
+def write_partition_with_global_seq(idx, partition_iter, records_per_partition=None, output_base_url=None):
+    partition_idx_file = f"idx-{idx:05d}.idx"
+    
+    # Calculate starting sequence number for this partition
+    start_seq = (idx * records_per_partition) + 1 if records_per_partition else 1
+    
+    with open(partition_idx_file, 'w') as f:
+        seq = start_seq
+        for record in partition_iter:
+            min_surt, _, filename, _, offset, length, _ = record
+            timestamp = "20240522010826" # TODO: what timestamp should this be????
+            f.write(f"{min_surt} {timestamp}\t{filename}\t{offset}\t{length}\t{seq}\n")
+            seq += 1
+    
+    with open(partition_idx_file, 'rb') as fd:
+        write_output_file(partition_idx_file, fd, output_base_url)
     
+    os.unlink(partition_idx_file)
+
+    return [(partition_idx_file,True)]
+
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
 
@@ -51,84 +197,13 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
 
-
-
-    def process_partition(partition_id: int, chunk_size: int, output_base_url: str,  write_output_file, partition_iter: Iterator[Tuple[str, Tuple[str, str]]]) -> Iterator[Tuple[str, str, str, str, int, int, int]]:
-        """Process partition with chunked compression and chunk boundary tracking"""
-        output_filename = f"cdx-{partition_id:05d}.gz"
-        index_entries = []
-        current_offset = 0
-        
-        current_chunk = []
-        chunk_min_surt = None
-        chunk_max_surt = None
-        
-        with open(output_filename, 'wb') as f:
-            for surt_key, (timestamp, json_data) in partition_iter:
-                line = f"{surt_key} {timestamp} {json_data}\n"
-                if chunk_min_surt is None:
-                    chunk_min_surt = surt_key
-                chunk_max_surt = surt_key  # Will end up as max since data is sorted
-                current_chunk.append(line)
-                
-                if len(current_chunk) >= chunk_size:
-                    # Compress and write chunk
-                    chunk_data = ''.join(current_chunk).encode('utf-8')
-                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                    compressed = z.compress(chunk_data) + z.flush()
-                    chunk_length = len(compressed)
-                    f.write(compressed)
-                    
-                    # Index entry with chunk boundaries
-                    index_entries.append((
-                        chunk_min_surt,  # min surt
-                        chunk_max_surt,  # max surt
-                        output_filename,  # filename
-                        partition_id,
-                        current_offset,
-                        chunk_length,
-                        len(current_chunk)  # number of records in chunk
-                    ))
-                    
-                    current_offset += chunk_length
-                    current_chunk = []
-                    chunk_min_surt = None
-            
-            # Handle final chunk
-            if current_chunk:
-                chunk_data = ''.join(current_chunk).encode('utf-8')
-                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                compressed = z.compress(chunk_data) + z.flush()
-                chunk_length = len(compressed)
-                f.write(compressed)
-                
-                index_entries.append((
-                    chunk_min_surt,
-                    chunk_max_surt,
-                    output_filename,
-                    partition_id,
-                    current_offset,
-                    chunk_length,
-                    len(current_chunk)
-                ))
-                current_chunk = []
-        
-        with open(output_filename, 'rb') as fd:
-            write_output_file(output_filename, fd, output_base_url)
-        
-        os.unlink(output_filename)
-
-        return index_entries
-    
     def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
         boundries_file_uri = self.args.partition_boundries_file
-        write_output_file = self.write_output_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
-
         rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
 
         # Cache the RDD with MEMORY_AND_DISK storage level
@@ -142,11 +217,18 @@ def run_job(self, session):
             with self.fetch_file(boundries_file_uri) as f:
                 boundaries = json.load(f)
         else:
-            ##logging.info(f"NO Boundaries file found, creating it: {boundries_file_uri}")
-            samples = rdd.keys().sample(False, 0.1).collect()
+            # this percent needs to be pretty small, since this collect brings data back to driver...
+            # 1/2 percent should be fine
+            samples = rdd.keys().sample(False, 0.005).collect()
             samples.sort()
-            step = len(samples) // num_partitions
-            boundaries = samples[::step][:num_partitions-1]
+            
+            # Ensure more even distribution by using quantiles
+            total_samples = len(samples)
+            boundaries = []
+            for i in range(1, num_partitions):
+                idx = (i * total_samples) // num_partitions
+                if idx < len(samples):
+                    boundaries.append(samples[idx])
             
             temp_file_name = 'temp_range_boundaries.json'
             with open(temp_file_name, 'w') as f:
@@ -157,50 +239,25 @@ def run_job(self, session):
 
             os.unlink(temp_file_name)
         
-
-        process_partition = lambda x: ZipNumClusterCdx.process_partition(x, num_lines, output_base_url, write_output_file)
-        
-        # Process with range partitioning
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
             partitionFunc=lambda k: get_partition_id(k,boundaries),
             keyfunc=lambda x: x[0]) \
-        .mapPartitionsWithIndex(process_partition) \
-        .values()
-        
-        # EMR has issues with this...
-        # rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
-        
-        # Update schema for new index format
-        index_schema = StructType([
-            StructField("min_surt", StringType(), False),
-            StructField("max_surt", StringType(), False),
-            StructField("filename", StringType(), False),
-            StructField("partition_id", LongType(), False),
-            StructField("offset", LongType(), False),
-            StructField("length", LongType(), False),
-            StructField("num_records", LongType(), False)
-        ])
-        
-        index_df = session.createDataFrame(rdd, index_schema).orderBy("min_surt")
-        
-        # Write chunk-level index
-        chunk_index_path = f"cluster.idx"
-        with open(chunk_index_path, 'w') as f:
-            seq = 1
-            for row in index_df.collect():
-                # Write min entry
-                f.write(f"{row['min_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{seq}\n")
-                # Write max entry (was just for testing, we don't really need this in final index I don't think...)
-                # f.write(f"{row['max_surt']}\t{row['filename']}\t{row['offset']}\t{row['length']}\t{row['sequence_number']}\n")
-                seq += 1
-
-        with open(chunk_index_path, 'rb') as fd:
-            write_output_file(chunk_index_path, fd, self.args.output_base_url)
-        
-        os.unlink(chunk_index_path)
-        
+        .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
+        .collect()
+    
+        # loop over the output files and concatenate them into a single final file
+        with open('cluster.idx', 'wb') as f:
+            for idx_file,_ in rdd:
+                with self.fetch_file(output_base_url + idx_file) as idx_fd:
+                    for line in idx_fd:
+                        f.write(line)
 
+        with open('cluster.idx', 'rb') as f:
+            self.write_output_file('cluster.idx', f, output_base_url)
+
+        os.unlink('cluster.idx')
+        
 if __name__ == "__main__":
     job = ZipNumClusterCdx()
     job.run()
\ No newline at end of file

From 30b3ce7bb93ea59ffe6682f794c85bc332bd501b Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Fri, 20 Dec 2024 16:27:51 -0600
Subject: [PATCH 49/66] fix: spelling of boundaries

---
 zipnumcluster-ccpyspark.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 334f794..0a3ee58 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -188,7 +188,7 @@ def add_arguments(self, parser):
         parser.add_argument("--output_base_url", required=False,
                             default='my_cdx_bucket',
                             help="destination for output")
-        parser.add_argument("--partition_boundries_file", required=False,
+        parser.add_argument("--partition_boundaries_file", required=False,
                             help="Full path to a json file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,
@@ -201,7 +201,7 @@ def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
-        boundries_file_uri = self.args.partition_boundries_file
+        boundaries_file_uri = self.args.partition_boundaries_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
         rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
@@ -211,10 +211,10 @@ def run_job(self, session):
         #rdd = rdd.cache()
 
         boundaries = None
-        ##logging.info(f"Boundaries file: {boundries_file_uri}")
-        if boundries_file_uri and self.check_for_output_file(boundries_file_uri):
-            ##logging.info(f"Boundaries file found, using it: {boundries_file_uri}")
-            with self.fetch_file(boundries_file_uri) as f:
+        ##logging.info(f"Boundaries file: {boundaries_file_uri}")
+        if boundaries_file_uri and self.check_for_output_file(boundaries_file_uri):
+            ##logging.info(f"Boundaries file found, using it: {boundaries_file_uri}")
+            with self.fetch_file(boundaries_file_uri) as f:
                 boundaries = json.load(f)
         else:
             # this percent needs to be pretty small, since this collect brings data back to driver...
@@ -235,7 +235,7 @@ def run_job(self, session):
                 json.dump(boundaries, f)
             
             with open(temp_file_name, 'rb') as f:
-                self.write_output_file(boundries_file_uri, f)
+                self.write_output_file(boundaries_file_uri, f)
 
             os.unlink(temp_file_name)
         

From 91a18e7c98866b087fcbb49df90ca7d1cad072bb Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Fri, 20 Dec 2024 16:28:49 -0600
Subject: [PATCH 50/66] fix: marking args as required

---
 zipnumcluster-ccpyspark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 0a3ee58..619fe10 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -185,10 +185,10 @@ class ZipNumClusterCdx(CCFileProcessorSparkJob):
 
     def add_arguments(self, parser):
         super(CCFileProcessorSparkJob,self).add_arguments(parser)
-        parser.add_argument("--output_base_url", required=False,
+        parser.add_argument("--output_base_url", required=True,
                             default='my_cdx_bucket',
                             help="destination for output")
-        parser.add_argument("--partition_boundaries_file", required=False,
+        parser.add_argument("--partition_boundaries_file", required=True,
                             help="Full path to a json file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,

From fcd02cfa0ce52c493b4fcd7b98fbc98d8a3be232 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Fri, 20 Dec 2024 16:29:41 -0600
Subject: [PATCH 51/66] fix: remove keyfunc, it isn't needed (and was causing
 issue that sort was only working on first char

---
 zipnumcluster-ccpyspark.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 619fe10..eeffdce 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -241,8 +241,7 @@ def run_job(self, session):
         
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
-            partitionFunc=lambda k: get_partition_id(k,boundaries),
-            keyfunc=lambda x: x[0]) \
+            partitionFunc=lambda k: get_partition_id(k,boundaries)) \
         .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
         .collect()
     

From 3f60a21de0870f9595b578b25a1085e8003d0142 Mon Sep 17 00:00:00 2001
From: Jason Grey <jason@commoncrawl.org>
Date: Mon, 27 Jan 2025 16:43:04 -0600
Subject: [PATCH 52/66] chore: adding TODO for upcoming tasks

---
 zipnumcluster-ccpyspark.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index eeffdce..0901f3d 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -251,11 +251,16 @@ def run_job(self, session):
                 with self.fetch_file(output_base_url + idx_file) as idx_fd:
                     for line in idx_fd:
                         f.write(line)
+                    # TODO: remove the idx file...
 
         with open('cluster.idx', 'rb') as f:
             self.write_output_file('cluster.idx', f, output_base_url)
 
         os.unlink('cluster.idx')
+
+        # These todo's will remove most of the need for any post processing...
+        # TODO: create metadata.yml andput it to output_base_url
+        # TODO: remove the "*.idx" files from the output_base_url
         
 if __name__ == "__main__":
     job = ZipNumClusterCdx()

From a1f85534dbaa66855cb6c997353823052e986610 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Thu, 13 Nov 2025 22:21:51 +0100
Subject: [PATCH 53/66] fix: code formatting and comments

---
 indexwarcsjob-ccpyspark.py |  6 +-----
 zipnumcluster-ccpyspark.py | 29 +++++++++++++++++++----------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/indexwarcsjob-ccpyspark.py b/indexwarcsjob-ccpyspark.py
index 91e1f72..5bb89a8 100644
--- a/indexwarcsjob-ccpyspark.py
+++ b/indexwarcsjob-ccpyspark.py
@@ -36,7 +36,6 @@ def add_arguments(self, parser):
                             help="skip processing files that already have CDX")
     
     def _conv_warc_to_cdx_path(self, warc_path):
-        # set cdx path
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
         cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
         cdx_path = cdx_path.replace('.warc.wet.gz', '.wet.cdx.gz')
@@ -56,19 +55,16 @@ def process_file(self, warc_path, tempfd):
             return
 
         with TemporaryFile(mode='w+b',
-                            dir=self.args.local_temp_dir) as cdxtemp:
+                           dir=self.args.local_temp_dir) as cdxtemp:
             
             success = False
             with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
-                # Index to temp
                 try:
                     write_cdx_index(cdxfile, tempfd, warc_path, **self.index_options)
                     success = True
                 except Exception as exc:
-                    # log detailed stack trace
                     LOG.error('Failed to index %s: %s', warc_path, exc)
 
-            # Upload temp
             cdxtemp.flush()
             cdxtemp.seek(0)
 
diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 0901f3d..5b73955 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -2,24 +2,31 @@
 import zlib
 import json
 import logging
+
 from typing import Iterator, Tuple, List
-from sparkcc import CCFileProcessorSparkJob
+
+import boto3
+import botocore
+import re
+
 from pyspark import StorageLevel
 from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
 from pyspark.sql.types import StringType, LongType, StructType, StructField
 from pyspark.sql.window import Window
-import boto3
-import botocore
-import re
 
-LOG = logging.getLogger('IndexWARCJob')
+from sparkcc import CCFileProcessorSparkJob
+
+
+LOG = logging.getLogger('ZipNumClusterCdx')
 data_url_pattern = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
 
 
+# TODO: remove comment lines after testing
 # note: this is LESS strict about partitioning than the original
-# based on my read of the zipnum clustering code, this shoudl be just fine
+# based on my read of the zipnum clustering code, this should be just fine
 # but so far, it's untested. I plan to test it with the index server we use (locally)
 
+# TODO: try to move the functions below into ZipNumClusterCdx class
 # some of these functions need to be serialized by spark, so, keep them outside of the class
 # so we don't have issues with EMR serialization
 def parse_line(line):
@@ -52,6 +59,7 @@ def get_partition_id(key: str, boundaries_data) -> int:
             
     return left
 
+# TODO: this duplicates code defined in CCFileProcessorSparkJob
 def write_output_file(filename: str, fd, base_uri: str = None):
     uri = os.path.join(base_uri, filename)
 
@@ -178,7 +186,7 @@ def write_partition_with_global_seq(idx, partition_iter, records_per_partition=N
     
     os.unlink(partition_idx_file)
 
-    return [(partition_idx_file,True)]
+    return [(partition_idx_file, True)]
 
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
@@ -206,6 +214,7 @@ def run_job(self, session):
         output_base_url = self.args.output_base_url
         rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
 
+        # TODO
         # Cache the RDD with MEMORY_AND_DISK storage level
         #rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
         #rdd = rdd.cache()
@@ -242,12 +251,12 @@ def run_job(self, session):
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
             partitionFunc=lambda k: get_partition_id(k,boundaries)) \
-        .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
-        .collect()
+            .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
+            .collect()
     
         # loop over the output files and concatenate them into a single final file
         with open('cluster.idx', 'wb') as f:
-            for idx_file,_ in rdd:
+            for idx_file, _ in rdd:
                 with self.fetch_file(output_base_url + idx_file) as idx_fd:
                     for line in idx_fd:
                         f.write(line)

From 014d702fe2e0c841f4801007188924670e3a6c80 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 14 Nov 2025 15:34:57 +0100
Subject: [PATCH 54/66] fix: sort by SURT key *and* capture timestamp

---
 zipnumcluster-ccpyspark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster-ccpyspark.py
index 5b73955..7e73926 100644
--- a/zipnumcluster-ccpyspark.py
+++ b/zipnumcluster-ccpyspark.py
@@ -35,7 +35,7 @@ def parse_line(line):
         if len(parts) != 3:
             return None
         surt_key, timestamp, json_str = parts
-        return (surt_key, (timestamp, json_str))
+        return ((surt_key, timestamp), json_str)
     except:
         return None
 
@@ -108,7 +108,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
     chunk_max_surt = None
     
     with open(output_filename, 'wb') as f:
-        for surt_key, (timestamp, json_data) in partition_iter:
+        for (surt_key, timestamp), json_data in partition_iter:
             line = f"{surt_key} {timestamp} {json_data}\n"
             if chunk_min_surt is None:
                 chunk_min_surt = surt_key
@@ -250,7 +250,7 @@ def run_job(self, session):
         
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
-            partitionFunc=lambda k: get_partition_id(k,boundaries)) \
+            partitionFunc=lambda k: get_partition_id(k, boundaries)) \
             .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
             .collect()
     

From 0ce27c8e18112039d69f64fc1fab82fb556dc931 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Fri, 14 Nov 2025 15:45:34 +0100
Subject: [PATCH 55/66] refactor: rename Python files to valid module names

---
 indexwarcsjob-ccpyspark.py => indexwarcs_cc_pyspark.py    | 1 -
 zipnumcluster-ccpyspark.py => zipnumcluster_cc_pyspark.py | 0
 2 files changed, 1 deletion(-)
 rename indexwarcsjob-ccpyspark.py => indexwarcs_cc_pyspark.py (97%)
 rename zipnumcluster-ccpyspark.py => zipnumcluster_cc_pyspark.py (100%)

diff --git a/indexwarcsjob-ccpyspark.py b/indexwarcs_cc_pyspark.py
similarity index 97%
rename from indexwarcsjob-ccpyspark.py
rename to indexwarcs_cc_pyspark.py
index 5bb89a8..a17ca60 100644
--- a/indexwarcsjob-ccpyspark.py
+++ b/indexwarcs_cc_pyspark.py
@@ -9,7 +9,6 @@
 LOG = logging.getLogger('IndexWARCJob')
 
 
-#=============================================================================
 class IndexWARCJob(CCFileProcessorSparkJob):
     """ This job receives as input a manifest of WARC/ARC files and produces
     a CDX index per file
diff --git a/zipnumcluster-ccpyspark.py b/zipnumcluster_cc_pyspark.py
similarity index 100%
rename from zipnumcluster-ccpyspark.py
rename to zipnumcluster_cc_pyspark.py

From 7779a47885bc1e92a211e1974ee63b60be90c608 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 14:01:29 +0100
Subject: [PATCH 56/66] refactor: move functions out of global scope

- make all functions in the global scope static methods
  of the class ZipNumClusterCdx. Static methods are
  required by Spark serialization if methods are used
  in RDD methods (map, partition, etc.)

fix: make methods requiring serialization static
---
 zipnumcluster_cc_pyspark.py | 340 +++++++++++++++++++-----------------
 1 file changed, 175 insertions(+), 165 deletions(-)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 7e73926..61ae2ea 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -26,167 +26,6 @@
 # based on my read of the zipnum clustering code, this should be just fine
 # but so far, it's untested. I plan to test it with the index server we use (locally)
 
-# TODO: try to move the functions below into ZipNumClusterCdx class
-# some of these functions need to be serialized by spark, so, keep them outside of the class
-# so we don't have issues with EMR serialization
-def parse_line(line):
-    try:
-        parts = line.split(' ', 2)
-        if len(parts) != 3:
-            return None
-        surt_key, timestamp, json_str = parts
-        return ((surt_key, timestamp), json_str)
-    except:
-        return None
-
-def get_partition_id(key: str, boundaries_data) -> int:
-    """Determine partition based on range boundaries"""
-    if not boundaries_data:
-        return 0
-    
-    # Binary search to find the right partition
-    left = 0
-    right = len(boundaries_data)
-    
-    while left < right:
-        mid = (left + right) // 2
-        if mid == len(boundaries_data):
-            return mid
-        if key <= boundaries_data[mid]:
-            right = mid
-        else:
-            left = mid + 1
-            
-    return left
-
-# TODO: this duplicates code defined in CCFileProcessorSparkJob
-def write_output_file(filename: str, fd, base_uri: str = None):
-    uri = os.path.join(base_uri, filename)
-
-    (scheme, netloc, path) = (None, None, None)
-    uri_match = data_url_pattern.match(uri)
-    if not uri_match and base_uri:
-        # relative input URI (path) and base URI defined
-        uri = base_uri + uri
-        uri_match = data_url_pattern.match(uri)
-    if uri_match:
-        (scheme, netloc, path) = uri_match.groups()
-    else:
-        # keep local file paths as is
-        path = uri
-    
-    if scheme in ['s3', 's3a', 's3n']:
-        bucketname = netloc
-        output_path = path
-        try:
-            client = boto3.client('s3', use_ssl=False)
-            client.upload_fileobj(fd, bucketname, path)
-        except botocore.client.ClientError as exception:
-            LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
-    else:
-        LOG.info('Writing local file {}'.format(uri))
-        if scheme == 'file':
-            # must be an absolute path
-            uri = os.path.join('/', path)
-        else:
-            base_dir = os.path.abspath(os.path.dirname(__file__))
-            uri = os.path.join(base_dir, uri)
-        os.makedirs(os.path.dirname(uri), exist_ok=True)
-        with open(uri, 'wb') as f:
-            f.write(fd.read())
-
-def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]], 
-                     num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, int, int, int, int]]:
-    """Process partition with chunked compression and chunk boundary tracking"""
-    output_filename = f"cdx-{partition_id:05d}.gz"
-    index_entries = []
-    current_offset = 0
-    chunk_size = num_lines
-    
-    current_chunk = []
-    chunk_min_surt = None
-    chunk_max_surt = None
-    
-    with open(output_filename, 'wb') as f:
-        for (surt_key, timestamp), json_data in partition_iter:
-            line = f"{surt_key} {timestamp} {json_data}\n"
-            if chunk_min_surt is None:
-                chunk_min_surt = surt_key
-            chunk_max_surt = surt_key  # Will end up as max since data is sorted
-            current_chunk.append(line)
-            
-            if len(current_chunk) >= chunk_size:
-                # Compress and write chunk
-                chunk_data = ''.join(current_chunk).encode('utf-8')
-                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-                compressed = z.compress(chunk_data) + z.flush()
-                chunk_length = len(compressed)
-                f.write(compressed)
-                
-                # Index entry with chunk boundaries
-                index_entries.append((
-                    str(chunk_min_surt),  # min surt
-                    str(chunk_max_surt),  # max surt
-                    str(output_filename),  # filename
-                    int(partition_id),     # explicit integer conversion
-                    int(current_offset),   # explicit integer conversion
-                    int(chunk_length),     # explicit integer conversion
-                    int(len(current_chunk))  # number of records in chunk
-                ))
-                
-                current_offset += chunk_length
-                current_chunk = []
-                chunk_min_surt = None
-        
-        # Handle final chunk
-        if current_chunk:
-            chunk_data = ''.join(current_chunk).encode('utf-8')
-            z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
-            compressed = z.compress(chunk_data) + z.flush()
-            chunk_length = len(compressed)
-            f.write(compressed)
-            
-            index_entries.append((
-                str(chunk_min_surt),  # min surt
-                str(chunk_max_surt),  # max surt
-                str(output_filename),
-                int(partition_id),
-                int(current_offset),
-                int(chunk_length),
-                int(len(current_chunk))
-            ))
-            current_chunk = []
-    
-    with open(output_filename, 'rb') as fd:
-        write_output_file(output_filename, fd, output_base_url)
-    
-    os.unlink(output_filename)
-
-
-    final_files = write_partition_with_global_seq(partition_id, index_entries, num_lines, output_base_url)
-
-    return final_files
-
-def write_partition_with_global_seq(idx, partition_iter, records_per_partition=None, output_base_url=None):
-    partition_idx_file = f"idx-{idx:05d}.idx"
-    
-    # Calculate starting sequence number for this partition
-    start_seq = (idx * records_per_partition) + 1 if records_per_partition else 1
-    
-    with open(partition_idx_file, 'w') as f:
-        seq = start_seq
-        for record in partition_iter:
-            min_surt, _, filename, _, offset, length, _ = record
-            timestamp = "20240522010826" # TODO: what timestamp should this be????
-            f.write(f"{min_surt} {timestamp}\t{filename}\t{offset}\t{length}\t{seq}\n")
-            seq += 1
-    
-    with open(partition_idx_file, 'rb') as fd:
-        write_output_file(partition_idx_file, fd, output_base_url)
-    
-    os.unlink(partition_idx_file)
-
-    return [(partition_idx_file, True)]
 
 class ZipNumClusterCdx(CCFileProcessorSparkJob):
     name = 'ZipNumClusterCdx'
@@ -205,6 +44,176 @@ def add_arguments(self, parser):
                             default=300,
                             help="number of partitions/shards")
 
+    @staticmethod
+    def parse_line(line):
+        try:
+            parts = line.split(' ', 2)
+            if len(parts) != 3:
+                return None
+            surt_key, timestamp, json_str = parts
+            return ((surt_key, timestamp), json_str)
+        except:
+            return None
+
+    @staticmethod
+    def get_partition_id(key: str, boundaries_data) -> int:
+        """Determine partition based on range boundaries"""
+        if not boundaries_data:
+            return 0
+
+        # Binary search to find the right partition
+        left = 0
+        right = len(boundaries_data)
+
+        while left < right:
+            mid = (left + right) // 2
+            if mid == len(boundaries_data):
+                return mid
+            if key <= boundaries_data[mid]:
+                right = mid
+            else:
+                left = mid + 1
+
+        return left
+
+    @staticmethod
+    def write_output_file(uri, fd, base_uri=None):
+        """
+        Write data from stream fd to output file location defined per URI.
+        A static variant of CCFileProcessorSparkJob.write_output_file(...)
+        """
+        uri_match = ZipNumClusterCdx.DATA_URL_PATTERN.match(uri)
+        if not uri_match and base_uri:
+            # relative input URI (path) and base URI defined
+            uri = base_uri + uri
+            uri_match = ZipNumClusterCdx.DATA_URL_PATTERN.match(uri)
+        if uri_match:
+            (scheme, netloc, path) = uri_match.groups()
+        else:
+            # keep local file paths as is
+            path = uri
+
+        if scheme in ['s3', 's3a', 's3n']:
+            bucketname = netloc
+            output_path = path
+            try:
+                client = boto3.client('s3')
+                client.upload_fileobj(fd, bucketname, path)
+            except botocore.client.ClientError as exception:
+                ZipNumClusterCdx.LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
+
+        elif scheme == 'http' or scheme == 'https':
+            raise ValueError('HTTP/HTTPS output not supported')
+
+        elif scheme == 'hdfs':
+            raise NotImplementedError('HDFS output not implemented')
+
+        else:
+            ZipNumClusterCdx.LOG.info('Writing local file {}'.format(uri))
+            if scheme == 'file':
+                # must be an absolute path
+                uri = os.path.join('/', path)
+            else:
+                base_dir = os.path.abspath(os.path.dirname(__file__))
+                uri = os.path.join(base_dir, uri)
+            os.makedirs(os.path.dirname(uri), exist_ok=True)
+            with open(uri, 'wb') as f:
+                f.write(fd.read())
+
+    @staticmethod
+    def write_partition_with_global_seq(idx, partition_iter, records_per_partition=None, output_base_url=None):
+        partition_idx_file = f"idx-{idx:05d}.idx"
+
+        # Calculate starting sequence number for this partition
+        start_seq = (idx * records_per_partition) + 1 if records_per_partition else 1
+
+        with open(partition_idx_file, 'w') as f:
+            seq = start_seq
+            for record in partition_iter:
+                min_surt, _, filename, _, offset, length, _ = record
+                timestamp = "20240522010826" # TODO: what timestamp should this be????
+                f.write(f"{min_surt} {timestamp}\t{filename}\t{offset}\t{length}\t{seq}\n")
+                seq += 1
+
+        with open(partition_idx_file, 'rb') as fd:
+            ZipNumClusterCdx.write_output_file(partition_idx_file, fd, output_base_url)
+
+        os.unlink(partition_idx_file)
+
+        return [(partition_idx_file, True)]
+
+    @staticmethod
+    def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]],
+                          num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, int, int, int, int]]:
+        """Process partition with chunked compression and chunk boundary tracking"""
+        output_filename = f"cdx-{partition_id:05d}.gz"
+        index_entries = []
+        current_offset = 0
+        chunk_size = num_lines
+
+        current_chunk = []
+        chunk_min_surt = None
+        chunk_max_surt = None
+
+        with open(output_filename, 'wb') as f:
+            for (surt_key, timestamp), json_data in partition_iter:
+                line = f"{surt_key} {timestamp} {json_data}\n"
+                if chunk_min_surt is None:
+                    chunk_min_surt = surt_key
+                chunk_max_surt = surt_key  # Will end up as max since data is sorted
+                current_chunk.append(line)
+
+                if len(current_chunk) >= chunk_size:
+                    # Compress and write chunk
+                    chunk_data = ''.join(current_chunk).encode('utf-8')
+                    z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                    compressed = z.compress(chunk_data) + z.flush()
+                    chunk_length = len(compressed)
+                    f.write(compressed)
+
+                    # Index entry with chunk boundaries
+                    index_entries.append((
+                        str(chunk_min_surt),  # min surt
+                        str(chunk_max_surt),  # max surt
+                        str(output_filename),  # filename
+                        int(partition_id),     # explicit integer conversion
+                        int(current_offset),   # explicit integer conversion
+                        int(chunk_length),     # explicit integer conversion
+                        int(len(current_chunk))  # number of records in chunk
+                    ))
+
+                    current_offset += chunk_length
+                    current_chunk = []
+                    chunk_min_surt = None
+
+            # Handle final chunk
+            if current_chunk:
+                chunk_data = ''.join(current_chunk).encode('utf-8')
+                z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
+                compressed = z.compress(chunk_data) + z.flush()
+                chunk_length = len(compressed)
+                f.write(compressed)
+
+                index_entries.append((
+                    str(chunk_min_surt),  # min surt
+                    str(chunk_max_surt),  # max surt
+                    str(output_filename),
+                    int(partition_id),
+                    int(current_offset),
+                    int(chunk_length),
+                    int(len(current_chunk))
+                ))
+                current_chunk = []
+
+        with open(output_filename, 'rb') as fd:
+            ZipNumClusterCdx.write_output_file(output_filename, fd, output_base_url)
+
+        os.unlink(output_filename)
+
+        final_files = ZipNumClusterCdx.write_partition_with_global_seq(partition_id, index_entries, num_lines, output_base_url)
+
+        return final_files
+
     def run_job(self, session):
         os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
@@ -212,7 +221,7 @@ def run_job(self, session):
         boundaries_file_uri = self.args.partition_boundaries_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
-        rdd = session.sparkContext.textFile(input).map(parse_line).filter(lambda x: x is not None)
+        rdd = session.sparkContext.textFile(input).map(self.parse_line).filter(lambda x: x is not None)
 
         # TODO
         # Cache the RDD with MEMORY_AND_DISK storage level
@@ -250,8 +259,9 @@ def run_job(self, session):
         
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
-            partitionFunc=lambda k: get_partition_id(k, boundaries)) \
-            .mapPartitionsWithIndex(lambda idx, iter: process_partition(idx, iter, num_lines, output_base_url)) \
+            partitionFunc=lambda k: ZipNumClusterCdx.get_partition_id(k, boundaries)) \
+            .mapPartitionsWithIndex(
+                lambda idx, iter: ZipNumClusterCdx.process_partition(idx, iter, num_lines, output_base_url)) \
             .collect()
     
         # loop over the output files and concatenate them into a single final file
@@ -268,7 +278,7 @@ def run_job(self, session):
         os.unlink('cluster.idx')
 
         # These todo's will remove most of the need for any post processing...
-        # TODO: create metadata.yml andput it to output_base_url
+        # TODO: create metadata.yml and put it to output_base_url
         # TODO: remove the "*.idx" files from the output_base_url
         
 if __name__ == "__main__":

From 64a2e11a0ca244506c6535e0ffe13e00fefacb57 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 14:20:55 +0100
Subject: [PATCH 57/66] fix: do not create output directory tree in local
 filesystem

---
 zipnumcluster_cc_pyspark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 61ae2ea..894fe41 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -215,7 +215,6 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
         return final_files
 
     def run_job(self, session):
-        os.makedirs(self.args.output_base_url, exist_ok=True)
         input = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
         boundaries_file_uri = self.args.partition_boundaries_file

From 136f1ae0b77a18889c89e64070de5a4e96ffc97e Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 15:02:16 +0100
Subject: [PATCH 58/66] fix: type error when using partition boundary file     
 passed per --partition_boundaries_file

---
 zipnumcluster_cc_pyspark.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 894fe41..5eac01c 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -36,7 +36,8 @@ def add_arguments(self, parser):
                             default='my_cdx_bucket',
                             help="destination for output")
         parser.add_argument("--partition_boundaries_file", required=True,
-                            help="Full path to a json file containing partition boundaries. if specified, and does not exist, will be created, otherwise, will be used.")
+                            help="Full path to a JSON file containing partition boundaries."
+                            "If specified, and does not exist, will be created, otherwise, will be used.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,
                             help="number of lines to compress in each chunk")
@@ -228,11 +229,12 @@ def run_job(self, session):
         #rdd = rdd.cache()
 
         boundaries = None
-        ##logging.info(f"Boundaries file: {boundaries_file_uri}")
+        logging.info(f"Boundaries file: {boundaries_file_uri}")
         if boundaries_file_uri and self.check_for_output_file(boundaries_file_uri):
-            ##logging.info(f"Boundaries file found, using it: {boundaries_file_uri}")
+            logging.info(f"Boundaries file found, using it: {boundaries_file_uri}")
             with self.fetch_file(boundaries_file_uri) as f:
-                boundaries = json.load(f)
+                boundaries = list(map(lambda l: tuple(l), json.load(f)))
+
         else:
             # this percent needs to be pretty small, since this collect brings data back to driver...
             # 1/2 percent should be fine
@@ -255,6 +257,8 @@ def run_job(self, session):
                 self.write_output_file(boundaries_file_uri, f)
 
             os.unlink(temp_file_name)
+
+            logging.info(f"Boundaries file created: {boundaries_file_uri}")
         
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,

From 7b0f5cc4db8977f506d9f8cd5a68a3e77e9f9ef5 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 15:07:43 +0100
Subject: [PATCH 59/66] refactor: code documentation, formatting and cleanup

---
 indexwarcs_cc_pyspark.py    | 14 +++++------
 zipnumcluster_cc_pyspark.py | 48 ++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/indexwarcs_cc_pyspark.py b/indexwarcs_cc_pyspark.py
index a17ca60..92fa770 100644
--- a/indexwarcs_cc_pyspark.py
+++ b/indexwarcs_cc_pyspark.py
@@ -27,13 +27,11 @@ class IndexWARCJob(CCFileProcessorSparkJob):
 
     def add_arguments(self, parser):
         super(CCFileProcessorSparkJob, self).add_arguments(parser)
-        parser.add_argument("--output_base_url", required=False,
-                            default='my_cdx_bucket',
-                            help="destination for cdx output")
-        
+        parser.add_argument("--output_base_url", required=True,
+                            help="Destination for CDX output.")
         parser.add_argument("--skip-existing", dest='skip_existing', action='store_true',
-                            help="skip processing files that already have CDX")
-    
+                            help="Skip processing files for which the output CDX file already exists.")
+
     def _conv_warc_to_cdx_path(self, warc_path):
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
         cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
@@ -42,7 +40,7 @@ def _conv_warc_to_cdx_path(self, warc_path):
         return cdx_path
 
     def process_file(self, warc_path, tempfd):
-        
+
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
         LOG.info('Indexing WARC: %s', warc_path)
@@ -55,7 +53,7 @@ def process_file(self, warc_path, tempfd):
 
         with TemporaryFile(mode='w+b',
                            dir=self.args.local_temp_dir) as cdxtemp:
-            
+
             success = False
             with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile:
                 try:
diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 5eac01c..9a60a3c 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -1,34 +1,33 @@
-import os
-import zlib
 import json
 import logging
+import os
+import re
+import zlib
 
-from typing import Iterator, Tuple, List
+from typing import Iterator, Tuple
 
 import boto3
 import botocore
-import re
 
-from pyspark import StorageLevel
-from pyspark.sql.functions import row_number, concat, lit, col, min as min_, max as max_
-from pyspark.sql.types import StringType, LongType, StructType, StructField
-from pyspark.sql.window import Window
 
 from sparkcc import CCFileProcessorSparkJob
 
 
-LOG = logging.getLogger('ZipNumClusterCdx')
-data_url_pattern = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
+class ZipNumClusterCdx(CCFileProcessorSparkJob):
+    """Spark job to create a ZipNum Sharded CDX index, see
+    <https://github.com/webrecorder/pywb/wiki/CDX-Index-Format#zipnum-sharded-cdx>.
+    The index is sharded over multiple partitions (default = 300). Each partition file
+    is compressed using gzip, but in chunks of 3000 lines (a configurable number).
+    Every chunk can be read separately, a jump index allows to find the right chunk
+    for a given key in a binary search.
+    """
 
+    name = 'ZipNumClusterCdx'
 
-# TODO: remove comment lines after testing
-# note: this is LESS strict about partitioning than the original
-# based on my read of the zipnum clustering code, this should be just fine
-# but so far, it's untested. I plan to test it with the index server we use (locally)
+    LOG = logging.getLogger('ZipNumClusterCdx')
 
+    DATA_URL_PATTERN = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
 
-class ZipNumClusterCdx(CCFileProcessorSparkJob):
-    name = 'ZipNumClusterCdx'
 
     def add_arguments(self, parser):
         super(CCFileProcessorSparkJob,self).add_arguments(parser)
@@ -174,8 +173,8 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
 
                     # Index entry with chunk boundaries
                     index_entries.append((
-                        str(chunk_min_surt),  # min surt
-                        str(chunk_max_surt),  # max surt
+                        str(chunk_min_surt),   # min surt
+                        str(chunk_max_surt),   # max surt
                         str(output_filename),  # filename
                         int(partition_id),     # explicit integer conversion
                         int(current_offset),   # explicit integer conversion
@@ -240,7 +239,7 @@ def run_job(self, session):
             # 1/2 percent should be fine
             samples = rdd.keys().sample(False, 0.005).collect()
             samples.sort()
-            
+
             # Ensure more even distribution by using quantiles
             total_samples = len(samples)
             boundaries = []
@@ -248,25 +247,25 @@ def run_job(self, session):
                 idx = (i * total_samples) // num_partitions
                 if idx < len(samples):
                     boundaries.append(samples[idx])
-            
+
             temp_file_name = 'temp_range_boundaries.json'
             with open(temp_file_name, 'w') as f:
                 json.dump(boundaries, f)
-            
+
             with open(temp_file_name, 'rb') as f:
                 self.write_output_file(boundaries_file_uri, f)
 
             os.unlink(temp_file_name)
 
             logging.info(f"Boundaries file created: {boundaries_file_uri}")
-        
+
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
             partitionFunc=lambda k: ZipNumClusterCdx.get_partition_id(k, boundaries)) \
             .mapPartitionsWithIndex(
                 lambda idx, iter: ZipNumClusterCdx.process_partition(idx, iter, num_lines, output_base_url)) \
             .collect()
-    
+
         # loop over the output files and concatenate them into a single final file
         with open('cluster.idx', 'wb') as f:
             for idx_file, _ in rdd:
@@ -283,7 +282,8 @@ def run_job(self, session):
         # These todo's will remove most of the need for any post processing...
         # TODO: create metadata.yml and put it to output_base_url
         # TODO: remove the "*.idx" files from the output_base_url
-        
+
+
 if __name__ == "__main__":
     job = ZipNumClusterCdx()
     job.run()
\ No newline at end of file

From 8394c74459fbc8bbc07081fdd40d7962cef881c6 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 23:08:40 +0100
Subject: [PATCH 60/66] fix: use correct timestamp in cluster.idx

---
 zipnumcluster_cc_pyspark.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 9a60a3c..04314a9 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -130,9 +130,8 @@ def write_partition_with_global_seq(idx, partition_iter, records_per_partition=N
         with open(partition_idx_file, 'w') as f:
             seq = start_seq
             for record in partition_iter:
-                min_surt, _, filename, _, offset, length, _ = record
-                timestamp = "20240522010826" # TODO: what timestamp should this be????
-                f.write(f"{min_surt} {timestamp}\t{filename}\t{offset}\t{length}\t{seq}\n")
+                min_surt, _, min_surt_timestamp, filename, _, offset, length, _ = record
+                f.write(f"{min_surt} {min_surt_timestamp}\t{filename}\t{offset}\t{length}\t{seq}\n")
                 seq += 1
 
         with open(partition_idx_file, 'rb') as fd:
@@ -160,6 +159,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                 line = f"{surt_key} {timestamp} {json_data}\n"
                 if chunk_min_surt is None:
                     chunk_min_surt = surt_key
+                    chunk_min_timestamp = timestamp
                 chunk_max_surt = surt_key  # Will end up as max since data is sorted
                 current_chunk.append(line)
 
@@ -175,6 +175,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                     index_entries.append((
                         str(chunk_min_surt),   # min surt
                         str(chunk_max_surt),   # max surt
+                        str(chunk_min_timestamp), # capture time
                         str(output_filename),  # filename
                         int(partition_id),     # explicit integer conversion
                         int(current_offset),   # explicit integer conversion
@@ -197,6 +198,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
                 index_entries.append((
                     str(chunk_min_surt),  # min surt
                     str(chunk_max_surt),  # max surt
+                    str(chunk_min_timestamp), # capture time
                     str(output_filename),
                     int(partition_id),
                     int(current_offset),

From 02be7575baaa69c9bd85a8edb5bffd65acfabc1b Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Mon, 17 Nov 2025 23:14:37 +0100
Subject: [PATCH 61/66] refactor: persistence and temporary files

- remove commentented code to persist RDD

- write per-shard indexes to temporary location,
  only the cluster.idx is written to the final location
---
 zipnumcluster_cc_pyspark.py | 39 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 04314a9..079485f 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -32,17 +32,18 @@ class ZipNumClusterCdx(CCFileProcessorSparkJob):
     def add_arguments(self, parser):
         super(CCFileProcessorSparkJob,self).add_arguments(parser)
         parser.add_argument("--output_base_url", required=True,
-                            default='my_cdx_bucket',
-                            help="destination for output")
+                            help="Output destination.")
         parser.add_argument("--partition_boundaries_file", required=True,
                             help="Full path to a JSON file containing partition boundaries."
                             "If specified, and does not exist, will be created, otherwise, will be used.")
+        parser.add_argument("--temporary_output_base_url", required=True,
+                            help="Temporary output location for per-shard cluster indexes.")
         parser.add_argument("--num_lines", type=int, required=False,
                             default=3000,
-                            help="number of lines to compress in each chunk")
+                            help="Number of lines to compress in each chunk")
         parser.add_argument("--num_output_partitions", type=int, required=False,
                             default=300,
-                            help="number of partitions/shards")
+                            help="Number of partitions/shards")
 
     @staticmethod
     def parse_line(line):
@@ -100,7 +101,8 @@ def write_output_file(uri, fd, base_uri=None):
                 client = boto3.client('s3')
                 client.upload_fileobj(fd, bucketname, path)
             except botocore.client.ClientError as exception:
-                ZipNumClusterCdx.LOG.error('Failed to write to S3 {}: {}'.format(output_path, exception))
+                ZipNumClusterCdx.LOG.error(
+                    'Failed to write to S3 {}: {}'.format(output_path, exception))
 
         elif scheme == 'http' or scheme == 'https':
             raise ValueError('HTTP/HTTPS output not supported')
@@ -121,7 +123,8 @@ def write_output_file(uri, fd, base_uri=None):
                 f.write(fd.read())
 
     @staticmethod
-    def write_partition_with_global_seq(idx, partition_iter, records_per_partition=None, output_base_url=None):
+    def write_partition_with_global_seq(idx: int, partition_iter: list,
+                                        records_per_partition: int, output_base_url: str):
         partition_idx_file = f"idx-{idx:05d}.idx"
 
         # Calculate starting sequence number for this partition
@@ -143,7 +146,8 @@ def write_partition_with_global_seq(idx, partition_iter, records_per_partition=N
 
     @staticmethod
     def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tuple[str, str]]],
-                          num_lines: int, output_base_url: str) -> Iterator[Tuple[str, str, str, int, int, int, int]]:
+                          num_lines: int, output_base_url: str, temporary_output_base_url: str) \
+                          -> Iterator[Tuple[str, str, str, str, int, int, int, int]]:
         """Process partition with chunked compression and chunk boundary tracking"""
         output_filename = f"cdx-{partition_id:05d}.gz"
         index_entries = []
@@ -212,7 +216,8 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
 
         os.unlink(output_filename)
 
-        final_files = ZipNumClusterCdx.write_partition_with_global_seq(partition_id, index_entries, num_lines, output_base_url)
+        final_files = ZipNumClusterCdx.write_partition_with_global_seq(
+            partition_id, index_entries, num_lines, temporary_output_base_url)
 
         return final_files
 
@@ -222,12 +227,10 @@ def run_job(self, session):
         boundaries_file_uri = self.args.partition_boundaries_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
-        rdd = session.sparkContext.textFile(input).map(self.parse_line).filter(lambda x: x is not None)
+        temporary_output_base_url = self.args.temporary_output_base_url
 
-        # TODO
-        # Cache the RDD with MEMORY_AND_DISK storage level
-        #rdd = rdd.persist(StorageLevel.MEMORY_AND_DISK)
-        #rdd = rdd.cache()
+        rdd = session.sparkContext.textFile(input).map(
+            self.parse_line).filter(lambda x: x is not None)
 
         boundaries = None
         logging.info(f"Boundaries file: {boundaries_file_uri}")
@@ -265,26 +268,22 @@ def run_job(self, session):
             numPartitions=num_partitions,
             partitionFunc=lambda k: ZipNumClusterCdx.get_partition_id(k, boundaries)) \
             .mapPartitionsWithIndex(
-                lambda idx, iter: ZipNumClusterCdx.process_partition(idx, iter, num_lines, output_base_url)) \
+                lambda idx, iter: ZipNumClusterCdx.process_partition(
+                    idx, iter, num_lines, output_base_url, temporary_output_base_url)) \
             .collect()
 
         # loop over the output files and concatenate them into a single final file
         with open('cluster.idx', 'wb') as f:
             for idx_file, _ in rdd:
-                with self.fetch_file(output_base_url + idx_file) as idx_fd:
+                with self.fetch_file(temporary_output_base_url + idx_file) as idx_fd:
                     for line in idx_fd:
                         f.write(line)
-                    # TODO: remove the idx file...
 
         with open('cluster.idx', 'rb') as f:
             self.write_output_file('cluster.idx', f, output_base_url)
 
         os.unlink('cluster.idx')
 
-        # These todo's will remove most of the need for any post processing...
-        # TODO: create metadata.yml and put it to output_base_url
-        # TODO: remove the "*.idx" files from the output_base_url
-
 
 if __name__ == "__main__":
     job = ZipNumClusterCdx()

From beac6c8af6c37d22f38090efb6a4ed8fc2f4c96e Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 18 Nov 2025 12:16:10 +0100
Subject: [PATCH 62/66] fix: unify logging, address Pylint warnings

---
 indexwarcs_cc_pyspark.py    | 30 ++++++++++++++++++-----------
 zipnumcluster_cc_pyspark.py | 38 ++++++++++++++++++++-----------------
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/indexwarcs_cc_pyspark.py b/indexwarcs_cc_pyspark.py
index 92fa770..2eec9ef 100644
--- a/indexwarcs_cc_pyspark.py
+++ b/indexwarcs_cc_pyspark.py
@@ -1,12 +1,10 @@
-import logging
-from sparkcc import CCFileProcessorSparkJob
-
+from gzip import GzipFile
 from tempfile import TemporaryFile
+
 from pywb.indexer.cdxindexer import write_cdx_index
-from gzip import GzipFile
 
+from sparkcc import CCFileProcessorSparkJob
 
-LOG = logging.getLogger('IndexWARCJob')
 
 
 class IndexWARCJob(CCFileProcessorSparkJob):
@@ -18,6 +16,15 @@ class IndexWARCJob(CCFileProcessorSparkJob):
 
     name = 'IndexWARCJob'
 
+    # description of input and output shown by --help
+    input_descr = "Path to file listing input paths (WARC/WAT/WET/ARC)"
+    output_descr = """Table containing the output CDX files
+(in spark.sql.warehouse.dir) and the indexing status:
+   1 successfully created,
+   0 already exists,
+  -1 processing failed"""
+
+    # PyWB index options
     index_options = {
         'surt_ordered': True,
         'sort': True,
@@ -29,8 +36,9 @@ def add_arguments(self, parser):
         super(CCFileProcessorSparkJob, self).add_arguments(parser)
         parser.add_argument("--output_base_url", required=True,
                             help="Destination for CDX output.")
-        parser.add_argument("--skip-existing", dest='skip_existing', action='store_true',
-                            help="Skip processing files for which the output CDX file already exists.")
+        parser.add_argument("--skip_existing", action='store_true',
+                            help="Skip processing files for which "
+                            "the output CDX file already exists.")
 
     def _conv_warc_to_cdx_path(self, warc_path):
         cdx_path = warc_path.replace('crawl-data', 'cc-index/cdx')
@@ -43,11 +51,11 @@ def process_file(self, warc_path, tempfd):
 
         cdx_path = self._conv_warc_to_cdx_path(warc_path)
 
-        LOG.info('Indexing WARC: %s', warc_path)
+        self.get_logger().info('Indexing WARC: %s', warc_path)
 
         if self.args.skip_existing and \
             self.check_for_output_file(cdx_path,self.args.output_base_url):
-            LOG.info('Already Exists: %s', cdx_path)
+            self.get_logger().info('Already Exists: %s', cdx_path)
             yield cdx_path, 0
             return
 
@@ -60,14 +68,14 @@ def process_file(self, warc_path, tempfd):
                     write_cdx_index(cdxfile, tempfd, warc_path, **self.index_options)
                     success = True
                 except Exception as exc:
-                    LOG.error('Failed to index %s: %s', warc_path, exc)
+                    self.get_logger().error('Failed to index %s: %s', warc_path, exc)
 
             cdxtemp.flush()
             cdxtemp.seek(0)
 
             if success:
                 self.write_output_file(cdx_path, cdxtemp, self.args.output_base_url)
-                LOG.info('Successfully uploaded CDX: %s', cdx_path)
+                self.get_logger().info('Successfully uploaded CDX: %s', cdx_path)
                 yield cdx_path, 1
             else:
                 yield cdx_path, -1
diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index 079485f..f0ac197 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -24,8 +24,6 @@ class ZipNumClusterCdx(CCFileProcessorSparkJob):
 
     name = 'ZipNumClusterCdx'
 
-    LOG = logging.getLogger('ZipNumClusterCdx')
-
     DATA_URL_PATTERN = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
 
 
@@ -34,8 +32,9 @@ def add_arguments(self, parser):
         parser.add_argument("--output_base_url", required=True,
                             help="Output destination.")
         parser.add_argument("--partition_boundaries_file", required=True,
-                            help="Full path to a JSON file containing partition boundaries."
-                            "If specified, and does not exist, will be created, otherwise, will be used.")
+                            help="Full path to a JSON file containing partition boundaries. "
+                            "If specified, and does not exist, will be created, otherwise, "
+                            "it will be used.")
         parser.add_argument("--temporary_output_base_url", required=True,
                             help="Temporary output location for per-shard cluster indexes.")
         parser.add_argument("--num_lines", type=int, required=False,
@@ -93,25 +92,27 @@ def write_output_file(uri, fd, base_uri=None):
         else:
             # keep local file paths as is
             path = uri
+            scheme = 'file'
+            netloc = None
 
-        if scheme in ['s3', 's3a', 's3n']:
+        if scheme in {'s3', 's3a', 's3n'}:
             bucketname = netloc
             output_path = path
             try:
                 client = boto3.client('s3')
                 client.upload_fileobj(fd, bucketname, path)
             except botocore.client.ClientError as exception:
-                ZipNumClusterCdx.LOG.error(
+                logging.error(
                     'Failed to write to S3 {}: {}'.format(output_path, exception))
 
-        elif scheme == 'http' or scheme == 'https':
+        elif scheme in {'http', 'https'}:
             raise ValueError('HTTP/HTTPS output not supported')
 
         elif scheme == 'hdfs':
             raise NotImplementedError('HDFS output not implemented')
 
         else:
-            ZipNumClusterCdx.LOG.info('Writing local file {}'.format(uri))
+            logging.info('Writing local file {}'.format(uri))
             if scheme == 'file':
                 # must be an absolute path
                 uri = os.path.join('/', path)
@@ -130,7 +131,7 @@ def write_partition_with_global_seq(idx: int, partition_iter: list,
         # Calculate starting sequence number for this partition
         start_seq = (idx * records_per_partition) + 1 if records_per_partition else 1
 
-        with open(partition_idx_file, 'w') as f:
+        with open(partition_idx_file, 'w', encoding="utf-8") as f:
             seq = start_seq
             for record in partition_iter:
                 min_surt, _, min_surt_timestamp, filename, _, offset, length, _ = record
@@ -157,6 +158,7 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
         current_chunk = []
         chunk_min_surt = None
         chunk_max_surt = None
+        chunk_min_timestamp = None
 
         with open(output_filename, 'wb') as f:
             for (surt_key, timestamp), json_data in partition_iter:
@@ -222,25 +224,26 @@ def process_partition(partition_id: int, partition_iter: Iterator[Tuple[str, Tup
         return final_files
 
     def run_job(self, session):
-        input = self.args.input_base_url + self.args.input
+        input_url = self.args.input_base_url + self.args.input
         num_partitions = self.args.num_output_partitions
         boundaries_file_uri = self.args.partition_boundaries_file
         num_lines = self.args.num_lines
         output_base_url = self.args.output_base_url
         temporary_output_base_url = self.args.temporary_output_base_url
 
-        rdd = session.sparkContext.textFile(input).map(
+        rdd = session.sparkContext.textFile(input_url).map(
             self.parse_line).filter(lambda x: x is not None)
 
         boundaries = None
-        logging.info(f"Boundaries file: {boundaries_file_uri}")
+        self.get_logger(session).info(f"Boundaries file: {boundaries_file_uri}")
         if boundaries_file_uri and self.check_for_output_file(boundaries_file_uri):
-            logging.info(f"Boundaries file found, using it: {boundaries_file_uri}")
+            self.get_logger(session).info(f"Boundaries file found, using it: {boundaries_file_uri}")
             with self.fetch_file(boundaries_file_uri) as f:
                 boundaries = list(map(lambda l: tuple(l), json.load(f)))
 
         else:
-            # this percent needs to be pretty small, since this collect brings data back to driver...
+            # The percentage needs to be pretty small, since the collect
+            # brings data back to the driver...
             # 1/2 percent should be fine
             samples = rdd.keys().sample(False, 0.005).collect()
             samples.sort()
@@ -254,7 +257,7 @@ def run_job(self, session):
                     boundaries.append(samples[idx])
 
             temp_file_name = 'temp_range_boundaries.json'
-            with open(temp_file_name, 'w') as f:
+            with open(temp_file_name, 'w', encoding="utf-8") as f:
                 json.dump(boundaries, f)
 
             with open(temp_file_name, 'rb') as f:
@@ -262,7 +265,8 @@ def run_job(self, session):
 
             os.unlink(temp_file_name)
 
-            logging.info(f"Boundaries file created: {boundaries_file_uri}")
+            self.get_logger(session).info(
+                f"Boundaries file created: {boundaries_file_uri}")
 
         rdd = rdd.repartitionAndSortWithinPartitions(
             numPartitions=num_partitions,
@@ -287,4 +291,4 @@ def run_job(self, session):
 
 if __name__ == "__main__":
     job = ZipNumClusterCdx()
-    job.run()
\ No newline at end of file
+    job.run()

From a9ff183a829ddbb1e800b1a898bfb0d7f698af2a Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 18 Nov 2025 14:21:11 +0100
Subject: [PATCH 63/66] fix: suppress ignored CLI options

---
 zipnumcluster_cc_pyspark.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/zipnumcluster_cc_pyspark.py b/zipnumcluster_cc_pyspark.py
index f0ac197..4508652 100644
--- a/zipnumcluster_cc_pyspark.py
+++ b/zipnumcluster_cc_pyspark.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import logging
 import os
@@ -24,6 +25,10 @@ class ZipNumClusterCdx(CCFileProcessorSparkJob):
 
     name = 'ZipNumClusterCdx'
 
+    input_descr = """Glob pattern of input CDX files, e.g., file:///path/*/*.cdx.gz
+(HDFS-compatible filesystems only: hdfs://, s3a://, file://)."""
+    output_descr = "Ignored but required (can be empty, no output table is produced)."
+
     DATA_URL_PATTERN = re.compile('^(s3|https?|file|hdfs|s3a|s3n):(?://([^/]*))?/(.*)')
 
 
@@ -43,6 +48,10 @@ def add_arguments(self, parser):
         parser.add_argument("--num_output_partitions", type=int, required=False,
                             default=300,
                             help="Number of partitions/shards")
+        # suppress help for ignored arguments
+        parser.add_argument("--output_format", help=argparse.SUPPRESS)
+        parser.add_argument("--output_compression", help=argparse.SUPPRESS)
+        parser.add_argument("--output_option", help=argparse.SUPPRESS)
 
     @staticmethod
     def parse_line(line):

From af10f62257074d5fffad04777a670dafb23a2a86 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 18 Nov 2025 22:01:45 +0100
Subject: [PATCH 64/66] Spark jobs to build index: README, add shell script and
 requirements

---
 README.md                  |   4 +
 requirements_ccpyspark.txt |   3 +
 run_index_ccpyspark.sh     | 187 +++++++++++++++++++++++++++++++++++++
 3 files changed, 194 insertions(+)
 create mode 100644 requirements_ccpyspark.txt
 create mode 100755 run_index_ccpyspark.sh

diff --git a/README.md b/README.md
index c7ebb6e..420ca3f 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,10 @@ WebArchive URL Indexing
 
 This project contains several scripts (MapReduce jobs) for generating URL indexes of web archive collections, usually containing large number of WARC (or ARC) files. The scripts are designed to ran on Hadoop or Amazon EMR to process terabytes or even petabytes of web archive content. Additionally, thanks to flexibility of the MRJob library, the scripts can also run on a local machine to build an index cluster.
 
+The indexer was originally implemented based on [mrjob](//github.com/Yelp/mrjob). Because mrjob is no longer maintained, it was ported to PySpark (based on [cc-pyspark](//github.com/commoncrawl/cc-pyspark/)) in 2024/2025. For running the Spark jobs, see [run_index_ccpyspark.sh](run_index_ccpyspark.sh) and [requirements_ccpyspark.txt](requirements_ccpyspark.txt). See also the documentation of [cc-pyspark](//github.com/commoncrawl/cc-pyspark/).
+
+The description below documents how to run the MapReduce jobs.
+
 ## Initial Setup and Usage
 
 Python 3 is required - see the branch `python-2.7` for a previous version running on Python 2.7 (not maintained anymore).
diff --git a/requirements_ccpyspark.txt b/requirements_ccpyspark.txt
new file mode 100644
index 0000000..9d3d21f
--- /dev/null
+++ b/requirements_ccpyspark.txt
@@ -0,0 +1,3 @@
+boto3
+# PyWB is required to index WARC files, running indexwarcs_cc_pyspark.py
+pywb
diff --git a/run_index_ccpyspark.sh b/run_index_ccpyspark.sh
new file mode 100755
index 0000000..7f2d0a4
--- /dev/null
+++ b/run_index_ccpyspark.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+    cat <<"EOF"
+$0 <year-week-of-crawl> <path-to-warc-file-list> [<split_file>]
+
+Create a Common Crawl CDX ZipNum index for a monthly crawl. All steps are run on Spark.
+
+  <year-week-of-crawl>   Year and week of the monthly crawl to be indexed, e.g. 2016-44
+                         used to determine the final location of the index
+                             s3://commoncrawl/cc-index/collections/CC-MAIN-2016-44/...
+                         Also locations for temporary files include the crawl name.
+
+  <path-to-warc-file-list>  List of WARC file objects to be indexed, e.g, the WARC/WAT/WET list
+                               s3://commoncrawl/crawl-data/CC-MAIN-2016-44/warc.paths
+                         or any subset or union of multiple WARC listings (incl. robots.txt WARCs).
+                         Paths in the list must be keys/objects in the Common Crawl bucket
+                         or another bucket configured in this script (WARC_PREFIX).
+                         The path to the list must be an absolute URL on HDFS or S3A.
+
+                         The "index warcs" step is skipped if an empty string is passed as argument.
+                         Since 2018 the per-WARC CDX files are written directly by the Fetcher
+                         and include index fields combined from the WARC response and metadata record.
+                         The latter holds the detected language and charset.
+
+  <split_file>           Optional split file to be reused from previous crawl with similar distribution of URLs.
+                         If not given, splits are calculated and saved on the default split file path.
+
+EOF
+    exit 1
+fi
+
+
+YEARWEEK="$1"
+WARC_MANIFEST="$2"
+REUSE_SPLIT_FILE="$3"
+
+CRAWL="CC-MAIN-$YEARWEEK"
+
+echo "Generating cc-index for $CRAWL"
+echo
+echo WARC_MANIFEST="$WARC_MANIFEST"
+echo
+
+# Path prefix of WARC/WAT/WET files listed in WARC_MANIFEST
+WARC_PREFIX="s3://commoncrawl/"
+
+# AWS S3 bucket to hold CDX files
+WARC_CDX_BUCKET="commoncrawl-index-temp"
+WARC_CDX_PREFIX="s3://$WARC_CDX_BUCKET/"
+
+# Location of the CDX status table
+SPARK_SQL_WAREHOUSE="s3a://$WARC_CDX_BUCKET/$CRAWL"
+CDX_STATUS_TABLE="cdx_status"
+
+
+# glob pattern to match all CDX files generated in step 1 (indexwarcs_cc_pyspark.py)
+# or available otherwise. The URI scheme must be supported by Hadoop / HDFS.
+WARC_CDX="s3a://$WARC_CDX_BUCKET/$CRAWL/cdx/segments/*/*/*.cdx.gz"
+
+
+### ZipNum definitions
+ZIPNUM_N_LINES=3000
+ZIPNUM_N_PARTITIONS=300
+
+# SPLIT_FILE could be reused from previous crawl with similar distribution of URLs, see REUSE_SPLIT_FILE
+SPLIT_FILE="s3a://$WARC_CDX_BUCKET/$CRAWL/partition_boundaries.json"
+# if explicitely configured
+if [ -n "$REUSE_SPLIT_FILE" ]; then
+    echo "Reusing SPLIT_FILE $REUSE_SPLIT_FILE"
+    SPLIT_FILE="$REUSE_SPLIT_FILE"
+fi
+
+# temporary output path of part-n files of the zipnum job, concatenated into the cluster.idx
+ZIPNUM_TEMP_DIR="s3://$WARC_CDX_BUCKET/$CRAWL/indexes/"
+
+# final path to ZipNum index files
+ZIPNUM_CLUSTER_DIR="s3://commoncrawl/cc-index/collections/$CRAWL/indexes/"
+
+
+# configure S3 buffer directory
+# - must exist on task/compute nodes for buffering data
+# - should provide several GBs of free space to hold temporarily
+#   the downloaded data (WARC, WAT, WET files to be indexed),
+#   only relevant for the indexwarcs_cc_pyspark job.
+if [ -n "$S3_LOCAL_TEMP_DIR" ]; then
+	S3_LOCAL_TEMP_DIR="--local_temp_dir=$S3_LOCAL_TEMP_DIR"
+else
+	S3_LOCAL_TEMP_DIR=""
+fi
+
+
+
+### PySpark definitions
+export PYSPARK_PYTHON="python"  # or "python3"
+
+# Python dependencies (for simplicity, include all Python files: cc-pyspark/*.py)
+PYFILES=sparkcc.py
+
+### Spark configuration
+
+SPARK_ON_YARN="--master yarn"
+SPARK_HADOOP_OPTS=""
+SPARK_EXTRA_OPTS=""
+
+# defines SPARK_HOME, SPARK_HADOOP_OPTS and HADOOP_CONF_DIR
+. spark_env.sh
+
+NUM_EXECUTORS=${NUM_EXECUTORS:-1}
+EXECUTOR_CORES=${EXECUTOR_CORES:-2}
+# input partitions for the WARC-to-CDX stop
+NUM_WARC_INPUT_PARTITIONS=${NUM_WARC_INPUT_PARTITIONS:-10}
+
+export LC_ALL=C
+
+set -e
+set -x
+
+
+if [ -n "$WARC_MANIFEST" ]; then
+    # Index WARC files in the manifest, write one CDX file per WARC
+    EXECUTOR_MEM=${EXECUTOR_MEM:-2g}
+    if [[ $NUM_WARC_INPUT_PARTITIONS -lt $((NUM_EXECUTORS*EXECUTOR_CORES)) ]]; then
+        echo "The number of input partitions is too low to utilize all executor cores"
+        exit 1
+    fi
+    $SPARK_HOME/bin/spark-submit \
+        $SPARK_ON_YARN \
+        $SPARK_HADOOP_OPTS \
+        --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+        --conf spark.task.maxFailures=5 \
+        --conf spark.executor.memory=$EXECUTOR_MEM \
+        --conf spark.driver.memory=3g \
+        --conf spark.core.connection.ack.wait.timeout=600s \
+        --conf spark.network.timeout=300s \
+        --conf spark.shuffle.io.maxRetries=50 \
+        --conf spark.shuffle.io.retryWait=600s \
+        --conf spark.locality.wait=1s \
+        --conf spark.executorEnv.LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native \
+        --num-executors $NUM_EXECUTORS \
+        --executor-cores $EXECUTOR_CORES \
+        --executor-memory $EXECUTOR_MEM \
+        --conf spark.sql.warehouse.dir="$SPARK_SQL_WAREHOUSE" \
+        --py-files $PYFILES \
+        indexwarcs_cc_pyspark.py \
+        --input_base_url="$WARC_PREFIX" \
+        --output_base_url="$WARC_CDX_PREFIX" \
+        $S3_LOCAL_TEMP_DIR \
+        --num_input_partitions=$NUM_WARC_INPUT_PARTITIONS \
+        --num_output_partitions=1 \
+        "$WARC_MANIFEST" "$CDX_STATUS_TABLE"
+fi
+
+
+### Create ZipNum index
+EXECUTOR_MEM=${EXECUTOR_MEM:-3g}
+
+$SPARK_HOME/bin/spark-submit \
+    $SPARK_ON_YARN \
+    $SPARK_HADOOP_OPTS \
+    --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+    --conf spark.task.maxFailures=5 \
+    --conf spark.executor.memory=$EXECUTOR_MEM \
+    --conf spark.driver.memory=3g \
+    --conf spark.core.connection.ack.wait.timeout=600s \
+    --conf spark.network.timeout=300s \
+    --conf spark.shuffle.io.maxRetries=50 \
+    --conf spark.shuffle.io.retryWait=600s \
+    --conf spark.locality.wait=1s \
+    --conf spark.io.compression.codec=zstd \
+    --conf spark.checkpoint.compress=true \
+    --conf spark.executorEnv.LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native \
+    --num-executors $NUM_EXECUTORS \
+    --executor-cores $EXECUTOR_CORES \
+    --executor-memory $EXECUTOR_MEM \
+    --py-files $PYFILES \
+    zipnumcluster_cc_pyspark.py \
+    $S3_LOCAL_TEMP_DIR \
+    --input_base_url="" \
+    --output_base_url="$ZIPNUM_CLUSTER_DIR" \
+    --temporary_output_base_url="$ZIPNUM_TEMP_DIR" \
+    --partition_boundaries_file="$SPLIT_FILE" \
+    --num_lines=$ZIPNUM_N_LINES \
+    --num_output_partitions=$ZIPNUM_N_PARTITIONS \
+    "$WARC_CDX" ""
+
+

From d995a58b790c1d4ab8caaff279c51e93fa86b7c1 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Sat, 10 Jan 2026 21:05:12 +0100
Subject: [PATCH 65/66] refactor: adapt publish_index.sh for Spark jobs

---
 publish_index.sh       | 10 ++++++++++
 run_index_ccpyspark.sh |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/publish_index.sh b/publish_index.sh
index 78d2c8f..bd4b772 100755
--- a/publish_index.sh
+++ b/publish_index.sh
@@ -25,6 +25,16 @@ if ! [ -e "$YEARWEEK-metadata.yaml" ]; then
 fi
 aws s3 cp "$YEARWEEK-metadata.yaml" "s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/metadata.yaml"
 
+
+echo "Prepare and install cluster.idx if not yet done"
+# Note: This is required for mrjob-based implementation, but not for that based on cc-pyspark.
+#       The jobs zipnumcluster_cc_pyspark.py already does the concatenation of the 300 per-partition
+#       *.idx files into the cluster.idx
+if aws s3 ls s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/cluster.idx; then
+	echo "cluster.idx already exists on s3://commoncrawl/cc-index/collections/CC-MAIN-$YEARWEEK/indexes/"
+	exit 0
+fi
+
 test -d "cdx-$YEARWEEK" || mkdir "cdx-$YEARWEEK"
 cd "cdx-$YEARWEEK"
 
diff --git a/run_index_ccpyspark.sh b/run_index_ccpyspark.sh
index 7f2d0a4..f7a564e 100755
--- a/run_index_ccpyspark.sh
+++ b/run_index_ccpyspark.sh
@@ -108,7 +108,7 @@ SPARK_EXTRA_OPTS=""
 
 NUM_EXECUTORS=${NUM_EXECUTORS:-1}
 EXECUTOR_CORES=${EXECUTOR_CORES:-2}
-# input partitions for the WARC-to-CDX stop
+# input partitions for the WARC-to-CDX step
 NUM_WARC_INPUT_PARTITIONS=${NUM_WARC_INPUT_PARTITIONS:-10}
 
 export LC_ALL=C

From 8a404af6dc54aa7443113a6a3498030aca97a4aa Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian@commoncrawl.org>
Date: Tue, 27 Jan 2026 18:11:35 +0100
Subject: [PATCH 66/66] refactor: increase driver memory and max. result size

Increase Spark driver memory to 6 GiB and
max. result size to 4 GiB in order to
address potential out-of-memory issues
when sampled URL index keys (SURT and timestamp)
are sent to the driver.
---
 run_index_ccpyspark.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run_index_ccpyspark.sh b/run_index_ccpyspark.sh
index f7a564e..2484eec 100755
--- a/run_index_ccpyspark.sh
+++ b/run_index_ccpyspark.sh
@@ -161,7 +161,8 @@ $SPARK_HOME/bin/spark-submit \
     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
     --conf spark.task.maxFailures=5 \
     --conf spark.executor.memory=$EXECUTOR_MEM \
-    --conf spark.driver.memory=3g \
+    --conf spark.driver.memory=6g \
+    --conf spark.driver.maxResultSize=4g \
     --conf spark.core.connection.ack.wait.timeout=600s \
     --conf spark.network.timeout=300s \
     --conf spark.shuffle.io.maxRetries=50 \