From 5793911de1c8ddabb691f4d39b44f383597604e3 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 16 Apr 2026 13:40:39 +0200 Subject: [PATCH 1/4] Add IP address column (#30) Add WARC-Record-ID column (#42) Add two new columns `warc_record_id` and `warc_ip_address` holding the WARC-Record-ID resp. WARC-IP-Address as STRING logical type. --- .../org/commoncrawl/spark/CCIndex2Table.java | 8 ++++++- .../schema/cc-index-schema-flat.json | 22 +++++++++++++++++++ .../schema/cc-index-schema-nested.json | 22 +++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java index c78614a..6bbee5a 100644 --- a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java +++ b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java @@ -43,6 +43,7 @@ protected static class CdxLine extends IndexTable.CdxLine { String redirect; String digest; String mime, mimeDetected; + String recordid, ipaddress; String filename; int offset, length; short status; @@ -60,6 +61,8 @@ public CdxLine(String line) throws IOException { mime = getString("mime"); mimeDetected = getString("mime-detected"); + recordid = getString("recordid"); + ipaddress = getString("ipaddress"); filename = getString("filename"); offset = getInt("offset"); length = getInt("length"); @@ -102,7 +105,7 @@ public static Row convertCdxLine(String line) { RowFactory.create(cdx.timestamp, cdx.status, cdx.redirect), // RowFactory .create(cdx.digest, cdx.mime, cdx.mimeDetected, cdx.charset, cdx.languages, cdx.truncated), // - RowFactory.create(cdx.filename, cdx.offset, cdx.length, cdx.segment), // + RowFactory.create(cdx.recordid, cdx.ipaddress, cdx.filename, cdx.offset, cdx.length, cdx.segment), // cdx.crawl, cdx.subset); } else { @@ -142,6 +145,9 @@ public static Row convertCdxLine(String line) { cdx.languages, // content (WARC record payload) truncated (since CC-MAIN-2019-47) cdx.truncated, + // WARC record headers + cdx.recordid, + cdx.ipaddress, // WARC record location cdx.filename, cdx.offset, diff --git a/src/main/resources/schema/cc-index-schema-flat.json b/src/main/resources/schema/cc-index-schema-flat.json index f89d663..f0e91f3 100644 --- a/src/main/resources/schema/cc-index-schema-flat.json +++ b/src/main/resources/schema/cc-index-schema-flat.json @@ -253,6 +253,28 @@ "fromCDX": "warc-truncated" } }, + { + "name": "warc_record_id", + "type": "string", + "nullable": true, + "metadata": { + "description": "UUID of the WARC record (WARC-Record-ID)", + "example": "019d6d22-5cf4-7ad9-8a24-81690cf43c7d", + "since": "CC-MAIN-2026-21", + "fromCDX": "recordid" + } + }, + { + "name": "warc_ip_address", + "type": "string", + "nullable": true, + "metadata": { + "description": "Numeric IP address contacted to retrieve the content (WARC-IP-Address)", + "example": "198.202.211.1 or 2620:cb:2000::1", + "since": "CC-MAIN-2026-21", + "fromCDX": "ipaddress" + } + }, { "name": "warc_filename", "type": "string", diff --git a/src/main/resources/schema/cc-index-schema-nested.json b/src/main/resources/schema/cc-index-schema-nested.json index c0c5732..f262245 100644 --- a/src/main/resources/schema/cc-index-schema-nested.json +++ b/src/main/resources/schema/cc-index-schema-nested.json @@ -298,6 +298,28 @@ "type": { "type": "struct", "fields": [ + { + "name": "record_id", + "type": "string", + "nullable": true, + "metadata": { + "description": "UUID of the WARC record (WARC-Record-ID)", + "example": "019d6d22-5cf4-7ad9-8a24-81690cf43c7d", + "since": "CC-MAIN-2026-21", + "fromCDX": "recordid" + } + }, + { + "name": "ip_address", + "type": "string", + "nullable": true, + "metadata": { + "description": "Numeric IP address contacted to retrieve the content (WARC-IP-Address)", + "example": "198.202.211.1 or 2620:cb:2000::1", + "since": "CC-MAIN-2026-21", + "fromCDX": "ipaddress" + } + }, { "name": "filename", "type": "string", From e6cbfa1590c6a3920c5280144372c9657849de54 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 17 Apr 2026 16:45:52 +0200 Subject: [PATCH 2/4] Add WARC-Record-ID column (#42) Configure column `warc_record_id` as not applicable for Parquet dictionary encoding, because all values are unique. --- src/script/convert_url_index.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/script/convert_url_index.sh b/src/script/convert_url_index.sh index 60bfa63..5852894 100755 --- a/src/script/convert_url_index.sh +++ b/src/script/convert_url_index.sh @@ -55,6 +55,7 @@ $SPARK_HOME/bin/spark-submit \ --executor-cores $EXECUTOR_CORES \ --executor-memory $EXECUTOR_MEM \ --conf spark.hadoop.parquet.enable.dictionary=true \ + --conf 'spark.hadoop.parquet.enable.dictionary#warc_record_id=false' \ --conf spark.sql.parquet.filterPushdown=true \ --conf spark.sql.parquet.mergeSchema=false \ --conf spark.sql.hive.metastorePartitionPruning=true \ From 2ab0b83546778045ae0dbac16b5e6548b91afd33 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 25 Apr 2026 13:38:35 +0200 Subject: [PATCH 3/4] Code formatting: sort imports --- src/main/java/org/commoncrawl/spark/CCIndex2Table.java | 2 +- .../java/org/commoncrawl/spark/util/NullOutputCommitter.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java index 6bbee5a..d0cee79 100644 --- a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java +++ b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java @@ -24,8 +24,8 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.commoncrawl.spark.util.CCWarcFilenameParser; -import org.commoncrawl.spark.util.CCWarcFilenameParser.FilenameParts; import org.commoncrawl.spark.util.CCWarcFilenameParser.FilenameParseError; +import org.commoncrawl.spark.util.CCWarcFilenameParser.FilenameParts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java b/src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java index bc6b659..09f7600 100644 --- a/src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java +++ b/src/main/java/org/commoncrawl/spark/util/NullOutputCommitter.java @@ -16,12 +16,12 @@ */ package org.commoncrawl.spark.util; +import java.io.IOException; + import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import java.io.IOException; - public class NullOutputCommitter extends OutputCommitter { @Override From e651f9c5dff9d51fa5d75424ecd192f2675a16fc Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 25 Apr 2026 13:56:26 +0200 Subject: [PATCH 4/4] Add IP address column (#30) Add WARC-Record-ID column (#42) Make the columns `warc_record_id` and `warc_ip_address` to hold binary data (primitive type BYTE_ARRAY). --- .../org/commoncrawl/spark/CCIndex2Table.java | 21 ++++++++++++++++--- .../schema/cc-index-schema-flat.json | 4 ++-- .../schema/cc-index-schema-nested.json | 4 ++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java index d0cee79..9553f45 100644 --- a/src/main/java/org/commoncrawl/spark/CCIndex2Table.java +++ b/src/main/java/org/commoncrawl/spark/CCIndex2Table.java @@ -17,6 +17,8 @@ package org.commoncrawl.spark; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.UUID; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; @@ -29,6 +31,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.net.InetAddresses; + /** * Convert Common Crawl's URL index into a tabular format. */ @@ -43,7 +47,7 @@ protected static class CdxLine extends IndexTable.CdxLine { String redirect; String digest; String mime, mimeDetected; - String recordid, ipaddress; + byte[] recordid, ipaddress; String filename; int offset, length; short status; @@ -61,8 +65,19 @@ public CdxLine(String line) throws IOException { mime = getString("mime"); mimeDetected = getString("mime-detected"); - recordid = getString("recordid"); - ipaddress = getString("ipaddress"); + recordid = null; + String id = getString("recordid"); + if (id != null) { + UUID uuid = UUID.fromString(id); + recordid = new byte[16]; + ByteBuffer.wrap(recordid) + .putLong(uuid.getMostSignificantBits()) + .putLong(uuid.getLeastSignificantBits()); + } + String ip = getString("ipaddress"); + if (ip != null) { + ipaddress = InetAddresses.forString(ip).getAddress(); + } filename = getString("filename"); offset = getInt("offset"); length = getInt("length"); diff --git a/src/main/resources/schema/cc-index-schema-flat.json b/src/main/resources/schema/cc-index-schema-flat.json index f0e91f3..59081e2 100644 --- a/src/main/resources/schema/cc-index-schema-flat.json +++ b/src/main/resources/schema/cc-index-schema-flat.json @@ -255,7 +255,7 @@ }, { "name": "warc_record_id", - "type": "string", + "type": "binary", "nullable": true, "metadata": { "description": "UUID of the WARC record (WARC-Record-ID)", @@ -266,7 +266,7 @@ }, { "name": "warc_ip_address", - "type": "string", + "type": "binary", "nullable": true, "metadata": { "description": "Numeric IP address contacted to retrieve the content (WARC-IP-Address)", diff --git a/src/main/resources/schema/cc-index-schema-nested.json b/src/main/resources/schema/cc-index-schema-nested.json index f262245..8393567 100644 --- a/src/main/resources/schema/cc-index-schema-nested.json +++ b/src/main/resources/schema/cc-index-schema-nested.json @@ -300,7 +300,7 @@ "fields": [ { "name": "record_id", - "type": "string", + "type": "binary", "nullable": true, "metadata": { "description": "UUID of the WARC record (WARC-Record-ID)", @@ -311,7 +311,7 @@ }, { "name": "ip_address", - "type": "string", + "type": "binary", "nullable": true, "metadata": { "description": "Numeric IP address contacted to retrieve the content (WARC-IP-Address)",