Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/main/java/org/commoncrawl/spark/CCIndex2Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ protected static class CdxLine extends IndexTable.CdxLine {
String digest;
String mime, mimeDetected;
String filename;
int offset, length;
long offset, length;
short status;
String crawl, segment, subset;
String charset, languages;
Expand All @@ -61,8 +61,8 @@ public CdxLine(String line) throws IOException {
mimeDetected = getString("mime-detected");

filename = getString("filename");
offset = getInt("offset");
length = getInt("length");
offset = getLong("offset");
length = getLong("length");
status = getHttpStatus("status");

try {
Expand Down
15 changes: 13 additions & 2 deletions src/main/java/org/commoncrawl/spark/util/CCWarcFilenameParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,14 @@ public class CCWarcFilenameParser {
*/
protected static final Pattern filenameAnalyzer = Pattern.compile(
"^(?:common-crawl/)?crawl-data/([^/]+)/segments/([^/]+)/(crawldiagnostics|robotstxt|warc|wat|wet)/");

/**
* Supplemental crawl filename pattern:
* <code>projects/PROJECT/CC-SUPPLEMENTAL-YYYY-WW[-...]/SEGMENT/SUBSET/*.warc.gz</code>
* e.g.
* <code>projects/cc-open-athena-test/CC-SUPPLEMENTAL-2026-22-bis/20260522101936/warc/CC-SUPPLEMENTAL-2026-22-bis-20260522102012-20260522152012-00000.warc.gz</code>
*/
protected static final Pattern supplementalFilenameAnalyzer = Pattern.compile(
"^[^/]+/[^/]+/(CC-SUPPLEMENTAL-[^/]+)/segments/(\\d+)/(crawldiagnostics|robotstxt|warc)/");
/**
* News crawl filename pattern:
* <code>s3://commoncrawl/crawl-data/CC-NEWS/YYYY/MM/*.warc.gz</code> e.g.
Expand Down Expand Up @@ -67,11 +74,15 @@ public static FilenameParts getParts(String filename) throws FilenameParseError
if (m.find()) {
return new FilenameParts(m.group(1), m.group(2), m.group(3));
}
m = supplementalFilenameAnalyzer.matcher(filename);
if (m.find()) {
return new FilenameParts(m.group(1), m.group(2), m.group(3));
}
m = newsFilenameAnalyzer.matcher(filename);
if (m.find()) {
String crawl = String.format("CC-NEWS-%s-%s", m.group(1), m.group(2));
return new FilenameParts(crawl, m.group(3), "news-warc");
}
throw new FilenameParseError("Filename not parseable (tried main and news): " + filename);
throw new FilenameParseError("Filename not parseable (tried main, supplemental and news): " + filename);
}
}
4 changes: 2 additions & 2 deletions src/main/resources/schema/cc-index-schema-flat.json
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@
},
{
"name": "warc_record_offset",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Offset of the WARC record",
Expand All @@ -275,7 +275,7 @@
},
{
"name": "warc_record_length",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Length of the WARC record",
Expand Down
4 changes: 2 additions & 2 deletions src/main/resources/schema/cc-index-schema-nested.json
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@
},
{
"name": "record_offset",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Offset of the WARC record",
Expand All @@ -320,7 +320,7 @@
},
{
"name": "record_length",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Length of the WARC record",
Expand Down
4 changes: 2 additions & 2 deletions src/main/resources/schema/index-schema-simple-nested.json
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@
},
{
"name": "record_offset",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Offset of the WARC record",
Expand All @@ -297,7 +297,7 @@
},
{
"name": "record_length",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Length of the WARC record",
Expand Down
4 changes: 2 additions & 2 deletions src/main/resources/schema/index-schema-simple.json
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@
},
{
"name": "warc_record_offset",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Offset of the WARC record",
Expand All @@ -249,7 +249,7 @@
},
{
"name": "warc_record_length",
"type": "integer",
"type": "long",
"nullable": false,
"metadata": {
"description": "Length of the WARC record",
Expand Down
18 changes: 18 additions & 0 deletions src/test/java/org/commoncrawl/spark/TestCCWarcFilenameParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,24 @@ public void testMainWarcFilename() throws FilenameParseError {
assertEquals("warc", parts.subset);
}

@Test
public void testSupplementalWarcFilenameCrawlDiagnostics() throws FilenameParseError {
String filename = "projects/cc-open-athena-test/CC-SUPPLEMENTAL-2026-22/segments/20260522204839/crawldiagnostics/CC-SUPPLEMENTAL-2026-22-20260522204937-20260523014937-00119.warc.gz";
FilenameParts parts = CCWarcFilenameParser.getParts(filename);
assertEquals("CC-SUPPLEMENTAL-2026-22", parts.crawl);
assertEquals("20260522204839", parts.segment);
assertEquals("crawldiagnostics", parts.subset);
}

@Test
public void testSupplementalWarcFilenameRobotsTxt() throws FilenameParseError {
String filename = "projects/cc-open-athena-test/CC-SUPPLEMENTAL-2026-22/segments/20260522204839/robotstxt/CC-SUPPLEMENTAL-2026-22-20260522204937-20260523014937-00119.warc.gz";
FilenameParts parts = CCWarcFilenameParser.getParts(filename);
assertEquals("CC-SUPPLEMENTAL-2026-22", parts.crawl);
assertEquals("20260522204839", parts.segment);
assertEquals("robotstxt", parts.subset);
}

@Test
public void testMainWat() throws FilenameParseError {
String filename = "crawl-data/CC-MAIN-2018-47/segments/1542039741324.15/wat/CC-MAIN-20181113153141-20181113174452-00011.warc.wat.gz";
Expand Down