From 3881d2cefb72c215edfe56ca8a744edd2348ff43 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 Apr 2026 21:17:36 +0200 Subject: [PATCH 01/10] feat: integrate the CC0 library from @belief-driven-design blog-uuidv7 --- src/java/org/commoncrawl/util/UUIDv7.java | 238 +++++++++++ src/java/org/commoncrawl/util/WarcWriter.java | 2 +- .../org/commoncrawl/util/UUIDv7Tests.java | 380 ++++++++++++++++++ 3 files changed, 619 insertions(+), 1 deletion(-) create mode 100644 src/java/org/commoncrawl/util/UUIDv7.java create mode 100644 src/test/org/commoncrawl/util/UUIDv7Tests.java diff --git a/src/java/org/commoncrawl/util/UUIDv7.java b/src/java/org/commoncrawl/util/UUIDv7.java new file mode 100644 index 0000000000..f71c983083 --- /dev/null +++ b/src/java/org/commoncrawl/util/UUIDv7.java @@ -0,0 +1,238 @@ +package org.commoncrawl.util; + +import java.security.SecureRandom; +import java.time.Clock; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Thread-safe implementation of UUIDv7 (RFC 9562). + *

+ * Layout (128 bits total): + *

+ * MSB (Most Significant Bits - 64 bits):
+ *   [48 bits] Unix epoch timestamp (milliseconds)
+ *   [ 4 bits] Version = 7 (0111)
+ *   [12 bits] Sub-millisecond sequence counter (for monotonicity)
+ *
+ * LSB (Least Significant Bits - 64 bits):
+ *   [ 2 bits] Variant = 2 (10 binary - IETF RFC variant)
+ *   [62 bits] Random bits (collision resistance)
+ * 
+ *

+ * Monotonicity Guarantee: + *

+ * + *

Thread Safety:

+ * Uses lock-free atomic operations (CAS) for high-performance concurrent UUID generation. + * State is packed into a single AtomicLong for efficient atomic updates. + * + * @spec https://www.rfc-editor.org/info/rfc9562 + * RFC 9562: Universally Unique IDentifiers (UUIDs) + * + * NOTE: This class was borrowed from https://github.com/belief-driven-design/blog-uuidv7 + * Copyright to the original author, and licensed as CC0. + */ +public final class UUIDv7 { + + /** + * Singleton that ensures process-wide monotonicity for all UUIDs generated + * via the public API. + */ + private final static UUIDv7 SHARED = new UUIDv7(Clock.systemUTC()); + + /** + * Time source for generating timestamps. + * Injectable for testing purposes. + */ + private final Clock clock; + + /** + * Packed state containing both timestamp and sequence counter. + * + *

Layout of the 64-bit long: + *

+     *   [52 bits] timestamp (only lower 48 bits used, upper 4 bits ignored)
+     *   [12 bits] sequence counter
+     * 
+ */ + private final AtomicLong state = new AtomicLong(0L); + + /** + * Source of cryptographic randomness for the lower 62 bits of the UUID. + * + * @implNote SecureRandom is used to prevent predictability, which is + * important if UUIDs might be exposed in URLs or logs. + * For pure collision resistance without security requirements, a faster + * RandomGenerator could be substituted. + */ + private final SecureRandom random = new SecureRandom(); + + /** + * Package-private constructor for dependency injection during testing. + */ + UUIDv7(Clock clock) { + this.clock = clock; + } + + /** + * Generates a new UUIDv7 with monotonic ordering guarantees. + *

+ * Lock-free algorithm using Compare-And-Swap (CAS) to ensure thread-safety + * and monotonicity even under high concurrency. + * + * @implSpec + * Algorithm Steps: + *

    + *
  1. Read current packed state (timestamp + sequence)
  2. + *
  3. Read current wall-clock time
  4. + *
  5. Calculate next state based on time progression
  6. + *
  7. Atomically update state (retry on CAS failure)
  8. + *
  9. Build and return UUID from confirmed state
  10. + *
+ * Monotonicity Rules: + * + * + * @return UUIDv7 that is guaranteed to be lexicographically greater than + * all previously generated UUIDs from this instance + */ + UUID generate() { + while (true) { + // STEP 1: Read current generator state. + // Format: [52-bit timestamp][12-bit sequence]) + long currentState = this.state.get(); + + // Unsigned right shift extracts upper 52 bits + long prevTimestamp = currentState >>> 12; + // Mask extracts lower 12 bits + int prevSequence = (int) (currentState & 0xFFFL); + + // STEP 2: Read current epoch value + long timestamp = this.clock.millis(); + + // STEP 3: Calculate next state + long sequence; + + if (timestamp > prevTimestamp) { + // SCENARIO 1: Clock advanced to a new millisecond. + // Reset sequence counter to 0. + sequence = 0; + } + else { + // SCENARIO 2: Either same millisecond OR clock moved backward (NTP adjustment). + // In both cases, preserve monotonicity by: + // 1. Using the previous timestamp (ignore backward movement) + // 2. Incrementing the sequence counter + timestamp = prevTimestamp; + sequence = prevSequence + 1; + + // Boundary Check: Sequence counter is only 12 bits (0-4095) + if (sequence > 0xFFF) { + // If we've generated 4096 UUIDs in the same millisecond, + // we use Spin-wait until the clock advances. + // Thread.onSpinWait() hints to the CPU that we're in a busy-wait loop + while (this.clock.millis() <= prevTimestamp) { + Thread.onSpinWait(); + } + + // Retry with the new timestamp + continue; + } + } + + // STEP 4: Pack the new state. + // Format: [52-bit timestamp][12-bit sequence]) + long nextState = (timestamp << 12) | sequence; + + // STEP 5: Update State atomically. + // compareAndSet ensures that if another thread modified state between + // our read and this update, we'll retry the entire operation. + if (this.state.compareAndSet(currentState, nextState)) { + // STEP 5: Encode values into UUID + return buildUUID(timestamp, sequence); + } + + // At this point CAS failed. + // Retry with the new state + } + } + + /** + * Encodes timestamp and sequence counter into a UUIDv7. + *

+ * Constructs the 128-bit UUID according to RFC 9562 layout: + * + *

+     * MSB (64 bits):
+     *   Bits  0-47: Unix timestamp (milliseconds)
+     *   Bits 48-51: Version = 7 (0111 binary)
+     *   Bits 52-63: Sequence counter (12 bits)
+     *
+     * LSB (64 bits):
+     *   Bits  0- 1: Variant = 2 (10 binary, meaning bits are 10xxxxxx...)
+     *   Bits  2-63: Random bits (62 bits)
+     * 
+ * + * @param timestamp the Unix epoch millisecond timestamp (only lower 48 bits used) + * @param sequence the sub-millisecond sequence counter (0-4095) + * @return a properly formatted UUIDv7 + */ + private UUID buildUUID(long timestamp, long sequence) { + + // STEP 1: HIGH BITS CONSTRUCTION (Most Significant Bits) + + // Start with timestamp in the leftmost 48 bits + // Mask ensures we only use 48 bits: 0xFFFFFFFFFFFF = 48 set bits + // Left shift by 16 to make room for version (4 bits) + sequence (12 bits) + long msb = (timestamp & 0xFFFFFFFFFFFFL) << 16; + + // OR in the version field: 7 in binary is 0111 + // 0x7000 = 0111 0000 0000 0000 in binary (version 7 in correct position) + msb |= 0x7000L; + + // OR in the sequence counter in the lowest 12 bits + // Mask ensures sequence fits in 12 bits: 0xFFF = 0000 1111 1111 1111 + msb |= (sequence & 0xFFFL); + + // STEP 2: LOW BITS CONSTRUCTION (Least Significant Bits) + + long randomBits = this.random.nextLong(); + + // Set variant bits: must be 10 (binary) per RFC 9562 + // 0x3FFFFFFFFFFFFFFF clears top 2 bits: 00111111... + // 0x8000000000000000 sets top bit to 1: 10000000... + // Result: 10xxxxxx... where x = random bits + long lsb = (randomBits & 0x3FFFFFFFFFFFFFFFL) | 0x8000000000000000L; + + // STEP 3: Construct the UUID from the two 64-bit longs + return new UUID(msb, lsb); + } + + /** + * Static factory to generate a new UUIDv7. + *

+ * This method provides a drop-in replacement for {@link UUID#randomUUID()} + * with the added benefits of time-ordering and monotonicity. + *

+ * All UUIDs generated through this method share the same monotonic sequence, + * ensuring process-wide ordering guarantees. + * + * @return a new UUIDv7 + */ + public static UUID randomUUID() { + return SHARED.generate(); + } + +} diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index aa9b20ba3b..48f270cd2f 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -457,7 +457,7 @@ protected static void writeWarcKeyValue(StringBuilder sb, String key, } private String getUUID() { - return UUID.randomUUID().toString(); + return UUIDv7.randomUUID().toString(); } public URI getRecordId() { diff --git a/src/test/org/commoncrawl/util/UUIDv7Tests.java b/src/test/org/commoncrawl/util/UUIDv7Tests.java new file mode 100644 index 0000000000..8d6d083024 --- /dev/null +++ b/src/test/org/commoncrawl/util/UUIDv7Tests.java @@ -0,0 +1,380 @@ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.time.Clock; +import java.time.Instant; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executors; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.jupiter.api.Test; + +/** + * This class was borrowed from https://github.com/belief-driven-design/blog-uuidv7 + * Copyright to the original author, and licensed as CC0. + */ +class UUIDv7Tests { + + /** + * Validates that generated UUIDs have correct version (7) and variant (2) fields + * as required by RFC 9562 for proper UUID classification. + */ + @Test + void versionAndVariantAreCorrect() { + UUID result = UUIDv7.randomUUID(); + + assertEquals(7, result.version(), "must be UUIDv7"); + assertEquals(2, result.variant(), "must be IETF variant"); + } + + /** + * Verifies strict monotonic ordering in a single-threaded context. + *

+ * Tests both code paths: + *

+ */ + @Test + void idsAreStrictlyIncreasingInSingleThread() { + MutableClock clock = new MutableClock(1_234_567_890_123L); + UUIDv7 generator = new UUIDv7(clock); + + UUID prev = generator.generate(); + + for (int i = 0; i < 10_000; i++) { + UUID next = generator.generate(); + + assertTrue(compareUnsignedLex(prev, next) < 0, "not monotonic: prev=" + prev + ", next=" + next); + + prev = next; + + // Occasionally advance time so both code paths are exercised + // (same millisecond vs. new millisecond) + if ((i % 257) == 0) { + clock.addMillis(1); + } + } + } + + /** + * Validates that clock regression (e.g., NTP drift) doesn't break monotonicity. + */ + @Test + void clockRollbackDoesNotBreakMonotonicity() { + MutableClock clock = new MutableClock(10_000L); + UUIDv7 generator = new UUIDv7(clock); + + UUID a = generator.generate(); + clock.addMillis(1); + UUID b = generator.generate(); + + // Simulate wall-clock going backwards (e.g., NTP correction) + clock.setMillis(9_000L); + UUID c = generator.generate(); + + assertTrue(compareUnsignedLex(a, b) < 0); + assertTrue(compareUnsignedLex(b, c) < 0, "must not go backward when clock regresses"); + } + + /** + * Validates uniqueness under high concurrency. + *

Spawns multiple threads that simultaneously generate UUIDs and verifies: + *

+ */ + @Test + void concurrentGenerationProducesUniqueIds() throws Exception { + int threads = Runtime.getRuntime().availableProcessors() - 2; + int perThread = 5_000; + + Set all = ConcurrentHashMap.newKeySet(threads * perThread); + + ExecutorService pool = Executors.newFixedThreadPool(threads); + CountDownLatch start = new CountDownLatch(1); + + for (int t = 0; t < threads; t++) { + pool.submit(() -> { + start.await(); + for (int i = 0; i < perThread; i++) { + all.add(UUIDv7.randomUUID()); + } + return null; + }); + } + + start.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(5, TimeUnit.SECONDS)); + + assertEquals(threads * perThread, all.size(), "no duplicates allowed"); + } + + /** + * Tests sequence overflow behavior when generating 4096+ UUIDs in a single millisecond. + *

+ * This test verifies: + *

    + *
  • Generator correctly handles sequence overflow
  • + *
  • All UUIDs remain unique and monotonic
  • + *
  • Generator waits for clock advancement rather than wrapping the counter
  • + *
+ */ + @Test + void sequenceOverflowCausesClockWait() { + MutableClock clock = new MutableClock(123_456_789L); + UUIDv7 generator = new UUIDv7(clock); + + List batch = new ArrayList<>(); + + // Generate exactly 4096 UUIDs (fills the sequence counter: 0-4095) + for (int i = 0; i < 4096; i++) { + batch.add(generator.generate()); + } + + // Next generation should trigger overflow handling. + // Start a thread that will generate the 4097th UUID. + // We need to do this in a separate thread to not call Thread.spinWait() + // on the main thread. + + // This is the synchronization signal between the main test thread and the overflow thread. + AtomicInteger finalizedCount = new AtomicInteger(0); + Thread overflowThread = new Thread(() -> { + UUID overflow = generator.generate(); + batch.add(overflow); + finalizedCount.incrementAndGet(); + }); + + overflowThread.start(); + + // Give the thread time to enter the spin-wait loop + try { + Thread.sleep(50); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + // Thread should be waiting, not completed yet + assertEquals(0, finalizedCount.get(), "should be waiting for clock to advance"); + + // Advance the clock to release the waiting thread + clock.addMillis(1); + + // Wait for the thread to complete + try { + overflowThread.join(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + assertEquals(1, finalizedCount.get(), "should have completed after clock advance"); + assertEquals(4097, batch.size(), "should have all UUIDs"); + + // Verify all UUIDs are unique + assertEquals(4097, new HashSet<>(batch).size(), "all UUIDs must be unique"); + + // Verify strict monotonicity + for (int i = 1; i < batch.size(); i++) { + assertTrue(compareUnsignedLex(batch.get(i - 1), batch.get(i)) < 0, + "UUIDs must be strictly increasing"); + } + } + + /** + * Validates the bit-level structure of generated UUIDs according to RFC 9562. + *

+ * Extracts and validates individual fields: + *

    + *
  • Timestamp: MSB 0-47
  • + *
  • Version (7): MSB 48-51
  • + *
  • Sequence: MSB 52-63
  • + *
  • Variant (2): LSB top 2 bits
  • + *
+ */ + @Test + void bitLevelStructureIsCorrect() { + long clockMs = 1_234_567_890_000L; + MutableClock clock = new MutableClock(clockMs); + UUIDv7 generator = new UUIDv7(clock); + + UUID first = generator.generate(); + UUID second = generator.generate(); + + // Extract fields from MSB + long msb1 = first.getMostSignificantBits(); + long timestamp1 = msb1 >>> 16; // Top 48 bits + int version1 = (int) ((msb1 >>> 12) & 0xFL); // 4 bits after timestamp + int sequence1 = (int) (msb1 & 0xFFFL); // Bottom 12 bits + + long msb2 = second.getMostSignificantBits(); + long timestamp2 = msb2 >>> 16; + int version2 = (int) ((msb2 >>> 12) & 0xFL); + int sequence2 = (int) (msb2 & 0xFFFL); + + // Validate version + assertEquals(7, version1, "version must be 7"); + assertEquals(7, version2, "version must be 7"); + + // Validate timestamp matches clock + assertEquals(clockMs, timestamp1, "timestamp should match clock"); + assertEquals(clockMs, timestamp2, "timestamp should match clock"); + + // Validate sequence increments (same millisecond) + assertEquals(0, sequence1, "first UUID should have sequence 0"); + assertEquals(1, sequence2, "second UUID should have sequence 1"); + + // Extract and validate variant from LSB + long lsb1 = first.getLeastSignificantBits(); + long lsb2 = second.getLeastSignificantBits(); + + // Variant bits should be 10 (binary), meaning the top bit is 1 and second bit is 0 + // In hex, this means the first nibble of LSB is in range [8, 9, a, b] + assertTrue((lsb1 & 0x8000000000000000L) != 0, "variant bit 0 must be 1"); + assertTrue((lsb1 & 0x4000000000000000L) == 0, "variant bit 1 must be 0"); + assertTrue((lsb2 & 0x8000000000000000L) != 0, "variant bit 0 must be 1"); + assertTrue((lsb2 & 0x4000000000000000L) == 0, "variant bit 1 must be 0"); + + // Validate random bits are different between UUIDs + // Mask out the variant bits and compare + long random1 = lsb1 & 0x3FFFFFFFFFFFFFFFL; + long random2 = lsb2 & 0x3FFFFFFFFFFFFFFFL; + assertNotEquals(random1, random2, "random bits should differ between UUIDs"); + } + + /** + * Stress test with extreme concurrency to validate robustness. + */ + @Test + void stressTestWithHighConcurrency() throws Exception { + int threads = Runtime.getRuntime().availableProcessors() * 2; + int perThread = 50_000; + + Set all = ConcurrentHashMap.newKeySet(threads * perThread); + ExecutorService pool = Executors.newFixedThreadPool(threads); + CountDownLatch start = new CountDownLatch(1); + CountDownLatch done = new CountDownLatch(threads); + + for (int t = 0; t < threads; t++) { + pool.submit(() -> { + try { + start.await(); + for (int i = 0; i < perThread; i++) { + all.add(UUIDv7.randomUUID()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + done.countDown(); + } + }); + } + + start.countDown(); + assertTrue(done.await(30, TimeUnit.SECONDS), "stress test should complete in reasonable time"); + pool.shutdown(); + + assertEquals(threads * perThread, all.size(), "all UUIDs must be unique under stress"); + } + + /** + * Validates that UUIDs generated at different times have increasing timestamps. + */ + @Test + void timestampIncreasesOverTime() { + MutableClock clock = new MutableClock(1_000_000L); + UUIDv7 generator = new UUIDv7(clock); + + UUID first = generator.generate(); + clock.addMillis(100); + UUID second = generator.generate(); + clock.addMillis(200); + UUID third = generator.generate(); + + long ts1 = first.getMostSignificantBits() >>> 16; + long ts2 = second.getMostSignificantBits() >>> 16; + long ts3 = third.getMostSignificantBits() >>> 16; + + assertEquals(1_000_000L, ts1); + assertEquals(1_000_100L, ts2); + assertEquals(1_000_300L, ts3); + + assertTrue(ts1 < ts2); + assertTrue(ts2 < ts3); + } + + // HELPERS + + /** + * Compares two UUIDs lexicographically using unsigned comparison. + *

+ * Java's {@link UUID#compareTo(UUID)} uses signed long comparison, + * which doesn't match the lexicographic byte ordering required by UUIDv7. + * + * @param a first UUID + * @param b second UUID + * @return negative if a < b, zero if a == b, positive if a > b (lexicographically) + */ + private static int compareUnsignedLex(UUID a, UUID b) { + int msb = Long.compareUnsigned(a.getMostSignificantBits(), b.getMostSignificantBits()); + if (msb != 0) { + return msb; + } + return Long.compareUnsigned(a.getLeastSignificantBits(), b.getLeastSignificantBits()); + } + + /** + * Mutable Clock implementation for testing. + */ + private static final class MutableClock extends Clock { + + private volatile long millis; + + MutableClock(long initialMillis) { + this.millis = initialMillis; + } + + void setMillis(long ms) { + this.millis = ms; + } + + void addMillis(long delta) { + this.millis += delta; + } + + @Override + public ZoneOffset getZone() { + return ZoneOffset.UTC; + } + + @Override + public Clock withZone(java.time.ZoneId zone) { + return this; + } + + @Override + public long millis() { + return this.millis; + } + + @Override + public Instant instant() { + return Instant.ofEpochMilli(this.millis); + } + } +} \ No newline at end of file From 15ba592a336c310c202a6c38e553d46fde7291e2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 Apr 2026 21:30:44 +0200 Subject: [PATCH 02/10] feat: use the timestamp for generating UUID of type 7 --- src/java/org/commoncrawl/util/UUIDv7.java | 21 +++++++++ src/java/org/commoncrawl/util/WarcWriter.java | 25 ++++++++--- .../org/commoncrawl/util/UUIDv7Tests.java | 44 +++++++++++++++++-- 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/src/java/org/commoncrawl/util/UUIDv7.java b/src/java/org/commoncrawl/util/UUIDv7.java index f71c983083..1e9d449934 100644 --- a/src/java/org/commoncrawl/util/UUIDv7.java +++ b/src/java/org/commoncrawl/util/UUIDv7.java @@ -235,4 +235,25 @@ public static UUID randomUUID() { return SHARED.generate(); } + /** + * Creates a UUIDv7 from an explicit Unix epoch millisecond timestamp. + *

+ * This is a stateless factory method: it does not participate in the + * monotonic sequence maintained by {@link #randomUUID()}. The caller + * is responsible for ensuring timestamp ordering. + *

+ * The sequence counter is set to 0 and the lower 62 bits are filled + * with cryptographically secure random data. + * + * @param timestamp Unix epoch milliseconds (must fit in 48 bits) + * @return a UUIDv7 embedding the given timestamp + * @throws IllegalArgumentException if timestamp is negative or >= 2^48 + */ + public static UUID fromTimestamp(long timestamp) { + if ((timestamp >> 48) != 0) { + throw new IllegalArgumentException( + "Timestamp does not fit in 48 bits: " + timestamp); + } + return SHARED.buildUUID(timestamp, 0); + } } diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index 48f270cd2f..f3d55d5ec5 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -197,7 +197,7 @@ public URI writeWarcinfoRecord(String filename, String hostname, writeWarcKeyValue(sb, settings); byte[] ba = sb.toString().getBytes(StandardCharsets.UTF_8); - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_INFO, date, CONTENT_TYPE_METADATA, recordId, extra, new ByteArrayInputStream(ba), ba.length); @@ -222,7 +222,7 @@ public URI writeWarcRequestRecord(final URI targetUri, final String ip, } } - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_REQUEST, date, "application/http; msgtype=request", recordId, extra, block); return recordId; @@ -265,7 +265,7 @@ public URI writeWarcResponseRecord(final URI targetUri, final String ip, extra.put(WARC_IDENTIFIED_PAYLOAD_TYPE, content.getContentType()); - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_RESPONSE, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -305,7 +305,7 @@ public URI writeWarcRevisitRecord(final URI targetUri, final String ip, extra.put(WARC_BLOCK_DIGEST, blockDigest); } - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_REVISIT, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); return recordId; } @@ -322,7 +322,7 @@ public URI writeWarcMetadataRecord(final URI targetUri, final Date date, extra.put(WARC_BLOCK_DIGEST, blockDigest); } - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_METADATA, date, CONTENT_TYPE_METADATA, recordId, extra, block); return recordId; } @@ -339,7 +339,7 @@ public URI writeWarcConversionRecord(final URI targetUri, final Date date, extra.put(WARC_BLOCK_DIGEST, blockDigest); } - URI recordId = getRecordId(); + URI recordId = getRecordId(date.getTime()); writeRecord(WARC_CONVERSION, date, contentType, recordId, extra, block); return recordId; } @@ -460,6 +460,11 @@ private String getUUID() { return UUIDv7.randomUUID().toString(); } + private String getUUID(long timestamp) { + return UUIDv7.fromTimestamp(timestamp).toString(); + } + + public URI getRecordId() { try { return new URI("urn:uuid:" + getUUID()); @@ -468,6 +473,14 @@ public URI getRecordId() { } } + public URI getRecordId(long timestamp) { + try { + return new URI("urn:uuid:" + getUUID(timestamp)); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + protected static String getMeta(Metadata metadata, String name) { String value = metadata.get(name); if (value == null) { diff --git a/src/test/org/commoncrawl/util/UUIDv7Tests.java b/src/test/org/commoncrawl/util/UUIDv7Tests.java index 8d6d083024..b5e9f68632 100644 --- a/src/test/org/commoncrawl/util/UUIDv7Tests.java +++ b/src/test/org/commoncrawl/util/UUIDv7Tests.java @@ -21,10 +21,6 @@ import org.junit.jupiter.api.Test; -/** - * This class was borrowed from https://github.com/belief-driven-design/blog-uuidv7 - * Copyright to the original author, and licensed as CC0. - */ class UUIDv7Tests { /** @@ -318,6 +314,46 @@ void timestampIncreasesOverTime() { assertTrue(ts2 < ts3); } + // --- fromTimestamp tests --- + + @Test + void fromTimestampHasCorrectVersionAndVariant() { + UUID result = UUIDv7.fromTimestamp(1_234_567_890_000L); + + assertEquals(7, result.version(), "must be UUIDv7"); + assertEquals(2, result.variant(), "must be IETF variant"); + } + + @Test + void fromTimestampEmbedsCorrectTimestamp() { + long clockMs = 1_234_567_890_000L; + UUID result = UUIDv7.fromTimestamp(clockMs); + + long extracted = result.getMostSignificantBits() >>> 16; + assertEquals(clockMs, extracted, "timestamp should match input"); + } + + @Test + void fromTimestampHasSequenceZero() { + UUID result = UUIDv7.fromTimestamp(1_234_567_890_000L); + + int sequence = (int) (result.getMostSignificantBits() & 0xFFFL); + assertEquals(0, sequence, "sequence should be 0"); + } + + @Test + void fromTimestampRejectsInvalidTimestamps() { + // Negative + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(-1L)); + + // Exceeds 48 bits + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(1L << 48)); + } + // HELPERS /** From 77e9c294779286e39dd14928389ff1008869566e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 11:02:42 +0200 Subject: [PATCH 03/10] feat: overload method supplying the sequence --- src/java/org/commoncrawl/util/UUIDv7.java | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/java/org/commoncrawl/util/UUIDv7.java b/src/java/org/commoncrawl/util/UUIDv7.java index 1e9d449934..763431022f 100644 --- a/src/java/org/commoncrawl/util/UUIDv7.java +++ b/src/java/org/commoncrawl/util/UUIDv7.java @@ -256,4 +256,30 @@ public static UUID fromTimestamp(long timestamp) { } return SHARED.buildUUID(timestamp, 0); } + + /** + * Creates a UUIDv7 from an explicit Unix epoch millisecond timestamp + * and sequence counter. + *

+ * This is a stateless factory method: it does not participate in the + * monotonic sequence maintained by {@link #randomUUID()}. The caller + * is responsible for ensuring timestamp and sequence ordering. + * + * @param timestamp Unix epoch milliseconds (must fit in 48 bits) + * @param sequence sub-millisecond sequence counter (0-4095) + * @return a UUIDv7 embedding the given timestamp and sequence + * @throws IllegalArgumentException if timestamp is negative or >= 2^48, + * or if sequence is outside the range 0-4095 + */ + public static UUID fromTimestamp(long timestamp, int sequence) { + if ((timestamp >> 48) != 0) { + throw new IllegalArgumentException( + "Timestamp does not fit in 48 bits: " + timestamp); + } + if (sequence < 0 || sequence > 0xFFF) { + throw new IllegalArgumentException( + "Sequence must be in range 0-4095: " + sequence); + } + return SHARED.buildUUID(timestamp, sequence); + } } From 969ea44510bbe2fd63d9e6a1a407b0e305a4ff25 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 11:03:12 +0200 Subject: [PATCH 04/10] feat: unit tests --- .../org/commoncrawl/util/UUIDv7Tests.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/test/org/commoncrawl/util/UUIDv7Tests.java b/src/test/org/commoncrawl/util/UUIDv7Tests.java index b5e9f68632..7df3240063 100644 --- a/src/test/org/commoncrawl/util/UUIDv7Tests.java +++ b/src/test/org/commoncrawl/util/UUIDv7Tests.java @@ -354,6 +354,92 @@ void fromTimestampRejectsInvalidTimestamps() { () -> UUIDv7.fromTimestamp(1L << 48)); } + // --- fromTimestamp(long, int) tests --- + + @Test + void fromTimestampWithSequenceHasCorrectVersionAndVariant() { + UUID result = UUIDv7.fromTimestamp(1_234_567_890_000L, 42); + + assertEquals(7, result.version(), "must be UUIDv7"); + assertEquals(2, result.variant(), "must be IETF variant"); + } + + @Test + void fromTimestampWithSequenceEmbedsCorrectTimestamp() { + long clockMs = 1_234_567_890_000L; + UUID result = UUIDv7.fromTimestamp(clockMs, 10); + + long extracted = result.getMostSignificantBits() >>> 16; + assertEquals(clockMs, extracted, "timestamp should match input"); + } + + @Test + void fromTimestampWithSequenceEmbedsCorrectSequence() { + UUID result = UUIDv7.fromTimestamp(1_234_567_890_000L, 99); + + int sequence = (int) (result.getMostSignificantBits() & 0xFFFL); + assertEquals(99, sequence, "sequence should match input"); + } + + @Test + void fromTimestampWithSequenceProducesOrderedUUIDs() { + long ts = 1_234_567_890_000L; + UUID first = UUIDv7.fromTimestamp(ts, 0); + UUID second = UUIDv7.fromTimestamp(ts, 1); + UUID third = UUIDv7.fromTimestamp(ts, 2); + + // Same timestamp, increasing sequence → MSB should be strictly ordered + assertTrue(Long.compareUnsigned( + first.getMostSignificantBits(), + second.getMostSignificantBits()) < 0, + "sequence 0 < sequence 1"); + assertTrue(Long.compareUnsigned( + second.getMostSignificantBits(), + third.getMostSignificantBits()) < 0, + "sequence 1 < sequence 2"); + } + + @Test + void fromTimestampWithSequenceRejectsInvalidSequence() { + long ts = 1_234_567_890_000L; + + // Negative sequence + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(ts, -1)); + + // Exceeds 12 bits + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(ts, 4096)); + } + + @Test + void fromTimestampWithSequenceRejectsInvalidTimestamp() { + // Negative timestamp + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(-1L, 0)); + + // Exceeds 48 bits + org.junit.jupiter.api.Assertions.assertThrows( + IllegalArgumentException.class, + () -> UUIDv7.fromTimestamp(1L << 48, 0)); + } + + @Test + void fromTimestampWithSequenceBoundaryValues() { + // Minimum valid values + UUID min = UUIDv7.fromTimestamp(0L, 0); + assertEquals(0L, min.getMostSignificantBits() >>> 16); + assertEquals(0, (int) (min.getMostSignificantBits() & 0xFFFL)); + + // Maximum valid timestamp (2^48 - 1) and sequence (4095) + UUID max = UUIDv7.fromTimestamp((1L << 48) - 1, 4095); + assertEquals((1L << 48) - 1, max.getMostSignificantBits() >>> 16); + assertEquals(4095, (int) (max.getMostSignificantBits() & 0xFFFL)); + } + // HELPERS /** From 3c04c744197905b77768e37fa708890bc6ee418f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 11:44:20 +0200 Subject: [PATCH 05/10] fix: unit tests naming convention --- .../org/commoncrawl/util/{UUIDv7Tests.java => TestUUIDv7.java} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/test/org/commoncrawl/util/{UUIDv7Tests.java => TestUUIDv7.java} (99%) diff --git a/src/test/org/commoncrawl/util/UUIDv7Tests.java b/src/test/org/commoncrawl/util/TestUUIDv7.java similarity index 99% rename from src/test/org/commoncrawl/util/UUIDv7Tests.java rename to src/test/org/commoncrawl/util/TestUUIDv7.java index 7df3240063..7395f8f17d 100644 --- a/src/test/org/commoncrawl/util/UUIDv7Tests.java +++ b/src/test/org/commoncrawl/util/TestUUIDv7.java @@ -21,7 +21,7 @@ import org.junit.jupiter.api.Test; -class UUIDv7Tests { +class TestUUIDv7 { /** * Validates that generated UUIDs have correct version (7) and variant (2) fields From cf7331ea6eeeeca395499a3bff66d4ef027b8599 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 11:44:50 +0200 Subject: [PATCH 06/10] fix: deprecate non-timestamp provided UUID generated methods --- src/java/org/commoncrawl/util/WarcWriter.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java index f3d55d5ec5..b42287dec1 100644 --- a/src/java/org/commoncrawl/util/WarcWriter.java +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -456,6 +456,13 @@ protected static void writeWarcKeyValue(StringBuilder sb, String key, sb.append(key).append(COLONSP).append(value).append(CRLF); } + /** + * This method is deprecated with the introduction of the UUID of type 7 that introduce a timestamp + * component. We use the capture timestamp for composing the UUID. + * + * @see String getUUID(long timestamp) + */ + @Deprecated private String getUUID() { return UUIDv7.randomUUID().toString(); } @@ -464,7 +471,13 @@ private String getUUID(long timestamp) { return UUIDv7.fromTimestamp(timestamp).toString(); } - + /** + * This method is deprecated with the introduction of the UUID of type 7 that introduce a timestamp + * component. We use the capture timestamp for composing the UUID. + * + * @see String getRecordId(long timestamp) + */ + @Deprecated public URI getRecordId() { try { return new URI("urn:uuid:" + getUUID()); From c6a462e2e19d9121d2aafe30ab442e0b45b17279 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 12:39:23 +0200 Subject: [PATCH 07/10] feat: update unit tests --- src/test/org/commoncrawl/util/TestWarcWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 4f7344010d..5117ccd9ca 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -52,8 +52,8 @@ public void testWriteRevisitRecordContentType() throws Exception { int httpStatusCode = 304; Date date = HttpDateFormat.toDate(metadata.get("date")); - URI warcinfoId = writer.getRecordId(); - URI relatedId = writer.getRecordId(); + URI warcinfoId = writer.getRecordId(date.getTime()); + URI relatedId = writer.getRecordId(date.getTime()); String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; Date refersToDate = new Date(System.currentTimeMillis() - 3600000); String payloadDigest = "sha1:abc123"; From 8031e8dfa9e71ff65a162f6cdd3bdc59c8eda900 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 12:39:39 +0200 Subject: [PATCH 08/10] fix: make unit tests work locally --- src/test/org/commoncrawl/util/TestWarcWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index 5117ccd9ca..d2b5baaa3b 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -39,7 +39,7 @@ public void testWriteRevisitRecordContentType() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); WarcWriter writer = new WarcWriter(bos); - File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + File segmentDir = new File(System.getProperty("test.build.data", "src/testresources"), "test-segments/20260224170658-revisit"); assertNotNull(segmentDir, "Missing segment resource"); String segmentPath = segmentDir.getAbsolutePath(); String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; From 3e537154ec0c5987f7033f1f2446857b7c54420a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 13:58:18 +0200 Subject: [PATCH 09/10] fix: use random sequence --- src/java/org/commoncrawl/util/UUIDv7.java | 5 ++++- src/test/org/commoncrawl/util/TestUUIDv7.java | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/java/org/commoncrawl/util/UUIDv7.java b/src/java/org/commoncrawl/util/UUIDv7.java index 763431022f..02896728f6 100644 --- a/src/java/org/commoncrawl/util/UUIDv7.java +++ b/src/java/org/commoncrawl/util/UUIDv7.java @@ -254,7 +254,10 @@ public static UUID fromTimestamp(long timestamp) { throw new IllegalArgumentException( "Timestamp does not fit in 48 bits: " + timestamp); } - return SHARED.buildUUID(timestamp, 0); + // RFC 9562 Section 6.2 Method 1: fill the 12-bit sub-millisecond + // field with random data. + int random12 = SHARED.random.nextInt(0x1000); + return SHARED.buildUUID(timestamp, random12); } /** diff --git a/src/test/org/commoncrawl/util/TestUUIDv7.java b/src/test/org/commoncrawl/util/TestUUIDv7.java index 7395f8f17d..493fb7d18a 100644 --- a/src/test/org/commoncrawl/util/TestUUIDv7.java +++ b/src/test/org/commoncrawl/util/TestUUIDv7.java @@ -334,11 +334,19 @@ void fromTimestampEmbedsCorrectTimestamp() { } @Test - void fromTimestampHasSequenceZero() { - UUID result = UUIDv7.fromTimestamp(1_234_567_890_000L); - - int sequence = (int) (result.getMostSignificantBits() & 0xFFFL); - assertEquals(0, sequence, "sequence should be 0"); + void fromTimestampUsesRandom12Bits() { + long ts = 1_234_567_890_000L; + Set seen = new HashSet<>(); + for (int i = 0; i < 100; i++) { + UUID result = UUIDv7.fromTimestamp(ts); + int rand12 = (int) (result.getMostSignificantBits() & 0xFFFL); + seen.add(rand12); + } + // With 100 draws from 4096 values, getting all zeros is impossible + // if the field is truly random. Even getting fewer than 2 distinct + // values is astronomically unlikely. + assertTrue(seen.size() > 1, + "sub-millisecond field should be random, not fixed"); } @Test From 7bc68cc04ff8270adf9bb0c90d14a8a4a72ac564 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 2 Apr 2026 13:58:40 +0200 Subject: [PATCH 10/10] tests: add test on segment for warc writer --- .../org/commoncrawl/util/TestWarcWriter.java | 165 +++++++++++++----- .../util/test/SegmenterRecordReader.java | 1 - 2 files changed, 119 insertions(+), 47 deletions(-) diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java index d2b5baaa3b..2ec6478dd9 100644 --- a/src/test/org/commoncrawl/util/TestWarcWriter.java +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -26,58 +26,131 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.net.URI; +import java.util.ArrayList; import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.UUID; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; public class TestWarcWriter { - @Test - public void testWriteRevisitRecordContentType() throws Exception { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - WarcWriter writer = new WarcWriter(bos); - - File segmentDir = new File(System.getProperty("test.build.data", "src/testresources"), "test-segments/20260224170658-revisit"); - assertNotNull(segmentDir, "Missing segment resource"); - String segmentPath = segmentDir.getAbsolutePath(); - String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; - - Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); - URI targetUri = new URI(content.getUrl()); - - Metadata metadata = content.getMetadata(); - String ip = content.getMetadata().get("_ip_"); - int httpStatusCode = 304; - - Date date = HttpDateFormat.toDate(metadata.get("date")); - URI warcinfoId = writer.getRecordId(date.getTime()); - URI relatedId = writer.getRecordId(date.getTime()); - String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; - Date refersToDate = new Date(System.currentTimeMillis() - 3600000); - String payloadDigest = "sha1:abc123"; - String blockDigest = "sha1:def456"; - - writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, - warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, - blockDigest, null, null, content.getContent(), content); - - byte[] compressed = bos.toByteArray(); - ByteArrayInputStream bis = new ByteArrayInputStream(compressed); - GZIPInputStream gis = new GZIPInputStream(bis); - ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); - gis.transferTo(decompressed); - - String warcOutput = decompressed.toString(); - - assertTrue(warcOutput.contains("WARC-Type: revisit"), - "WARC record should have WARC-Type: revisit"); - assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), - "WARC revisit record should have Content-Type: application/http; msgtype=response"); - assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), - "WARC record should have WARC-Refers-To-Target-URI header"); - assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), - "WARC record should have WARC-Profile header"); - } + @Test + public void testWriteRevisitRecordContentType() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "src/testresources"), "test-segments/20260224170658-revisit"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + assertThat("Revisit record should not have any payload or content", + content.getContent(), is(new byte[]{})); + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 304; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(date.getTime()); + URI relatedId = writer.getRecordId(date.getTime()); + String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; + Date refersToDate = new Date(System.currentTimeMillis() - 3600000); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, null, null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Type: revisit"), + "WARC record should have WARC-Type: revisit"); + assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), + "WARC revisit record should have Content-Type: application/http; msgtype=response"); + assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), + "WARC record should have WARC-Refers-To-Target-URI header"); + assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), + "WARC record should have WARC-Profile header"); + } + + @Test + public void testWriteRecordWithUUID7() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "src/testresources"), "test-segments/20150309101656"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + + String url = "http://avro.apache.org/"; + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + + URI targetUri = new URI(content.getUrl()); + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + String contentType = content.getMetadata().get("Content-Type"); + int httpStatusCode = 200; + + Date date = HttpDateFormat.toDate(metadata.get("Date")); + URI warcinfoId = writer.getRecordId(date.getTime()); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + URI recordId = writer.getRecordId(date.getTime()); + + writer.writeWarcinfoRecord("output", "avro.apache.org", "CCF", "CCF", "CCBot", warcinfoId.toString(), "blablabal", date); + writer.writeWarcRequestRecord(targetUri, ip, date, warcinfoId, null, null, content.getContent()); + writer.writeWarcResponseRecord(targetUri, ip, httpStatusCode, date, warcinfoId, recordId, payloadDigest, blockDigest, "False", null, null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + StringBuilder allRecords = new StringBuilder(); + while (bis.available() > 0) { + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + allRecords.append(decompressed.toString()); + } + String warcOutput = allRecords.toString(); + + Pattern pattern = Pattern.compile("WARC-Record-ID: ]+)>"); + Matcher matcher = pattern.matcher(warcOutput); + List recordIds = new ArrayList<>(); + while (matcher.find()) { + recordIds.add(UUID.fromString(matcher.group(1))); + } + + assertEquals(3, recordIds.size(), "should have 3 WARC-Record-IDs"); + + assertEquals(3, new HashSet<>(recordIds).size(), + "all record IDs must be unique"); + + long expectedTs = date.getTime(); + for (UUID uuid : recordIds) { + assertEquals(7, uuid.version(), "must be UUIDv7"); + assertEquals(2, uuid.variant(), "must be IETF variant"); + long embedded = uuid.getMostSignificantBits() >>> 16; + assertEquals(expectedTs, embedded, + "timestamp must match capture date"); + } + } } diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java index 62057f4e17..5570ac54bf 100644 --- a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -28,7 +28,6 @@ private int run(String path, String url) throws Exception { Content c = new Content(); readers[0].get(k, c); assert (c.getUrl().equals(url)); - assert (c.getContent() == null || c.getContent().length == 0); this.content = c; return 0;