commoncrawl · malteos · Jun 3, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/README.md b/README.md
@@ -21,6 +21,12 @@ need an extra:
 uv sync --extra notebooks
 ```
 
+`ccoa tokenize` needs the HuggingFace `transformers` stack:
+
+```bash
+uv sync --extra tokenize
+```
+
 ## CLI
 
 ```bash
@@ -189,6 +195,59 @@ when a worker dies on adversarial HTML the pool drops the suspect file
 and continues; a follow-up resume run will retry the dropped files.
 
 
+### Tokenize
+
+`ccoa tokenize` reads the per-WARC text-extraction cache produced by
+`ccoa classify-warc --cache-dir <uri>`, tokenizes each record with a
+fast HuggingFace tokenizer, and writes a per-record parquet:
+
+```
+cache_path: string, record_index: int32, n_tokens: int32, token_ids: list<int32>
+```
+
+Plus a sidecar `<output>.summary.csv` with run metadata and a token-count
+distribution (count/min/max/mean/median/p10..p99/total) mirroring the
+`classify-warc` summary shape.
+
+```bash
+uv sync --extra tokenize
+export HF_TOKEN=<your token with the model's license accepted>
+uv run ccoa tokenize \
+  --cache-paths 's3://commoncrawl-dev/cc-focus-tools/warc-text-extract-cache/s3/commoncrawl/crawl-data/CC-MAIN-2025-51/segments/*/warc/*.warc.gz.jsonl.gz' \
+  --files-limit 1 --records-per-file-limit 100 \
+  --workers 4 --progress-every 25 \
+  --output /tmp/tokens.parquet
+```
+
+`--cache-paths` accepts one or more URIs or globs; matches must be
+gzipped-JSONL cache files (`{"index": N, "text": "..."}` per line) as
+produced by `classify-warc --cache-dir`. Each cache file maps 1:1 to a
+source WARC and is the unit of work for `--workers` parallelism.
+
+`--tokenizer` defaults to `meta-llama/Llama-2-7b`, which is gated —
+accept the license on HuggingFace, then set `HF_TOKEN` (or run
+`huggingface-cli login`). Override with any HuggingFace repo id; the
+tokenizer must resolve to a fast (Rust) variant for thread-mode safety.
+
+`--workers-mode thread` (default) shares one tokenizer instance across
+worker threads — HF fast tokenizers release the GIL and are
+thread-safe. `--workers-mode process` loads a separate tokenizer per
+worker process; pick it if you must use a slow tokenizer.
+
+`--batch-size N` (default 64) controls how many texts are handed to the
+tokenizer per call (fast tokenizers vectorize internally — bigger is
+faster up to a point). `--progress-every N` logs a per-file heartbeat
+every N tokenized records; per-file completion lines always log
+`progress — files=K/M elapsed=... eta=~...` like classify-warc.
+
+`--output` accepts a local path or any fsspec URI (e.g.
+`s3://bucket/key.parquet`). To overwrite an existing output, pass
+`--overwrite`.
+
+The cache JSONL stores `index` + `text` only — no URL. The parquet's
+`cache_path` is the source JSONL URI; downstream code can reverse it to
+a WARC URI if the `--cache-dir` prefix is known.
+
 ## Development
 
 ```bash

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,17 +13,21 @@ dependencies = [
     "huggingface-hub>=0.24",
     "fsspec>=2024.0",
     "pandas",
+    "pyarrow>=15",
 ]
 
 [project.optional-dependencies]
 notebooks = ["jupyter", "matplotlib", "scipy", "nbconvert"]
 # Required for reading/writing `s3://...` URIs (WARC inputs, classify-warc
 # output, extraction cache, resume-from-output).
 s3 = ["s3fs"]
+# Required for `ccoa tokenize` (HuggingFace tokenizer over the text cache).
+tokenize = ["transformers>=4.40", "sentencepiece", "protobuf"]
 # Convenience: install every optional dependency at once.
 all = [
     "ccoa[notebooks]",
     "ccoa[s3]",
+    "ccoa[tokenize]",
 ]
 
 [project.scripts]

diff --git a/src/ccoa/cli.py b/src/ccoa/cli.py
@@ -7,6 +7,7 @@
 import sys
 
 from ccoa.commands.classify_warc import ClassifyWarcCommand
+from ccoa.commands.tokenize import TokenizeCommand
 
 LOG_LEVELS = ["debug", "info", "warning", "error", "critical"]
 
@@ -22,7 +23,7 @@ def main(argv: list[str] | None = None) -> int:
     )
     subparsers = parser.add_subparsers(dest="command", required=True)
 
-    commands = [ClassifyWarcCommand()]
+    commands = [ClassifyWarcCommand(), TokenizeCommand()]
     command_map: dict[str, object] = {}
     for cmd in commands:
         sub = subparsers.add_parser(cmd.name, help=cmd.help, description=cmd.help)