diff --git a/README.md b/README.md index 134198b..457f620 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ need an extra: uv sync --extra notebooks ``` +`ccoa tokenize` needs the HuggingFace `transformers` stack: + +```bash +uv sync --extra tokenize +``` + ## CLI ```bash @@ -189,6 +195,59 @@ when a worker dies on adversarial HTML the pool drops the suspect file and continues; a follow-up resume run will retry the dropped files. +### Tokenize + +`ccoa tokenize` reads the per-WARC text-extraction cache produced by +`ccoa classify-warc --cache-dir `, tokenizes each record with a +fast HuggingFace tokenizer, and writes a per-record parquet: + +``` +cache_path: string, record_index: int32, n_tokens: int32, token_ids: list +``` + +Plus a sidecar `.summary.csv` with run metadata and a token-count +distribution (count/min/max/mean/median/p10..p99/total) mirroring the +`classify-warc` summary shape. + +```bash +uv sync --extra tokenize +export HF_TOKEN= +uv run ccoa tokenize \ + --cache-paths 's3://commoncrawl-dev/cc-focus-tools/warc-text-extract-cache/s3/commoncrawl/crawl-data/CC-MAIN-2025-51/segments/*/warc/*.warc.gz.jsonl.gz' \ + --files-limit 1 --records-per-file-limit 100 \ + --workers 4 --progress-every 25 \ + --output /tmp/tokens.parquet +``` + +`--cache-paths` accepts one or more URIs or globs; matches must be +gzipped-JSONL cache files (`{"index": N, "text": "..."}` per line) as +produced by `classify-warc --cache-dir`. Each cache file maps 1:1 to a +source WARC and is the unit of work for `--workers` parallelism. + +`--tokenizer` defaults to `meta-llama/Llama-2-7b`, which is gated — +accept the license on HuggingFace, then set `HF_TOKEN` (or run +`huggingface-cli login`). Override with any HuggingFace repo id; the +tokenizer must resolve to a fast (Rust) variant for thread-mode safety. + +`--workers-mode thread` (default) shares one tokenizer instance across +worker threads — HF fast tokenizers release the GIL and are +thread-safe. `--workers-mode process` loads a separate tokenizer per +worker process; pick it if you must use a slow tokenizer. + +`--batch-size N` (default 64) controls how many texts are handed to the +tokenizer per call (fast tokenizers vectorize internally — bigger is +faster up to a point). `--progress-every N` logs a per-file heartbeat +every N tokenized records; per-file completion lines always log +`progress — files=K/M elapsed=... eta=~...` like classify-warc. + +`--output` accepts a local path or any fsspec URI (e.g. +`s3://bucket/key.parquet`). To overwrite an existing output, pass +`--overwrite`. + +The cache JSONL stores `index` + `text` only — no URL. The parquet's +`cache_path` is the source JSONL URI; downstream code can reverse it to +a WARC URI if the `--cache-dir` prefix is known. + ## Development ```bash diff --git a/pyproject.toml b/pyproject.toml index 43fb291..94b7d69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "huggingface-hub>=0.24", "fsspec>=2024.0", "pandas", + "pyarrow>=15", ] [project.optional-dependencies] @@ -20,10 +21,13 @@ notebooks = ["jupyter", "matplotlib", "scipy", "nbconvert"] # Required for reading/writing `s3://...` URIs (WARC inputs, classify-warc # output, extraction cache, resume-from-output). s3 = ["s3fs"] +# Required for `ccoa tokenize` (HuggingFace tokenizer over the text cache). +tokenize = ["transformers>=4.40", "sentencepiece", "protobuf"] # Convenience: install every optional dependency at once. all = [ "ccoa[notebooks]", "ccoa[s3]", + "ccoa[tokenize]", ] [project.scripts] diff --git a/src/ccoa/cli.py b/src/ccoa/cli.py index 1ff57d2..9d27db9 100644 --- a/src/ccoa/cli.py +++ b/src/ccoa/cli.py @@ -7,6 +7,7 @@ import sys from ccoa.commands.classify_warc import ClassifyWarcCommand +from ccoa.commands.tokenize import TokenizeCommand LOG_LEVELS = ["debug", "info", "warning", "error", "critical"] @@ -22,7 +23,7 @@ def main(argv: list[str] | None = None) -> int: ) subparsers = parser.add_subparsers(dest="command", required=True) - commands = [ClassifyWarcCommand()] + commands = [ClassifyWarcCommand(), TokenizeCommand()] command_map: dict[str, object] = {} for cmd in commands: sub = subparsers.add_parser(cmd.name, help=cmd.help, description=cmd.help) diff --git a/src/ccoa/commands/tokenize.py b/src/ccoa/commands/tokenize.py new file mode 100644 index 0000000..ae9d0de --- /dev/null +++ b/src/ccoa/commands/tokenize.py @@ -0,0 +1,535 @@ +"""Tokenize text-cache records with a HuggingFace tokenizer. + +Reads the per-WARC text-extraction cache produced by +`ccoa classify-warc --cache-dir ` (gzipped JSONL with one +`{"index": N, "text": "..."}` line per response record), tokenizes each +text with a fast HuggingFace tokenizer, and writes a per-record parquet +of `(cache_path, record_index, n_tokens, token_ids)`. A sidecar +`.summary.csv` captures CLI args, counters, and a token-count +distribution. + +Example: + ```bash + uv sync --extra tokenize + export HF_TOKEN= + uv run ccoa tokenize \\ + --cache-paths 's3://commoncrawl-dev/cc-focus-tools/warc-text-extract-cache/s3/.../*.warc.gz.jsonl.gz' \\ + --files-limit 1 --records-per-file-limit 100 \\ + --workers 4 --progress-every 25 \\ + --output /tmp/tokens.parquet + ``` + +Each input cache file maps 1:1 to a WARC, so per-file parallelism is the +natural unit of work. HuggingFace Fast tokenizers (Rust) release the GIL +and are thread-safe — the default `--workers-mode thread` shares one +loaded tokenizer instance with no per-call lock. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +import logging +import os +import threading +import time +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from concurrent.futures.process import BrokenProcessPool +from dataclasses import dataclass, field + +import pyarrow as pa + +from ccoa.commands import BaseCommand +from ccoa.extraction.cache import load_extraction_cache +from ccoa.tokenizer.hf import DEFAULT_TOKENIZER_REPO, load_tokenizer, tokenize_batch +from ccoa.utils.fs.paths import ( + output_exists, + resolve_warc_paths, + s3_storage_options, +) +from ccoa.utils.io.parquet import TOKENIZE_SCHEMA, open_parquet_writer +from ccoa.utils.reporting.summary import ( + format_duration, + log_tokens_summary, + write_tokenize_summary, +) + +logger = logging.getLogger(__name__) + + +# Per-process tokenizer state. In thread mode the parent loads the +# tokenizer once and passes it to each worker; in process mode each +# worker rebuilds `_WORKER_TOKENIZER` via the pool initializer. +_WORKER_TOKENIZER: object | None = None + + +@dataclass +class FileResult: + """Aggregated per-file outcome returned by `process_one_file`.""" + + cache_uri: str + rows: list[tuple[str, int, int, list[int]]] = field(default_factory=list) + processed: int = 0 + skipped_empty: int = 0 + t_load: float = 0.0 + t_tokenize: float = 0.0 + + +def process_one_file( + cache_uri: str, + tokenizer: object, + args: argparse.Namespace, +) -> FileResult: + """Tokenize all records in one cache file and return its `FileResult`. + + Loads the gzipped-JSONL cache into memory, applies + `args.records_per_file_limit` (if set), drops empty texts, then + batches the surviving texts through `tokenize_batch`. Records are + emitted in ascending record-index order for determinism. + """ + result = FileResult(cache_uri=cache_uri) + storage_options = s3_storage_options(cache_uri, args.anonymous_s3, args.s3_requester_pays) + + logger.info("Loading cache %s", cache_uri) + t0 = time.perf_counter() + entries = load_extraction_cache(cache_uri, storage_options) + result.t_load = time.perf_counter() - t0 + + if not entries: + logger.warning("Cache %s yielded no entries; skipping.", cache_uri) + return result + + indices = sorted(entries) + if args.records_per_file_limit > 0: + indices = indices[: args.records_per_file_limit] + + # Drop empties up front so the per-batch arrays line up with the + # tokenizer output. + surviving: list[int] = [] + for idx in indices: + text = entries[idx] + if not text or not text.strip(): + result.skipped_empty += 1 + continue + surviving.append(idx) + + progress_every = getattr(args, "progress_every", 0) or 0 + batch_size = max(1, args.batch_size) + + for i in range(0, len(surviving), batch_size): + batch_idx = surviving[i : i + batch_size] + batch_text = [entries[j] for j in batch_idx] + t0 = time.perf_counter() + ids_list = tokenize_batch(tokenizer, batch_text) + result.t_tokenize += time.perf_counter() - t0 + for j, ids in zip(batch_idx, ids_list, strict=True): + result.rows.append((cache_uri, j, len(ids), ids)) + result.processed += 1 + if progress_every and result.processed % progress_every == 0: + logger.info( + " %s: processed=%d skipped_empty=%d", + cache_uri, + result.processed, + result.skipped_empty, + ) + + return result + + +def _tokenize_summary_path(output: str) -> str: + """Return the sidecar summary path for `output`. + + Always emits `.summary.csv` (the sidecar is CSV regardless of the + main output's extension — typically `.parquet` for this command). + """ + root, _ = os.path.splitext(output) + return f"{root}.summary.csv" + + +def _process_pool_initializer(tokenizer_repo: str) -> None: + """Per-worker setup for `ProcessPoolExecutor`: load the tokenizer once.""" + global _WORKER_TOKENIZER # noqa: PLW0603 + _WORKER_TOKENIZER = load_tokenizer(tokenizer_repo) + + +def _process_pool_worker( + payload: tuple[str, argparse.Namespace], +) -> FileResult: + """Top-level pickleable adapter that runs `process_one_file` in a worker.""" + cache_uri, args = payload + return process_one_file(cache_uri, _WORKER_TOKENIZER, args) + + +def _rows_to_table(rows: list[tuple[str, int, int, list[int]]]) -> pa.Table: + """Convert `FileResult.rows` into a `pyarrow.Table` matching `TOKENIZE_SCHEMA`.""" + cache_paths = [r[0] for r in rows] + record_indices = [r[1] for r in rows] + n_tokens = [r[2] for r in rows] + token_ids = [r[3] for r in rows] + return pa.table( + { + "cache_path": pa.array(cache_paths, type=pa.string()), + "record_index": pa.array(record_indices, type=pa.int32()), + "n_tokens": pa.array(n_tokens, type=pa.int32()), + "token_ids": pa.array(token_ids, type=pa.list_(pa.int32())), + }, + schema=TOKENIZE_SCHEMA, + ) + + +class TokenizeCommand(BaseCommand): + """Tokenize text-cache records with a HuggingFace tokenizer.""" + + name = "tokenize" + help = "Tokenize text-cache records with a HuggingFace tokenizer; write parquet." + + def add_arguments(self, parser: argparse.ArgumentParser) -> None: + """Register CLI flags for the tokenize subcommand.""" + parser.add_argument( + "--cache-paths", + nargs="+", + required=True, + metavar="URI", + help=( + "One or more text-cache URIs or glob patterns " + "(e.g. 's3://my-bucket/cache/s3/commoncrawl/.../*.warc.gz.jsonl.gz'). " + "Each match is a gzipped-JSONL cache file as produced by " + "`classify-warc --cache-dir`. Globs are expanded via fsspec." + ), + ) + parser.add_argument( + "--tokenizer", + default=DEFAULT_TOKENIZER_REPO, + metavar="REPO", + help=( + "HuggingFace repo id of the tokenizer " + f"(default: {DEFAULT_TOKENIZER_REPO}). Must resolve to a fast " + "(Rust) tokenizer for thread-mode safety. Gated repos require " + "HF_TOKEN env var or `huggingface-cli login`." + ), + ) + parser.add_argument( + "--records-limit", + type=int, + default=0, + help=( + "Max number of records to tokenize across all selected files " + "(0 = unlimited). Incompatible with --workers > 1." + ), + ) + parser.add_argument( + "--records-per-file-limit", + type=int, + default=0, + help="Max number of records per cache file (0 = unlimited).", + ) + parser.add_argument( + "--files-limit", + type=int, + default=0, + help="Max number of input cache files after glob expansion and shuffle (0 = unlimited).", + ) + parser.add_argument( + "--shuffle-files", + action="store_true", + help="Shuffle the resolved file list (deterministic via --seed) before --files-limit.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Seed for --shuffle-files (default: 42).", + ) + parser.add_argument( + "--workers", + type=int, + default=1, + help=( + "Number of cache files tokenized concurrently (default: 1). " + "Incompatible with --records-limit; use --records-per-file-limit " + "for a per-file cap." + ), + ) + parser.add_argument( + "--workers-mode", + choices=["thread", "process"], + default="thread", + help=( + "How to parallelise across files when --workers > 1. 'thread' " + "(default) shares one fast tokenizer in this process; HF fast " + "tokenizers release the GIL and are thread-safe. 'process' " + "loads a separate tokenizer per worker process (heavier RAM)." + ), + ) + parser.add_argument( + "--max-pool-restarts", + type=int, + default=10, + metavar="N", + help=( + "In --workers-mode process, restart the pool up to N times when " + "a worker dies (BrokenProcessPool). The suspect file is dropped " + "from this run. 0 = unlimited; default 10." + ), + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help=( + "Number of texts handed to the tokenizer per call (default: 64). " + "HF fast tokenizers vectorize batches internally." + ), + ) + parser.add_argument( + "--progress-every", + type=int, + default=1000, + metavar="N", + help=( + "Log a per-file progress heartbeat every N tokenized records " + "(default: 1000; 0 disables)." + ), + ) + parser.add_argument( + "--output", + required=True, + metavar="PATH", + help=( + "Output parquet path. Local path or any fsspec URL " + "(e.g. s3://bucket/key.parquet). Suggest the .parquet extension." + ), + ) + parser.add_argument( + "--overwrite", + action="store_true", + help=( + "Overwrite an existing --output (and its `.summary` sidecar) " + "instead of failing fast. Off by default to protect prior runs." + ), + ) + parser.add_argument( + "--anonymous-s3", + action="store_true", + help="Force anonymous S3 access (only for buckets that allow it).", + ) + parser.add_argument( + "--s3-requester-pays", + action="store_true", + help="Set RequestPayer=requester for S3 access.", + ) + + def run(self, args: argparse.Namespace) -> int: + """Resolve cache files, tokenize each record, write parquet + summary.""" + logger.info("Running tokenize") + + if args.workers > 1 and args.records_limit > 0: + logger.error( + "--records-limit cannot be combined with --workers > 1; " + "use --records-per-file-limit (works in parallel) or set --workers 1." + ) + return 2 + + output_storage_options = s3_storage_options( + args.output, args.anonymous_s3, args.s3_requester_pays + ) + summary_uri = _tokenize_summary_path(args.output) + summary_storage_options = s3_storage_options( + summary_uri, args.anonymous_s3, args.s3_requester_pays + ) + for path, opts in ( + (args.output, output_storage_options), + (summary_uri, summary_storage_options), + ): + if output_exists(path, opts): + if args.overwrite: + logger.warning("Overwriting existing output: %s", path) + else: + logger.error( + "Output already exists: %s. Refusing to overwrite; " + "delete the file(s), pick a fresh --output path, or " + "pass --overwrite to replace them.", + path, + ) + return 2 + + logger.info("Resolving cache paths: %s", args.cache_paths) + resolved = resolve_warc_paths( + args.cache_paths, + anonymous=args.anonymous_s3, + requester_pays=args.s3_requester_pays, + shuffle=args.shuffle_files, + seed=args.seed, + files_limit=args.files_limit, + ) + logger.info( + "Resolved %d cache files (shuffle=%s files_limit=%d)", + len(resolved), + args.shuffle_files, + args.files_limit, + ) + for uri in resolved: + logger.debug(" selected %s", uri) + + if not resolved: + logger.warning("No cache files matched --cache-paths; nothing to tokenize.") + log_tokens_summary([]) + return 0 + + logger.info("Loading tokenizer %s", args.tokenizer) + try: + parent_tokenizer = load_tokenizer(args.tokenizer) + except Exception as exc: + logger.error( + "Failed to load tokenizer %s: %s: %s", args.tokenizer, type(exc).__name__, exc + ) + return 2 + + use_process_pool = args.workers > 1 and args.workers_mode == "process" + if use_process_pool: + logger.info( + "Worker mode 'process'; freeing parent-side tokenizer and deferring " + "load to each of %d worker processes.", + args.workers, + ) + # Workers reload via the pool initializer; drop the parent copy. + parent_tokenizer = None # noqa: F841 + + n_tokens_all: list[int] = [] + processed = 0 + skipped_empty = 0 + files_done = 0 + files_total = len(resolved) + limit = args.records_limit + t_load_total = 0.0 + t_tokenize_total = 0.0 + + logger.info("Writing output to %s (workers=%d)", args.output, args.workers) + logger.info("Summary will be written to %s", summary_uri) + + started_at = _dt.datetime.now(_dt.UTC).isoformat() + t_processing_start = time.perf_counter() + + writer_lock = threading.Lock() + # Inner-scope state so `_aggregate` can capture `writer` and the + # totals via `nonlocal` (mirrors classify-warc's structure). + with open_parquet_writer(args.output, TOKENIZE_SCHEMA, output_storage_options) as writer: + + def _aggregate(result: FileResult) -> None: + """Append `result` to the parquet and fold its counters into totals.""" + nonlocal processed, skipped_empty, t_load_total, t_tokenize_total + nonlocal files_done + if result.rows: + table = _rows_to_table(result.rows) + with writer_lock: + writer.write_table(table) + n_tokens_all.extend(r[2] for r in result.rows) + processed += result.processed + skipped_empty += result.skipped_empty + t_load_total += result.t_load + t_tokenize_total += result.t_tokenize + files_done += 1 + logger.info( + "Finished %s — processed=%d skipped_empty=%d", + result.cache_uri, + processed, + skipped_empty, + ) + elapsed = time.perf_counter() - t_processing_start + mean_per_file = elapsed / files_done + eta = mean_per_file * (files_total - files_done) + logger.info( + "progress — files=%d/%d elapsed=%s eta=~%s", + files_done, + files_total, + format_duration(elapsed), + format_duration(eta), + ) + + if args.workers == 1: + for uri in resolved: + _aggregate(process_one_file(uri, parent_tokenizer, args)) + if limit and processed >= limit: + logger.info("Reached --records-limit %d; stopping.", limit) + break + elif args.workers_mode == "process": + remaining_uris = list(resolved) + restarts = 0 + max_restarts = args.max_pool_restarts + while remaining_uris: + completed_in_pool = 0 + try: + with ProcessPoolExecutor( + max_workers=args.workers, + initializer=_process_pool_initializer, + initargs=(args.tokenizer,), + ) as pool: + payloads = [(u, args) for u in remaining_uris] + for result in pool.map(_process_pool_worker, payloads): + _aggregate(result) + completed_in_pool += 1 + break + except BrokenProcessPool: + suspect_idx = completed_in_pool + if suspect_idx >= len(remaining_uris): + logger.error( + "Process pool died after consuming all results; " + "aborting (cannot identify a culprit to drop)." + ) + raise + suspect = remaining_uris[suspect_idx] + restarts += 1 + if max_restarts and restarts > max_restarts: + logger.error( + "Process pool died %d times (limit %d); aborting. " + "Last suspect: %s.", + restarts, + max_restarts, + suspect, + ) + raise + logger.error( + "Process pool worker died after %d files in this pool; " + "dropping suspect %s and restarting (restart %d/%s).", + completed_in_pool, + suspect, + restarts, + "∞" if max_restarts == 0 else str(max_restarts), + ) + remaining_uris = remaining_uris[suspect_idx + 1 :] + else: + with ThreadPoolExecutor(max_workers=args.workers) as pool: + for result in pool.map( + lambda u: process_one_file(u, parent_tokenizer, args), + resolved, + ): + _aggregate(result) + + t_processing = time.perf_counter() - t_processing_start + + logger.info( + "Total records tokenized: %d (skipped %d empty)", + processed, + skipped_empty, + ) + log_tokens_summary(n_tokens_all) + + finished_at = _dt.datetime.now(_dt.UTC).isoformat() + write_tokenize_summary( + summary_uri, + summary_storage_options, + args=args, + resolved_count=len(resolved), + n_tokens=n_tokens_all, + processed=processed, + skipped_empty=skipped_empty, + t_processing=t_processing, + t_load_total=t_load_total, + t_tokenize_total=t_tokenize_total, + started_at=started_at, + finished_at=finished_at, + ) + logger.info("Wrote summary %s", summary_uri) + + return 0 diff --git a/src/ccoa/tokenizer/__init__.py b/src/ccoa/tokenizer/__init__.py new file mode 100644 index 0000000..b38288f --- /dev/null +++ b/src/ccoa/tokenizer/__init__.py @@ -0,0 +1 @@ +"""HuggingFace tokenizer wrappers (currently: AutoTokenizer-based loader).""" diff --git a/src/ccoa/tokenizer/hf.py b/src/ccoa/tokenizer/hf.py new file mode 100644 index 0000000..86bee4a --- /dev/null +++ b/src/ccoa/tokenizer/hf.py @@ -0,0 +1,57 @@ +"""Load and apply a HuggingFace tokenizer to plain text. + +Only the *fast* (Rust-backed) `PreTrainedTokenizerFast` variants are +supported. Fast tokenizers are thread-safe and release the GIL during +encoding, so the `tokenize` command can share a single tokenizer +instance across worker threads without a lock. + +Gated repos (e.g. `meta-llama/Llama-2-7b`) require a HuggingFace access +token; the underlying `huggingface_hub` library picks up `HF_TOKEN` from +the environment or `~/.cache/huggingface/token` automatically. Accept +the model's license on the HuggingFace web UI first, then either +`export HF_TOKEN=...` or run `huggingface-cli login`. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + +DEFAULT_TOKENIZER_REPO = "meta-llama/Llama-2-7b" + + +def load_tokenizer(repo: str = DEFAULT_TOKENIZER_REPO) -> PreTrainedTokenizerBase: + """Download (cached) and load a fast HuggingFace tokenizer from `repo`. + + Raises `RuntimeError` if the resolved tokenizer is the slow Python + variant — those are not thread-safe and the `tokenize` command's + thread mode would race. + """ + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(repo, use_fast=True) + if not getattr(tokenizer, "is_fast", False): + raise RuntimeError( + f"Tokenizer for {repo!r} resolved to the slow Python variant " + f"({type(tokenizer).__name__}); fast tokenizers are required for " + f"thread-mode safety. Install the matching fast-tokenizer extra " + f"for this model, or run with --workers-mode process." + ) + return tokenizer + + +def tokenize_batch(tokenizer: PreTrainedTokenizerBase, texts: list[str]) -> list[list[int]]: + """Tokenize `texts` and return one list of token ids per input. + + No special tokens, no padding, no truncation — the caller wants raw + token sequences for counting and length analysis. + """ + encoded = tokenizer( + texts, + add_special_tokens=False, + padding=False, + truncation=False, + ) + return [list(ids) for ids in encoded["input_ids"]] diff --git a/src/ccoa/utils/io/__init__.py b/src/ccoa/utils/io/__init__.py new file mode 100644 index 0000000..7a5045a --- /dev/null +++ b/src/ccoa/utils/io/__init__.py @@ -0,0 +1 @@ +"""I/O helpers (parquet writers, etc.) used by CLI commands.""" diff --git a/src/ccoa/utils/io/parquet.py b/src/ccoa/utils/io/parquet.py new file mode 100644 index 0000000..8e7a03c --- /dev/null +++ b/src/ccoa/utils/io/parquet.py @@ -0,0 +1,50 @@ +"""Streaming parquet writer over an fsspec-backed file handle. + +Used by `ccoa tokenize` to write per-record token output to a local +path or `s3://...` URI in a single pass, without buffering the full +dataset in memory. +""" + +from __future__ import annotations + +import contextlib +from collections.abc import Iterator +from typing import TYPE_CHECKING + +import fsspec +import pyarrow as pa +import pyarrow.parquet as pq + +if TYPE_CHECKING: + pass + +TOKENIZE_SCHEMA = pa.schema( + [ + ("cache_path", pa.string()), + ("record_index", pa.int32()), + ("n_tokens", pa.int32()), + ("token_ids", pa.list_(pa.int32())), + ] +) + + +@contextlib.contextmanager +def open_parquet_writer( + uri: str, + schema: pa.Schema, + storage_options: dict[str, object], + *, + compression: str = "zstd", +) -> Iterator[pq.ParquetWriter]: + """Yield a `pyarrow.parquet.ParquetWriter` bound to `uri`. + + The underlying file handle is opened via `fsspec.open(..., 'wb')`, so + local paths and any fsspec-supported URI (`s3://`, etc.) both work. + Both the writer and the file handle are closed on exit. + """ + with fsspec.open(uri, "wb", **storage_options) as raw: + writer = pq.ParquetWriter(raw, schema, compression=compression) + try: + yield writer + finally: + writer.close() diff --git a/src/ccoa/utils/reporting/summary.py b/src/ccoa/utils/reporting/summary.py index 05d1008..1c10180 100644 --- a/src/ccoa/utils/reporting/summary.py +++ b/src/ccoa/utils/reporting/summary.py @@ -250,3 +250,118 @@ def _format_arg_value(value: object) -> str: if isinstance(value, list | tuple): return ";".join(_format_arg_value(item) for item in value) return str(value) + + +def log_tokens_summary(n_tokens: list[int]) -> None: + """Log one INFO line summarising the token-count distribution.""" + if not n_tokens: + logger.warning("No records tokenized.") + return + stats = compute_score_stats([float(n) for n in n_tokens]) + logger.info( + "tokens stats — count=%d min=%s p10=%s p25=%s p50=%s p75=%s p90=%s p95=%s p99=%s " + "max=%s mean=%s median=%s stdev=%s total=%d", + stats["count"], + format_score(stats["min"]), + format_score(stats["p10"]), + format_score(stats["p25"]), + format_score(stats["p50"]), + format_score(stats["p75"]), + format_score(stats["p90"]), + format_score(stats["p95"]), + format_score(stats["p99"]), + format_score(stats["max"]), + format_score(stats["mean"]), + format_score(stats["median"]), + format_score(stats["stdev"]), + sum(n_tokens), + ) + + +def write_tokenize_summary( + summary_uri: str, + storage_options: dict[str, object], + *, + args: argparse.Namespace, + resolved_count: int, + n_tokens: list[int], + processed: int, + skipped_empty: int, + t_processing: float, + t_load_total: float, + t_tokenize_total: float, + started_at: str, + finished_at: str, +) -> None: + """Write a sidecar two-column CSV summarising a `ccoa tokenize` run. + + Sections (by key prefix): `run.*` (cli + timestamps), `arg.*` (every + CLI flag), `input.*` (resolved files), `count.*` (record counters + + total tokens), `tokens.*` (per-record token-count stats), `time.*` + (wall-clock / load / tokenize). Parses back with `csv.reader` or + `pandas.read_csv`. + """ + rows: list[tuple[str, str]] = [] + + rows.append(("run.cli", shlex.join(sys.argv))) + rows.append(("run.started_at", started_at)) + rows.append(("run.finished_at", finished_at)) + + for key in sorted(vars(args)): + rows.append((f"arg.{key}", _format_arg_value(getattr(args, key)))) + + rows.append(("input.resolved_count", str(resolved_count))) + + rows.append(("count.processed", str(processed))) + rows.append(("count.skipped_empty", str(skipped_empty))) + rows.append(("count.total_tokens", str(sum(n_tokens)))) + + stats = compute_score_stats([float(n) for n in n_tokens]) + for key in ( + "count", + "min", + "p10", + "p25", + "p50", + "p75", + "p90", + "p95", + "p99", + "max", + "mean", + "median", + "stdev", + ): + if key in stats: + value = stats[key] + rows.append( + ( + f"tokens.{key}", + format_score(value) if key != "count" else str(value), + ) + ) + + rows.append(("time.total_seconds", f"{t_processing:.6f}")) + rows.append(("time.load_total_seconds", f"{t_load_total:.6f}")) + rows.append(("time.tokenize_total_seconds", f"{t_tokenize_total:.6f}")) + if processed > 0: + rows.append( + ( + "time.throughput_docs_per_sec", + f"{processed / t_processing:.6f}" if t_processing > 0 else "inf", + ) + ) + rows.append(("time.load_mean_seconds", f"{t_load_total / processed:.6f}")) + rows.append(("time.tokenize_mean_seconds", f"{t_tokenize_total / processed:.6f}")) + + with fsspec.open( + summary_uri, + mode="w", + newline="", + encoding="utf-8", + **storage_options, + ) as sink: + writer = csv.writer(sink) + writer.writerow(["key", "value"]) + for key, value in rows: + writer.writerow([key, value]) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py new file mode 100644 index 0000000..77ab155 --- /dev/null +++ b/tests/test_tokenize.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import argparse +import csv +import gzip +import json +import logging + +import pyarrow.parquet as pq +import pytest + +from ccoa.commands import tokenize as tok_mod +from ccoa.commands.tokenize import TokenizeCommand, process_one_file +from ccoa.utils.io.parquet import TOKENIZE_SCHEMA + + +class _FakeTokenizer: + """Fast-tokenizer stand-in: token IDs are deterministic per word.""" + + is_fast = True + + def __call__(self, texts, **_): # noqa: D401 - mirrors HF tokenizer signature + return {"input_ids": [[hash(w) % 50_000 for w in t.split()] for t in texts]} + + +def _write_cache(path, entries): + """Write a gzipped-JSONL extraction cache at `path` from `{index: text}`.""" + with gzip.open(path, "wb") as gz: + for idx in sorted(entries): + gz.write((json.dumps({"index": idx, "text": entries[idx]}) + "\n").encode("utf-8")) + + +def _default_args(**overrides): + """Build an argparse.Namespace matching TokenizeCommand defaults.""" + base = { + "cache_paths": [], + "tokenizer": "fake-tokenizer", + "records_limit": 0, + "records_per_file_limit": 0, + "files_limit": 0, + "shuffle_files": False, + "seed": 42, + "workers": 1, + "workers_mode": "thread", + "max_pool_restarts": 10, + "batch_size": 64, + "progress_every": 0, + "output": "", + "overwrite": False, + "anonymous_s3": False, + "s3_requester_pays": False, + } + base.update(overrides) + return argparse.Namespace(**base) + + +def _patch_tokenizer(monkeypatch): + monkeypatch.setattr(tok_mod, "load_tokenizer", lambda repo: _FakeTokenizer()) + + +def test_process_one_file_emits_one_row_per_non_empty(tmp_path): + """process_one_file reads cache, batches, skips empties, returns ordered rows.""" + cache = tmp_path / "foo.warc.gz.jsonl.gz" + _write_cache( + cache, + {0: "alpha beta", 1: " ", 2: "gamma delta epsilon", 3: ""}, + ) + args = _default_args(batch_size=2) + result = process_one_file(str(cache), _FakeTokenizer(), args) + + assert result.processed == 2 + assert result.skipped_empty == 2 + assert [r[1] for r in result.rows] == [0, 2] + assert [r[2] for r in result.rows] == [2, 3] + assert all( + isinstance(ids, list) and len(ids) == n + for *_, n, ids in [(r[0], r[1], r[2], r[3]) for r in result.rows] + ) + assert all(r[0] == str(cache) for r in result.rows) + + +def test_process_one_file_respects_per_file_limit(tmp_path): + """records_per_file_limit caps the records taken from one cache file.""" + cache = tmp_path / "foo.warc.gz.jsonl.gz" + _write_cache(cache, {i: f"word{i} more" for i in range(10)}) + args = _default_args(records_per_file_limit=3) + result = process_one_file(str(cache), _FakeTokenizer(), args) + assert result.processed == 3 + assert [r[1] for r in result.rows] == [0, 1, 2] + + +def test_progress_every_emits_heartbeat(tmp_path, caplog): + """--progress-every N logs once every Nth record.""" + cache = tmp_path / "foo.warc.gz.jsonl.gz" + _write_cache(cache, {i: "x y z" for i in range(5)}) + args = _default_args(progress_every=2, batch_size=1) + caplog.set_level(logging.INFO, logger="ccoa.commands.tokenize") + process_one_file(str(cache), _FakeTokenizer(), args) + progress_lines = [r.getMessage() for r in caplog.records if "processed=" in r.getMessage()] + assert any("processed=2" in m for m in progress_lines) + assert any("processed=4" in m for m in progress_lines) + + +def test_cli_rejects_records_limit_with_workers(tmp_path, monkeypatch, caplog): + """--records-limit + --workers > 1 errors fast with exit 2.""" + _patch_tokenizer(monkeypatch) + args = _default_args( + cache_paths=[str(tmp_path / "no-match-*.jsonl.gz")], + output=str(tmp_path / "out.parquet"), + records_limit=10, + workers=2, + ) + caplog.set_level(logging.ERROR, logger="ccoa.commands.tokenize") + rc = TokenizeCommand().run(args) + assert rc == 2 + assert any("--records-limit" in r.getMessage() for r in caplog.records) + + +def test_cli_writes_parquet_with_expected_schema(tmp_path, monkeypatch): + """The end-to-end CLI run produces a parquet matching TOKENIZE_SCHEMA.""" + _patch_tokenizer(monkeypatch) + cache = tmp_path / "alpha.warc.gz.jsonl.gz" + _write_cache(cache, {0: "one two three", 1: "four", 2: "five six"}) + out = tmp_path / "tokens.parquet" + + args = _default_args(cache_paths=[str(cache)], output=str(out)) + rc = TokenizeCommand().run(args) + assert rc == 0 + assert out.exists() + + table = pq.read_table(str(out)) + assert table.schema.equals(TOKENIZE_SCHEMA) + assert table.num_rows == 3 + rows = table.to_pylist() + assert [r["record_index"] for r in rows] == [0, 1, 2] + assert [r["n_tokens"] for r in rows] == [3, 1, 2] + assert all(r["cache_path"] == str(cache) for r in rows) + assert all(len(r["token_ids"]) == r["n_tokens"] for r in rows) + + +def test_cli_overwrite_protects_then_replaces(tmp_path, monkeypatch, caplog): + """Existing output errors without --overwrite; succeeds with it.""" + _patch_tokenizer(monkeypatch) + cache = tmp_path / "alpha.warc.gz.jsonl.gz" + _write_cache(cache, {0: "hello world"}) + out = tmp_path / "tokens.parquet" + + args = _default_args(cache_paths=[str(cache)], output=str(out)) + assert TokenizeCommand().run(args) == 0 + + caplog.set_level(logging.ERROR, logger="ccoa.commands.tokenize") + rc = TokenizeCommand().run(args) + assert rc == 2 + assert any("already exists" in r.getMessage() for r in caplog.records) + + args.overwrite = True + assert TokenizeCommand().run(args) == 0 + + +def test_cli_writes_summary_sidecar(tmp_path, monkeypatch): + """Sidecar CSV captures args, counters, and tokens.* distribution rows.""" + _patch_tokenizer(monkeypatch) + cache = tmp_path / "alpha.warc.gz.jsonl.gz" + _write_cache(cache, {0: "alpha", 1: "alpha beta", 2: "alpha beta gamma"}) + out = tmp_path / "tokens.parquet" + summary = tmp_path / "tokens.summary.csv" + + args = _default_args(cache_paths=[str(cache)], output=str(out)) + assert TokenizeCommand().run(args) == 0 + assert summary.exists() + + rows = list(csv.reader(summary.open())) + keys = {r[0] for r in rows[1:]} + assert "run.cli" in keys + assert "arg.tokenizer" in keys + assert "count.processed" in keys + assert "count.total_tokens" in keys + assert "tokens.count" in keys + assert "tokens.mean" in keys + assert "time.total_seconds" in keys + + by_key = {r[0]: r[1] for r in rows[1:]} + assert by_key["arg.tokenizer"] == "fake-tokenizer" + assert by_key["count.processed"] == "3" + assert by_key["count.total_tokens"] == "6" # 1 + 2 + 3 + assert by_key["tokens.count"] == "3" + + +def test_load_tokenizer_rejects_slow(monkeypatch): + """load_tokenizer raises when AutoTokenizer returns a slow tokenizer.""" + from ccoa.tokenizer import hf as hf_mod + + class _SlowTok: + is_fast = False + + class _FakeAuto: + @staticmethod + def from_pretrained(*_, **__): + return _SlowTok() + + import types + + fake = types.SimpleNamespace(AutoTokenizer=_FakeAuto) + monkeypatch.setitem(__import__("sys").modules, "transformers", fake) + with pytest.raises(RuntimeError, match="slow Python variant"): + hf_mod.load_tokenizer("dummy/repo") + + +@pytest.mark.real_model +def test_tokenize_end_to_end_gpt2(tmp_path): + """End-to-end with the public `gpt2` tokenizer. Skipped without --run-real.""" + cache = tmp_path / "real.warc.gz.jsonl.gz" + _write_cache( + cache, + {0: "Hello, world!", 1: "The quick brown fox jumps over the lazy dog."}, + ) + out = tmp_path / "real-tokens.parquet" + args = _default_args( + cache_paths=[str(cache)], + output=str(out), + tokenizer="gpt2", + ) + rc = TokenizeCommand().run(args) + assert rc == 0 + table = pq.read_table(str(out)) + assert table.num_rows == 2 + n_tokens = table.column("n_tokens").to_pylist() + assert all(n > 0 for n in n_tokens) + token_ids = table.column("token_ids").to_pylist() + assert all(len(ids) == n for ids, n in zip(token_ids, n_tokens, strict=True)) diff --git a/uv.lock b/uv.lock index ed40661..7c46ca4 100644 --- a/uv.lock +++ b/uv.lock @@ -320,6 +320,7 @@ dependencies = [ { name = "fsspec" }, { name = "huggingface-hub" }, { name = "pandas" }, + { name = "pyarrow" }, { name = "trafilatura" }, { name = "warcio", extra = ["s3"] }, ] @@ -329,8 +330,11 @@ all = [ { name = "jupyter" }, { name = "matplotlib" }, { name = "nbconvert" }, + { name = "protobuf" }, { name = "s3fs" }, { name = "scipy" }, + { name = "sentencepiece" }, + { name = "transformers" }, ] notebooks = [ { name = "jupyter" }, @@ -341,6 +345,11 @@ notebooks = [ s3 = [ { name = "s3fs" }, ] +tokenize = [ + { name = "protobuf" }, + { name = "sentencepiece" }, + { name = "transformers" }, +] [package.dev-dependencies] dev = [ @@ -352,6 +361,7 @@ dev = [ requires-dist = [ { name = "ccoa", extras = ["notebooks"], marker = "extra == 'all'" }, { name = "ccoa", extras = ["s3"], marker = "extra == 'all'" }, + { name = "ccoa", extras = ["tokenize"], marker = "extra == 'all'" }, { name = "fasttext-numpy2-wheel" }, { name = "fsspec", specifier = ">=2024.0" }, { name = "huggingface-hub", specifier = ">=0.24" }, @@ -359,12 +369,16 @@ requires-dist = [ { name = "matplotlib", marker = "extra == 'notebooks'" }, { name = "nbconvert", marker = "extra == 'notebooks'" }, { name = "pandas" }, + { name = "protobuf", marker = "extra == 'tokenize'" }, + { name = "pyarrow", specifier = ">=15" }, { name = "s3fs", marker = "extra == 's3'" }, { name = "scipy", marker = "extra == 'notebooks'" }, + { name = "sentencepiece", marker = "extra == 'tokenize'" }, { name = "trafilatura", specifier = ">=1.8.0,<1.12.0" }, + { name = "transformers", marker = "extra == 'tokenize'", specifier = ">=4.40" }, { name = "warcio", extras = ["s3"], specifier = ">=1.7" }, ] -provides-extras = ["notebooks", "s3", "all"] +provides-extras = ["notebooks", "s3", "tokenize", "all"] [package.metadata.requires-dev] dev = [ @@ -2286,6 +2300,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/ed/1cdcab6ba3d6ab7feca11fc14f0eeea80755bb53ef4e892079f31b10a25f/propcache-0.5.2-py3-none-any.whl", hash = "sha256:be1ddfcbb376e3de5d2e2db1d58d6d67463e6b4f9f040c000de8e300295465fe", size = 14036, upload-time = "2026-05-08T21:02:10.673Z" }, ] +[[package]] +name = "protobuf" +version = "7.35.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/60/fd/5b1491d9e4b586d621c54f4c36b888714164b6875f8d6afa3f9072906a51/protobuf-7.35.0.tar.gz", hash = "sha256:a2efd84605f41e559f1881b0912b44099d0a2ac9bf46b3474823f10fb393b0e6", size = 458677, upload-time = "2026-05-19T23:02:29.197Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/ee/93d06e358a4aa32280b00e722d3ea0a1f25fc3cc5778d80581c9cca2c10e/protobuf-7.35.0-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:66be6c513931c794fa92c080ffee41671390da3d79da219cf9c0c0907f035dda", size = 433225, upload-time = "2026-05-19T23:02:19.884Z" }, + { url = "https://files.pythonhosted.org/packages/8b/39/1c76c2da93f3c507e958e0aecee2391cc44d4625de6c728bbc555195b5a8/protobuf-7.35.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:fcbe42a4ac09d3ec9c987ddfcd956afd0b15f1ff613bd8371bde9405ffd5c8e5", size = 328847, upload-time = "2026-05-19T23:02:22.3Z" }, + { url = "https://files.pythonhosted.org/packages/91/1a/39f7ce90a238c1a987a4d81ec26379e02ca0aff367de68e4a1fa474215b9/protobuf-7.35.0-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:4cbf5cc286130e06a6c9bbefac442431173906dfcc979712183d4adcc01b37ee", size = 344030, upload-time = "2026-05-19T23:02:23.591Z" }, + { url = "https://files.pythonhosted.org/packages/70/5b/6baf9008817964454055ff3fe65f1de0b5f1e26c80c82f7fb108b7cd4ea3/protobuf-7.35.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:6c0f98f10c8a05ea30f8993dfef2de093d27b490fdae78bb60c8343795d55011", size = 327130, upload-time = "2026-05-19T23:02:24.637Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e5/e46adb0badc388bfb84877a5f9f026aff63f60e611016cf64dbe77e05446/protobuf-7.35.0-cp310-abi3-win32.whl", hash = "sha256:4c4617b83ade0e279d1d2bfe04025a1adb87f9ed657de038620dc0ff959357f6", size = 428946, upload-time = "2026-05-19T23:02:25.741Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ab/547fbd9e16d879dd13c167478f8ae0a83a428008ca07a5e06acdc23ad473/protobuf-7.35.0-cp310-abi3-win_amd64.whl", hash = "sha256:f05bcadf9a2a6b8dda047007075135fb7d08c73d9177aabc067e1be46881a201", size = 439996, upload-time = "2026-05-19T23:02:26.808Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ef/50433d346c56657a70d27f156c7b349ac59a068b01de4eb796e747eecc43/protobuf-7.35.0-py3-none-any.whl", hash = "sha256:c13f325cf242bad135c350629eeb5d54b24228eb472fb3e2e9ebbd4c5dc20ca0", size = 171659, upload-time = "2026-05-19T23:02:27.842Z" }, +] + [[package]] name = "psutil" version = "7.2.2" @@ -2332,6 +2361,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "pyarrow" +version = "24.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" }, + { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" }, + { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" }, + { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" }, + { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" }, + { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" }, + { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" }, + { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" }, + { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" }, + { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" }, + { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" }, + { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" }, + { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" }, + { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" }, + { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" }, + { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" }, + { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" }, + { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" }, + { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" }, + { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" }, + { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" }, +] + [[package]] name = "pybind11" version = "3.0.4" @@ -2804,6 +2876,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/a4/9d1ea10ebc9e028a289a72fec84da170689549a8102c8aacfcad26bc5035/s3fs-2026.4.0-py3-none-any.whl", hash = "sha256:de0d2a1f33cdf03831fd2382d278c6e4e31fe57c3bf2f703c61f8aec6b703e2a", size = 32392, upload-time = "2026-04-29T20:52:50.295Z" }, ] +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + [[package]] name = "scipy" version = "1.17.1" @@ -2874,6 +2968,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/78/504fdd027da3b84ff1aecd9f6957e65f35134534ccc6da8628eb71e76d3f/send2trash-2.1.0-py3-none-any.whl", hash = "sha256:0da2f112e6d6bb22de6aa6daa7e144831a4febf2a87261451c4ad849fe9a873c", size = 17610, upload-time = "2026-01-14T06:27:35.218Z" }, ] +[[package]] +name = "sentencepiece" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/be/32ce495aa1d0e0c323dcb1ba87096037358edee539cac5baf8755a6bd396/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57cae326c8727de58c85977b175af132a7138d84c764635d7e71bbee7e774133", size = 1943152, upload-time = "2025-08-12T06:59:40.048Z" }, + { url = "https://files.pythonhosted.org/packages/88/7e/ff23008899a58678e98c6ff592bf4d368eee5a71af96d0df6b38a039dd4f/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:56dd39a3c4d6493db3cdca7e8cc68c6b633f0d4195495cbadfcf5af8a22d05a6", size = 1325651, upload-time = "2025-08-12T06:59:41.536Z" }, + { url = "https://files.pythonhosted.org/packages/19/84/42eb3ce4796777a1b5d3699dfd4dca85113e68b637f194a6c8d786f16a04/sentencepiece-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9381351182ff9888cc80e41c632e7e274b106f450de33d67a9e8f6043da6f76", size = 1253645, upload-time = "2025-08-12T06:59:42.903Z" }, + { url = "https://files.pythonhosted.org/packages/89/fa/d3d5ebcba3cb9e6d3775a096251860c41a6bc53a1b9461151df83fe93255/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99f955df238021bf11f0fc37cdb54fd5e5b5f7fd30ecc3d93fb48b6815437167", size = 1316273, upload-time = "2025-08-12T06:59:44.476Z" }, + { url = "https://files.pythonhosted.org/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cdfecef430d985f1c2bcbfff3defd1d95dae876fbd0173376012d2d7d24044b", size = 1387881, upload-time = "2025-08-12T06:59:46.09Z" }, + { url = "https://files.pythonhosted.org/packages/fd/b8/903e5ccb77b4ef140605d5d71b4f9e0ad95d456d6184688073ed11712809/sentencepiece-0.2.1-cp312-cp312-win32.whl", hash = "sha256:a483fd29a34c3e34c39ac5556b0a90942bec253d260235729e50976f5dba1068", size = 999540, upload-time = "2025-08-12T06:59:48.023Z" }, + { url = "https://files.pythonhosted.org/packages/2d/81/92df5673c067148c2545b1bfe49adfd775bcc3a169a047f5a0e6575ddaca/sentencepiece-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4cdc7c36234fda305e85c32949c5211faaf8dd886096c7cea289ddc12a2d02de", size = 1054671, upload-time = "2025-08-12T06:59:49.895Z" }, + { url = "https://files.pythonhosted.org/packages/fe/02/c5e3bc518655d714622bec87d83db9cdba1cd0619a4a04e2109751c4f47f/sentencepiece-0.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:daeb5e9e9fcad012324807856113708614d534f596d5008638eb9b40112cd9e4", size = 1033923, upload-time = "2025-08-12T06:59:51.952Z" }, + { url = "https://files.pythonhosted.org/packages/ba/4a/85fbe1706d4d04a7e826b53f327c4b80f849cf1c7b7c5e31a20a97d8f28b/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dcd8161eee7b41aae57ded06272905dbd680a0a04b91edd0f64790c796b2f706", size = 1943150, upload-time = "2025-08-12T06:59:53.588Z" }, + { url = "https://files.pythonhosted.org/packages/c2/83/4cfb393e287509fc2155480b9d184706ef8d9fa8cbf5505d02a5792bf220/sentencepiece-0.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c6c8f42949f419ff8c7e9960dbadcfbc982d7b5efc2f6748210d3dd53a7de062", size = 1325651, upload-time = "2025-08-12T06:59:55.073Z" }, + { url = "https://files.pythonhosted.org/packages/8d/de/5a007fb53b1ab0aafc69d11a5a3dd72a289d5a3e78dcf2c3a3d9b14ffe93/sentencepiece-0.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:097f3394e99456e9e4efba1737c3749d7e23563dd1588ce71a3d007f25475fff", size = 1253641, upload-time = "2025-08-12T06:59:56.562Z" }, + { url = "https://files.pythonhosted.org/packages/2c/d2/f552be5928105588f4f4d66ee37dd4c61460d8097e62d0e2e0eec41bc61d/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b670879c370d350557edabadbad1f6561a9e6968126e6debca4029e5547820", size = 1316271, upload-time = "2025-08-12T06:59:58.109Z" }, + { url = "https://files.pythonhosted.org/packages/96/df/0cfe748ace5485be740fed9476dee7877f109da32ed0d280312c94ec259f/sentencepiece-0.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7f0fd2f2693309e6628aeeb2e2faf6edd221134dfccac3308ca0de01f8dab47", size = 1387882, upload-time = "2025-08-12T07:00:00.701Z" }, + { url = "https://files.pythonhosted.org/packages/ac/dd/f7774d42a881ced8e1739f393ab1e82ece39fc9abd4779e28050c2e975b5/sentencepiece-0.2.1-cp313-cp313-win32.whl", hash = "sha256:92b3816aa2339355fda2c8c4e021a5de92180b00aaccaf5e2808972e77a4b22f", size = 999541, upload-time = "2025-08-12T07:00:02.709Z" }, + { url = "https://files.pythonhosted.org/packages/dd/e9/932b9eae6fd7019548321eee1ab8d5e3b3d1294df9d9a0c9ac517c7b636d/sentencepiece-0.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:10ed3dab2044c47f7a2e7b4969b0c430420cdd45735d78c8f853191fa0e3148b", size = 1054669, upload-time = "2025-08-12T07:00:04.915Z" }, + { url = "https://files.pythonhosted.org/packages/c9/3a/76488a00ea7d6931689cda28726a1447d66bf1a4837943489314593d5596/sentencepiece-0.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac650534e2251083c5f75dde4ff28896ce7c8904133dc8fef42780f4d5588fcd", size = 1033922, upload-time = "2025-08-12T07:00:06.496Z" }, + { url = "https://files.pythonhosted.org/packages/4a/b6/08fe2ce819e02ccb0296f4843e3f195764ce9829cbda61b7513f29b95718/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:8dd4b477a7b069648d19363aad0cab9bad2f4e83b2d179be668efa672500dc94", size = 1946052, upload-time = "2025-08-12T07:00:08.136Z" }, + { url = "https://files.pythonhosted.org/packages/ab/d9/1ea0e740591ff4c6fc2b6eb1d7510d02f3fb885093f19b2f3abd1363b402/sentencepiece-0.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0c0f672da370cc490e4c59d89e12289778310a0e71d176c541e4834759e1ae07", size = 1327408, upload-time = "2025-08-12T07:00:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/99/7e/1fb26e8a21613f6200e1ab88824d5d203714162cf2883248b517deb500b7/sentencepiece-0.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ad8493bea8432dae8d6830365352350f3b4144415a1d09c4c8cb8d30cf3b6c3c", size = 1254857, upload-time = "2025-08-12T07:00:11.021Z" }, + { url = "https://files.pythonhosted.org/packages/bc/85/c72fd1f3c7a6010544d6ae07f8ddb38b5e2a7e33bd4318f87266c0bbafbf/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b81a24733726e3678d2db63619acc5a8dccd074f7aa7a54ecd5ca33ca6d2d596", size = 1315722, upload-time = "2025-08-12T07:00:12.989Z" }, + { url = "https://files.pythonhosted.org/packages/4a/e8/661e5bd82a8aa641fd6c1020bd0e890ef73230a2b7215ddf9c8cd8e941c2/sentencepiece-0.2.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0a81799d0a68d618e89063fb423c3001a034c893069135ffe51fee439ae474d6", size = 1387452, upload-time = "2025-08-12T07:00:15.088Z" }, + { url = "https://files.pythonhosted.org/packages/99/5e/ae66c361023a470afcbc1fbb8da722c72ea678a2fcd9a18f1a12598c7501/sentencepiece-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:89a3ea015517c42c0341d0d962f3e6aaf2cf10d71b1932d475c44ba48d00aa2b", size = 1002501, upload-time = "2025-08-12T07:00:16.966Z" }, + { url = "https://files.pythonhosted.org/packages/c1/03/d332828c4ff764e16c1b56c2c8f9a33488bbe796b53fb6b9c4205ddbf167/sentencepiece-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:33f068c9382dc2e7c228eedfd8163b52baa86bb92f50d0488bf2b7da7032e484", size = 1057555, upload-time = "2025-08-12T07:00:18.573Z" }, + { url = "https://files.pythonhosted.org/packages/88/14/5aee0bf0864df9bd82bd59e7711362908e4935e3f9cdc1f57246b5d5c9b9/sentencepiece-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:b3616ad246f360e52c85781e47682d31abfb6554c779e42b65333d4b5f44ecc0", size = 1036042, upload-time = "2025-08-12T07:00:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/24/9c/89eb8b2052f720a612478baf11c8227dcf1dc28cd4ea4c0c19506b5af2a2/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5d0350b686c320068702116276cfb26c066dc7e65cfef173980b11bb4d606719", size = 1943147, upload-time = "2025-08-12T07:00:21.809Z" }, + { url = "https://files.pythonhosted.org/packages/82/0b/a1432bc87f97c2ace36386ca23e8bd3b91fb40581b5e6148d24b24186419/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c7f54a31cde6fa5cb030370566f68152a742f433f8d2be458463d06c208aef33", size = 1325624, upload-time = "2025-08-12T07:00:23.289Z" }, + { url = "https://files.pythonhosted.org/packages/ea/99/bbe054ebb5a5039457c590e0a4156ed073fb0fe9ce4f7523404dd5b37463/sentencepiece-0.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c83b85ab2d6576607f31df77ff86f28182be4a8de6d175d2c33ca609925f5da1", size = 1253670, upload-time = "2025-08-12T07:00:24.69Z" }, + { url = "https://files.pythonhosted.org/packages/19/ad/d5c7075f701bd97971d7c2ac2904f227566f51ef0838dfbdfdccb58cd212/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1855f57db07b51fb51ed6c9c452f570624d2b169b36f0f79ef71a6e6c618cd8b", size = 1316247, upload-time = "2025-08-12T07:00:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/fb/03/35fbe5f3d9a7435eebd0b473e09584bd3cc354ce118b960445b060d33781/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01e6912125cb45d3792f530a4d38f8e21bf884d6b4d4ade1b2de5cf7a8d2a52b", size = 1387894, upload-time = "2025-08-12T07:00:28.339Z" }, + { url = "https://files.pythonhosted.org/packages/dc/aa/956ef729aafb6c8f9c443104c9636489093bb5c61d6b90fc27aa1a865574/sentencepiece-0.2.1-cp314-cp314-win32.whl", hash = "sha256:c415c9de1447e0a74ae3fdb2e52f967cb544113a3a5ce3a194df185cbc1f962f", size = 1096698, upload-time = "2025-08-12T07:00:29.764Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/fe400d8836952cc535c81a0ce47dc6875160e5fedb71d2d9ff0e9894c2a6/sentencepiece-0.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:881b2e44b14fc19feade3cbed314be37de639fc415375cefaa5bc81a4be137fd", size = 1155115, upload-time = "2025-08-12T07:00:32.865Z" }, + { url = "https://files.pythonhosted.org/packages/32/89/047921cf70f36c7b6b6390876b2399b3633ab73b8d0cb857e5a964238941/sentencepiece-0.2.1-cp314-cp314-win_arm64.whl", hash = "sha256:2005242a16d2dc3ac5fe18aa7667549134d37854823df4c4db244752453b78a8", size = 1133890, upload-time = "2025-08-12T07:00:34.763Z" }, + { url = "https://files.pythonhosted.org/packages/a1/11/5b414b9fae6255b5fb1e22e2ed3dc3a72d3a694e5703910e640ac78346bb/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:a19adcec27c524cb7069a1c741060add95f942d1cbf7ad0d104dffa0a7d28a2b", size = 1946081, upload-time = "2025-08-12T07:00:36.97Z" }, + { url = "https://files.pythonhosted.org/packages/77/eb/7a5682bb25824db8545f8e5662e7f3e32d72a508fdce086029d89695106b/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e37e4b4c4a11662b5db521def4e44d4d30ae69a1743241412a93ae40fdcab4bb", size = 1327406, upload-time = "2025-08-12T07:00:38.669Z" }, + { url = "https://files.pythonhosted.org/packages/03/b0/811dae8fb9f2784e138785d481469788f2e0d0c109c5737372454415f55f/sentencepiece-0.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:477c81505db072b3ab627e7eab972ea1025331bd3a92bacbf798df2b75ea86ec", size = 1254846, upload-time = "2025-08-12T07:00:40.611Z" }, + { url = "https://files.pythonhosted.org/packages/ef/23/195b2e7ec85ebb6a547969f60b723c7aca5a75800ece6cc3f41da872d14e/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:010f025a544ef770bb395091d57cb94deb9652d8972e0d09f71d85d5a0816c8c", size = 1315721, upload-time = "2025-08-12T07:00:42.914Z" }, + { url = "https://files.pythonhosted.org/packages/7e/aa/553dbe4178b5f23eb28e59393dddd64186178b56b81d9b8d5c3ff1c28395/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:733e59ff1794d26db706cd41fc2d7ca5f6c64a820709cb801dc0ea31780d64ab", size = 1387458, upload-time = "2025-08-12T07:00:44.56Z" }, + { url = "https://files.pythonhosted.org/packages/66/7c/08ff0012507297a4dd74a5420fdc0eb9e3e80f4e88cab1538d7f28db303d/sentencepiece-0.2.1-cp314-cp314t-win32.whl", hash = "sha256:d3233770f78e637dc8b1fda2cd7c3b99ec77e7505041934188a4e7fe751de3b0", size = 1099765, upload-time = "2025-08-12T07:00:46.058Z" }, + { url = "https://files.pythonhosted.org/packages/91/d5/2a69e1ce15881beb9ddfc7e3f998322f5cedcd5e4d244cb74dade9441663/sentencepiece-0.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5e4366c97b68218fd30ea72d70c525e6e78a6c0a88650f57ac4c43c63b234a9d", size = 1157807, upload-time = "2025-08-12T07:00:47.673Z" }, + { url = "https://files.pythonhosted.org/packages/f3/16/54f611fcfc2d1c46cbe3ec4169780b2cfa7cf63708ef2b71611136db7513/sentencepiece-0.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:105e36e75cbac1292642045458e8da677b2342dcd33df503e640f0b457cb6751", size = 1136264, upload-time = "2025-08-12T07:00:49.485Z" }, +] + [[package]] name = "setuptools" version = "82.0.1" @@ -2959,6 +3101,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/90/39a85a4b63c84213e78b3c17d22e1bf45328acf8ebb33ef93be30d0a3911/tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c", size = 296743, upload-time = "2026-03-06T23:50:32.465Z" }, ] +[[package]] +name = "tokenizers" +version = "0.22.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" }, + { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" }, + { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" }, + { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" }, + { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" }, + { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" }, + { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" }, + { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" }, + { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" }, + { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" }, + { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" }, + { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" }, + { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" }, + { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" }, +] + [[package]] name = "tornado" version = "6.5.5" @@ -3015,6 +3183,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/98/a9937a969d018a23badfea0b381f66783649d48e0ea6c41923265c3cbeb3/traitlets-5.15.0-py3-none-any.whl", hash = "sha256:fb36a18867a6803deab09f3c5e0fa81bb7b26a5c9e82501c9933f759166eff40", size = 85877, upload-time = "2026-05-06T08:05:55.853Z" }, ] +[[package]] +name = "transformers" +version = "5.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/58/7f843608f2e8421f86bb97060b54649be6239ec612b82bf9d41e65c26c00/transformers-5.9.0.tar.gz", hash = "sha256:25997cb8fa6053533171634b6162d7df54346530ec2aa9b42bb834e63668c842", size = 8642240, upload-time = "2026-05-20T14:50:49.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/ca/2eaa5359f2ccb8c2e1656bc26305ad0cf438aa392ce4b29ae67a315c186e/transformers-5.9.0-py3-none-any.whl", hash = "sha256:1d19509bcff7028ebc6b277d71caa712e8353778463d38764237d14b42b52788", size = 10787648, upload-time = "2026-05-20T14:50:45.337Z" }, +] + [[package]] name = "typer" version = "0.25.1"