sign-language-processing · AmitMY · Oct 28, 2025 · Oct 28, 2025
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -20,7 +20,7 @@ jobs:
           python-version: '3.12'
 
       - name: Install Requirements
-        run: pip install .[dev]
+        run: make install
 
       - name: Lint Code
         run: ruff check .
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,7 +20,7 @@ jobs:
           python-version: '3.12'
 
       - name: Install Requirements
-        run: pip install .[dev]
+        run: make install
 
       - name: Test Code
         run: pytest .
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@
 build/
 text_to_tokenized_video.egg-info/
 **/__pycache__/
-.env
+.env
+checkpoints
+.claude
diff --git a/Makefile b/Makefile
@@ -0,0 +1,8 @@
+install:
+	pip install git+https://github.com/nvidia-cosmos/cosmos-predict1.git --no-deps
+	pip install ".[dev,cosmos]"
+
+download_checkpoints:
+	hf auth login
+	hf download nvidia/Cosmos-Tokenize1-DV8x16x16-720p --local-dir checkpoints/Cosmos-Tokenize1-DV8x16x16-720p
+
diff --git a/README.md b/README.md
@@ -3,16 +3,51 @@
 ## Usage
 
 Installation:
-```bash
-pip install ".[dev]"
+
+```shell
+# Install dependencies
+make install
+# Download Necessary Checkpoints
+make download_checkpoints
 ```
 
 Lint:
-```bash
+
+```shell
 ruff check . --fix
 ```
 
 Test:
-```bash
+
+```shell
 pytest .
+```
+
+### Download the Phoenix Dataset
+
+```shell
+# Download the PHOENIX-2014T v3 release
+wget https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/2016/phoenix-2014-T.v3.tar.gz
+
+# Extract the dataset archive
+tar -xvzf phoenix-2014-T.v3.tar.gz
+```
+
+### Tokenizer Example
+
+Encode a video:
+
+```shell
+python -m text_to_tokenized_video.tokenizer.encode_video \
+    --video=assets/example.mp4 \
+    --checkpoint-enc=checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit \
+    --output-path=test.pt
+ ```
+
+Encode a dataset:
+
+```shell
+export PHOENIX="PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px"
+export TOKENS_DIR="tokens"
+./text_to_tokenized_video/tokenizer/tokenize_dataset.sh
 ```
diff --git a/assets/example.mp4 b/assets/example.mp4
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,13 +8,20 @@ authors = [
 ]
 readme = "README.md"
 dependencies = [
+    "numpy",
+    "mediapy",
+    "torch",
 ]
 
 [project.optional-dependencies]
 dev = [
     "pytest",
     "ruff",
 ]
+cosmos = [
+    "loguru",
+    "einops",
+]
 
 [tool.ruff]
 line-length = 120
@@ -38,6 +45,7 @@ select = [
 [tool.setuptools]
 packages = [
     "text_to_tokenized_video",
+    "text_to_tokenized_video.tokenizer",
 ]
 
 [tool.pytest.ini_options]

diff --git a/text_to_tokenized_video/tokenizer/__init__.py b/text_to_tokenized_video/tokenizer/__init__.py
diff --git a/text_to_tokenized_video/tokenizer/encode_video.py b/text_to_tokenized_video/tokenizer/encode_video.py
@@ -0,0 +1,84 @@
+from functools import cache
+
+import torch
+import numpy as np
+from cosmos_predict1.tokenizer.inference.video_lib import CausalVideoTokenizer
+import mediapy
+from pathlib import Path
+import torch.nn.functional as F
+
+
+def read_video(directory_or_file: Path):
+    """Reads image frames in BGR, converts to RGB, returns shape (1, T, H, W, 3)"""
+    if directory_or_file.is_dir():
+        images = directory_or_file.glob("*.png")
+        frames = [mediapy.read_image(f) for f in images]
+        return np.stack(frames, axis=0)
+
+    return mediapy.read_video(directory_or_file)
+
+
+@cache
+def load_tokenizer(checkpoint_enc: Path = None, checkpoint_dec: Path = None, device=None):
+    if device is None:
+        if torch.cuda.is_available():
+            device = "cuda"
+        else:
+            device = "cpu"
+
+    return CausalVideoTokenizer(
+        checkpoint_enc=checkpoint_enc,
+        checkpoint_dec=checkpoint_dec,
+        device=device,
+    )
+
+def resize_videos(videos, target_height=128, target_width=128):
+    """Resize videos to target resolution, ensuring dimensions are divisible by 16."""
+    B, T, H, W, C = videos.shape
+
+    # Round to nearest multiple of 16
+    target_height = (target_height // 16) * 16
+    target_width = (target_width // 16) * 16
+
+    videos = videos.view(B * T, H, W, C).permute(0, 3, 1, 2)  # [B*T, C, H, W]
+    videos = F.interpolate(videos, size=(target_height, target_width), mode="bilinear", align_corners=False)
+    return videos.permute(0, 2, 3, 1).view(B, T, target_height, target_width, C)
+
+
+@torch.no_grad()
+def encode_video(video, checkpoint_enc, device=None):
+    video_np = read_video(video)[..., :3] # (T, H, W, 3)
+    video_tensor = torch.from_numpy(video_np)
+
+    # Add batch dimension: (1, T, H, W, 3)
+    video_tensor = video_tensor.unsqueeze(0)
+    video_tensor = resize_videos(video_tensor)
+
+    # Normalize tensor
+    video_tensor = video_tensor.to(torch.float) / 127.5 - 1
+
+    # Rearrange to Bx3xTxHxW layout expected by tokenizer
+    video_tensor = video_tensor.permute(0, 4, 1, 2, 3)  # (B, C, T, H, W)
+
+    tokenizer = load_tokenizer(checkpoint_enc=checkpoint_enc, device=device)
+    return tokenizer.encode(video_tensor)  # returns tokens
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video", type=Path, required=True)
+    parser.add_argument("--checkpoint-enc", type=Path, required=True)
+    parser.add_argument("--output-path", type=Path, required=True)
+    args = parser.parse_args()
+
+    tokens = encode_video(
+        video=args.video,
+        checkpoint_enc=args.checkpoint_enc,
+    )
+
+
+    torch.save(tokens, args.output_path)
+    print(f"Saved tokens to {args.output_path}")
+    print(tokens)
diff --git a/text_to_tokenized_video/tokenizer/tokenize_dataset.sh b/text_to_tokenized_video/tokenizer/tokenize_dataset.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+CHECKPOINT="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit"
+SPLITS=("train" "dev" "test")
+
+mkdir -p "$TOKENS_DIR"
+
+for split in "${SPLITS[@]}"; do
+  find "$PHOENIX/$split" -mindepth 1 -maxdepth 1 -type d | while read -r seq_dir; do
+    seq_id=$(basename "$seq_dir")
+    out_path="$TOKENS_DIR/$split/$seq_id.pt"
+    mkdir -p "$(dirname "$out_path")"
+
+    # Skip if already exists
+    if [[ -f "$out_path" ]]; then
+      echo "Skipping $split/$seq_id (already exists)"
+      continue
+    fi
+
+    echo "Encoding $split/$seq_id..."
+    python -m text_to_tokenized_video.tokenizer.encode_video \
+      --video="$seq_dir" \
+      --checkpoint-enc="$CHECKPOINT" \
+      --output-path="$out_path" &
+
+    # Limit concurrency
+    if (( $(jobs -r | wc -l) >= 8 )); then
+      wait -n
+    fi
+  done
+done
+
+wait
+echo "✅ All done."