Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
python-version: '3.12'

- name: Install Requirements
run: pip install .[dev]
run: make install

- name: Lint Code
run: ruff check .
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
python-version: '3.12'

- name: Install Requirements
run: pip install .[dev]
run: make install

- name: Test Code
run: pytest .
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
build/
text_to_tokenized_video.egg-info/
**/__pycache__/
.env
.env
checkpoints
.claude
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
install:
pip install git+https://github.com/nvidia-cosmos/cosmos-predict1.git --no-deps
pip install ".[dev,cosmos]"

download_checkpoints:
hf auth login
hf download nvidia/Cosmos-Tokenize1-DV8x16x16-720p --local-dir checkpoints/Cosmos-Tokenize1-DV8x16x16-720p

43 changes: 39 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,51 @@
## Usage

Installation:
```bash
pip install ".[dev]"

```shell
# Install dependencies
make install
# Download Necessary Checkpoints
make download_checkpoints
```

Lint:
```bash

```shell
ruff check . --fix
```

Test:
```bash

```shell
pytest .
```

### Download the Phoenix Dataset

```shell
# Download the PHOENIX-2014T v3 release
wget https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/2016/phoenix-2014-T.v3.tar.gz

# Extract the dataset archive
tar -xvzf phoenix-2014-T.v3.tar.gz
```

### Tokenizer Example

Encode a video:

```shell
python -m text_to_tokenized_video.tokenizer.encode_video \
--video=assets/example.mp4 \
--checkpoint-enc=checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit \
--output-path=test.pt
```

Encode a dataset:

```shell
export PHOENIX="PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px"
export TOKENS_DIR="tokens"
./text_to_tokenized_video/tokenizer/tokenize_dataset.sh
```
Binary file added assets/example.mp4
Binary file not shown.
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,20 @@ authors = [
]
readme = "README.md"
dependencies = [
"numpy",
"mediapy",
"torch",
]

[project.optional-dependencies]
dev = [
"pytest",
"ruff",
]
cosmos = [
"loguru",
"einops",
]

[tool.ruff]
line-length = 120
Expand All @@ -38,6 +45,7 @@ select = [
[tool.setuptools]
packages = [
"text_to_tokenized_video",
"text_to_tokenized_video.tokenizer",
]

[tool.pytest.ini_options]
Expand Down
Empty file.
84 changes: 84 additions & 0 deletions text_to_tokenized_video/tokenizer/encode_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from functools import cache

import torch
import numpy as np
from cosmos_predict1.tokenizer.inference.video_lib import CausalVideoTokenizer
import mediapy
from pathlib import Path
import torch.nn.functional as F


def read_video(directory_or_file: Path):
"""Reads image frames in BGR, converts to RGB, returns shape (1, T, H, W, 3)"""
if directory_or_file.is_dir():
images = directory_or_file.glob("*.png")
frames = [mediapy.read_image(f) for f in images]
return np.stack(frames, axis=0)

return mediapy.read_video(directory_or_file)


@cache
def load_tokenizer(checkpoint_enc: Path = None, checkpoint_dec: Path = None, device=None):
if device is None:
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"

return CausalVideoTokenizer(
checkpoint_enc=checkpoint_enc,
checkpoint_dec=checkpoint_dec,
device=device,
)

def resize_videos(videos, target_height=128, target_width=128):
"""Resize videos to target resolution, ensuring dimensions are divisible by 16."""
B, T, H, W, C = videos.shape

# Round to nearest multiple of 16
target_height = (target_height // 16) * 16
target_width = (target_width // 16) * 16

videos = videos.view(B * T, H, W, C).permute(0, 3, 1, 2) # [B*T, C, H, W]
videos = F.interpolate(videos, size=(target_height, target_width), mode="bilinear", align_corners=False)
return videos.permute(0, 2, 3, 1).view(B, T, target_height, target_width, C)


@torch.no_grad()
def encode_video(video, checkpoint_enc, device=None):
video_np = read_video(video)[..., :3] # (T, H, W, 3)
video_tensor = torch.from_numpy(video_np)

# Add batch dimension: (1, T, H, W, 3)
video_tensor = video_tensor.unsqueeze(0)
video_tensor = resize_videos(video_tensor)

# Normalize tensor
video_tensor = video_tensor.to(torch.float) / 127.5 - 1

# Rearrange to Bx3xTxHxW layout expected by tokenizer
video_tensor = video_tensor.permute(0, 4, 1, 2, 3) # (B, C, T, H, W)

tokenizer = load_tokenizer(checkpoint_enc=checkpoint_enc, device=device)
return tokenizer.encode(video_tensor) # returns tokens


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--video", type=Path, required=True)
parser.add_argument("--checkpoint-enc", type=Path, required=True)
parser.add_argument("--output-path", type=Path, required=True)
args = parser.parse_args()

tokens = encode_video(
video=args.video,
checkpoint_enc=args.checkpoint_enc,
)


torch.save(tokens, args.output_path)
print(f"Saved tokens to {args.output_path}")
print(tokens)
35 changes: 35 additions & 0 deletions text_to_tokenized_video/tokenizer/tokenize_dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail

CHECKPOINT="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit"
SPLITS=("train" "dev" "test")

mkdir -p "$TOKENS_DIR"

for split in "${SPLITS[@]}"; do
find "$PHOENIX/$split" -mindepth 1 -maxdepth 1 -type d | while read -r seq_dir; do
seq_id=$(basename "$seq_dir")
out_path="$TOKENS_DIR/$split/$seq_id.pt"
mkdir -p "$(dirname "$out_path")"

# Skip if already exists
if [[ -f "$out_path" ]]; then
echo "Skipping $split/$seq_id (already exists)"
continue
fi

echo "Encoding $split/$seq_id..."
python -m text_to_tokenized_video.tokenizer.encode_video \
--video="$seq_dir" \
--checkpoint-enc="$CHECKPOINT" \
--output-path="$out_path" &

# Limit concurrency
if (( $(jobs -r | wc -l) >= 8 )); then
wait -n
fi
done
done

wait
echo "✅ All done."
Loading