diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index a3aee75..2c19d4a 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -20,7 +20,7 @@ jobs: python-version: '3.12' - name: Install Requirements - run: pip install .[dev] + run: make install - name: Lint Code run: ruff check . diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2075581..0e7df69 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,7 +20,7 @@ jobs: python-version: '3.12' - name: Install Requirements - run: pip install .[dev] + run: make install - name: Test Code run: pytest . diff --git a/.gitignore b/.gitignore index 06babd8..467f841 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ build/ text_to_tokenized_video.egg-info/ **/__pycache__/ -.env \ No newline at end of file +.env +checkpoints +.claude diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9ab864e --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +install: + pip install git+https://github.com/nvidia-cosmos/cosmos-predict1.git --no-deps + pip install ".[dev,cosmos]" + +download_checkpoints: + hf auth login + hf download nvidia/Cosmos-Tokenize1-DV8x16x16-720p --local-dir checkpoints/Cosmos-Tokenize1-DV8x16x16-720p + diff --git a/README.md b/README.md index be226a2..0e23aa0 100644 --- a/README.md +++ b/README.md @@ -3,16 +3,51 @@ ## Usage Installation: -```bash -pip install ".[dev]" + +```shell +# Install dependencies +make install +# Download Necessary Checkpoints +make download_checkpoints ``` Lint: -```bash + +```shell ruff check . --fix ``` Test: -```bash + +```shell pytest . +``` + +### Download the Phoenix Dataset + +```shell +# Download the PHOENIX-2014T v3 release +wget https://www-i6.informatik.rwth-aachen.de/ftp/pub/rwth-phoenix/2016/phoenix-2014-T.v3.tar.gz + +# Extract the dataset archive +tar -xvzf phoenix-2014-T.v3.tar.gz +``` + +### Tokenizer Example + +Encode a video: + +```shell +python -m text_to_tokenized_video.tokenizer.encode_video \ + --video=assets/example.mp4 \ + --checkpoint-enc=checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit \ + --output-path=test.pt + ``` + +Encode a dataset: + +```shell +export PHOENIX="PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px" +export TOKENS_DIR="tokens" +./text_to_tokenized_video/tokenizer/tokenize_dataset.sh ``` \ No newline at end of file diff --git a/assets/example.mp4 b/assets/example.mp4 new file mode 100644 index 0000000..734a6fe Binary files /dev/null and b/assets/example.mp4 differ diff --git a/pyproject.toml b/pyproject.toml index 8417ec5..7d2469b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,9 @@ authors = [ ] readme = "README.md" dependencies = [ + "numpy", + "mediapy", + "torch", ] [project.optional-dependencies] @@ -15,6 +18,10 @@ dev = [ "pytest", "ruff", ] +cosmos = [ + "loguru", + "einops", +] [tool.ruff] line-length = 120 @@ -38,6 +45,7 @@ select = [ [tool.setuptools] packages = [ "text_to_tokenized_video", + "text_to_tokenized_video.tokenizer", ] [tool.pytest.ini_options] diff --git a/text_to_tokenized_video/tokenizer/__init__.py b/text_to_tokenized_video/tokenizer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text_to_tokenized_video/tokenizer/encode_video.py b/text_to_tokenized_video/tokenizer/encode_video.py new file mode 100644 index 0000000..42b0095 --- /dev/null +++ b/text_to_tokenized_video/tokenizer/encode_video.py @@ -0,0 +1,84 @@ +from functools import cache + +import torch +import numpy as np +from cosmos_predict1.tokenizer.inference.video_lib import CausalVideoTokenizer +import mediapy +from pathlib import Path +import torch.nn.functional as F + + +def read_video(directory_or_file: Path): + """Reads image frames in BGR, converts to RGB, returns shape (1, T, H, W, 3)""" + if directory_or_file.is_dir(): + images = directory_or_file.glob("*.png") + frames = [mediapy.read_image(f) for f in images] + return np.stack(frames, axis=0) + + return mediapy.read_video(directory_or_file) + + +@cache +def load_tokenizer(checkpoint_enc: Path = None, checkpoint_dec: Path = None, device=None): + if device is None: + if torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + return CausalVideoTokenizer( + checkpoint_enc=checkpoint_enc, + checkpoint_dec=checkpoint_dec, + device=device, + ) + +def resize_videos(videos, target_height=128, target_width=128): + """Resize videos to target resolution, ensuring dimensions are divisible by 16.""" + B, T, H, W, C = videos.shape + + # Round to nearest multiple of 16 + target_height = (target_height // 16) * 16 + target_width = (target_width // 16) * 16 + + videos = videos.view(B * T, H, W, C).permute(0, 3, 1, 2) # [B*T, C, H, W] + videos = F.interpolate(videos, size=(target_height, target_width), mode="bilinear", align_corners=False) + return videos.permute(0, 2, 3, 1).view(B, T, target_height, target_width, C) + + +@torch.no_grad() +def encode_video(video, checkpoint_enc, device=None): + video_np = read_video(video)[..., :3] # (T, H, W, 3) + video_tensor = torch.from_numpy(video_np) + + # Add batch dimension: (1, T, H, W, 3) + video_tensor = video_tensor.unsqueeze(0) + video_tensor = resize_videos(video_tensor) + + # Normalize tensor + video_tensor = video_tensor.to(torch.float) / 127.5 - 1 + + # Rearrange to Bx3xTxHxW layout expected by tokenizer + video_tensor = video_tensor.permute(0, 4, 1, 2, 3) # (B, C, T, H, W) + + tokenizer = load_tokenizer(checkpoint_enc=checkpoint_enc, device=device) + return tokenizer.encode(video_tensor) # returns tokens + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--video", type=Path, required=True) + parser.add_argument("--checkpoint-enc", type=Path, required=True) + parser.add_argument("--output-path", type=Path, required=True) + args = parser.parse_args() + + tokens = encode_video( + video=args.video, + checkpoint_enc=args.checkpoint_enc, + ) + + + torch.save(tokens, args.output_path) + print(f"Saved tokens to {args.output_path}") + print(tokens) diff --git a/text_to_tokenized_video/tokenizer/tokenize_dataset.sh b/text_to_tokenized_video/tokenizer/tokenize_dataset.sh new file mode 100644 index 0000000..ed06af0 --- /dev/null +++ b/text_to_tokenized_video/tokenizer/tokenize_dataset.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +CHECKPOINT="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/encoder.jit" +SPLITS=("train" "dev" "test") + +mkdir -p "$TOKENS_DIR" + +for split in "${SPLITS[@]}"; do + find "$PHOENIX/$split" -mindepth 1 -maxdepth 1 -type d | while read -r seq_dir; do + seq_id=$(basename "$seq_dir") + out_path="$TOKENS_DIR/$split/$seq_id.pt" + mkdir -p "$(dirname "$out_path")" + + # Skip if already exists + if [[ -f "$out_path" ]]; then + echo "Skipping $split/$seq_id (already exists)" + continue + fi + + echo "Encoding $split/$seq_id..." + python -m text_to_tokenized_video.tokenizer.encode_video \ + --video="$seq_dir" \ + --checkpoint-enc="$CHECKPOINT" \ + --output-path="$out_path" & + + # Limit concurrency + if (( $(jobs -r | wc -l) >= 8 )); then + wait -n + fi + done +done + +wait +echo "✅ All done."