MemoryError During Video Processing: depth_splatting_inference.py #4

enoky · 2024-12-28T14:29:46Z

Hi,

While using StereoCrafter, I encountered a MemoryError when processing large videos, specifically during the write_video step in process_video. The script attempts to allocate a massive array (~24.7 GiB) for video_grid with shape (180, 1600, 3840, 3) and float64 data type, which exceeds typical system memory limits.

Suggested Fix

To address this, I implemented the following optimizations:

Data Type Reduction: Convert arrays to float32 or uint8 before writing:
```
video_grid = (video_grid * 255.0).astype(np.uint8)
```

Incremental Frame Processing: Instead of creating a massive video_grid, process and write frames incrementally using OpenCV:

import cv2
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
for frame in frames:
    processed_frame = (frame * 255.0).astype(np.uint8)
    out.write(cv2.cvtColor(processed_frame, cv2.COLOR_RGB2BGR))
out.release()

These changes significantly reduced memory usage and resolved the crash for larger videos.

Request

Could you consider integrating these optimizations into the codebase? Thanks for your excellent work on StereoCrafter!

The text was updated successfully, but these errors were encountered:

xiaoyu258 · 2024-12-29T03:55:28Z

Thanks for your contribution. This is helpful!

I will update the codebase according to these optimizations.

enoky · 2024-12-29T06:29:27Z

Here is a complete implementation for depth_splatting_inference.py.

import gc
import cv2
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.io import write_video

from diffusers.training_utils import set_seed
from fire import Fire
from decord import VideoReader, cpu

from dependency.DepthCrafter.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
from dependency.DepthCrafter.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
from dependency.DepthCrafter.depthcrafter.utils import vis_sequence_depth, read_video_frames

from Forward_Warp import forward_warp

DEFAULT_INPUT_FOLDER = "./input_videos"
DEFAULT_OUTPUT_FOLDER = "./output_splatted"
DEFAULT_PRE_TRAINED_PATH = "./weights/stable-video-diffusion-img2vid-xt-1-1"
DEFAULT_UNET_PATH = "./weights/DepthCrafter"

class DepthCrafterDemo:
    def __init__(
            self,
            unet_path: str,
            pre_trained_path: str,
            cpu_offload: str = "model",
    ):
        unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
            unet_path,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
        )
        self.pipe = DepthCrafterPipeline.from_pretrained(
            pre_trained_path,
            unet=unet,
            torch_dtype=torch.float16,
            variant="fp16",
        )

        if cpu_offload is not None:
            if cpu_offload == "sequential":
                self.pipe.enable_sequential_cpu_offload()
            elif cpu_offload == "model":
                self.pipe.enable_model_cpu_offload()
            else:
                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
        else:
            self.pipe.to("cuda")

        try:
            self.pipe.enable_xformers_memory_efficient_attention()
        except Exception as e:
            print(e)
            print("Xformers is not enabled")
        self.pipe.enable_attention_slicing()

    def infer(
            self,
            input_video_path: str,
            output_video_path: str,
            process_length: int = -1,
            num_denoising_steps: int = 8,
            guidance_scale: float = 1.2,
            window_size: int = 70,
            overlap: int = 25,
            max_res: int = 960,
            dataset: str = "open",
            target_fps: int = -1,
            seed: int = 42,
            track_time: bool = False,
            save_depth: bool = False,
    ):
        set_seed(seed)

        frames, target_fps, original_height, original_width = read_video_frames(
            input_video_path,
            process_length,
            target_fps,
            max_res,
            dataset,
        )

        with torch.inference_mode():
            res = self.pipe(
                frames,
                height=frames.shape[1],
                width=frames.shape[2],
                output_type="np",
                guidance_scale=guidance_scale,
                num_inference_steps=num_denoising_steps,
                window_size=window_size,
                overlap=overlap,
                track_time=track_time,
            ).frames[0]

        res = res.sum(-1) / res.shape[-1]

        tensor_res = torch.tensor(res).unsqueeze(1).float().contiguous().cuda()
        res = F.interpolate(tensor_res, size=(original_height, original_width), mode='bilinear', align_corners=False)
        res = res.cpu().numpy()[:, 0, :, :]

        res = (res - res.min()) / (res.max() - res.min())
        vis = vis_sequence_depth(res)

        save_path = os.path.join(
            os.path.dirname(output_video_path), os.path.splitext(os.path.basename(output_video_path))[0]
        )

        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        if save_depth:
            np.savez_compressed(save_path + ".npz", depth=res)
            write_video(save_path + "_depth_vis.mp4", vis * 255.0, fps=target_fps, video_codec="h264",
                        options={"crf": "16"})

        return res, vis

class ForwardWarpStereo(nn.Module):
    def __init__(self, eps=1e-6, occlu_map=False):
        super(ForwardWarpStereo, self).__init__()
        self.eps = eps
        self.occlu_map = occlu_map
        self.fw = forward_warp()

    def forward(self, im, disp):
        im = im.contiguous()
        disp = disp.contiguous()
        weights_map = disp - disp.min()
        weights_map = (1.414) ** weights_map
        flow = -disp.squeeze(1)
        dummy_flow = torch.zeros_like(flow, requires_grad=False)
        flow = torch.stack((flow, dummy_flow), dim=-1)
        res_accum = self.fw(im * weights_map, flow)
        mask = self.fw(weights_map, flow)
        mask.clamp_(min=self.eps)
        res = res_accum / mask
        if not self.occlu_map:
            return res
        else:
            ones = torch.ones_like(disp, requires_grad=False)
            occlu_map = self.fw(ones, flow)
            occlu_map.clamp_(0.0, 1.0)
            occlu_map = 1.0 - occlu_map
            return res, occlu_map

def process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp=20.0, process_length=-1, batch_size=10):
    video_depth, depth_vis = depthcrafter_demo.infer(
        input_video_path,
        output_video_path,
        process_length,
    )

    vid_reader = VideoReader(input_video_path, ctx=cpu(0))
    original_fps = vid_reader.get_avg_fps()
    input_frames = vid_reader[:].asnumpy() / 255.0

    if process_length != -1 and process_length < len(input_frames):
        input_frames = input_frames[:process_length]
        video_depth = video_depth[:process_length]
        depth_vis = depth_vis[:process_length]

    stereo_projector = ForwardWarpStereo(occlu_map=True).cuda()

    num_frames = len(input_frames)
    height, width, _ = input_frames[0].shape

    # Initialize OpenCV VideoWriter
    out = cv2.VideoWriter(
        output_video_path, 
        cv2.VideoWriter_fourcc(*'avc1'),  # h264 codec
        original_fps, 
        (width * 2, height * 2)
    )

    for i in range(0, num_frames, batch_size):
        batch_frames = input_frames[i:i+batch_size]
        batch_depth = video_depth[i:i+batch_size]
        batch_depth_vis = depth_vis[i:i+batch_size]

        left_video = torch.from_numpy(batch_frames).permute(0, 3, 1, 2).float().cuda()
        disp_map = torch.from_numpy(batch_depth).unsqueeze(1).float().cuda()
        disp_map = disp_map * 2.0 - 1.0
        disp_map = disp_map * max_disp

        with torch.no_grad():
            right_video, occlusion_mask = stereo_projector(left_video, disp_map)

        right_video = right_video.cpu().permute(0, 2, 3, 1).numpy()
        occlusion_mask = occlusion_mask.cpu().permute(0, 2, 3, 1).numpy().repeat(3, axis=-1)

        for j in range(len(batch_frames)):
            video_grid_top = np.concatenate([batch_frames[j], batch_depth_vis[j]], axis=1)
            video_grid_bottom = np.concatenate([occlusion_mask[j], right_video[j]], axis=1)
            video_grid = np.concatenate([video_grid_top, video_grid_bottom], axis=0)

            video_grid_uint8 = np.clip(video_grid * 255.0, 0, 255).astype(np.uint8)
            video_grid_bgr = cv2.cvtColor(video_grid_uint8, cv2.COLOR_RGB2BGR)
            out.write(video_grid_bgr)

        # Free up GPU memory
        del left_video, disp_map, right_video, occlusion_mask
        torch.cuda.empty_cache()
        gc.collect()

    out.release()

def main(
        input_folder: str = DEFAULT_INPUT_FOLDER,
        output_folder: str = DEFAULT_OUTPUT_FOLDER,
        unet_path: str = DEFAULT_UNET_PATH,
        pre_trained_path: str = DEFAULT_PRE_TRAINED_PATH,
        max_disp: float = 20.0,
        process_length=-1,
        batch_size=10  # Added batch_size parameter
):
    os.makedirs(output_folder, exist_ok=True)
    depthcrafter_demo = DepthCrafterDemo(
        unet_path=unet_path,
        pre_trained_path=pre_trained_path
    )

    for filename in os.listdir(input_folder):
        if filename.endswith(".mp4") or filename.endswith(".avi"):
            input_video_path = os.path.join(input_folder, filename)
            output_video_path = os.path.join(output_folder, filename)
            print(f"Processing: {input_video_path}")
            process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp, process_length, batch_size)

if __name__ == "__main__":
    Fire(main)

xiaoyu258 · 2024-12-30T12:37:40Z

Hi enoky, I have updated the code according to your suggestions, now it requires much less GPU memory to run.

Your implementation is very helpful for me to improve the project, thanks a lot.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MemoryError During Video Processing: depth_splatting_inference.py #4

MemoryError During Video Processing: depth_splatting_inference.py #4

enoky commented Dec 28, 2024

xiaoyu258 commented Dec 29, 2024

enoky commented Dec 29, 2024 •

edited

Loading

xiaoyu258 commented Dec 30, 2024

MemoryError During Video Processing: depth_splatting_inference.py #4

MemoryError During Video Processing: depth_splatting_inference.py #4

Comments

enoky commented Dec 28, 2024

Suggested Fix

Request

xiaoyu258 commented Dec 29, 2024

enoky commented Dec 29, 2024 • edited Loading

xiaoyu258 commented Dec 30, 2024

enoky commented Dec 29, 2024 •

edited

Loading