Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MemoryError During Video Processing: depth_splatting_inference.py #4

Open
enoky opened this issue Dec 28, 2024 · 3 comments
Open

MemoryError During Video Processing: depth_splatting_inference.py #4

enoky opened this issue Dec 28, 2024 · 3 comments

Comments

@enoky
Copy link

enoky commented Dec 28, 2024

Hi,

While using StereoCrafter, I encountered a MemoryError when processing large videos, specifically during the write_video step in process_video. The script attempts to allocate a massive array (~24.7 GiB) for video_grid with shape (180, 1600, 3840, 3) and float64 data type, which exceeds typical system memory limits.

Suggested Fix

To address this, I implemented the following optimizations:

  1. Data Type Reduction: Convert arrays to float32 or uint8 before writing:

    video_grid = (video_grid * 255.0).astype(np.uint8)
  2. Incremental Frame Processing: Instead of creating a massive video_grid, process and write frames incrementally using OpenCV:

    import cv2
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        processed_frame = (frame * 255.0).astype(np.uint8)
        out.write(cv2.cvtColor(processed_frame, cv2.COLOR_RGB2BGR))
    out.release()

These changes significantly reduced memory usage and resolved the crash for larger videos.

Request

Could you consider integrating these optimizations into the codebase? Thanks for your excellent work on StereoCrafter!

@xiaoyu258
Copy link
Contributor

Thanks for your contribution. This is helpful!

I will update the codebase according to these optimizations.

@enoky
Copy link
Author

enoky commented Dec 29, 2024

Here is a complete implementation for depth_splatting_inference.py.

import gc
import cv2
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.io import write_video

from diffusers.training_utils import set_seed
from fire import Fire
from decord import VideoReader, cpu

from dependency.DepthCrafter.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
from dependency.DepthCrafter.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
from dependency.DepthCrafter.depthcrafter.utils import vis_sequence_depth, read_video_frames

from Forward_Warp import forward_warp

DEFAULT_INPUT_FOLDER = "./input_videos"
DEFAULT_OUTPUT_FOLDER = "./output_splatted"
DEFAULT_PRE_TRAINED_PATH = "./weights/stable-video-diffusion-img2vid-xt-1-1"
DEFAULT_UNET_PATH = "./weights/DepthCrafter"

class DepthCrafterDemo:
    def __init__(
            self,
            unet_path: str,
            pre_trained_path: str,
            cpu_offload: str = "model",
    ):
        unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
            unet_path,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
        )
        self.pipe = DepthCrafterPipeline.from_pretrained(
            pre_trained_path,
            unet=unet,
            torch_dtype=torch.float16,
            variant="fp16",
        )

        if cpu_offload is not None:
            if cpu_offload == "sequential":
                self.pipe.enable_sequential_cpu_offload()
            elif cpu_offload == "model":
                self.pipe.enable_model_cpu_offload()
            else:
                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
        else:
            self.pipe.to("cuda")

        try:
            self.pipe.enable_xformers_memory_efficient_attention()
        except Exception as e:
            print(e)
            print("Xformers is not enabled")
        self.pipe.enable_attention_slicing()

    def infer(
            self,
            input_video_path: str,
            output_video_path: str,
            process_length: int = -1,
            num_denoising_steps: int = 8,
            guidance_scale: float = 1.2,
            window_size: int = 70,
            overlap: int = 25,
            max_res: int = 960,
            dataset: str = "open",
            target_fps: int = -1,
            seed: int = 42,
            track_time: bool = False,
            save_depth: bool = False,
    ):
        set_seed(seed)

        frames, target_fps, original_height, original_width = read_video_frames(
            input_video_path,
            process_length,
            target_fps,
            max_res,
            dataset,
        )

        with torch.inference_mode():
            res = self.pipe(
                frames,
                height=frames.shape[1],
                width=frames.shape[2],
                output_type="np",
                guidance_scale=guidance_scale,
                num_inference_steps=num_denoising_steps,
                window_size=window_size,
                overlap=overlap,
                track_time=track_time,
            ).frames[0]

        res = res.sum(-1) / res.shape[-1]

        tensor_res = torch.tensor(res).unsqueeze(1).float().contiguous().cuda()
        res = F.interpolate(tensor_res, size=(original_height, original_width), mode='bilinear', align_corners=False)
        res = res.cpu().numpy()[:, 0, :, :]

        res = (res - res.min()) / (res.max() - res.min())
        vis = vis_sequence_depth(res)

        save_path = os.path.join(
            os.path.dirname(output_video_path), os.path.splitext(os.path.basename(output_video_path))[0]
        )

        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        if save_depth:
            np.savez_compressed(save_path + ".npz", depth=res)
            write_video(save_path + "_depth_vis.mp4", vis * 255.0, fps=target_fps, video_codec="h264",
                        options={"crf": "16"})

        return res, vis

class ForwardWarpStereo(nn.Module):
    def __init__(self, eps=1e-6, occlu_map=False):
        super(ForwardWarpStereo, self).__init__()
        self.eps = eps
        self.occlu_map = occlu_map
        self.fw = forward_warp()

    def forward(self, im, disp):
        im = im.contiguous()
        disp = disp.contiguous()
        weights_map = disp - disp.min()
        weights_map = (1.414) ** weights_map
        flow = -disp.squeeze(1)
        dummy_flow = torch.zeros_like(flow, requires_grad=False)
        flow = torch.stack((flow, dummy_flow), dim=-1)
        res_accum = self.fw(im * weights_map, flow)
        mask = self.fw(weights_map, flow)
        mask.clamp_(min=self.eps)
        res = res_accum / mask
        if not self.occlu_map:
            return res
        else:
            ones = torch.ones_like(disp, requires_grad=False)
            occlu_map = self.fw(ones, flow)
            occlu_map.clamp_(0.0, 1.0)
            occlu_map = 1.0 - occlu_map
            return res, occlu_map

def process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp=20.0, process_length=-1, batch_size=10):
    video_depth, depth_vis = depthcrafter_demo.infer(
        input_video_path,
        output_video_path,
        process_length,
    )

    vid_reader = VideoReader(input_video_path, ctx=cpu(0))
    original_fps = vid_reader.get_avg_fps()
    input_frames = vid_reader[:].asnumpy() / 255.0

    if process_length != -1 and process_length < len(input_frames):
        input_frames = input_frames[:process_length]
        video_depth = video_depth[:process_length]
        depth_vis = depth_vis[:process_length]

    stereo_projector = ForwardWarpStereo(occlu_map=True).cuda()

    num_frames = len(input_frames)
    height, width, _ = input_frames[0].shape

    # Initialize OpenCV VideoWriter
    out = cv2.VideoWriter(
        output_video_path, 
        cv2.VideoWriter_fourcc(*'avc1'),  # h264 codec
        original_fps, 
        (width * 2, height * 2)
    )

    for i in range(0, num_frames, batch_size):
        batch_frames = input_frames[i:i+batch_size]
        batch_depth = video_depth[i:i+batch_size]
        batch_depth_vis = depth_vis[i:i+batch_size]

        left_video = torch.from_numpy(batch_frames).permute(0, 3, 1, 2).float().cuda()
        disp_map = torch.from_numpy(batch_depth).unsqueeze(1).float().cuda()
        disp_map = disp_map * 2.0 - 1.0
        disp_map = disp_map * max_disp

        with torch.no_grad():
            right_video, occlusion_mask = stereo_projector(left_video, disp_map)

        right_video = right_video.cpu().permute(0, 2, 3, 1).numpy()
        occlusion_mask = occlusion_mask.cpu().permute(0, 2, 3, 1).numpy().repeat(3, axis=-1)

        for j in range(len(batch_frames)):
            video_grid_top = np.concatenate([batch_frames[j], batch_depth_vis[j]], axis=1)
            video_grid_bottom = np.concatenate([occlusion_mask[j], right_video[j]], axis=1)
            video_grid = np.concatenate([video_grid_top, video_grid_bottom], axis=0)

            video_grid_uint8 = np.clip(video_grid * 255.0, 0, 255).astype(np.uint8)
            video_grid_bgr = cv2.cvtColor(video_grid_uint8, cv2.COLOR_RGB2BGR)
            out.write(video_grid_bgr)

        # Free up GPU memory
        del left_video, disp_map, right_video, occlusion_mask
        torch.cuda.empty_cache()
        gc.collect()

    out.release()

def main(
        input_folder: str = DEFAULT_INPUT_FOLDER,
        output_folder: str = DEFAULT_OUTPUT_FOLDER,
        unet_path: str = DEFAULT_UNET_PATH,
        pre_trained_path: str = DEFAULT_PRE_TRAINED_PATH,
        max_disp: float = 20.0,
        process_length=-1,
        batch_size=10  # Added batch_size parameter
):
    os.makedirs(output_folder, exist_ok=True)
    depthcrafter_demo = DepthCrafterDemo(
        unet_path=unet_path,
        pre_trained_path=pre_trained_path
    )

    for filename in os.listdir(input_folder):
        if filename.endswith(".mp4") or filename.endswith(".avi"):
            input_video_path = os.path.join(input_folder, filename)
            output_video_path = os.path.join(output_folder, filename)
            print(f"Processing: {input_video_path}")
            process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp, process_length, batch_size)

if __name__ == "__main__":
    Fire(main)

@xiaoyu258
Copy link
Contributor

Hi enoky, I have updated the code according to your suggestions, now it requires much less GPU memory to run.

Your implementation is very helpful for me to improve the project, thanks a lot.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants