-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MemoryError During Video Processing: depth_splatting_inference.py #4
Comments
Thanks for your contribution. This is helpful! I will update the codebase according to these optimizations. |
Here is a complete implementation for import gc
import cv2
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.io import write_video
from diffusers.training_utils import set_seed
from fire import Fire
from decord import VideoReader, cpu
from dependency.DepthCrafter.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
from dependency.DepthCrafter.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
from dependency.DepthCrafter.depthcrafter.utils import vis_sequence_depth, read_video_frames
from Forward_Warp import forward_warp
DEFAULT_INPUT_FOLDER = "./input_videos"
DEFAULT_OUTPUT_FOLDER = "./output_splatted"
DEFAULT_PRE_TRAINED_PATH = "./weights/stable-video-diffusion-img2vid-xt-1-1"
DEFAULT_UNET_PATH = "./weights/DepthCrafter"
class DepthCrafterDemo:
def __init__(
self,
unet_path: str,
pre_trained_path: str,
cpu_offload: str = "model",
):
unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
unet_path,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
)
self.pipe = DepthCrafterPipeline.from_pretrained(
pre_trained_path,
unet=unet,
torch_dtype=torch.float16,
variant="fp16",
)
if cpu_offload is not None:
if cpu_offload == "sequential":
self.pipe.enable_sequential_cpu_offload()
elif cpu_offload == "model":
self.pipe.enable_model_cpu_offload()
else:
raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
else:
self.pipe.to("cuda")
try:
self.pipe.enable_xformers_memory_efficient_attention()
except Exception as e:
print(e)
print("Xformers is not enabled")
self.pipe.enable_attention_slicing()
def infer(
self,
input_video_path: str,
output_video_path: str,
process_length: int = -1,
num_denoising_steps: int = 8,
guidance_scale: float = 1.2,
window_size: int = 70,
overlap: int = 25,
max_res: int = 960,
dataset: str = "open",
target_fps: int = -1,
seed: int = 42,
track_time: bool = False,
save_depth: bool = False,
):
set_seed(seed)
frames, target_fps, original_height, original_width = read_video_frames(
input_video_path,
process_length,
target_fps,
max_res,
dataset,
)
with torch.inference_mode():
res = self.pipe(
frames,
height=frames.shape[1],
width=frames.shape[2],
output_type="np",
guidance_scale=guidance_scale,
num_inference_steps=num_denoising_steps,
window_size=window_size,
overlap=overlap,
track_time=track_time,
).frames[0]
res = res.sum(-1) / res.shape[-1]
tensor_res = torch.tensor(res).unsqueeze(1).float().contiguous().cuda()
res = F.interpolate(tensor_res, size=(original_height, original_width), mode='bilinear', align_corners=False)
res = res.cpu().numpy()[:, 0, :, :]
res = (res - res.min()) / (res.max() - res.min())
vis = vis_sequence_depth(res)
save_path = os.path.join(
os.path.dirname(output_video_path), os.path.splitext(os.path.basename(output_video_path))[0]
)
os.makedirs(os.path.dirname(save_path), exist_ok=True)
if save_depth:
np.savez_compressed(save_path + ".npz", depth=res)
write_video(save_path + "_depth_vis.mp4", vis * 255.0, fps=target_fps, video_codec="h264",
options={"crf": "16"})
return res, vis
class ForwardWarpStereo(nn.Module):
def __init__(self, eps=1e-6, occlu_map=False):
super(ForwardWarpStereo, self).__init__()
self.eps = eps
self.occlu_map = occlu_map
self.fw = forward_warp()
def forward(self, im, disp):
im = im.contiguous()
disp = disp.contiguous()
weights_map = disp - disp.min()
weights_map = (1.414) ** weights_map
flow = -disp.squeeze(1)
dummy_flow = torch.zeros_like(flow, requires_grad=False)
flow = torch.stack((flow, dummy_flow), dim=-1)
res_accum = self.fw(im * weights_map, flow)
mask = self.fw(weights_map, flow)
mask.clamp_(min=self.eps)
res = res_accum / mask
if not self.occlu_map:
return res
else:
ones = torch.ones_like(disp, requires_grad=False)
occlu_map = self.fw(ones, flow)
occlu_map.clamp_(0.0, 1.0)
occlu_map = 1.0 - occlu_map
return res, occlu_map
def process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp=20.0, process_length=-1, batch_size=10):
video_depth, depth_vis = depthcrafter_demo.infer(
input_video_path,
output_video_path,
process_length,
)
vid_reader = VideoReader(input_video_path, ctx=cpu(0))
original_fps = vid_reader.get_avg_fps()
input_frames = vid_reader[:].asnumpy() / 255.0
if process_length != -1 and process_length < len(input_frames):
input_frames = input_frames[:process_length]
video_depth = video_depth[:process_length]
depth_vis = depth_vis[:process_length]
stereo_projector = ForwardWarpStereo(occlu_map=True).cuda()
num_frames = len(input_frames)
height, width, _ = input_frames[0].shape
# Initialize OpenCV VideoWriter
out = cv2.VideoWriter(
output_video_path,
cv2.VideoWriter_fourcc(*'avc1'), # h264 codec
original_fps,
(width * 2, height * 2)
)
for i in range(0, num_frames, batch_size):
batch_frames = input_frames[i:i+batch_size]
batch_depth = video_depth[i:i+batch_size]
batch_depth_vis = depth_vis[i:i+batch_size]
left_video = torch.from_numpy(batch_frames).permute(0, 3, 1, 2).float().cuda()
disp_map = torch.from_numpy(batch_depth).unsqueeze(1).float().cuda()
disp_map = disp_map * 2.0 - 1.0
disp_map = disp_map * max_disp
with torch.no_grad():
right_video, occlusion_mask = stereo_projector(left_video, disp_map)
right_video = right_video.cpu().permute(0, 2, 3, 1).numpy()
occlusion_mask = occlusion_mask.cpu().permute(0, 2, 3, 1).numpy().repeat(3, axis=-1)
for j in range(len(batch_frames)):
video_grid_top = np.concatenate([batch_frames[j], batch_depth_vis[j]], axis=1)
video_grid_bottom = np.concatenate([occlusion_mask[j], right_video[j]], axis=1)
video_grid = np.concatenate([video_grid_top, video_grid_bottom], axis=0)
video_grid_uint8 = np.clip(video_grid * 255.0, 0, 255).astype(np.uint8)
video_grid_bgr = cv2.cvtColor(video_grid_uint8, cv2.COLOR_RGB2BGR)
out.write(video_grid_bgr)
# Free up GPU memory
del left_video, disp_map, right_video, occlusion_mask
torch.cuda.empty_cache()
gc.collect()
out.release()
def main(
input_folder: str = DEFAULT_INPUT_FOLDER,
output_folder: str = DEFAULT_OUTPUT_FOLDER,
unet_path: str = DEFAULT_UNET_PATH,
pre_trained_path: str = DEFAULT_PRE_TRAINED_PATH,
max_disp: float = 20.0,
process_length=-1,
batch_size=10 # Added batch_size parameter
):
os.makedirs(output_folder, exist_ok=True)
depthcrafter_demo = DepthCrafterDemo(
unet_path=unet_path,
pre_trained_path=pre_trained_path
)
for filename in os.listdir(input_folder):
if filename.endswith(".mp4") or filename.endswith(".avi"):
input_video_path = os.path.join(input_folder, filename)
output_video_path = os.path.join(output_folder, filename)
print(f"Processing: {input_video_path}")
process_video(input_video_path, output_video_path, depthcrafter_demo, max_disp, process_length, batch_size)
if __name__ == "__main__":
Fire(main) |
Hi enoky, I have updated the code according to your suggestions, now it requires much less GPU memory to run. Your implementation is very helpful for me to improve the project, thanks a lot. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
While using StereoCrafter, I encountered a MemoryError when processing large videos, specifically during the
write_video
step inprocess_video
. The script attempts to allocate a massive array (~24.7 GiB) forvideo_grid
with shape(180, 1600, 3840, 3)
andfloat64
data type, which exceeds typical system memory limits.Suggested Fix
To address this, I implemented the following optimizations:
Data Type Reduction: Convert arrays to
float32
oruint8
before writing:Incremental Frame Processing: Instead of creating a massive
video_grid
, process and write frames incrementally using OpenCV:These changes significantly reduced memory usage and resolved the crash for larger videos.
Request
Could you consider integrating these optimizations into the codebase? Thanks for your excellent work on StereoCrafter!
The text was updated successfully, but these errors were encountered: