From 7815e21f8c0945039b254d3508d759153c29436f Mon Sep 17 00:00:00 2001 From: ejolly <eshin.jolly@gmail.com> Date: Fri, 18 Oct 2024 21:43:04 -0700 Subject: [PATCH] add faq page --- docs/_toc.yml | 1 + docs/basic_tutorials/02_detector_vids.ipynb | 18 +++---------- docs/pages/faqs.md | 28 +++++++++++++++++++++ docs/pages/intro.md | 2 ++ feat/data.py | 14 ++--------- feat/detector.py | 4 ++- feat/tests/test_data.py | 7 +++++- feat/utils/io.py | 15 +++++++++++ 8 files changed, 61 insertions(+), 28 deletions(-) create mode 100644 docs/pages/faqs.md diff --git a/docs/_toc.yml b/docs/_toc.yml index 36b05846..65526d06 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -7,6 +7,7 @@ parts: - file: pages/models - file: pages/au_reference - file: pages/usage_guide + - file: pages/faqs - caption: Basic Tutorials chapters: - file: basic_tutorials/01_basics diff --git a/docs/basic_tutorials/02_detector_vids.ipynb b/docs/basic_tutorials/02_detector_vids.ipynb index cd63c985..c1313e37 100644 --- a/docs/basic_tutorials/02_detector_vids.ipynb +++ b/docs/basic_tutorials/02_detector_vids.ipynb @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-03-24T21:06:23.837549Z", @@ -41,23 +41,13 @@ } }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/esh/miniconda3/envs/py-feat/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "/Users/esh/miniconda3/envs/py-feat/lib/python3.11/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n", - " @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)\n" - ] - }, { "data": { "text/plain": [ "Detector(face_model=img2pose, landmark_model=mobilefacenet, au_model=xgb, emotion_model=resmasknet, facepose_model=img2pose, identity_model=facenet)" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -81,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2021-03-24T21:06:31.491788Z", @@ -100,7 +90,7 @@ "<IPython.core.display.Video object>" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/pages/faqs.md b/docs/pages/faqs.md new file mode 100644 index 00000000..fe5493cb --- /dev/null +++ b/docs/pages/faqs.md @@ -0,0 +1,28 @@ +# FAQS + +### Py-feat is detecting multiple faces in an image with a single face + +This can happen sometimes particularly for videos as our current models don't use information from one frame to inform predictions for another frame. Try increasing the `face_detection_threshold` argument to `Detector.detect()` from the default of `0.5` to something like `0.8` or `0.9`. This will make the detector more conservative in what it considers a face. + +### Py-feat is treating the same person as multiple identities or treating different people as the same identity + +Similar to the previous issue, you can control the sensitivity of how confidently identity embeddings are retreated as different using the `face_identity_threshold` argument to `Detector.detect()` from the default of `0.8` to something higher or lower. This will make the detector more conservative or liberal respectively, in how distinct identity embeddings have to be to be considered different people. + +### How can I speed things up and control memory usage? + +By default all images or videos frames are processed independently in batches of size 1 using your CPU. If you have access to a CUDA-enabled GPU, you can use the `device` argument when initializing a detector instance to make use of it: `Detector(device='cuda')`. Unfortunately, macOS `'mps'` is not supported on our current model versions, but we hope to add it soon. To perform detections in parallel increase the `batch_size` argument to `Detector.detect()` from the default of 1. The largest batch size you can use without crashing your kernel is limited by the amount of VRAM available to your GPU (or RAM if you're using CPU). + +In order to use batching you must either: +- use a video - where frames are all assumed to have the same dimensions +- use a list of images - where each image has the same dimensions +- use a list of images and set `output_size=(width, height)` in `.detect()` to resize all images to the same dimensions before processing + +You can control parallelization of data loading using the `num_workers` argument to `.detect()`, which gets directly passed to pytorch's [DataLoader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) + +### Why does video processing take so long? + +This was a deliberate design-tradeoff to ensure a seamless experience when processing videos of any length on any computer, regardless of memory limits or GPU availability. By default, Py-feat avoids loading an entire video into memory, only loading frames as-needed, similar to Pytorch's [unofficial video API](https://pytorch.org/vision/main/auto_examples/others/plot_video_api.html#building-a-sample-read-video-function). + +This means that you don't need to worry about your computer crashing if you're trying to process a video that doesn't fit into memory! However, it also means that theres a small latency overhead that increases with the length of the video, i.e. later frames take longer to load than earlier frames as the video needs to be "seeked" to the correct time-point. + +If you already know that you have enough system memory to load the entire video at once, you can instead manually call `video_to_tensor('videofile.mp4)` from `feat.utils.io`. Then you can process the tensor by passing `data_type='tensor'` to `Detector.detect()` and proceed with batching as usual. diff --git a/docs/pages/intro.md b/docs/pages/intro.md index ca752d5b..656b15cc 100644 --- a/docs/pages/intro.md +++ b/docs/pages/intro.md @@ -36,6 +36,8 @@ pip install py-feat For other installation methods (e.g. Google Collab, development) see the [how to install page](./installation.md) +Check out our [FAQS](./faqs.md) for common issues and solutions. + ## Available models Py-feat includes several **pre-trained** models for Action Unit detection, Emotion detection, Face detection, Facial Landmark detection, and Face/Head post estimation. diff --git a/feat/data.py b/feat/data.py index d15f1692..724de2bc 100644 --- a/feat/data.py +++ b/feat/data.py @@ -2566,8 +2566,7 @@ class VideoDataset(Dataset): Dataset: dataset of [batch, channels, height, width] that can be passed to DataLoader """ - def __init__(self, video_file, skip_frames=None, output_size=None, low_memory=True): - self.low_memory = low_memory + def __init__(self, video_file, skip_frames=None, output_size=None): self.file_name = video_file self.skip_frames = skip_frames self.output_size = output_size @@ -2576,10 +2575,6 @@ def __init__(self, video_file, skip_frames=None, output_size=None, low_memory=Tr self.video_frames = np.arange( 0, self.metadata["num_frames"], 1 if skip_frames is None else skip_frames ) - if not self.low_memory: - self._container = av.open(self.file_name) - self._stream = self._container.streams.video[0] - self._frame_generator = self._container.decode(self._stream) def __len__(self): # Number of frames respective skip_frames @@ -2587,12 +2582,7 @@ def __len__(self): def __getitem__(self, idx): # Get the frame data and frame number respective skip_frames - if self.low_memory: - frame_data, frame_idx = self.load_frame(idx) - else: - frame = next(self._frame_generator) - frame_data = torch.from_numpy(frame.to_ndarray(format="rgb24")) - frame_idx = int(self.video_frames[idx]) + frame_data, frame_idx = self.load_frame(idx) # Swap frame dims to match output of read_image: [time, channels, height, width] # Otherwise detectors face on tensor dimension mismatch diff --git a/feat/detector.py b/feat/detector.py index 67073e34..b1587528 100644 --- a/feat/detector.py +++ b/feat/detector.py @@ -571,7 +571,9 @@ def detect( ) elif data_type.lower() == "video": dataset = VideoDataset( - inputs, skip_frames=skip_frames, output_size=output_size + inputs, + skip_frames=skip_frames, + output_size=output_size, ) data_loader = DataLoader( dataset, diff --git a/feat/tests/test_data.py b/feat/tests/test_data.py index bfda7920..d8991b22 100644 --- a/feat/tests/test_data.py +++ b/feat/tests/test_data.py @@ -3,7 +3,7 @@ import numpy as np import os from feat.data import Fex -from feat.utils.io import read_openface, get_test_data_path, read_feat +from feat.utils.io import read_openface, get_test_data_path, read_feat, video_to_tensor from nltools.data import Adjacency @@ -293,3 +293,8 @@ def test_stats(): [np.array(range(int(len(doubled) / 2))), np.array(range(int(len(doubled) / 2)))] ) assert doubled.assign(frame=frame).isc(col="AU04_r").iloc[0, 0] == 1 + + +def test_video_to_tensor(single_face_mov): + tensor = video_to_tensor(single_face_mov) + assert tensor.shape == (72, 3, 360, 640) diff --git a/feat/utils/io.py b/feat/utils/io.py index 0d09515d..f14ed67e 100644 --- a/feat/utils/io.py +++ b/feat/utils/io.py @@ -25,6 +25,9 @@ from torchvision.io import read_image, read_video from torchvision.transforms.functional import to_pil_image import warnings +import av +import torch +from torch import swapaxes __all__ = [ "get_resource_path", @@ -231,3 +234,15 @@ def load_pil_img(file_name, frame_id): video, audio, info = read_video(file_name, output_format="TCHW") frame_img = video[frame_id, :, :] return to_pil_image(frame_img) + + +def video_to_tensor(file_name): + container = av.open(file_name) + stream = container.streams.video[0] + tensor = [] + for frame in container.decode(stream): + frame_data = torch.from_numpy(frame.to_ndarray(format="rgb24")) + frame_data = swapaxes(swapaxes(frame_data, 0, -1), 1, 2) + tensor.append(frame_data) + container.close() + return torch.stack(tensor, dim=0)