diff --git a/gdl/datasets/FaceVideoDataModule.py b/gdl/datasets/FaceVideoDataModule.py index 1009415..a27c4cb 100644 --- a/gdl/datasets/FaceVideoDataModule.py +++ b/gdl/datasets/FaceVideoDataModule.py @@ -20,12 +20,14 @@ from torch.utils.data.dataloader import DataLoader import os, sys +import subprocess from pathlib import Path import numpy as np import torch # import torchaudio from typing import Optional, Union, List import pickle as pkl +import hickle as hkl # from collections import OrderedDict from tqdm import tqdm, auto # import subprocess @@ -37,10 +39,18 @@ from gdl.datasets.UnsupervisedImageDataset import UnsupervisedImageDataset from facenet_pytorch import InceptionResnetV1 from collections import OrderedDict -from gdl.datasets.IO import save_emotion +from gdl.datasets.IO import save_emotion, save_segmentation_list, save_reconstruction_list, save_emotion_list from PIL import Image, ImageDraw, ImageFont import cv2 from skimage.io import imread +from skvideo.io import vreader, vread +import skvideo.io +import torch.nn.functional as F + +from gdl.datasets.VideoFaceDetectionDataset import VideoFaceDetectionDataset +import types + +from gdl.utils.FaceDetector import save_landmark, save_landmark_v2 # from memory_profiler import profile @@ -61,24 +71,49 @@ def __init__(self, root_dir, output_dir, processed_subfolder=None, face_detector_threshold=0.9, image_size=224, scale=1.25, - device=None): + processed_video_size=256, + device=None, + unpack_videos=True, + save_detection_images=True, + save_landmarks=True, + save_landmarks_one_file=False, + save_segmentation_frame_by_frame=True, + save_segmentation_one_file=False, + bb_center_shift_x=0, # in relative numbers + bb_center_shift_y=0, # in relative numbers (i.e. -0.1 for 10% shift upwards, ...) + include_processed_audio = True, + include_raw_audio = True, + preload_videos = False, + inflate_by_video_size = False, + read_video=True, + ): super().__init__(root_dir, output_dir, processed_subfolder=processed_subfolder, face_detector=face_detector, face_detector_threshold=face_detector_threshold, image_size = image_size, scale = scale, - device=device) - - + device=device, + save_detection_images=save_detection_images, + save_landmarks_frame_by_frame=save_landmarks, + save_landmarks_one_file=save_landmarks_one_file, + save_segmentation_frame_by_frame=save_segmentation_frame_by_frame, # default + save_segmentation_one_file=save_segmentation_one_file, # only use for large scale video datasets (that would produce too many files otherwise) + bb_center_shift_x=bb_center_shift_x, # in relative numbers + bb_center_shift_y=bb_center_shift_y, # in relative numbers (i.e. -0.1 for 10% shift upwards, ...) + ) + self.unpack_videos = unpack_videos + self.detect_landmarks_on_restored_images = None + self.processed_video_size = processed_video_size # self._instantiate_detector() # self.face_recognition = InceptionResnetV1(pretrained='vggface2').eval().to(device) - - self.version = 2 + # self.version = 2 + self.version = 3 self.video_list = None self.video_metas = None + self.audio_metas = None self.annotation_list = None self.frame_lists = None self.loaded = False @@ -88,6 +123,14 @@ def __init__(self, root_dir, output_dir, processed_subfolder=None, self.detection_centers = [] self.detection_sizes = [] + self.include_processed_audio = include_processed_audio + self.include_raw_audio = include_raw_audio + self.preload_videos = preload_videos + self.inflate_by_video_size = inflate_by_video_size + + self._must_include_audio = False + self.read_video=read_video + @property def metadata_path(self): return os.path.join(self.output_dir, "metadata.pkl") @@ -106,6 +149,8 @@ def prepare_data(self, *args, **kwargs): self._unpack_videos() self._saveMeta() + def _is_video_dataset(self): + return True def _unpack_videos(self): self.frame_lists = [] @@ -115,6 +160,12 @@ def _unpack_videos(self): def get_frame_number_format(self): return "%06d" + def count_num_frames(self): + num_frames = 0 + for i in range(len(self.video_metas)): + num_frames += self.video_metas[i]['num_frames'] + return num_frames + # def _get_unpacked_video_subfolder(self, video_idx): # return Path(self._video_category(video_idx)) / video_file.parts[-3] /self._video_set(video_idx) / video_file.stem @@ -152,7 +203,28 @@ def _unpack_video(self, video_idx, overwrite=False): % (expected_frames, n_frames, str(video_file))) - def _detect_faces(self): + def _extract_audio(self): + # extract audio for all videos + print("Extracting audio for all videos") + for vi, video_file in enumerate(auto.tqdm(self.video_list)): + self._extract_audio_for_video(vi) + print("Audio extracted for all videos") + + def _extract_audio_for_video(self, video_idx): + video_file = Path(self.root_dir) / self.video_list[video_idx] + audio_file = self._get_path_to_sequence_audio(video_idx) + + # extract the audio from the video using ffmpeg + if not audio_file.is_file(): + # print("Extracting audio from video '%s'" % str(video_file)) + audio_file.parent.mkdir(exist_ok=True, parents=True) + cmd = "ffmpeg -i " + str(video_file) + " -f wav -vn -y " + str(audio_file) + ' -loglevel quiet' + os.system(cmd) + else: + print("Skipped extracting audio from video '%s' because it already exists" % str(video_file)) + + + def _detect_faces(self): #, videos_unpacked=True): #, save_detection_images=True, save_landmarks=True): for sid in range(self.num_sequences): self._detect_faces_in_sequence(sid) @@ -169,21 +241,65 @@ def _get_path_to_sequence_files(self, sequence_id, file_type, method="", suffix= out_folder = Path(self.output_dir) / suffix return out_folder + def _get_path_to_sequence_audio(self, sequence_id): + return self._get_path_to_sequence_files(sequence_id, "audio").with_suffix(".wav") def _get_path_to_sequence_frames(self, sequence_id): return self._get_path_to_sequence_files(sequence_id, "videos") + def _get_path_to_aligned_videos(self, sequence_id): + return self._get_path_to_sequence_files(sequence_id, "videos_aligned").with_suffix(".mp4") + def _get_path_to_sequence_detections(self, sequence_id): return self._get_path_to_sequence_files(sequence_id, "detections") - def _get_path_to_sequence_landmarks(self, sequence_id): - return self._get_path_to_sequence_files(sequence_id, "landmarks") + def _get_landmark_method(self): + return "" # for backwards compatibility (AffectNet, ...), the inheriting classes should specify the method + + def _get_path_to_sequence_landmarks(self, sequence_id, use_aligned_videos=False): + + if self.save_detection_images: + # landmarks will be saved wrt to the detection images + landmark_subfolder = "landmarks" + elif use_aligned_videos: + landmark_subfolder = "landmarks_aligned" + else: + # landmarks will be saved wrt to the original images (not the detection images), + # so better put them in a different folder to make it clear + landmark_subfolder = "landmarks_original" - def _get_path_to_sequence_segmentations(self, sequence_id): - return self._get_path_to_sequence_files(sequence_id, "segmentations") + method = self._get_landmark_method() - def _get_path_to_sequence_emotions(self, sequence_id): - return self._get_path_to_sequence_files(sequence_id, "emotions") + return self._get_path_to_sequence_files(sequence_id, landmark_subfolder, method=method) + + def _get_segmentation_method(self): + return "" + + def _get_path_to_sequence_segmentations(self, sequence_id, use_aligned_videos=False): + if self.save_detection_images: + # landmarks will be saved wrt to the detection images + segmentation_subfolder = "segmentations" + elif use_aligned_videos: + segmentation_subfolder = "segmentations_aligned" + else: + # landmarks will be saved wrt to the original images (not the detection images), + # so better put them in a different folder to make it clear + segmentation_subfolder = "segmentations_original" + + method = self._get_segmentation_method() + + return self._get_path_to_sequence_files(sequence_id, segmentation_subfolder, method=method) + # return self._get_path_to_sequence_files(sequence_id, "segmentations") + + + # def _get_path_to_sequence_landmarks(self, sequence_id): + # return self._get_path_to_sequence_files(sequence_id, "landmarks") + + # def _get_path_to_sequence_segmentations(self, sequence_id): + # return self._get_path_to_sequence_files(sequence_id, "segmentations") + + def _get_path_to_sequence_emotions(self, sequence_id, emo_method="resnet50"): + return self._get_path_to_sequence_files(sequence_id, "emotions", method=emo_method) def _video_category(self, sequence_id): video_file = self.video_list[sequence_id] @@ -202,7 +318,7 @@ def _get_path_to_sequence_reconstructions(self, sequence_id, rec_method='emoca', if rec_method == 'deca': return self._get_path_to_sequence_files(sequence_id, "reconstructions", "", suffix) else: - assert rec_method in ['emoca', 'deep3dface'] + assert rec_method in ['emoca', 'deep3dface', 'spectre'] return self._get_path_to_sequence_files(sequence_id, "reconstructions", rec_method, suffix) # video_file = self.video_list[sequence_id] # if rec_method == 'deca': @@ -227,11 +343,18 @@ def _detect_faces_in_sequence(self, sequence_id): # suffix = Path(self._video_category(sequence_id)) / 'detections' /self._video_set(sequence_id) / video_file.stem out_detection_folder = self._get_path_to_sequence_detections(sequence_id) out_detection_folder.mkdir(exist_ok=True, parents=True) - out_file = out_detection_folder / "bboxes.pkl" + out_file_boxes = out_detection_folder / "bboxes.pkl" out_landmark_folder = self._get_path_to_sequence_landmarks(sequence_id) out_landmark_folder.mkdir(exist_ok=True, parents=True) + if self.save_landmarks_one_file: + overwrite = False + if not overwrite and (out_landmark_folder / "landmarks.pkl").is_file() and (out_landmark_folder / "landmarks_original.pkl").is_file() and (out_landmark_folder / "landmark_types.pkl").is_file(): + print("Files with landmarks already found in '%s'. Skipping" % out_landmark_folder) + return + + centers_all = [] sizes_all = [] detection_fnames_all = [] @@ -251,21 +374,222 @@ def _detect_faces_in_sequence(self, sequence_id): # detector_instantion_frequency = 200 start_fid = 0 - frame_list = self.frame_lists[sequence_id] - fid = 0 - if len(frame_list) == 0: - print("Nothing to detect in: '%s'. All frames have been processed" % self.video_list[sequence_id]) - for fid, frame_fname in enumerate(tqdm(range(start_fid, len(frame_list)))): + if self.unpack_videos: + frame_list = self.frame_lists[sequence_id] + fid = 0 + if len(frame_list) == 0: + print("Nothing to detect in: '%s'. All frames have been processed" % self.video_list[sequence_id]) + for fid, frame_fname in enumerate(tqdm(range(start_fid, len(frame_list)))): - # if fid % detector_instantion_frequency == 0: - # self._instantiate_detector(overwrite=True) + # if fid % detector_instantion_frequency == 0: + # self._instantiate_detector(overwrite=True) - self._detect_faces_in_image_wrapper(frame_list, fid, out_detection_folder, out_landmark_folder, out_file, - centers_all, sizes_all, detection_fnames_all, landmark_fnames_all) + self._detect_faces_in_image_wrapper(frame_list, fid, out_detection_folder, out_landmark_folder, out_file_boxes, + centers_all, sizes_all, detection_fnames_all, landmark_fnames_all) - FaceVideoDataModule.save_detections(out_file, + else: + num_frames = self.video_metas[sequence_id]['num_frames'] + if self.detect_landmarks_on_restored_images is None: + video_name = self.root_dir / self.video_list[sequence_id] + else: + video_name = video_file = self._get_path_to_sequence_restored( + sequence_id, method=self.detect_landmarks_on_restored_images) + assert video_name.is_file() + if start_fid == 0: + videogen = vreader(str(video_name)) + # videogen = vread(str(video_name)) + # for i in range(start_fid): + # _discarded_frame = next(videogen + else: + videogen = vread(str(video_name)) + + if self.save_landmarks_one_file: + out_landmarks_all = [] # landmarks wrt to the aligned image + out_landmarks_original_all = [] # landmarks wrt to the original image + out_bbox_type_all = [] + else: + out_landmarks_all = None + out_landmarks_original_all = None + out_bbox_type_all = None + + for fid in tqdm(range(start_fid, num_frames)): + self._detect_faces_in_image_wrapper(videogen, fid, out_detection_folder, out_landmark_folder, out_file_boxes, + centers_all, sizes_all, detection_fnames_all, landmark_fnames_all, + out_landmarks_all, out_landmarks_original_all, out_bbox_type_all) + + if self.save_landmarks_one_file: + # saves all landmarks per video + out_file = out_landmark_folder / "landmarks.pkl" + FaceVideoDataModule.save_landmark_list(out_file, out_landmarks_all) + out_file = out_landmark_folder / "landmarks_original.pkl" + FaceVideoDataModule.save_landmark_list(out_file, out_landmarks_original_all) + print(f"Landmarks for sequence saved into one file: {out_file}") + out_file = out_landmark_folder / "landmark_types.pkl" + FaceVideoDataModule.save_landmark_list(out_file, out_bbox_type_all) + + + FaceVideoDataModule.save_detections(out_file_boxes, detection_fnames_all, landmark_fnames_all, centers_all, sizes_all, fid) print("Done detecting faces in sequence: '%s'" % self.video_list[sequence_id]) + return + + + # @profile + def _detect_landmarkes_in_aligned_sequence(self, sequence_id): + video_file = self._get_path_to_aligned_videos(sequence_id) + print("Detecting landmarks in aligned sequence: '%s'" % video_file) + + out_landmark_folder = self._get_path_to_sequence_landmarks(sequence_id, use_aligned_videos=True) + out_landmark_folder.mkdir(exist_ok=True, parents=True) + + if self.save_landmarks_one_file: + overwrite = False + if not overwrite and (out_landmark_folder / "landmarks.pkl").is_file() and (out_landmark_folder / "landmarks_original.pkl").is_file() and (out_landmark_folder / "landmark_types.pkl").is_file(): + print("Files with landmarks already found in '%s'. Skipping" % out_landmark_folder) + return + + # start_fid = 0 + + if self.unpack_videos: + raise NotImplementedError("Not implemented and should not be. Unpacking videos into a sequence of images is pricy.") + # frame_list = self.frame_lists[sequence_id] + # fid = 0 + # if len(frame_list) == 0: + # print("Nothing to detect in: '%s'. All frames have been processed" % self.video_list[sequence_id]) + # for fid, frame_fname in enumerate(tqdm(range(start_fid, len(frame_list)))): + + # # if fid % detector_instantion_frequency == 0: + # # self._instantiate_detector(overwrite=True) + + # self._detect_faces_in_image_wrapper(frame_list, fid, out_detection_folder, out_landmark_folder, out_file_boxes, + # centers_all, sizes_all, detection_fnames_all, landmark_fnames_all) + + else: + # if start_fid == 0: + # videogen = vreader(str(video_name)) + videogen = skvideo.io.FFmpegReader(str(video_file)) + # videogen = vread(str(video_name)) + # for i in range(start_fid): + # _discarded_frame = next(videogen) + # else: + # videogen = vread(str(video_name)) + self._detect_landmarks_no_face_detection(videogen, out_landmark_folder) + + + def _detect_landmarks_no_face_detection(self, detection_fnames_or_ims, out_landmark_folder, path_depth = 0): + """ + Just detects landmarks without face detection. The images should already be cropped to the face. + """ + import time + if self.save_landmarks_one_file: + overwrite = False + single_out_file = out_landmark_folder / "landmarks.pkl" + if single_out_file.is_file() and not overwrite: + print(f"Landmarks already found in {single_out_file}, skipping") + return + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + print(device) + # net, landmark_type, batch_size = self._get_segmentation_net(device) + + # if self.save_detection_images: + # ref_im = imread(detection_fnames_or_ims[0]) + # else: + # ref_im = detection_fnames_or_ims[0] + # ref_size = Resize((ref_im.shape[0], ref_im.shape[1]), interpolation=Image.NEAREST) + # ref_size = None + + optimal_landmark_detector_size = self.face_detector.optimal_landmark_detector_im_size() + + transforms = Compose([ + Resize((optimal_landmark_detector_size, optimal_landmark_detector_size)), + # Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ]) + # transforms=None + batch_size = 64 + + if isinstance(detection_fnames_or_ims, types.GeneratorType): + im_read = "skvreader" + elif isinstance(detection_fnames_or_ims, (skvideo.io.FFmpegReader)): + im_read = "skvffmpeg" + else: + im_read = 'pil' if not isinstance(detection_fnames_or_ims[0], np.ndarray) else None + + dataset = UnsupervisedImageDataset(detection_fnames_or_ims, image_transforms=transforms, + im_read=im_read) + num_workers = 4 if im_read not in ["skvreader", "skvffmpeg"] else 1 # videos can only be read on 1 thread frame by frame + loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) + + # import matplotlib.pyplot as plt + + if self.save_landmarks_one_file: + # out_landmark_names = [] + out_landmarks = [] + out_landmark_types = [] + out_landmarks_scores = [] + + for i, batch in enumerate(tqdm(loader)): + # facenet_pytorch expects this stanadrization for the input to the net + # images = fixed_image_standardization(batch['image'].to(device)) + images = batch['image'].cuda() + # start = time.time() + with torch.no_grad(): + landmarks, landmark_scores = self.face_detector.landmarks_from_batch_no_face_detection(images) + # end = time.time() + + # import matplotlib.pyplot as plt + # plt.imshow(images[0].cpu().numpy().transpose(1,2,0)) + # # plot the landmark points + # plt.scatter(landmarks[0, :, 0] * images.shape[3], landmarks[0, :, 1] * images.shape[2], s=10, marker='.', c='r') + # plt.show() + + if self.save_landmarks_frame_by_frame: + start = time.time() + for j in range(landmarks.shape[0]): + image_path = batch['path'][j] + # if isinstance(out_segmentation_folder, list): + if path_depth > 0: + rel_path = Path(image_path).parent.relative_to(Path(image_path).parents[path_depth]) + landmark_path = out_landmark_folder / rel_path / (Path(image_path).stem + ".pkl") + else: + landmark_path = out_landmark_folder / (Path(image_path).stem + ".pkl") + landmark_path.parent.mkdir(exist_ok=True, parents=True) + save_landmark_v2(landmark_path, landmarks[j], landmark_scores[j], self.face_detector.landmark_type()) + print(f" Saving batch {i} took: {end - start}") + end = time.time() + if self.save_landmarks_one_file: + out_landmarks += [landmarks] + out_landmarks_scores += [landmark_scores] + out_landmark_types += [self.face_detector.landmark_type()] * len(landmarks) + + if self.save_landmarks_one_file: + out_landmarks = np.concatenate(out_landmarks, axis=0) + out_landmarks_scores = np.concatenate(out_landmarks_scores, axis=0) + FaceVideoDataModule.save_landmark_list_v2(single_out_file, out_landmarks, landmark_scores, out_landmark_types) + print("Landmarks saved to %s" % single_out_file) + + + def _cut_out_detected_faces_in_sequence(self, sequence_id): + in_landmark_folder = self._get_path_to_sequence_landmarks(sequence_id) + in_file = in_landmark_folder / "landmarks_original.pkl" + FaceVideoDataModule.load_landmark_list(in_file) + + # Extract the number of people (use face recognition) + + # Take the most numerous person. (that is most likely the person in question) + # - very unlikely there will be more equally present ones in most face datasets + + + # Interpolate the bounding boxes of that person in order to have a BB for dropped detections + + # Extract the face from all frames to create a video + + # Resample the video to conform to the desired FPS if need be + + # Save the video + + + def _get_emotion_net(self, device): @@ -276,19 +600,35 @@ def _get_emotion_net(self, device): return net, "emo_net" - def _segment_faces_in_sequence(self, sequence_id): + def _segment_faces_in_sequence(self, sequence_id, use_aligned_videos=False): video_file = self.video_list[sequence_id] print("Segmenting faces in sequence: '%s'" % video_file) # suffix = Path(self._video_category(sequence_id)) / 'detections' /self._video_set(sequence_id) / video_file.stem - out_detection_folder = self._get_path_to_sequence_detections(sequence_id) - out_segmentation_folder = self._get_path_to_sequence_segmentations(sequence_id) - out_segmentation_folder.mkdir(exist_ok=True, parents=True) + if self.save_detection_images: + out_detection_folder = self._get_path_to_sequence_detections(sequence_id) + detections = sorted(list(out_detection_folder.glob("*.png"))) + elif use_aligned_videos: + # video_path = str( Path(self.output_dir) / "videos_aligned" / self.video_list[sequence_id]) + video_path = self._get_path_to_aligned_videos(sequence_id) + # detections = vreader( video_path) + detections = skvideo.io.FFmpegReader(str(video_path)) + # detections = detections.astype(np.float32) / 255. + else: + detections = vread( str(self.root_dir / self.video_list[sequence_id])) + detections = detections.astype(np.float32) / 255. + - detection_fnames = sorted(list(out_detection_folder.glob("*.png"))) + out_segmentation_folder = self._get_path_to_sequence_segmentations(sequence_id, use_aligned_videos=use_aligned_videos) + out_segmentation_folder.mkdir(exist_ok=True, parents=True) - self._segment_images(detection_fnames, out_segmentation_folder) + # if self.save_landmarks_frame_by_frame: + # out_landmark_folder = self._get_path_to_sequence_landmarks(sequence_id) + # landmarks = sorted(list(out_landmark_folder.glob("*.pkl"))) + # else: + # landmarks = None + self._segment_images(detections, out_segmentation_folder) def _extract_emotion_from_faces_in_sequence(self, sequence_id): @@ -477,7 +817,7 @@ def _recognize_faces(self, device = None): for sid in range(self.num_sequences): self._recognize_faces_in_sequence(sid, recognition_net, device) - def _recognize_faces_in_sequence(self, sequence_id, recognition_net=None, device=None, num_workers = 4): + def _recognize_faces_in_sequence(self, sequence_id, recognition_net=None, device=None, num_workers = 4, overwrite = False): def fixed_image_standardization(image_tensor): processed_tensor = (image_tensor - 127.5) / 128.0 @@ -485,13 +825,27 @@ def fixed_image_standardization(image_tensor): print("Running face recognition in sequence '%s'" % self.video_list[sequence_id]) out_folder = self._get_path_to_sequence_detections(sequence_id) + out_file = out_folder / "embeddings.pkl" + if out_file.exists() and not overwrite: + print("Face embeddings already computed for sequence '%s'" % self.video_list[sequence_id]) + return device = device or torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') recognition_net = recognition_net or self._get_recognition_net(device) recognition_net.requires_grad_(False) # detections_fnames = sorted(self.detection_fnames[sequence_id]) detections_fnames = sorted(list(out_folder.glob("*.png"))) - dataset = UnsupervisedImageDataset(detections_fnames) - loader = DataLoader(dataset, batch_size=64, num_workers=num_workers, shuffle=False) + + if len(detections_fnames) == 0: + # there are no images, either there is a video file or something went wrong: + video_path = self.root_dir / self.video_list[sequence_id] + landmark_file = self._get_path_to_sequence_landmarks(sequence_id) + + from gdl.datasets.VideoFaceDetectionDataset import VideoFaceDetectionDataset + dataset = VideoFaceDetectionDataset(video_path, landmark_file, output_im_range=255) + else: + dataset = UnsupervisedImageDataset(detections_fnames) + # loader = DataLoader(dataset, batch_size=64, num_workers=num_workers, shuffle=False) + loader = DataLoader(dataset, batch_size=64, num_workers=1, shuffle=False) # loader = DataLoader(dataset, batch_size=2, num_workers=0, shuffle=False) all_embeddings = [] for i, batch in enumerate(tqdm(loader)): @@ -500,8 +854,12 @@ def fixed_image_standardization(image_tensor): embeddings = recognition_net(images) all_embeddings += [embeddings.detach().cpu().numpy()] - embedding_array = np.concatenate(all_embeddings, axis=0) - FaceVideoDataModule._save_face_embeddings(out_folder / "embeddings.pkl", embedding_array, detections_fnames) + if len(all_embeddings) > 0: + embedding_array = np.concatenate(all_embeddings, axis=0) + else: + embedding_array = np.array([]) + out_folder.mkdir(parents=True, exist_ok=True) + FaceVideoDataModule._save_face_embeddings(out_file, embedding_array, detections_fnames) print("Done running face recognition in sequence '%s'" % self.video_list[sequence_id]) @@ -553,6 +911,76 @@ def _load_face_embeddings(fname): # deca = EMOCA(config=deca_cfg, device=device) # return deca + def _get_emotion_recognition_net(self, device, rec_method='resnet50'): + if rec_method == 'resnet50': + if hasattr(self, '_emo_resnet') and self._emo_resnet is not None: + return self._emo_resnet.to(device) + from gdl.models.temporal.Preprocessors import EmotionRecognitionPreprocessor + from munch import Munch + cfg = Munch() + cfg.model_name = "ResNet50" + cfg.model_path = False + cfg.return_features = True + self._emo_resnet = EmotionRecognitionPreprocessor(cfg).to(device) + return self._emo_resnet + + raise ValueError(f"Unknown emotion recognition method: {rec_method}") + + + def _get_reconstruction_net_v2(self, device, rec_method="emoca"): + device = device or torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + if rec_method == "emoca": + if hasattr(self, '_emoca') and self._emoca is not None: + return self._emoca.to(device) + from gdl.models.temporal.Preprocessors import EmocaPreprocessor + from munch import Munch + cfg = Munch() + cfg.with_global_pose = True + cfg.return_global_pose = True + cfg.average_shape_decode = False + cfg.return_appearance = True + cfg.model_name = "EMOCA" + cfg.model_path = False + cfg.stage = "detail" + cfg.max_b = 16 + cfg.render = False + cfg.crash_on_invalid = False + # cfg.render = True + emoca = EmocaPreprocessor(cfg).to(device) + self._emoca = emoca + return emoca + elif rec_method == "spectre": + if hasattr(self, '_spectre') and self._spectre is not None: + return self._spectre.to(device) + from gdl.models.temporal.external.SpectrePreprocessor import SpectrePreprocessor + from munch import Munch + cfg = Munch() + cfg.return_vis = False + # cfg.render = True + cfg.render = False + cfg.with_global_pose = True + cfg.return_global_pose = True + cfg.slice_off_invalid = False + cfg.return_appearance = True + cfg.average_shape_decode = False + cfg.crash_on_invalid = False + + # paths + cfg.flame_model_path = "/ps/scratch/rdanecek/data/FLAME/geometry/generic_model.pkl" + cfg.flame_lmk_embedding_path = "/ps/scratch/rdanecek/data/FLAME/geometry/landmark_embedding.npy" + cfg.face_mask_path = "/ps/scratch/rdanecek/data/FLAME/mask/uv_face_mask.png" + cfg.face_eye_mask_path = "/ps/scratch/rdanecek/data/FLAME/mask/uv_face_eye_mask.png" + cfg.tex_type = "BFM" + cfg.tex_path = "/ps/scratch/rdanecek/data/FLAME/texture/FLAME_albedo_from_BFM.npz" + cfg.fixed_displacement_path = "/ps/scratch/rdanecek/data/FLAME/geometry/fixed_uv_displacements/fixed_displacement_256.npy" + cfg.pretrained_modelpath = "pretrained/spectre_model.tar" + + spectre = SpectrePreprocessor(cfg).to(device) + self._spectre = spectre + return spectre + raise ValueError("Unknown reconstruction method '%s'" % rec_method) + + def _get_reconstruction_net(self, device, rec_method='deca'): if rec_method == 'deca': add_pretrained_deca_to_path() @@ -567,8 +995,8 @@ def _get_reconstruction_net(self, device, rec_method='deca'): mode = "test" from gdl.models.IO import get_checkpoint_with_kwargs from omegaconf import OmegaConf - # model_path = "/is/cluster/work/rdanecek/emoca/finetune_deca/2021_11_13_03-43-40_4753326650554236352_ExpDECA_Affec_clone_NoRing_EmoC_F2_DeSeggt_BlackC_Aug_early" - model_path = Path(gdl.__file__).parents[1] / "assets" / "EMOCA" / "models" / "EMOCA" + model_path = "/is/cluster/work/rdanecek/emoca/finetune_deca/2021_11_13_03-43-40_4753326650554236352_ExpDECA_Affec_clone_NoRing_EmoC_F2_DeSeggt_BlackC_Aug_early" + # model_path = Path(gdl.__file__).parents[1] / "assets" / "EMOCA" / "models" / "EMOCA" cfg = OmegaConf.load(Path(model_path) / "cfg.yaml") stage = 'detail' cfg = cfg[stage] @@ -587,9 +1015,20 @@ def _get_reconstruction_net(self, device, rec_method='deca'): "inout_params": checkpoint_kwargs["config"]["inout"], "stage_name": "train", } + + from gdl_apps.EMOCA.utils.load import load_model + deca = instantiate_deca(cfg, mode, "", checkpoint, deca_checkpoint_kwargs ) deca.to(device) deca.deca.config.detail_constrain_type = 'none' + + # path_to_models = Path(gdl.__file__).parents[1] / "assets/EMOCA/models + # model_name = "EMOCA" + # mode = "detail" + # deca, conf = load_model(path_to_models, model_name, mode) + # deca.to(device) + # deca.eval() + return deca # return deca.deca elif rec_method == "deep3dface": @@ -731,6 +1170,8 @@ def _reconstruct_faces(self, device = None): for sid in range(self.num_sequences): self._reconstruct_faces_in_sequence(sid, reconstruction_net, device) + def get_single_video_dataset(self, i): + raise NotImplementedError("This method must be implemented by the deriving classes.") def _reconstruct_faces_in_sequence(self, sequence_id, reconstruction_net=None, device=None, save_obj=False, save_mat=True, save_vis=True, save_images=False, @@ -749,13 +1190,19 @@ def _reconstruct_faces_in_sequence(self, sequence_id, reconstruction_net=None, d codedict_retarget = None suffix = None - - def fixed_image_standardization(image): return image / 255. print("Running face reconstruction in sequence '%s'" % self.video_list[sequence_id]) - in_folder = self._get_path_to_sequence_detections(sequence_id) + if self.unpack_videos: + in_folder = self._get_path_to_sequence_detections(sequence_id) + else: + if self.detect_landmarks_on_restored_images is None: + in_folder = self.root_dir / self.video_list[sequence_id] + else: + in_folder = self._get_path_to_sequence_restored( + sequence_id, method=self.detect_landmarks_on_restored_images) + out_folder = self._get_path_to_sequence_reconstructions(sequence_id, rec_method=rec_method, suffix=suffix) if retarget_from is not None: @@ -776,8 +1223,13 @@ def fixed_image_standardization(image): video_writer = None - detections_fnames = sorted(list(in_folder.glob("*.png"))) - dataset = UnsupervisedImageDataset(detections_fnames) + if self.unpack_videos: + detections_fnames_or_images = sorted(list(in_folder.glob("*.png"))) + else: + from skvideo.io import vread + detections_fnames_or_images = vread(str(in_folder)) + + dataset = UnsupervisedImageDataset(detections_fnames_or_images) batch_size = 32 # batch_size = 64 # loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False) @@ -787,14 +1239,22 @@ def fixed_image_standardization(image): with torch.no_grad(): images = fixed_image_standardization(batch['image'].to(device))#[None, ...] batch_ = {} + if images.shape[2:4] != reconstruction_net.get_input_image_size(): + images = F.interpolate(images, size=reconstruction_net.get_input_image_size(), mode='bicubic', align_corners=False) batch_["image"] = images codedict = reconstruction_net.encode(batch_, training=False) + encoded_values = util.dict_tensor2npy(codedict) + if "images" in encoded_values.keys(): + del encoded_values["images"] + if "image" in encoded_values.keys(): + del encoded_values["image"] + # opdict, visdict = reconstruction_net.decode(codedict) if codedict_retarget is not None: codedict["shapecode"] = codedict_retarget["shapecode"].repeat(batch_["image"].shape[0], 1,) codedict["detailcode"] = codedict_retarget["detailcode"].repeat(batch_["image"].shape[0], 1,) codedict = reconstruction_net.decode(codedict, training=False) - + uv_detail_normals = None if 'uv_detail_normals' in codedict.keys(): uv_detail_normals = codedict['uv_detail_normals'] @@ -805,7 +1265,7 @@ def fixed_image_standardization(image): uv_detail_normals, codedict, 0, "train", "") else: visdict = reconstruction_net._visualization_checkpoint(batch_["image"].shape[0], batch_, codedict, i, "", "") - values = util.dict_tensor2npy(codedict) + # values = util.dict_tensor2npy(codedict) #TODO: verify axis # vis_im = np.split(vis_im, axis=0 ,indices_or_sections=batch_size) for j in range(images.shape[0]): @@ -816,12 +1276,12 @@ def fixed_image_standardization(image): # if i*j == 0: mesh_folder = out_folder / 'meshes' mesh_folder.mkdir(exist_ok=True, parents=True) - reconstruction_net.deca.save_obj(str(mesh_folder / (name + '.obj')), values) + reconstruction_net.deca.save_obj(str(mesh_folder / (name + '.obj')), encoded_values) if save_mat: # if i*j == 0: mat_folder = out_folder / 'mat' mat_folder.mkdir(exist_ok=True, parents=True) - savemat(str(mat_folder / (name + '.mat')), values) + savemat(str(mat_folder / (name + '.mat')), encoded_values) if save_vis or save_video: # if i*j == 0: vis_folder = out_folder / 'vis' @@ -846,7 +1306,8 @@ def fixed_image_standardization(image): # if i*j == 0: ims_folder = out_folder / 'ims' ims_folder.mkdir(exist_ok=True, parents=True) - for vis_name in ['inputs', 'rendered_images', 'albedo_images', 'shape_images', 'shape_detail_images']: + for vis_name in ['inputs', 'rendered_images', 'albedo_images', 'shape_images', 'shape_detail_images', + "geometry_detail", "geometry_coarse", "output_images_detail"]: if vis_name not in visdict.keys(): continue image = util.tensor2image(visdict[vis_name][j]) @@ -856,7 +1317,167 @@ def fixed_image_standardization(image): video_writer.release() print("Done running face reconstruction in sequence '%s'" % self.video_list[sequence_id]) - def _gather_data(self, exist_ok=False): + def _reconstruct_faces_in_sequence_v2(self, sequence_id, reconstruction_net=None, device=None, + save_obj=False, save_mat=True, save_vis=True, save_images=False, + save_video=True, rec_methods='emoca', retarget_from=None, retarget_suffix=None): + if retarget_from is not None: + raise NotImplementedError("Retargeting is not implemented yet for _reconstruct_faces_in_sequence_v2") + import datetime + t = datetime.datetime.now() + t_str = t.strftime("%Y_%m_%d_%H-%M-%S") + suffix = f"_retarget_{t_str}_{str(hash(t_str))}" if retarget_suffix is None else retarget_suffix + else: + codedict_retarget = None + suffix = None + + if not isinstance(rec_methods, list): + rec_methods = [rec_methods] + + print("Running face reconstruction in sequence '%s'" % self.video_list[sequence_id]) + + out_folder = {} + out_file_shape = {} + out_file_appearance = {} + for rec_method in rec_methods: + out_folder[rec_method] = self._get_path_to_sequence_reconstructions(sequence_id, rec_method=rec_method, suffix=suffix) + out_file_shape[rec_method] = out_folder[rec_method] / f"shape_pose_cam.pkl" + out_file_appearance[rec_method] = out_folder[rec_method] / f"appearance.pkl" + out_folder[rec_method].mkdir(exist_ok=True, parents=True) + + if retarget_from is not None: + raise NotImplementedError("Retargeting is not implemented yet for _reconstruct_faces_in_sequence_v2") + out_folder.mkdir(exist_ok=True, parents=True) + + + exists = True + for rec_method in rec_methods: + if not out_file_shape[rec_method].is_file(): + exists = False + break + if not out_file_appearance[rec_method].is_file(): + exists = False + break + if exists: + return + + device = device or torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + # reconstruction_net = reconstruction_net or self._get_reconstruction_net_v2(device, rec_method=rec_method) + + # if retarget_from is not None: + # image = imread(retarget_from) + # batch_r = {} + # batch_r["image"] = torch.from_numpy(image).float().unsqueeze(0).to(device) + # # to torch channel format + # batch_r["image"] = batch_r["image"].permute(0, 3, 1, 2) + # batch_r["image"] = fixed_image_standardization(batch_r["image"]) + # with torch.no_grad(): + # codedict_retarget = reconstruction_net.encode(batch_r, training=False) + + + # video_writer = None + # if self.unpack_videos: + # detections_fnames_or_images = sorted(list(in_folder.glob("*.png"))) + # else: + # from skvideo.io import vread + # detections_fnames_or_images = vread(str(in_folder)) + + dataset = self.get_single_video_dataset(sequence_id) + batch_size = 32 + # batch_size = 64 + loader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False) + + for i, batch in enumerate(tqdm(loader)): + with torch.no_grad(): + batch = dict_to_device(batch, device) + for rec_method in rec_methods: + if out_file_shape[rec_method].is_file() and out_file_appearance[rec_method].is_file(): + continue + batch_ = batch.copy() + reconstruction_net = self._get_reconstruction_net_v2(device, rec_method=rec_method) + result = reconstruction_net(batch_, input_key='video', output_prefix="") + assert batch['video'].shape[0] == 1 + T = batch['video'].shape[1] + result_keys_to_keep = ['shape', 'exp', 'jaw', 'global_pose', 'cam'] + shape_pose = {k: result[k].cpu().numpy() for k in result_keys_to_keep} + assert shape_pose['shape'].shape[1] == T, f"{shape_pose['shape'].shape[1]} != {T}" + result_keys_to_keep = ['tex', 'light', 'detail'] + appearance = {k: result[k].cpu().numpy() for k in result_keys_to_keep if k in result.keys()} + + # with open(out_file_shape, "wb") as f: + # hkl.dump(shape_pose, f) + # with open(out_file_appearance, "wb") as f: + # hkl.dump(appearance, f) + # hkl.dump(shape_pose, out_file_shape[rec_method]) + # hkl.dump(appearance, out_file_appearance[rec_method]) + + save_reconstruction_list(out_file_shape[rec_method], shape_pose) + save_reconstruction_list(out_file_appearance[rec_method], appearance) + + print("Done running face reconstruction in sequence '%s'" % self.video_list[sequence_id]) + + + + def _extract_emotion_in_sequence(self, sequence_id, emotion_net=None, device=None, + emo_methods='resnet50',): + if not isinstance(emo_methods, list): + emo_methods = [emo_methods] + + print("Running face emotion recognition in sequence '%s'" % self.video_list[sequence_id]) + + out_folder = {} + out_file_emotion = {} + out_file_features = {} + for emo_method in emo_methods: + out_folder[emo_method] = self._get_path_to_sequence_emotions(sequence_id, emo_method=emo_method) + out_file_emotion[emo_method] = out_folder[emo_method] / f"emotions.pkl" + out_file_features[emo_method] = out_folder[emo_method] / f"features.pkl" + out_folder[emo_method].mkdir(exist_ok=True, parents=True) + + exists = True + for emo_method in emo_methods: + if not out_file_emotion[emo_method].is_file(): + exists = False + break + if not out_file_features[emo_method].is_file(): + exists = False + break + if exists: + return + + device = device or torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + # emotion_net = emotion_net or self._get_emotion_recognition_net(device, rec_method=emo_method) + + dataset = self.get_single_video_dataset(sequence_id) + batch_size = 32 + # batch_size = 64 + loader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False) + + for i, batch in enumerate(tqdm(loader)): + with torch.no_grad(): + batch = dict_to_device(batch, device) + for emo_method in emo_methods: + if out_file_emotion[emo_method].is_file() and out_file_features[emo_method].is_file(): + continue + emotion_net = self._get_emotion_recognition_net(device, rec_method=emo_method) + result = emotion_net(batch, input_key='video', output_prefix="") + assert batch['video'].shape[0] == 1 + T = batch['video'].shape[1] + result_keys_to_keep = ['expression', 'valence', 'arousal',] + assert result['expression'].shape[1] == T, f"{result['expression'].shape[1]} != {T}" + emotion_labels = {k: result[k].cpu().numpy() for k in result_keys_to_keep} + result_keys_to_keep = ['feature',] + emotion_features = {k: result[k].cpu().numpy() for k in result_keys_to_keep} + + # hkl.dump(emotion_labels, out_file_emotion[emo_method]) + # hkl.dump(emotion_features, out_file_features[emo_method]) + + save_emotion_list(out_file_emotion[emo_method], emotion_labels) + save_emotion_list(out_file_features[emo_method], emotion_features) + + print("Done running face reconstruction in sequence '%s'" % self.video_list[sequence_id]) + + # def _gather_data(self, exist_ok=False): + def _gather_data(self, exist_ok=True): print("Processing dataset") Path(self.output_dir).mkdir(parents=True, exist_ok=exist_ok) @@ -872,24 +1493,88 @@ def _gather_data(self, exist_ok=False): def _gather_video_metadata(self): import ffmpeg self.video_metas = [] + self.audio_metas = [] + + invalid_videos = [] + for vi, vid_file in enumerate(tqdm(self.video_list)): - vid = ffmpeg.probe(str( Path(self.root_dir) / vid_file)) + video_path = str( Path(self.root_dir) / vid_file) + try: + vid = ffmpeg.probe(video_path) + except ffmpeg._run.Error as e: + print(f"The video file '{video_path}' is corrupted. Skipping it." ) + self.video_metas += [None] + self.audio_metas += [None] + invalid_videos += [vi] + continue # codec_idx = [idx for idx in range(len(vid)) if vid['streams'][idx]['codec_type'] == 'video'] - codec_idx = [idx for idx in range(len(vid)) if vid['streams'][0]['codec_type'] == 'video'] + codec_idx = [idx for idx in range(len(vid['streams'])) if vid['streams'][idx]['codec_type'] == 'video'] if len(codec_idx) == 0: raise RuntimeError("Video file has no video streams! '%s'" % str(vid_file)) - # if len(codec_idx) > 1: + if len(codec_idx) > 1: # raise RuntimeError("Video file has two video streams! '%s'" % str(vid_file)) - print("[WARNING] Video file has %d video streams. Only the first one will be processed" % len(codec_idx)) + print("[WARNING] Video file has %d video streams. Only the first one will be processed" % len(codec_idx)) codec_idx = codec_idx[0] vid_info = vid['streams'][codec_idx] + assert vid_info['codec_type'] == 'video' vid_meta = {} vid_meta['fps'] = vid_info['avg_frame_rate'] vid_meta['width'] = int(vid_info['width']) vid_meta['height'] = int(vid_info['height']) - vid_meta['num_frames'] = int(vid_info['nb_frames']) + if 'nb_frames' in vid_info.keys(): + vid_meta['num_frames'] = int(vid_info['nb_frames']) + elif 'num_frames' in vid_info.keys(): + vid_meta['num_frames'] = int(vid_info['num_frames']) + else: + vid_meta['num_frames'] = 0 + # make the frame number reading a bit more robest, sometims the above does not work and gives zeros + if vid_meta['num_frames'] == 0: + vid_meta['num_frames'] = int(subprocess.check_output(["ffprobe", "-v", "error", "-select_streams", "v:0", "-count_packets", "-show_entries", "stream=nb_read_packets", "-of", "csv=p=0", + video_path])) + if vid_meta['num_frames'] == 0: + _vr = skvideo.io.FFmpegReader(video_path) + vid_meta['num_frames'] = _vr.getShape()[0] + del _vr + vid_meta['bit_rate'] = vid_info['bit_rate'] + if 'bits_per_raw_sample' in vid_info.keys(): + vid_meta['bits_per_raw_sample'] = vid_info['bits_per_raw_sample'] self.video_metas += [vid_meta] + # audio codec + codec_idx = [idx for idx in range(len(vid['streams'])) if vid['streams'][idx]['codec_type'] == 'audio'] + if len(codec_idx) > 1: + raise RuntimeError("Video file has two audio streams! '%s'" % str(vid_file)) + if len(codec_idx) == 0: + if self._must_include_audio is True or self._must_include_audio == 'strict': + raise RuntimeError("Video file has no audio streams! '%s'" % str(vid_file)) + elif self._must_include_audio == 'warn': + print("[WARNING] Video file has no audio streams! '%s'" % str(vid_file)) + self.audio_metas += [None] + else: + codec_idx = codec_idx[0] + aud_info = vid['streams'][codec_idx] + assert aud_info['codec_type'] == 'audio' + aud_meta = {} + aud_meta['sample_rate'] = aud_info['sample_rate'] + aud_meta['sample_fmt'] = aud_info['sample_fmt'] + # aud_meta['num_samples'] = int(aud_info['nb_samples']) + aud_meta["num_frames"] = int(aud_info['nb_frames']) + assert float(aud_info['start_time']) == 0 + self.audio_metas += [aud_meta] + + for vi in sorted(invalid_videos, reverse=True): + del self.video_list[vi] + del self.video_metas[vi] + if self.annotation_list is not None: + del self.annotation_list[vi] + + if hasattr(self, "audio_metas") and self.audio_metas is not None: + del self.audio_metas[vi] + + if self.frame_lists is not None: + del self.frame_lists[vi] + + def _loadMeta(self): if self.loaded: print("FaceVideoDataset already loaded.") @@ -900,10 +1585,11 @@ def _loadMeta(self): self.video_list = pkl.load(f) self.video_metas = pkl.load(f) self.annotation_list = pkl.load(f) - # try: self.frame_lists = pkl.load(f) - # except Exception: - # pass + try: + self.audio_metas = pkl.load(f) + except Exception: + pass self.loaded = True def _saveMeta(self): @@ -913,6 +1599,8 @@ def _saveMeta(self): pkl.dump(self.video_metas, f) pkl.dump(self.annotation_list,f) pkl.dump(self.frame_lists, f) + if hasattr(self, "audio_metas"): + pkl.dump(self.audio_metas, f) def setup(self, stage: Optional[str] = None): @@ -1011,7 +1699,7 @@ def _get_frames_for_sequence(self, sid): def _get_annotations_for_sequence(self, sid): video_file = self.video_list[sid] - suffix = Path(self._video_category(sid)) / 'annotations' /self._video_set(sequence_id) + suffix = Path(self._video_category(sid)) / 'annotations' /self._video_set(sid) annotation_prefix = Path(self.root_dir / suffix) annotation = sorted(annotation_prefix.glob(video_file.stem + "*.txt")) return annotation @@ -1028,14 +1716,18 @@ def _get_recognition_for_sequence(self, sid, distance_threshold=None): return indices, labels, mean, cov, fnames def create_reconstruction_video(self, sequence_id, overwrite=False, distance_threshold=0.5, - rec_method='emoca', image_type=None, retarget_suffix=None, cat_dim=0, include_transparent=True): + rec_method='emoca', image_type=None, retarget_suffix=None, cat_dim=0, include_transparent=True, + include_original=True, include_rec=True, black_background=False, use_mask=True, + out_folder=None): + print("Include original: " + str(include_original)) + print("========================") from PIL import Image, ImageDraw # fid = 0 - image_type = image_type or "detail" + image_type = image_type or "geometry_detail" detection_fnames, centers, sizes, last_frame_id = self._get_detection_for_sequence(sequence_id) vis_fnames = self._get_reconstructions_for_sequence(sequence_id, rec_method=rec_method, - retarget_suffix=retarget_suffix, image_type=image_type) - + retarget_suffix=retarget_suffix, image_type=image_type, out_folder=out_folder) + vid_frames = self._get_frames_for_sequence(sequence_id) vis_fnames.sort() @@ -1061,14 +1753,18 @@ def create_reconstruction_video(self, sequence_id, overwrite=False, distance_thr break frame_name = vid_frames[fid] - c = centers[fid] - s = sizes[fid] - frame = imread(frame_name) + if len(centers) > 0 and len(sizes) > 0: + c = centers[fid] + s = sizes[fid] + else: + c = [[frame.shape[0] / 2, frame.shape[0] / 2]] + s = frame.shape[0] + frame_pill_bb = Image.fromarray(frame) - if retarget_suffix is None: + if retarget_suffix is not None or black_background is False: frame_deca_full = Image.fromarray(frame) frame_deca_trans = Image.fromarray(frame) else: @@ -1086,7 +1782,7 @@ def create_reconstruction_video(self, sequence_id, overwrite=False, distance_thr vis_name = vis_fnames[did] - if detection_name.stem not in str(vis_name): + if detection_name.stem not in str(vis_name) : print("%s != %s" % (detection_name.stem, vis_name.stem)) raise RuntimeError("Detection and visualization filenames should match but they don't.") @@ -1104,16 +1800,22 @@ def create_reconstruction_video(self, sequence_id, overwrite=False, distance_thr im_c = vis_im.shape[1] num_ims = im_c // im_r - if image_type == "coarse": - vis_im = vis_im[:, im_r*3:im_r*4, ...] # coarse - elif image_type == "detail": - vis_im = vis_im[:, im_r*4:im_r*5, ...] # detail + # if image_type == "coarse": + # vis_im = vis_im[:, im_r*3:im_r*4, ...] # coarse + # elif image_type == "detail": + # vis_im = vis_im[:, im_r*4:im_r*5, ...] # detail # vis_im = vis_im[:, :, ...] # vis_mask = np.prod(vis_im, axis=2) == 0 - vis_mask = (np.prod(vis_im, axis=2) > 30).astype(np.uint8) * 255 + mask_name = vis_name.parent / "geometry_coarse.png" + mask_im = imread(mask_name) + + # a hacky way to get the mask + vis_mask = (np.prod(mask_im, axis=2) > 30).astype(np.uint8) * 255 + if not use_mask: + vis_mask = np.ones_like(vis_mask) * 255 # vis_im = np.concatenate([vis_im, vis_mask[..., np.newaxis]], axis=2) @@ -1155,10 +1857,22 @@ def create_reconstruction_video(self, sequence_id, overwrite=False, distance_thr else: cat_dim = 0 - if include_transparent: - im = np.concatenate([final_im, final_im2, final_im3], axis=cat_dim) - else: - im = np.concatenate([final_im, final_im2,], axis=cat_dim) + im_list = [] + if include_original: + im_list += [final_im] + + if include_rec: + im_list += [final_im2] + + if include_transparent: + im_list += [final_im3] + + im = np.concatenate(im_list, axis=cat_dim) + + # if include_transparent: + # im = np.concatenate([final_im, final_im2, final_im3], axis=cat_dim) + # else: + # im = np.concatenate([final_im, final_im2,], axis=cat_dim) if writer is None: fourcc = cv2.VideoWriter_fourcc(*'mp4v') @@ -1173,7 +1887,8 @@ def create_reconstruction_video(self, sequence_id, overwrite=False, distance_thr im_cv = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) writer.write(im_cv) writer.release() - attach_audio_to_reconstruction_video(outfile, self.root_dir / self.video_list[sequence_id]) + outfile_with_sound = attach_audio_to_reconstruction_video(outfile, self.root_dir / self.video_list[sequence_id]) + return outfile, outfile_with_sound # plt.figure() # plt.imshow(im) # plt.show() @@ -1511,8 +2226,10 @@ def _gather_detections_for_sequence(self, sequence_id, with_recognitions=True): else: print("Faces for video %d not detected" % sequence_id) detection_fnames = [] + landmark_fnames = [] centers = [] sizes = [] + if with_recognitions: return detection_fnames, landmark_fnames, centers, sizes, embeddings, recognized_detections_fnames return detection_fnames, landmark_fnames, centers, sizes @@ -1554,17 +2271,22 @@ def _get_recognition_filename(self, sequence_id, distance_threshold=None): def _identify_recognitions_for_sequence(self, sequence_id, distance_threshold = None): if distance_threshold is None: distance_threshold = self.get_default_recognition_threshold() + out_file = self._get_recognition_filename(sequence_id, distance_threshold) + if out_file.is_file(): + print("Recognitions for video %d already processed. Skipping" % sequence_id) + return + print("Identifying recognitions for sequence %d: '%s'" % (sequence_id, self.video_list[sequence_id])) detection_fnames, landmark_fnames, centers, sizes, embeddings, recognized_detections_fnames = \ self._gather_detections_for_sequence(sequence_id, with_recognitions=True) out_folder = self._get_path_to_sequence_detections(sequence_id) - # if distance_threshold != 0.5: - # out_file = out_folder / ("recognition_dist_%.03f.pkl" % distance_threshold) - # else: - # out_file = out_folder / "recognition.pkl" - out_file = self._get_recognition_filename(sequence_id, distance_threshold) + + + if embeddings is None or embeddings.size == 0: + print("No embeddings found for sequence %d" % sequence_id) + return from collections import Counter, OrderedDict from sklearn.cluster import DBSCAN @@ -1588,22 +2310,302 @@ def _identify_recognitions_for_sequence(self, sequence_id, distance_threshold = features = embeddings[indices] mean = np.mean(features, axis=0, keepdims=True) cov = np.cov(features, rowvar=False) - try: - recognized_filenames_label = sorted([recognized_detections_fnames[i].relative_to( - self.output_dir) for i in indices.tolist()]) - except ValueError: - recognized_filenames_label = sorted([recognized_detections_fnames[i].relative_to( - recognized_detections_fnames[i].parents[4]) for i in indices.tolist()]) + if len(recognized_detections_fnames): + try: + pass + recognized_filenames_label = sorted([recognized_detections_fnames[i].relative_to( + self.output_dir) for i in indices.tolist()]) + except ValueError: + recognized_filenames_label = sorted([recognized_detections_fnames[i].relative_to( + recognized_detections_fnames[i].parents[4]) for i in indices.tolist()]) + recognition_indices[label] = indices recognition_means[label] = mean recognition_cov[label] = cov - recognition_fnames[label] = recognized_filenames_label + if len(recognized_detections_fnames): + recognition_fnames[label] = recognized_filenames_label + else: + recognition_fnames[label] = None + FaceVideoDataModule._save_recognitions(out_file, labels, recognition_indices, recognition_means, recognition_cov, recognition_fnames) print("Done identifying recognitions for sequence %d: '%s'" % (sequence_id, self.video_list[sequence_id])) + def _extract_personal_recognition_sequences(self, sequence_id, distance_threshold = None): + detection_fnames, landmark_fnames, centers, sizes, embeddings, recognized_detections_fnames = \ + self._gather_detections_for_sequence(sequence_id, with_recognitions=True) + + output_video_file = self._get_path_to_sequence_files(sequence_id, "videos_aligned").with_suffix(".mp4") + + if output_video_file.is_file(): + print("Aligned personal video for sequence %d already extracted" % sequence_id) + return + + desired_processed_video_size = self.processed_video_size + + # 1) first handle the case with no successful detections + if embeddings is None or embeddings.size == 0: + self._save_unsuccessfully_aligned_video(sequence_id, output_video_file) + return + + # 2) handle the case with successful detections + landmark_file = self._get_path_to_sequence_landmarks(sequence_id) / "landmarks_original.pkl" + landmarks = FaceVideoDataModule.load_landmark_list(landmark_file) + + num_frames = len(landmarks) + + distance_threshold = self.get_default_recognition_threshold() if distance_threshold is None else distance_threshold + + indices, labels, mean, cov, fnames = self._get_recognition_for_sequence(sequence_id, distance_threshold) + + # 1) extract the most numerous recognition + exclusive_indices = OrderedDict({key: np.unique(value) for key, value in indices.items()}) + exclusive_sizes = OrderedDict({key: value.size for key, value in exclusive_indices.items()}) + + max_size = max(exclusive_sizes.values()) + max_size = -1 + max_index = -1 + same_occurence_count = [] + for k,v in exclusive_sizes.items(): + if exclusive_sizes[k] > max_size: + max_size = exclusive_sizes[k] + max_index = k + same_occurence_count.clear() + same_occurence_count.append(k) + elif exclusive_sizes[k] == max_size: + same_occurence_count.append(k) + #TODO: handle this case - how to break the ambiguity? + + if len(same_occurence_count) > 1: + print(f"Warning: ambiguous recognition for sequence {sequence_id}. There are {len(same_occurence_count)} of faces" + "that have dominant detections across the video. Choosing the first one") + + main_occurence_mean = mean[max_index] + main_occurence_cov = cov[max_index] + + # 2) retrieve its detections/landmarks + total_len = 0 + frame_map = OrderedDict() # detection index to frame map + index_for_frame_map = OrderedDict() # detection index to frame map + inv_frame_map = {} # frame to detection index map + frame_indices = OrderedDict() + for i in range(len(landmarks)): + # for j in range(len(landmarks[i])): + # frame_map[total_len + j] = i + # index_for_frame_map[total_len + j] = j + # inv_frame_map[i] = (i, j) + frame_indices[i] = (total_len, total_len + len(landmarks[i])) + total_len += len(landmarks[i]) + + + # main_occurence_sizes = OrderedDict() + # main_occurence_centers = OrderedDict() + + used_frames = [] + per_frame_landmark_indices = np.zeros((num_frames,), dtype=np.int32) + main_occurence_centers = [] + main_occurence_sizes = [] + used_landmarks = [] + + + for frame_num in frame_indices.keys(): + first_index, last_index = frame_indices[frame_num] + if first_index == last_index: + continue + frame_recognitions = embeddings[first_index:last_index, ...] + + # 3) compute the distance between the main recognition and the detections + distances = np.linalg.norm(frame_recognitions - main_occurence_mean, axis=1) + # find the closest detection to the main recognition + closest_detection = np.argmin(distances) + closest_detection_index = first_index + closest_detection + + per_frame_landmark_indices[frame_num] = closest_detection + + if distances.min() < distance_threshold: + main_occurence_sizes += [sizes[frame_num][closest_detection]] + main_occurence_centers += [centers[frame_num][closest_detection]] + used_frames += [frame_num] + used_landmarks += [landmarks[frame_num][closest_detection]] + # main_occurence_sizes[frame_num] = sizes[closest_detection_index] + # main_occurence_centers[frame_num] = centers[closest_detection_index] + + # 3) compute bounding box for frames without detection (via fitting/interpolating the curve) + from scipy.interpolate import griddata, RBFInterpolator + import scipy + + if len(used_frames) < 2: + self._save_unsuccessfully_aligned_video(sequence_id, output_video_file) + return + + used_frames = np.array(used_frames, dtype=np.int32)[:,np.newaxis] + main_occurence_centers = np.stack(main_occurence_centers, axis=0) + main_occurence_sizes = np.stack(main_occurence_sizes, axis=0) + + used_frames_bin = np.zeros((num_frames,), dtype=np.int32) + used_frames_bin[used_frames] = 1 + + # only iterpolates + # interpolated_centers = griddata(used_frames, main_occurence_centers, np.arange(len(landmarks)), method='linear') + # interpolated_sizes = griddata(used_frames, main_occurence_sizes, np.arange(len(landmarks)), method='linear') + + # can extrapolate + if len(used_landmarks) >= 2: + interpolated_centers = RBFInterpolator(used_frames, main_occurence_centers)(np.arange(len(landmarks))[:, np.newaxis]) + interpolated_sizes = RBFInterpolator(used_frames, main_occurence_sizes)(np.arange(len(landmarks))[:, np.newaxis]) + interpolated_landmarks = RBFInterpolator(used_frames, used_landmarks)(np.arange(len(landmarks))[:, np.newaxis]) + else: + self._save_unsuccessfully_aligned_video(sequence_id, output_video_file) + return + + # convolve with a gaussian kernel to smooth the curve + smoothed_centers = np.zeros(interpolated_centers.shape) + + smoothed_sizes = scipy.ndimage.filters.gaussian_filter1d(interpolated_sizes, sigma=3) + for i in range(interpolated_centers.shape[1]): + smoothed_centers[:, i] = scipy.ndimage.filters.gaussian_filter1d(interpolated_centers[:, i], sigma=3) + + # do we need to smooth landmarks? + # smoothed_landmarks = np.zeros(interpolated_landmarks.shape) + # for i in range(interpolated_landmarks.shape[0]): + # smoothed_landmarks[:, i] = scipy.ndimage.filters.gaussian_filter1d(interpolated_landmarks[:, i], sigma=3) + + # # plot the centers over time + # from matplotlib import pyplot as plt + # plt.figure() + # plt.plot(np.arange(len(landmarks)), interpolated_centers[:, 0], 'r-', label="center x") + # # plt.plot(np.arange(len(landmarks)), interpolated_centers[:, 1], 'b-', label="center y") + # plt.plot(np.arange(len(landmarks)), smoothed_centers[:, 0], 'r.', label="smoothed center x") + # # plt.plot(np.arange(len(landmarks)), smoothed_centers[:, 1], 'b.', label="smoothed center y") + # plt.legend() + # plt.show() + + # 4) generate a new video + + # video = skvideo.io.vread(str(self.root_dir / self.video_list[sequence_id])) + video = skvideo.io.vreader(str(self.root_dir / self.video_list[sequence_id])) + + from gdl.datasets.FaceAlignmentTools import align_video, align_and_save_video + + # # aligned_video, aligned_landmarks = align_video(video, interpolated_centers, interpolated_sizes, interpolated_landmarks, + # # target_size_height=desired_processed_video_size, target_size_width=desired_processed_video_size) + # # smoothed_video, aligned_smoothed_landmarks = align_video(video, smoothed_centers, smoothed_sizes, smoothed_landmarks, + # # target_size_height=desired_processed_video_size, target_size_width=desired_processed_video_size) + # smoothed_video, aligned_smoothed_landmarks = align_video(video, smoothed_centers, smoothed_sizes, interpolated_landmarks, + # target_size_height=desired_processed_video_size, target_size_width=desired_processed_video_size) + + + output_video_file = self._get_path_to_sequence_files(sequence_id, "videos_aligned").with_suffix(".mp4") + output_video_file.parent.mkdir(parents=True, exist_ok=True) + output_dict = { + '-c:v': 'h264', + # '-q:v': '1', + '-r': self.video_metas[sequence_id]['fps'], + '-b': self.video_metas[sequence_id].get('bit_rate', '300000000'), + } + aligned_smoothed_landmarks = align_and_save_video(video, output_video_file, smoothed_centers, smoothed_sizes, interpolated_landmarks, + target_size_height=desired_processed_video_size, target_size_width=desired_processed_video_size, output_dict=output_dict) + + + # 5) save the video and the landmarks + + trasformed_landmarks_path = self._get_path_to_sequence_landmarks(sequence_id) / "landmarks_aligned_video.pkl" + smoothed_trasformed_landmarks_path = self._get_path_to_sequence_landmarks(sequence_id) / "landmarks_aligned_video_smoothed.pkl" + used_indices_landmarks_path = self._get_path_to_sequence_landmarks(sequence_id) / "landmarks_alignment_used_frame_indices.pkl" + used_detection_indices_path = self._get_path_to_sequence_landmarks(sequence_id) / "landmarks_alignment_per_frame_detection_indices.pkl" + + # FaceVideoDataModule.save_landmark_list(trasformed_landmarks_path, aligned_landmarks) + FaceVideoDataModule.save_landmark_list(smoothed_trasformed_landmarks_path, aligned_smoothed_landmarks) + FaceVideoDataModule.save_landmark_list(used_indices_landmarks_path, used_frames) + FaceVideoDataModule.save_landmark_list(used_detection_indices_path, per_frame_landmark_indices) + + + # # aligned_video = (aligned_video * 255).astype(np.uint8) + # smoothed_video = (smoothed_video * 255).astype(np.uint8) + + # output_video_file = self._get_path_to_sequence_files(sequence_id, "videos_aligned").with_suffix(".mp4") + # video_file_smooth = self._get_path_to_sequence_files(sequence_id, "videos_aligned").parent / (output_video_file.stem + "_smooth.mp4") + # output_dict = { + # '-c:v': 'h264', + # # '-q:v': '1', + # '-r': self.video_metas[sequence_id]['fps'], + # '-b': self.video_metas[sequence_id].get('bit_rate', '300000000'), + # } + # writer = skvideo.io.FFmpegWriter(str(output_video_file), outputdict=output_dict) + # for i in range(aligned_video.shape[0]): + # writer.writeFrame(aligned_video[i]) + # writer.close() + + # writer = skvideo.io.FFmpegWriter(str(video_file_smooth), outputdict=output_dict) + # for i in range(smoothed_video.shape[0]): + # writer.writeFrame(smoothed_video[i]) + # writer.close() + + # writer = skvideo.io.FFmpegWriter(str(output_video_file), outputdict=output_dict) + # for i in range(smoothed_video.shape[0]): + # writer.writeFrame(smoothed_video[i]) + # writer.close() + + + def _save_unsuccessfully_aligned_video(self, sequence_id, output_video_file): + desired_processed_video_size = self.processed_video_size + videogen = skvideo.io.vreader(str(self.root_dir / self.video_list[sequence_id])) + first_frame = None + for frame in videogen: + first_frame = frame + break + height = first_frame.shape[0] + width = first_frame.shape[1] + + assert first_frame is not None, "No frames found in video" + + + from skimage.transform import resize + + + output_dict = { + '-c:v': 'h264', + '-r': self.video_metas[sequence_id]['fps'], + '-b': self.video_metas[sequence_id].get('bit_rate', '300000000'), + } + Path(output_video_file).parent.mkdir(parents=True, exist_ok=True) + writer = skvideo.io.FFmpegWriter(str(output_video_file), outputdict=output_dict) + + # write the first already read out frame + if height < width: + diff = (width - height) // 2 + first_frame = first_frame[..., :, diff: diff + height, :] + elif height > width: + diff = (height - width) // 2 + first_frame = first_frame[..., diff :diff + width, :] + first_frame_resized = resize(frame, (desired_processed_video_size, desired_processed_video_size)) + if first_frame_resized.dtype in [np.float32, np.float64]: + if first_frame_resized.max() < 5.: # likely to be in range [0, 1] + first_frame_resized *= 255.0 + first_frame_resized = first_frame_resized.astype(np.uint8) + writer.writeFrame(first_frame_resized) + + # write the rest of the frames + for frame in videogen: + if height < width: + diff = (width - height) // 2 + frame = frame[..., :, diff: diff + height, :] + elif height > width: + diff = (height - width) // 2 + frame = frame[..., diff :diff + width, :, :] + frame_resized = resize(frame, (desired_processed_video_size, desired_processed_video_size)) + if frame_resized.dtype in [np.float32, np.float64]: + if frame_resized.max() < 5.: # likely to be in range [0, 1] (yeah, it's hacky, bite me) + frame_resized *= 255.0 + frame_resized = frame_resized.astype(np.uint8) + + writer.writeFrame(frame_resized) + # for i in range(video_resized.shape[0]): + # writer.writeFrame(video_resized[i]) + writer.close() + @staticmethod def _save_recognitions(file, labels, indices, mean, cov, fnames): @@ -1617,8 +2619,8 @@ def _save_recognitions(file, labels, indices, mean, cov, fnames): @staticmethod def _load_recognitions(file): with open(file, "rb") as f: - indices = pkl.load(f) labels = pkl.load(f) + indices = pkl.load(f) mean = pkl.load(f) cov = pkl.load(f) fnames = pkl.load(f) @@ -1671,9 +2673,6 @@ def _gather_detections(self, with_recognitions=True): self.detection_embeddings += [embeddings] self.detection_recognized_fnames += [recognized_detections_fnames] - def _assign_gt_to_detections(self): - for sid in range(self.num_sequences): - self.assign_gt_to_detections_sequence(sid) def get_default_recognition_threshold(self): #TODO: ensure that 0.6 is good for the most part @@ -2144,280 +3143,6 @@ def _get_bb_center_from_fname(self, fname, detection_fname_list, center_list): # dataset = self.get_annotated_emotion_dataset(annotation_list, filter_pattern) - def assign_gt_to_detections_sequence(self, sequence_id): - print(f"Assigning GT to sequence {sequence_id}") - - def second_most_frequent_label(): - if len(most_frequent_labels) == 2: - second_label = most_frequent_labels[1] - elif len(most_frequent_labels) > 2: - raise RuntimeError(f"Too many labels occurred with the same frequency. Unclear which one to pick.") - else: - most_frequent_count2 = list(counts2labels.keys())[1] - most_frequent_labels2 = counts2labels[most_frequent_count2] - if len(most_frequent_labels2) != 1: - raise RuntimeError( - f"Too many labels occurred with the same frequency. Unclear which one to pick.") - second_label = most_frequent_labels2[0] - return second_label - - def correct_left_right_order(left_center, right_center): - left_right_dim = 0 # TODO: verify if this is correct - if left_center[left_right_dim] < right_center[left_right_dim]: - # left is on the left - return 1 - elif left_center[left_right_dim] == right_center[left_right_dim]: - # same place - return 0 - # left is on the right - return -1 - - # detection_fnames = self._get_path_to_sequence_detections(sequence_id) - # full_frames = self._get_frames_for_sequence(sequence_id) - annotations = self._get_annotations_for_sequence(sequence_id) - if len(annotations) == 0: - print(f"No GT available for video '{self.video_list[sequence_id]}'") - return - annotation_type = annotations[0].parent.parent.parent.stem - if annotation_type == 'AU_Set': - anno_type = 'au8' # AU1,AU2,AU4,AU6,AU12,AU15,AU20,AU25 - elif annotation_type == 'Expression_Set': - anno_type = 'expr7' # Neutral,Anger,Disgust,Fear,Happiness,Sadness,Surprise - elif annotation_type == 'VA_Set': - anno_type = 'va' # valence arousal -1 to 1 - else: - raise ValueError(f"Unsupported annotation type: '{annotation_type}'") - - # load the recognitions: - # recognition_file = self._get_recognition_filename( - # sequence_id, self.get_default_recognition_threshold()) - # indices, labels, mean, cov, recognition_fnames = FaceVideoDataModule._load_recognitions( - # recognition_file) - indices, labels, mean, cov, recognition_fnames = self._get_recognition_for_sequence(sequence_id) - counts2labels = OrderedDict() - for key, val in labels.items(): - if key == -1: # skip invalid outliers - continue - count = len(val) - if count not in counts2labels.keys(): - counts2labels[count] = [] - counts2labels[count] += [key] - - recognition_label_dict = OrderedDict() - annotated_detection_fnames = OrderedDict() - validated_annotations = OrderedDict() - discarded_annotations = OrderedDict() - detection_not_found = OrderedDict() - - # suffs = [str(Path(str(anno)[len(str(anno.parent / self.video_list[sequence_id].stem)):]).stem) for anno in - # annotations] - suffs = [str(anno.stem)[len(str(self.video_list[sequence_id].stem)):] for anno in - annotations] - - ### WARNING: HORRIBLE THINGS FOLLOW, PUT ON YOUR PROTECTIVE GOGGLES BEFORE YOU PROCEED - # this next section is a ugly rule-based approach to assign annotation files to detected and recognized - # faces. This assignment is not provided by the authors of aff-wild2 and therefore it's approximated - # using these rules that are taken from the readme. - - # THERE IS ONLY ONE DOMINANT DETECTION AND ONE ANNOTATION FILE: - if len(annotations) == 1 and suffs[0] == '': - most_frequent_count = list(counts2labels.keys())[0] - most_frequent_labels = counts2labels[most_frequent_count] - - if len(most_frequent_labels) != 1: - raise ValueError("There seem to be two people at the same time in all pictures but we only " - "have annotation for one") - - main_label = most_frequent_labels[0] - main_detection_file_names = recognition_fnames[main_label] - main_annotation_file = annotations[0] - main_valid_detection_list, main_valid_annotation_list, main_discarded_list, main_detection_not_found_list \ - = self._map_detections_to_gt(main_detection_file_names, main_annotation_file, anno_type) - - recognition_label_dict[main_annotation_file.stem] = main_label - annotated_detection_fnames[main_annotation_file.stem] = main_valid_detection_list - validated_annotations[main_annotation_file.stem] = main_valid_annotation_list - discarded_annotations[main_annotation_file.stem] = main_discarded_list - detection_not_found[main_annotation_file.stem] = main_detection_not_found_list - - - # THERE ARE TWO DOMINANT DETECTIONS BUT ONLY ONE IS ANNOTATED - elif len(annotations) == 1 and (suffs[0] == '_left' or suffs[0] == '_right'): - - most_frequent_count = list(counts2labels.keys())[0] - most_frequent_labels = counts2labels[most_frequent_count] - - detection_fnames, detection_centers, detection_sizes, _ = \ - self._get_detection_for_sequence(sequence_id) - - if len(most_frequent_labels) != 1: - raise ValueError("There seem to be two people at the same time in all pictures but we only " - "have annotation for one") - - main_label = most_frequent_labels[0] - main_detection_file_names = recognition_fnames[main_label] - main_annotation_file = annotations[0] - main_valid_detection_list, main_valid_annotation_list, main_discarded_list, main_detection_not_found_list \ - = self._map_detections_to_gt(main_detection_file_names, main_annotation_file, anno_type) - - other_label = second_most_frequent_label() - other_detection_file_names = recognition_fnames[other_label] - other_annotation_file = annotations[0] # use the same annotation, which one will be used is figured out next - other_valid_detection_list, other_valid_annotation_list, other_discarded_list, other_detection_not_found_list\ - = self._map_detections_to_gt(other_detection_file_names, other_annotation_file, anno_type) - - other_center = self._get_bb_center_from_fname(other_detection_file_names[0], detection_fnames, - detection_centers) - main_center = self._get_bb_center_from_fname(main_detection_file_names[0], detection_fnames, - detection_centers) - if correct_left_right_order(other_center, main_center) == 1: - pass # do nothing, order correct - elif correct_left_right_order(other_center, main_center) == -1: - # swap main and other - print("Swapping left and right") - other_label, main_label = main_label, other_label - # other_valid_detection_list, main_valid_detection_list = main_valid_detection_list, other_valid_detection_list - # other_valid_annotation_list, main_valid_annotation_list = main_valid_annotation_list, other_valid_annotation_list - else: - raise ValueError("Detections are in the same place. No way to tell left from right") - - # now other is on the left, and main is on the right, decide which one is annotated based on the suffix - if suffs[0] == '_left': - print("Choosing left") - recognition_label_dict[other_annotation_file.stem] = other_label - annotated_detection_fnames[other_annotation_file.stem] = other_valid_detection_list - validated_annotations[other_annotation_file.stem] = other_valid_annotation_list - discarded_annotations[other_annotation_file.stem] = other_discarded_list - detection_not_found[other_annotation_file.stem] = other_detection_not_found_list - else: # suffs[0] == '_right': - print("Choosing right") - recognition_label_dict[main_annotation_file.stem] = main_label - annotated_detection_fnames[main_annotation_file.stem] = main_valid_detection_list - validated_annotations[main_annotation_file.stem] = main_valid_annotation_list - discarded_annotations[main_annotation_file.stem] = main_discarded_list - detection_not_found[main_annotation_file.stem] = main_detection_not_found_list - else: - if len(suffs) > 2: - print(f"Unexpected number of suffixes found {len(suffs)}") - print(suffs) - raise RuntimeError(f"Unexpected number of suffixes found {len(suffs)}") - - most_frequent_count = list(counts2labels.keys())[0] - most_frequent_labels = counts2labels[most_frequent_count] - - detection_fnames, detection_centers, detection_sizes, _ = \ - self._get_detection_for_sequence(sequence_id) - - # THE CASE OF ONE DOMINANT DETECTION AND ONE SMALLER ONE (NO SUFFIX vs LEFT/RIGHT) - if suffs[0] == '' and (suffs[1] == '_left' or suffs[1] == '_right'): - if len(most_frequent_labels) != 1: - raise ValueError("There seem to be two people at the same time in all pictures but we only " - "have annotation for one") - - main_label = most_frequent_labels[0] - main_detection_file_names = recognition_fnames[main_label] - main_annotation_file = annotations[0] - main_valid_detection_list, main_valid_annotation_list, main_discarded_list, main_detection_not_found_list\ - = self._map_detections_to_gt(main_detection_file_names, main_annotation_file, anno_type) - - recognition_label_dict[main_annotation_file.stem] = main_label - annotated_detection_fnames[main_annotation_file.stem] = main_valid_detection_list - validated_annotations[main_annotation_file.stem] = main_valid_annotation_list - discarded_annotations[main_annotation_file.stem] = main_discarded_list - detection_not_found[main_annotation_file.stem] = main_detection_not_found_list - - - other_label = most_frequent_labels[1] - other_detection_file_names = recognition_fnames[other_label] - other_annotation_file = annotations[1] - other_valid_detection_list, other_valid_annotation_list, other_discarded_list, other_detection_not_found_list \ - = self._map_detections_to_gt(other_detection_file_names, other_annotation_file, anno_type) - - recognition_label_dict[other_annotation_file.stem] = other_label - annotated_detection_fnames[other_annotation_file.stem] = other_valid_detection_list - validated_annotations[other_annotation_file.stem] = other_valid_annotation_list - discarded_annotations[other_annotation_file.stem] = other_discarded_list - detection_not_found[other_annotation_file.stem] = other_detection_not_found_list - - other_center = self._get_bb_center_from_fname(other_detection_file_names[0], detection_fnames, - detection_centers) - main_center = self._get_bb_center_from_fname(main_detection_file_names[0], detection_fnames, - detection_centers) - if suffs[1] == '_left': - if correct_left_right_order(other_center, main_center) != 1: - raise RuntimeError("The main detection should be on the right and the other on the left but this is not the case") - elif suffs[1] == '_right': - if correct_left_right_order(main_center, other_center) != 1: - raise RuntimeError( - "The main detection should be on the left and the other on the right but this is not the case") - - # THE CASE OF TWO ROUGHLY EQUALY DOMINANT DETECTIONS (LEFT and RIGHT) - elif suffs[0] == '_left' and suffs[1] == '_right': - #TODO: figure out which one is left and which one is right by loading the bboxes and comparing - counts2labels.keys() - left_label = most_frequent_labels[0] - # if len(most_frequent_labels) == 2: - # right_label = most_frequent_labels[1] - # elif len(most_frequent_labels) > 2: - # raise RuntimeError(f"Too many labels occurred with the same frequency. Unclear which one to pick.") - # else: - # most_frequent_count2 = list(counts2labels.keys())[1] - # most_frequent_labels2 = counts2labels[most_frequent_count2] - # if len(most_frequent_labels2) != 1: - # raise RuntimeError( - # f"Too many labels occurred with the same frequency. Unclear which one to pick.") - # right_label = most_frequent_labels2[0] - right_label = second_most_frequent_label() - - left_filename = recognition_fnames[left_label][0] - left_center = self._get_bb_center_from_fname(left_filename, detection_fnames, detection_centers) - - right_filename = recognition_fnames[right_label][0] - right_center = self._get_bb_center_from_fname(right_filename, detection_fnames, detection_centers) - - order = correct_left_right_order(left_center, right_center) - # if left is not left, swap - if order == -1: - left_label, right_label = right_label, left_label - left_filename, right_filename = right_filename, left_filename - elif order == 0: - raise RuntimeError("Left and right detections have centers in the same place. " - "No way to tell left from right") - - left_detection_file_names = recognition_fnames[left_label] - left_annotation_file = annotations[0] - left_valid_detection_list, left_annotation_list, left_discarded_list, left_detection_not_found_list \ - = self._map_detections_to_gt(left_detection_file_names, left_annotation_file, anno_type) - recognition_label_dict[left_annotation_file.stem] = left_label - annotated_detection_fnames[left_annotation_file.stem] = left_valid_detection_list - validated_annotations[left_annotation_file.stem] = left_annotation_list - discarded_annotations[left_annotation_file.stem] = left_discarded_list - detection_not_found[left_annotation_file.stem] = left_detection_not_found_list - - - - right_detection_file_names = recognition_fnames[right_label] - right_annotation_file = annotations[1] - - right_valid_detection_list, right_valid_annotation_list, right_discarded_list, right_detection_not_found_list \ - = self._map_detections_to_gt(right_detection_file_names, right_annotation_file, anno_type) - recognition_label_dict[right_annotation_file.stem] = right_label - annotated_detection_fnames[right_annotation_file.stem] = right_valid_detection_list - validated_annotations[right_annotation_file.stem] = right_valid_annotation_list - discarded_annotations[right_annotation_file.stem] = right_discarded_list - detection_not_found[right_annotation_file.stem] = right_detection_not_found_list - - # THE FOLLOWING CASE SHOULD NEVER HAPPEN - else: - print(f"Unexpected annotation case found.") - print(suffs) - raise RuntimeError(f"Unexpected annotation case found: {str(suffs)}") - - out_folder = self._get_path_to_sequence_detections(sequence_id) - out_file = out_folder / "valid_annotations.pkl" - FaceVideoDataModule._save_annotations(out_file, annotated_detection_fnames, validated_annotations, - recognition_label_dict, discarded_annotations, detection_not_found) - def attach_audio_to_reconstruction_video(input_video, input_video_with_audio, output_video=None, overwrite=False): output_video = output_video or (Path(input_video).parent / (str(Path(input_video).stem) + "_with_sound.mp4")) if output_video.exists() and not overwrite: @@ -2426,6 +3151,7 @@ def attach_audio_to_reconstruction_video(input_video, input_video_with_audio, ou cmd = "ffmpeg -y -i %s -i %s -c copy -map 0:0 -map 1:1 -shortest %s" \ % (input_video, input_video_with_audio, output_video) os.system(cmd) + return output_video @@ -2436,12 +3162,14 @@ def __init__(self, video_path, output_dir, processed_subfolder="processed", face_detector_threshold=0.9, image_size=224, scale=1.25, + detect = True, batch_size=8, num_workers=4, device=None): self.video_path = Path(video_path) self.batch_size = batch_size self.num_workers = num_workers + self.detect = detect super().__init__(self.video_path.parent, output_dir, processed_subfolder, face_detector, @@ -2449,7 +3177,6 @@ def __init__(self, video_path, output_dir, processed_subfolder="processed", image_size, scale, device) - def prepare_data(self, *args, **kwargs): outdir = Path(self.output_dir) @@ -2461,9 +3188,15 @@ def prepare_data(self, *args, **kwargs): self._loadMeta() return # else: - self._gather_data() + self._gather_data(exist_ok=True) self._unpack_videos() + # if self.detect: self._detect_faces() + # else: + # src = self._get_path_to_sequence_frames(0) + # dst = self._get_path_to_sequence_detections(0) + # # create a symlink from src to dst + # os.symlink(src, dst, target_is_directory=True) self._saveMeta() # def _get_unpacked_video_subfolder(self, video_idx): @@ -2479,22 +3212,44 @@ def _gather_data(self, exist_ok=False): self.annotation_list = [] self._gather_video_metadata() + def _detect_faces_in_image(self, image_path, detected_faces=None): + if self.detect: + return super()._detect_faces_in_image(image_path, None) + else: + # the image is already a detection + # get the size of the image from image_path using PIL + img = Image.open(image_path, mode="r") # mode=r does not load the whole image + #get the image dimensions + width, height = img.size + detected_faces = [np.array([0,0, width, height]) ] + return super()._detect_faces_in_image(image_path, detected_faces) def _get_path_to_sequence_results(self, sequence_id, rec_method='EMOCA', suffix=''): return self._get_path_to_sequence_files(sequence_id, "results", rec_method, suffix) - def _get_reconstructions_for_sequence(self, sid, rec_method='emoca', retarget_suffix=None, image_type=None): - out_folder = self._get_path_to_sequence_results(sid, rec_method=rec_method, - suffix=retarget_suffix) + def _get_reconstructions_for_sequence(self, sid, rec_method='emoca', retarget_suffix=None, image_type=None, out_folder=None): + if out_folder is None: + out_folder = self._get_path_to_sequence_results(sid, rec_method=rec_method, + suffix=retarget_suffix) + else: + out_folder = Path(out_folder) if image_type is None: image_type = "geometry_detail" + assert image_type in ["geometry_detail", "geometry_coarse", "out_im_detail", "out_im_coarse"], f"Invalid image type: '{image_type}'" + # use subprocess to find all the image_type.png files in the out_folder, + # and sort them. + # vis_fnames = subprocess.check_output(["find", str(out_folder), "-name", f"{image_type}.png"]) vis_fnames = sorted(list(out_folder.glob(f"**/{image_type}.png"))) return vis_fnames + def _get_path_to_sequence_detections(self, sequence_id): + return self._get_path_to_sequence_files(sequence_id, "detections") + + def _get_path_to_sequence_files(self, sequence_id, file_type, method="", suffix=""): assert file_type in ['videos', 'detections', "landmarks", "segmentations", - "emotions", "reconstructions", "results"] + "emotions", "reconstructions", "results", "audio"], f"'{file_type}' is not a valid file type" video_file = self.video_list[sequence_id] if len(method) > 0: file_type += "/" + method @@ -2515,4 +3270,16 @@ def setup(self, stage: Optional[str] = None): def test_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]: - return DataLoader(self.testdata, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False) \ No newline at end of file + return DataLoader(self.testdata, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False) + + + +def dict_to_device(d, device): + for k, v in d.items(): + if isinstance(v, torch.Tensor): + d[k] = v.to(device) + elif isinstance(v, dict): + d[k] = dict_to_device(v, device) + else: + pass + return d \ No newline at end of file