ok

johndpope · May 13, 2024 · 75ea7d8 · 75ea7d8
1 parent 9c8f4b7
commit 75ea7d8
Show file tree

Hide file tree

Showing 6 changed files with 1,222 additions and 5 deletions.
diff --git a/EmoDataset.py b/EmoDataset.py
@@ -12,6 +12,7 @@
 import decord
 from typing import List, Tuple, Dict, Any
 from decord import VideoReader,AVReader
+import face_alignment
 
 class EMODataset(Dataset):
     def __init__(self, use_gpu:False, sample_rate: int, n_sample_frames: int, width: int, height: int, img_scale: Tuple[float, float], img_ratio: Tuple[float, float] = (0.9, 1.0), video_dir: str = ".", drop_ratio: float = 0.1, json_file: str = "", stage: str = 'stage1', transform: transforms.Compose = None):
@@ -24,8 +25,9 @@ def __init__(self, use_gpu:False, sample_rate: int, n_sample_frames: int, width:
         self.video_dir = video_dir
         self.transform = transform
         self.stage = stage
-        # self.feature_extractor = Wav2VecFeatureExtractor(model_name='facebook/wav2vec2-base-960h', device='cuda')
+        self.face_alignment = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, device='cpu')
 
+        # self.feature_extractor = Wav2VecFeatureExtractor(model_name='facebook/wav2vec2-base-960h', device='cuda')
         # self.face_mask_generator = FaceHelper()
         self.pixel_transform = transforms.Compose(
             [
@@ -62,6 +64,22 @@ def __init__(self, use_gpu:False, sample_rate: int, n_sample_frames: int, width:
         decord.bridge.set_bridge('torch')  # Optional: This line sets decord to directly output PyTorch tensors.
         self.ctx = decord.cpu()
 
+        # DRIVING VIDEO
+        video_drv_reader = VideoReader("./junk/-2KGPYEFnsU_8.mp4", ctx=self.ctx)
+        video_length = len(video_drv_reader)
+
+        driving_vid_pil_image_list = []
+        # keypoints_list = []
+
+        for frame_idx in range(video_length):
+            # Read frame and convert to PIL Image
+            frame = Image.fromarray(video_drv_reader[frame_idx].numpy())
+
+
+            # Transform the frame
+            state = torch.get_rng_state()
+            pixel_values_frame = self.augmentation(frame, self.pixel_transform, state)
+            driving_vid_pil_image_list.append(pixel_values_frame)
 
     def __len__(self) -> int:
 
@@ -87,12 +105,20 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
 
 
         vid_pil_image_list = []
-
+        keypoints_list = []
 
         for frame_idx in range(video_length):
             # Read frame and convert to PIL Image
             frame = Image.fromarray(video_reader[frame_idx].numpy())
 
+
+            # Detect keypoints using face_alignment
+            keypoints = self.face_alignment.get_landmarks(video_reader[frame_idx].numpy())
+            if keypoints is not None:
+                keypoints_list.append(keypoints[0])
+            else:
+                keypoints_list.append(None)
+
             # Transform the frame
             state = torch.get_rng_state()
             pixel_values_frame = self.augmentation(frame, self.pixel_transform, state)
@@ -101,6 +127,8 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         # Convert list of lists to a tensor
         sample = {
             "video_id": video_id,
-            "images": vid_pil_image_list
+            "source_frames": vid_pil_image_list,
+            "driving_frames": self.driving_vid_pil_image_list,
+            "keypoints": keypoints_list
         }
         return sample
diff --git a/data/driving_video.json b/data/driving_video.json
@@ -0,0 +1,3 @@
+{"meta_info": {"appearance_mapping": ["blurry", "male", "young", "chubby", "pale_skin", "rosy_cheeks", "oval_face", "receding_hairline", "bald", "bangs", "black_hair", "blonde_hair", "gray_hair", "brown_hair", "straight_hair", "wavy_hair", "long_hair", "arched_eyebrows", "bushy_eyebrows", "bags_under_eyes", "eyeglasses", "sunglasses", "narrow_eyes", "big_nose", "pointy_nose", "high_cheekbones", "big_lips", "double_chin", "no_beard", "5_o_clock_shadow", "goatee", "mustache", "sideburns", "heavy_makeup", "wearing_earrings", "wearing_hat", "wearing_lipstick", "wearing_necklace", "wearing_necktie", "wearing_mask"], "action_mapping": ["blow", "chew", "close_eyes", "cough", "cry", "drink", "eat", "frown", "gaze", "glare", "head_wagging", "kiss", "laugh", "listen_to_music", "look_around", "make_a_face", "nod", "play_instrument", "read", "shake_head", "shout", "sigh", "sing", "sleep", "smile", "smoke", "sneer", "sneeze", "sniff", "talk", "turn", "weep", "whisper", "wink", "yawn"]}, "clips": {"-2KGPYEFnsU_8": {"ytb_id": "-2KGPYEFnsU", "duration": {"start_sec": 102.6, "end_sec": 106.52}, "bbox": {"top": 0.0991, "bottom": 0.612, "left": 0.1234, "right": 0.412}, "attributes": {"appearance": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "action": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "emotion": {"sep_flag": false, "labels": "neutral"}}, "version": "v0.1"}}}
+
+
diff --git a/junk/-2KGPYEFnsU_8.mp4 b/junk/-2KGPYEFnsU_8.mp4
diff --git a/junk/M2Ohb0FAaJU_1.mp4 b/junk/M2Ohb0FAaJU_1.mp4
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		{"meta_info": {"appearance_mapping": ["blurry", "male", "young", "chubby", "pale_skin", "rosy_cheeks", "oval_face", "receding_hairline", "bald", "bangs", "black_hair", "blonde_hair", "gray_hair", "brown_hair", "straight_hair", "wavy_hair", "long_hair", "arched_eyebrows", "bushy_eyebrows", "bags_under_eyes", "eyeglasses", "sunglasses", "narrow_eyes", "big_nose", "pointy_nose", "high_cheekbones", "big_lips", "double_chin", "no_beard", "5_o_clock_shadow", "goatee", "mustache", "sideburns", "heavy_makeup", "wearing_earrings", "wearing_hat", "wearing_lipstick", "wearing_necklace", "wearing_necktie", "wearing_mask"], "action_mapping": ["blow", "chew", "close_eyes", "cough", "cry", "drink", "eat", "frown", "gaze", "glare", "head_wagging", "kiss", "laugh", "listen_to_music", "look_around", "make_a_face", "nod", "play_instrument", "read", "shake_head", "shout", "sigh", "sing", "sleep", "smile", "smoke", "sneer", "sneeze", "sniff", "talk", "turn", "weep", "whisper", "wink", "yawn"]}, "clips": {"-2KGPYEFnsU_8": {"ytb_id": "-2KGPYEFnsU", "duration": {"start_sec": 102.6, "end_sec": 106.52}, "bbox": {"top": 0.0991, "bottom": 0.612, "left": 0.1234, "right": 0.412}, "attributes": {"appearance": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "action": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "emotion": {"sep_flag": false, "labels": "neutral"}}, "version": "v0.1"}}}