diff --git a/README.md b/README.md index 24796d3e..266d886d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # Py-FEAT: Python Facial Expression Analysis Toolbox +[![arXiv-badge](https://img.shields.io/badge/arXiv-2104.03509-red.svg)](https://arxiv.org/abs/2104.03509) [![Package versioning](https://img.shields.io/pypi/v/py-feat.svg)](https://pypi.org/project/py-feat/) [![Tests](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml/badge.svg)](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml) [![Coverage Status](https://coveralls.io/repos/github/cosanlab/py-feat/badge.svg?branch=master)](https://coveralls.io/github/cosanlab/py-feat?branch=master) diff --git a/docs/pages/changelog.md b/docs/pages/changelog.md index 334ad3c5..eebc6436 100644 --- a/docs/pages/changelog.md +++ b/docs/pages/changelog.md @@ -1,5 +1,16 @@ # Change Log +# 0.5.1 + +## Notes + +This is a maintenance release that addresses multiple under-the-hood issues with `py-feat` failing when images or videos contain 0 faces. It addresses the following specific issues amongst others and is recommended for all users: + +- [#153](https://github.com/cosanlab/py-feat/issues/153) +- [#155](https://github.com/cosanlab/py-feat/issues/155) +- [#158](https://github.com/cosanlab/py-feat/issues/158) +- [#160](https://github.com/cosanlab/py-feat/issues/160) + # 0.5.0 ## Notes diff --git a/docs/pages/intro.md b/docs/pages/intro.md index 46805a8b..ca752d5b 100644 --- a/docs/pages/intro.md +++ b/docs/pages/intro.md @@ -1,5 +1,6 @@ Py-Feat: Python Facial Expression Analysis Toolbox ============================ +[![arXiv-badge](https://img.shields.io/badge/arXiv-2104.03509-red.svg)](https://arxiv.org/abs/2104.03509) [![Package versioning](https://img.shields.io/pypi/v/py-feat.svg)](https://pypi.org/project/py-feat/) [![Tests](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml/badge.svg)](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml) [![Coverage Status](https://coveralls.io/repos/github/cosanlab/py-feat/badge.svg?branch=master)](https://coveralls.io/github/cosanlab/py-feat?branch=master) diff --git a/feat/data.py b/feat/data.py index 3c01cbd2..6040ae56 100644 --- a/feat/data.py +++ b/feat/data.py @@ -1974,19 +1974,21 @@ def _inverse_face_transform(faces, batch_data): out_face = [] for face in frame: out_face.append( - np.append( - ( - np.array( - [ - face[0] - left, - face[1] - top, - face[2] - left, - face[3] - top, - ] - ) - / scale - ), - face[4], + list( + np.append( + ( + np.array( + [ + face[0] - left, + face[1] - top, + face[2] - left, + face[3] - top, + ] + ) + / scale + ), + face[4], + ) ) ) out_frame.append(out_face) diff --git a/feat/detector.py b/feat/detector.py index 86e38cd7..b490eaf8 100644 --- a/feat/detector.py +++ b/feat/detector.py @@ -16,12 +16,14 @@ FEAT_FACEPOSE_COLUMNS_6D, FEAT_TIME_COLUMNS, set_torch_device, + is_list_of_lists_empty, ) from feat.utils.io import get_resource_path from feat.utils.image_operations import ( extract_face_from_landmarks, extract_face_from_bbox, convert_image_to_tensor, + BBox, ) from feat.pretrained import get_pretrained_models, fetch_model, AU_LANDMARK_MAP from feat.data import ( @@ -358,6 +360,7 @@ def detect_faces(self, frame, threshold=0.5, **face_model_kwargs): Args: frame (np.ndarray): 3d (single) or 4d (multiple) image array + threshold (float): threshold for detectiong faces (default=0.5) Returns: list: list of lists with the same length as the number of frames. Each list @@ -382,7 +385,7 @@ def detect_faces(self, frame, threshold=0.5, **face_model_kwargs): else: faces = self.face_detector(frame, **face_model_kwargs) - if len(faces) == 0: + if is_list_of_lists_empty(faces): logging.warning("Warning: NO FACE is detected") thresholded_face = [] @@ -417,51 +420,55 @@ def detect_landmarks(self, frame, detected_faces, **landmark_model_kwargs): logging.info("detecting landmarks...") frame = convert_image_to_tensor(frame) - if self.info["landmark_model"]: - if self.info["landmark_model"].lower() == "mobilenet": - out_size = 224 - else: - out_size = 112 + if is_list_of_lists_empty(detected_faces): + list_concat = detected_faces + else: + if self.info["landmark_model"]: + if self.info["landmark_model"].lower() == "mobilenet": - extracted_faces, new_bbox = extract_face_from_bbox( - frame, detected_faces, face_size=out_size - ) + out_size = 224 + else: + out_size = 112 - extracted_faces = extracted_faces / 255.0 + extracted_faces, new_bbox = extract_face_from_bbox( + frame, detected_faces, face_size=out_size + ) - if self.info["landmark_model"].lower() == "mobilenet": - extracted_faces = Compose( - [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] - )(extracted_faces) + extracted_faces = extracted_faces / 255.0 - # Run Landmark Model - if self.info["landmark_model"].lower() == "mobilefacenet": - landmark = ( - self.landmark_detector(extracted_faces, **landmark_model_kwargs)[0] - .cpu() - .data.numpy() - ) - else: - landmark = ( - self.landmark_detector(extracted_faces, **landmark_model_kwargs) - .cpu() - .data.numpy() - ) + if self.info["landmark_model"].lower() == "mobilenet": + extracted_faces = Compose( + [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] + )(extracted_faces) + + # Run Landmark Model + if self.info["landmark_model"].lower() == "mobilefacenet": + landmark = ( + self.landmark_detector(extracted_faces, **landmark_model_kwargs)[0] + .cpu() + .data.numpy() + ) + else: + landmark = ( + self.landmark_detector(extracted_faces, **landmark_model_kwargs) + .cpu() + .data.numpy() + ) - landmark = landmark.reshape(landmark.shape[0], -1, 2) + landmark = landmark.reshape(landmark.shape[0], -1, 2) - landmark_results = [] - for ik in range(landmark.shape[0]): + landmark_results = [] + for ik in range(landmark.shape[0]): - landmark_results.append( - new_bbox[ik].inverse_transform_landmark(landmark[ik, :, :]) - ) + landmark_results.append( + new_bbox[ik].inverse_transform_landmark(landmark[ik, :, :]) + ) - length_index = [len(x) for x in detected_faces] - new_lens = np.insert(np.cumsum(length_index), 0, 0) - list_concat = [] - for ij in range(len(length_index)): - list_concat.append(landmark_results[new_lens[ij] : new_lens[ij + 1]]) + length_index = [len(x) for x in detected_faces] + new_lens = np.insert(np.cumsum(length_index), 0, 0) + list_concat = [] + for ij in range(len(length_index)): + list_concat.append(landmark_results[new_lens[ij] : new_lens[ij + 1]]) return list_concat @@ -488,12 +495,17 @@ def detect_facepose(self, frame, landmarks=None, **facepose_model_kwargs): # Normalize Data frame = convert_image_to_tensor(frame, img_type="float32") / 255 + output = {} if "img2pose" in self.info["facepose_model"]: faces, poses = self.facepose_detector(frame, **facepose_model_kwargs) + output["faces"] = faces + output["poses"] = poses else: - poses = self.facepose_detector(frame, landmarks, **facepose_model_kwargs) + output["poses"] = self.facepose_detector( + frame, landmarks, **facepose_model_kwargs + ) - return poses + return output def detect_aus(self, frame, landmarks, **au_model_kwargs): """Detect Action Units from image or video frame @@ -515,66 +527,73 @@ def detect_aus(self, frame, landmarks, **au_model_kwargs): logging.info("detecting aus...") frame = convert_image_to_tensor(frame, img_type="float32") - # frame = transforms.ToTensor()(frame) - - if self["au_model"].lower() in ["svm", "xgb"]: - # transform = Grayscale(3) - # frame = transform(frame) - hog_arr, new_lands = self._batch_hog(frames=frame, landmarks=landmarks) - au_predictions = self.au_model.detect_au( - frame=hog_arr, landmarks=new_lands, **au_model_kwargs - ) + + if is_list_of_lists_empty(landmarks): + return landmarks else: - au_predictions = self.au_model.detect_au( - frame, landmarks=landmarks, **au_model_kwargs - ) + if self["au_model"].lower() in ["svm", "xgb"]: + # transform = Grayscale(3) + # frame = transform(frame) + hog_features, new_landmarks = self._batch_hog( + frames=frame, landmarks=landmarks + ) + au_predictions = self.au_model.detect_au( + frame=hog_features, landmarks=new_landmarks, **au_model_kwargs + ) + else: + au_predictions = self.au_model.detect_au( + frame, landmarks=landmarks, **au_model_kwargs + ) - return self._convert_detector_output(landmarks, au_predictions) + return self._convert_detector_output(landmarks, au_predictions) def _batch_hog(self, frames, landmarks): """ Helper function used in batch processing hog features - frames is a batch of frames - """ - len_index = [len(aa) for aa in landmarks] - lenth_cumu = np.cumsum(len_index) - lenth_cumu2 = np.insert(lenth_cumu, 0, 0) - new_lands_list = [] - flat_land = [item for sublist in landmarks for item in sublist] - hogs_arr = None + Args: + frames: a batch of frames + landmarks: a list of list of detected landmarks - for i in range(len(flat_land)): + Returns: + hog_features: a numpy array of hog features for each detected landmark + landmarks: updated landmarks + """ - frame_assignment = np.where(i < lenth_cumu)[0][0] + hog_features = [] + new_landmark_frames = [] + for i, frame_landmark in enumerate(landmarks): + if len(frame_landmark) != 0: + new_landmarks_faces = [] + for j in range(len(frame_landmark)): + convex_hull, new_landmark = extract_face_from_landmarks( + frame=frames[i], + landmarks=frame_landmark[j], + face_size=112, + ) - convex_hull, new_lands = extract_face_from_landmarks( - frame=frames[frame_assignment], - landmarks=flat_land[i], - face_size=112, - ) + hog_features.append( + hog( + transforms.ToPILImage()(convex_hull[0]), + orientations=8, + pixels_per_cell=(8, 8), + cells_per_block=(2, 2), + visualize=False, + channel_axis=-1, + ).reshape(1, -1) + ) - hogs = hog( - transforms.ToPILImage()(convex_hull[0]), - orientations=8, - pixels_per_cell=(8, 8), - cells_per_block=(2, 2), - visualize=False, - channel_axis=-1, - ).reshape(1, -1) - - if hogs_arr is None: - hogs_arr = hogs + new_landmarks_faces.append(new_landmark) + new_landmark_frames.append(new_landmarks_faces) else: - hogs_arr = np.concatenate([hogs_arr, hogs], 0) - - new_lands_list.append(new_lands) + hog_features.append( + np.zeros((1, 5408)) + ) # LC: Need to confirm this size is fixed. + new_landmark_frames.append([np.zeros((68, 2))]) - new_lands = [] - for i in range(len(lenth_cumu)): - new_lands.append(new_lands_list[lenth_cumu2[i] : (lenth_cumu2[i + 1])]) + hog_features = np.concatenate(hog_features) - return (hogs_arr, new_lands) + return (hog_features, new_landmark_frames) def detect_emotions(self, frame, facebox, landmarks, **emotion_model_kwargs): """Detect emotions from image or video frame @@ -600,42 +619,89 @@ def detect_emotions(self, frame, facebox, landmarks, **emotion_model_kwargs): logging.info("detecting emotions...") frame = convert_image_to_tensor(frame, img_type="float32") - if self.info["emotion_model"].lower() == "resmasknet": - return self._convert_detector_output( - facebox, - self.emotion_model.detect_emo(frame, facebox, **emotion_model_kwargs), - ) + if is_list_of_lists_empty(facebox): + return facebox + else: + if self.info["emotion_model"].lower() == "resmasknet": + return self._convert_detector_output( + facebox, + self.emotion_model.detect_emo( + frame, facebox, **emotion_model_kwargs + ), + ) - elif self.info["emotion_model"].lower() == "svm": - hog_arr, new_lands = self._batch_hog(frames=frame, landmarks=landmarks) - return self._convert_detector_output( - landmarks, - self.emotion_model.detect_emo( - frame=hog_arr, landmarks=new_lands, **emotion_model_kwargs - ), - ) + elif self.info["emotion_model"].lower() == "svm": + hog_features, new_landmarks = self._batch_hog( + frames=frame, landmarks=landmarks + ) + return self._convert_detector_output( + landmarks, + self.emotion_model.detect_emo( + frame=hog_features, + landmarks=new_landmarks, + **emotion_model_kwargs, + ), + ) - else: - raise ValueError( - "Cannot recognize input emo model! Please try to re-type emotion model" - ) + else: + raise ValueError( + "Cannot recognize input emo model! Please try to re-type emotion model" + ) - def _check_detections(self, faces, landmarks, poses, aus, emotions, batch_data): + def _run_detection_waterfall( + self, + batch_data, + face_detection_threshold, + face_model_kwargs, + landmark_model_kwargs, + facepose_model_kwargs, + emotion_model_kwargs, + au_model_kwargs, + ): """ - Private method to ensure that all detectors return the same number of detections + Main detection "waterfall." Calls each individual detector in the sequence + required to support any interactions between detections. Called + behind-the-scenes by .detect_image() and .detect_video() + + Args: + batch_data (dict): singleton item from iterating over the output of a DataLoader + face_detection_threshold (float): value between 0-1 + face_model_kwargs (dict): face model kwargs + landmark_model_kwargs (dict): landmark model kwargs + facepose_model_kwargs (dict): facepose model kwargs + emotion_model_kwargs (dict): emotion model kwargs + au_model_kwargs (dict): au model kwargs + + Returns: + tuple: faces, landmarks, poses, aus, emotions """ + faces = self.detect_faces( + batch_data["Image"], + threshold=face_detection_threshold, + **face_model_kwargs, + ) - # Each input arg is a nested list with length == number of faces in the batch + landmarks = self.detect_landmarks( + batch_data["Image"], + detected_faces=faces, + **landmark_model_kwargs, + ) + poses_dict = self.detect_facepose( + batch_data["Image"], landmarks, **facepose_model_kwargs + ) + aus = self.detect_aus(batch_data["Image"], landmarks, **au_model_kwargs) + emotions = self.detect_emotions( + batch_data["Image"], faces, landmarks, **emotion_model_kwargs + ) - # Check 1) img2pose sometimes gives fewer detections that other models, we can't - # properly assemble Fex when that's the case. Returning more or the same - # detections is ok for now - if len(poses[0]) >= len(faces[0]): - return + faces = _inverse_face_transform(faces, batch_data) + landmarks = _inverse_landmark_transform(landmarks, batch_data) - raise ValueError( - f"Mismatch across detectors when processing batch: {batch_data['FileNames']}\n\nAn error occurred trying to merge detections into a single Fex object, as each type of detector is detecting a different number of faces:\n\nface_detector: {len(faces[0])}\npose_detector: {len(poses[0])}\nlandmark_detector: {len(landmarks[0])}\nau_detector: {len(aus[0])}\nemotion_detector: {len(emotions[0])}\n\nThis can happen for a number of reasons. Here are a few solutions:\n\n1) the face_model is too liberal. You use the 'threshold' keyword argument to make the detector more conservative, e.g. threshold= some val > 0.5\n2) the pose_detector gives different predictions than other detectors. You can use the same model for both pose and face detection by setting face_model='img2pose' and pose_model='img2pose' (or 'img2pose-c')" + # match faces to poses - sometimes face detector finds different faces than pose detector. + faces, poses = self._match_faces_to_poses( + faces, poses_dict["faces"], poses_dict["poses"] ) + return faces, landmarks, poses, aus, emotions def detect_image( self, @@ -645,8 +711,7 @@ def detect_image( num_workers=0, pin_memory=False, frame_counter=0, - skip_failed_detections=False, - threshold=0.5, + face_detection_threshold=0.5, **kwargs, ): """ @@ -661,17 +726,17 @@ def detect_image( Args: input_file_list (list of str): Path to a list of paths to image files. output_size (int): image size to rescale all image preserving aspect ratio. - Will raise an error if not set and batch_size > 1 but images are not the same size + Will raise an error if not set and batch_size > 1 but images are not the same size batch_size (int): how many batches of images you want to run at one shot. - Larger gives faster speed but is more memory-consuming. Images must be the + Larger gives faster speed but is more memory-consuming. Images must be the same size to be run in batches! num_workers (int): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process. pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. If your data elements are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type frame_counter (int): starting value to count frames - threshold (float): value between 0-1 to report a detection based on the - confidence of the face detector; Default >= 0.5 + face_detection_threshold (float): value between 0-1 to report a detection based on the + confidence of the face detector; Default >= 0.5 **kwargs: you can pass each detector specific kwargs using a dictionary - like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...` + like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...` Returns: Fex: Prediction results dataframe @@ -701,47 +766,34 @@ def detect_image( warnings.warn( "Currently using mobilenet for landmark detection with batch_size > 1 may lead to erroneous detections. We recommend either setting batch_size=1 or using mobilefacenet as the landmark detection model. You can follow this issue for more: https://github.com/cosanlab/py-feat/issues/151" ) - try: + try: batch_output = [] + for batch_id, batch_data in enumerate(tqdm(data_loader)): - frame_counter += frame_counter + batch_id * batch_size - faces = self.detect_faces( - batch_data["Image"], threshold=threshold, **face_model_kwargs - ) - landmarks = self.detect_landmarks( - batch_data["Image"], detected_faces=faces, **landmark_model_kwargs - ) - poses = self.detect_facepose( - batch_data["Image"], landmarks, **facepose_model_kwargs + + faces, landmarks, poses, aus, emotions = self._run_detection_waterfall( + batch_data, + face_detection_threshold, + face_model_kwargs, + landmark_model_kwargs, + facepose_model_kwargs, + emotion_model_kwargs, + au_model_kwargs, ) - aus = self.detect_aus(batch_data["Image"], landmarks, **au_model_kwargs) - emotions = self.detect_emotions( - batch_data["Image"], faces, landmarks, **emotion_model_kwargs + + output = self._create_fex( + faces, + landmarks, + poses, + aus, + emotions, + batch_data["FileNames"], + frame_counter, ) + batch_output.append(output) + frame_counter += 1 * batch_size - faces = _inverse_face_transform(faces, batch_data) - landmarks = _inverse_landmark_transform(landmarks, batch_data) - try: - self._check_detections( - faces, landmarks, poses, aus, emotions, batch_data - ) - output = self._create_fex( - faces, - landmarks, - poses, - aus, - emotions, - batch_data["FileNames"], - frame_counter, - ) - batch_output.append(output) - except ValueError as e: - if skip_failed_detections: - print(e) - continue - else: - raise e batch_output = pd.concat(batch_output) batch_output.reset_index(drop=True, inplace=True) @@ -759,7 +811,8 @@ def detect_video( batch_size=1, num_workers=0, pin_memory=False, - **detector_kwargs, + face_detection_threshold=0.5, + **kwargs, ): """Detects FEX from a video file. @@ -773,11 +826,20 @@ def detect_video( pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. If your data elements are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type + face_detection_threshold (float): value between 0-1 to report a detection based on the + confidence of the face detector; Default >= 0.5 Returns: Fex: Prediction results dataframe """ + # Keyword arguments than can be passed to the underlying models + face_model_kwargs = kwargs.pop("face_model_kwargs", dict()) + landmark_model_kwargs = kwargs.pop("landmark_model_kwargs", dict()) + au_model_kwargs = kwargs.pop("au_model_kwargs", dict()) + emotion_model_kwargs = kwargs.pop("emotion_model_kwargs", dict()) + facepose_model_kwargs = kwargs.pop("facepose_model_kwargs", dict()) + data_loader = DataLoader( VideoDataset(video_path, skip_frames=skip_frames, output_size=output_size), num_workers=num_workers, @@ -787,48 +849,37 @@ def detect_video( ) batch_output = [] + for batch_data in tqdm(data_loader): - faces = self.detect_faces(batch_data["Image"], **detector_kwargs) - landmarks = self.detect_landmarks( - batch_data["Image"], detected_faces=faces, **detector_kwargs - ) - poses = self.detect_facepose(batch_data["Image"], **detector_kwargs) - aus = self.detect_aus(batch_data["Image"], landmarks, **detector_kwargs) - emotions = self.detect_emotions( - batch_data["Image"], faces, landmarks, **detector_kwargs + + faces, landmarks, poses, aus, emotions = self._run_detection_waterfall( + batch_data, + face_detection_threshold, + face_model_kwargs, + landmark_model_kwargs, + facepose_model_kwargs, + emotion_model_kwargs, + au_model_kwargs, ) + frames = list(batch_data["Frame"].numpy()) - landmarks = _inverse_landmark_transform(landmarks, batch_data) + output = self._create_fex( - faces, landmarks, poses, aus, emotions, batch_data["FileName"], frames + faces, + landmarks, + poses, + aus, + emotions, + batch_data["FileName"], + frames, ) + batch_output.append(output) batch_output = pd.concat(batch_output) batch_output.reset_index(drop=True, inplace=True) return batch_output.set_index("frame", drop=False) - def _convert_detector_output(detected_faces, detector_results): - """Helper function to convert AU/Emotion detector output into frame by face list of lists. - - Args: - detected_faces (list): list of lists output from face/landmark detector - au_results (np.array):, results from au/emotion detectors - - Returns: - list_concat: (list of list). The list which contains the number of faces. for example - if you process 2 frames and each frame contains 4 faces, it will return: - [[xxx,xxx,xxx,xxx],[xxx,xxx,xxx,xxx]] - """ - - length_index = [len(x) for x in detected_faces] - - list_concat = [] - new_lens = np.insert(np.cumsum(length_index), 0, 0) - for ij in range(len(length_index)): - list_concat.append(detector_results[new_lens[ij] : new_lens[ij + 1], :]) - return list_concat - def _create_fex( self, faces, landmarks, poses, aus, emotions, file_names, frame_counter ): @@ -848,11 +899,53 @@ def _create_fex( """ logging.info("creating fex output...") - files = [[f] * n for f, n in zip(file_names, [len(x) for x in faces])] - # Convert to Pandas Format out = [] for i, frame in enumerate(faces): + if not frame: + facebox_df = pd.DataFrame( + {x: np.nan for x in self.info["face_detection_columns"]}, + columns=self.info["face_detection_columns"], + index=[i], + ) + facepose_df = pd.DataFrame( + {x: np.nan for x in self.info["facepose_model_columns"]}, + columns=self.info["facepose_model_columns"], + index=[i], + ) + landmarks_df = pd.DataFrame( + {x: np.nan for x in self.info["face_landmark_columns"]}, + columns=self.info["face_landmark_columns"], + index=[i], + ) + aus_df = pd.DataFrame( + {x: np.nan for x in self.info["au_presence_columns"]}, + columns=self.info["au_presence_columns"], + index=[i], + ) + emotions_df = pd.DataFrame( + {x: np.nan for x in self.info["emotion_model_columns"]}, + columns=self.info["emotion_model_columns"], + index=[i], + ) + input_df = pd.DataFrame(file_names[i], columns=["input"], index=[i]) + tmp_df = pd.concat( + [ + facebox_df, + landmarks_df, + facepose_df, + aus_df, + emotions_df, + input_df, + ], + axis=1, + ) + if isinstance(frame_counter, (list)): + tmp_df[FEAT_TIME_COLUMNS] = frame_counter[i] + else: + tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i + out.append(tmp_df) + for j, face_in_frame in enumerate(frame): facebox_df = pd.DataFrame( [ @@ -869,7 +962,7 @@ def _create_fex( ) facepose_df = pd.DataFrame( - [poses[i][j].flatten(order="F")], + [poses[i][j]], columns=self.info["facepose_model_columns"], index=[j], ) @@ -887,13 +980,15 @@ def _create_fex( ) emotions_df = pd.DataFrame( - emotions[i][j, :].reshape(1, len(FEAT_EMOTION_COLUMNS)), - columns=FEAT_EMOTION_COLUMNS, + emotions[i][j, :].reshape( + 1, len(self.info["emotion_model_columns"]) + ), + columns=self.info["emotion_model_columns"], index=[j], ) input_df = pd.DataFrame( - files[i][j], + file_names[i], columns=["input"], index=[j], ) @@ -915,6 +1010,7 @@ def _create_fex( else: tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i out.append(tmp_df) + out = pd.concat(out) out.reset_index(drop=True, inplace=True) @@ -922,9 +1018,9 @@ def _create_fex( return Fex( out, au_columns=self.info["au_presence_columns"], - emotion_columns=FEAT_EMOTION_COLUMNS, - facebox_columns=FEAT_FACEBOX_COLUMNS, - landmark_columns=openface_2d_landmark_columns, + emotion_columns=self.info["emotion_model_columns"], + facebox_columns=self.info["face_detection_columns"], + landmark_columns=self.info["face_landmark_columns"], facepose_columns=self.info["facepose_model_columns"], detector="Feat", face_model=self.info["face_model"], @@ -934,7 +1030,8 @@ def _create_fex( facepose_model=self.info["facepose_model"], ) - def _convert_detector_output(self, detected_faces, detector_results): + @staticmethod + def _convert_detector_output(detected_faces, detector_results): """ Helper function to convert AU/Emotion detector output into frame by face list of lists. Either face or landmark detector list of list outputs can be used. @@ -956,3 +1053,136 @@ def _convert_detector_output(self, detected_faces, detector_results): for ij in range(len(length_index)): list_concat.append(detector_results[new_lens[ij] : new_lens[ij + 1], :]) return list_concat + + @staticmethod + def _match_faces_to_poses(faces, faces_pose, poses): + """Helper function to match list of lists of faces and poses based on overlap in bounding boxes. + + Sometimes the face detector finds different faces than the pose detector unless the user + is using the same detector (i.e., img2pose). + + This function will match the faces and poses and will return nans if more faces are detected then poses. + Will only return poses that match faces even if more faces are detected by pose detector. + + Args: + faces (list): list of lists of face bounding boxes from face detector + faces_pose (list): list of lists of face bounding boxes from pose detector + poses (list): list of lists of poses from pose detector + + Returns: + faces (list): list of list of faces that have been matched to poses + poses (list): list of list of poses that have been matched to faces + """ + + if len(faces) != len(faces_pose): + raise ValueError( + "Make sure the number of batches in faces and poses is the same." + ) + + if is_list_of_lists_empty(faces): + # Currently assuming no faces if no face is detected. Not running pose + return (faces, poses) + + else: + + overlap_faces = [] + overlap_poses = [] + for frame_face, frame_face_pose, frame_pose in zip( + faces, faces_pose, poses + ): + if not frame_face: + n_faces = 0 + elif isinstance(frame_face[0], list): + n_faces = len(frame_face) + else: + n_faces = 1 + + if not frame_face_pose: + n_poses = 0 + elif isinstance(frame_face_pose[0], list): + n_poses = len(frame_face_pose) + else: + n_poses = 1 + + frame_overlap = np.zeros([n_faces, n_poses]) + + if n_faces == 0: + overlap_faces.append([]) + overlap_poses.append([]) + + elif (n_faces == 1) & (n_poses > 1): + b1 = BBox(frame_face[0][:-1]) + + for pose_idx in range(n_poses): + b2 = BBox(frame_face_pose[pose_idx][:-1]) + frame_overlap[0, pose_idx] = b1.overlap(b2) + matched_pose_index = np.where( + frame_overlap[0, :] == frame_overlap[0, :].max() + )[0][0] + overlap_faces.append(frame_face) + overlap_poses.append([frame_pose[matched_pose_index]]) + + elif (n_faces > 1) & (n_poses == 1): + b2 = BBox(frame_face_pose[0][:-1]) + for face_idx in range(n_faces): + b1 = BBox(frame_face[face_idx][:-1]) + frame_overlap[face_idx, 0] = b1.overlap(b2) + matched_face_index = np.where( + frame_overlap[:, 0] == frame_overlap[:, 0].max() + )[0][0] + new_poses = [] + for f_idx in range(n_faces): + if f_idx == matched_face_index: + new_poses.append(frame_pose[0]) + else: + new_poses.append(np.ones(3) * np.nan) + overlap_faces.append(frame_face) + overlap_poses.append(new_poses) + + else: + for face_idx in range(n_faces): + b1 = BBox(frame_face[face_idx][:-1]) + for pose_idx in range(n_poses): + b2 = BBox(frame_face_pose[pose_idx][:-1]) + frame_overlap[face_idx, pose_idx] = b1.overlap(b2) + + overlap_faces_frame = [] + overlap_poses_frame = [] + if n_faces < n_poses: + for face_idx in range(n_faces): + pose_idx = np.where( + frame_overlap[face_idx, :] + == frame_overlap[face_idx, :].max() + )[0][0] + overlap_faces_frame.append(frame_face[face_idx]) + overlap_poses_frame.append(frame_pose[pose_idx]) + elif n_faces > n_poses: + matched_pose_index = [] + for pose_idx in range(n_poses): + matched_pose_index.append( + np.where( + frame_overlap[:, pose_idx] + == frame_overlap[:, pose_idx].max() + )[0][0] + ) + for face_idx in range(n_faces): + overlap_faces_frame.append(frame_face[face_idx]) + if face_idx in matched_pose_index: + overlap_poses_frame.append( + frame_pose[ + np.where( + frame_overlap[face_idx, :] + == frame_overlap[face_idx, :].max() + )[0][0] + ] + ) + else: + overlap_poses_frame.append(np.ones(3) * np.nan) + elif n_faces == n_poses: + overlap_faces_frame = frame_face + overlap_poses_frame = frame_pose + + overlap_faces.append(overlap_faces_frame) + overlap_poses.append(overlap_poses_frame) + + return (overlap_faces, overlap_poses) diff --git a/feat/face_detectors/FaceBoxes/FaceBoxes_test.py b/feat/face_detectors/FaceBoxes/FaceBoxes_test.py index 7a96f982..1bcfede2 100644 --- a/feat/face_detectors/FaceBoxes/FaceBoxes_test.py +++ b/feat/face_detectors/FaceBoxes/FaceBoxes_test.py @@ -52,7 +52,7 @@ def __init__( top_k=5000, keep_top_k=750, nms_threshold=0.3, - vis_threshold=0.5, + detection_threshold=0.5, resize=1, device="auto", ): @@ -82,14 +82,14 @@ def __init__( self.top_k, self.keep_top_k, self.nms_threshold, - self.vis_threshold, + self.detection_threshold, self.resize, ) = ( confidence_threshold, top_k, keep_top_k, nms_threshold, - vis_threshold, + detection_threshold, resize, ) @@ -148,11 +148,11 @@ def _calculate_boxinfo(self, im_height, im_width, loc, conf, scale): # keep top-K faster NMS dets = dets[: self.keep_top_k, :] - # filter using vis_thres - rescale box size to be proportional to image size + # filter using detection_threshold - rescale box size to be proportional to image size scale_x, scale_y = (im_width / im_height, im_height / im_width) det_bboxes = [] for b in dets: - if b[4] > self.vis_threshold: + if b[4] > self.detection_threshold: xmin, ymin, xmax, ymax, score = b det_bboxes.append( [ diff --git a/feat/face_detectors/MTCNN/MTCNN_test.py b/feat/face_detectors/MTCNN/MTCNN_test.py index 113e88fb..f67be176 100644 --- a/feat/face_detectors/MTCNN/MTCNN_test.py +++ b/feat/face_detectors/MTCNN/MTCNN_test.py @@ -32,6 +32,7 @@ class MTCNN(nn.Module): (default: {0}) min_face_size {int} -- Minimum face size to search for. (default: {20}) thresholds {list} -- MTCNN face detection thresholds (default: {[0.6, 0.7, 0.7]}) + detection_threshold (float): threshold for detectiong faces (default=0.5). Will override the last stage of thresholds factor {float} -- Factor used to create a scaling pyramid of face sizes. (default: {0.709}) post_process {bool} -- Whether or not to post process images tensors before returning. (default: {True}) @@ -46,8 +47,7 @@ class MTCNN(nn.Module): "center_weighted_size": box size minus weighted squared offset from image center (default: {None}) keep_all {bool} -- If True, all detected faces are returned, in the order dictated by the - select_largest parameter. If a save_path is specified, the first face is saved to that - path and the remaining faces are saved to 1, 2 etc. + select_largest parameter. (default: {False}) device {torch.device} -- The device on which to run neural net passes. Image tensors and models are copied to this device before running forward passes. (default: 'auto') @@ -59,11 +59,12 @@ def __init__( margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], + detection_threshold=0.5, factor=0.709, post_process=True, select_largest=True, selection_method=None, - keep_all=False, + keep_all=True, device="auto", ): super().__init__() @@ -72,6 +73,7 @@ def __init__( self.margin = margin self.min_face_size = min_face_size self.thresholds = thresholds + self.thresholds[-1] = detection_threshold self.factor = factor self.post_process = post_process self.select_largest = select_largest diff --git a/feat/face_detectors/Retinaface/Retinaface_test.py b/feat/face_detectors/Retinaface/Retinaface_test.py index f1a56dc8..ed5e71d2 100644 --- a/feat/face_detectors/Retinaface/Retinaface_test.py +++ b/feat/face_detectors/Retinaface/Retinaface_test.py @@ -21,7 +21,7 @@ def __init__( self, device="auto", resize=1, - vis_threshold=0.5, + detection_threshold=0.5, nms_threshold=0.4, keep_top_k=750, top_k=5000, @@ -34,7 +34,7 @@ def __init__( device: (str) timer_flag: (bool) resize: (int) - vis_threshold: (float) + detection_threshold: (float) nms_threshold: (float) keep_top_k: (float) top_k: (float) @@ -78,14 +78,14 @@ def __init__( # Set cutoff parameters ( self.resize, - self.vis_threshold, + self.detection_threshold, self.nms_threshold, self.keep_top_k, self.top_k, self.confidence_threshold, ) = ( resize, - vis_threshold, + detection_threshold, nms_threshold, keep_top_k, top_k, @@ -175,11 +175,11 @@ def _calculate_boxinfo(self, im_height, im_width, loc, conf, landms, scale, img) # keep top-K faster NMS dets = dets[: self.keep_top_k, :] - # filter using vis_thres - rescale box size to be proportional to image size + # filter using detection_threshold - rescale box size to be proportional to image size scale_x, scale_y = (im_width / im_height, im_height / im_width) det_bboxes = [] for b in dets: - if b[4] > self.vis_threshold: + if b[4] > self.detection_threshold: xmin, ymin, xmax, ymax, score = b det_bboxes.append( [ diff --git a/feat/facepose_detectors/img2pose/img2pose_test.py b/feat/facepose_detectors/img2pose/img2pose_test.py index d915c234..3671b117 100644 --- a/feat/facepose_detectors/img2pose/img2pose_test.py +++ b/feat/facepose_detectors/img2pose/img2pose_test.py @@ -231,7 +231,7 @@ def predict(self, img, border_size=0, scale=1.0, euler=True): dof_pose = pose_pred[:] # pitch, roll, yaw, x, y, z dof_pose = dof_pose.reshape(1, -1) - det_pose.append(dof_pose) + det_pose.append(list(dof_pose.flatten())) return {"boxes": det_bboxes, "poses": det_pose} diff --git a/feat/tests/conftest.py b/feat/tests/conftest.py index bda0050f..69271eba 100644 --- a/feat/tests/conftest.py +++ b/feat/tests/conftest.py @@ -84,6 +84,11 @@ def default_detector(): return Detector() +@fixture(scope="module") +def no_face_img(data_path): + return os.path.join(data_path, "free-mountain-vector-01.jpg") + + @fixture(scope="module") def single_face_img(data_path): return os.path.join(data_path, "single_face.jpg") @@ -100,6 +105,21 @@ def single_face_mov(data_path): return os.path.join(data_path, "single_face.mp4") +@fixture(scope="module") +def no_face_mov(data_path): + return os.path.join(data_path, "no_face.mp4") + + +@fixture(scope="module") +def face_noface_mov(data_path): + return os.path.join(data_path, "face_noface.mov") + + +@fixture(scope="module") +def noface_face_mov(data_path): + return os.path.join(data_path, "noface_face.mov") + + @fixture(scope="module") def multi_face_img(data_path): return os.path.join(data_path, "multi_face.jpg") diff --git a/feat/tests/data/face_noface.mov b/feat/tests/data/face_noface.mov new file mode 100644 index 00000000..4122ce5d Binary files /dev/null and b/feat/tests/data/face_noface.mov differ diff --git a/feat/tests/data/no_face.mp4 b/feat/tests/data/no_face.mp4 new file mode 100644 index 00000000..67d85100 Binary files /dev/null and b/feat/tests/data/no_face.mp4 differ diff --git a/feat/tests/data/noface_face.mov b/feat/tests/data/noface_face.mov new file mode 100644 index 00000000..a0f529c8 Binary files /dev/null and b/feat/tests/data/noface_face.mov differ diff --git a/feat/tests/test_detector_core.py b/feat/tests/test_detector_core.py index 4dead6e5..72e763ce 100644 --- a/feat/tests/test_detector_core.py +++ b/feat/tests/test_detector_core.py @@ -31,6 +31,7 @@ def test_landmark_with_batches(multiple_images_for_batch_testing): # TODO: Currently making this test always pass even if batching gives slightly diff # results until @tiankang can debug whether we're in tolerance +# Track progress updates in this issue: https://github.com/cosanlab/py-feat/issues/128 def test_detection_and_batching_with_diff_img_sizes( single_face_img, multi_face_img, multiple_images_for_batch_testing ): @@ -108,6 +109,49 @@ def test_nofile(default_detector): _ = default_detector.detect_image(inputFname) +# No Face images +def test_detect_single_img_no_face(default_detector, no_face_img): + """Test detection of a single image with no face. Default detector returns 173 attributes""" + out = default_detector.detect_image(no_face_img) + assert type(out) == Fex + assert out.shape == (1, 173) + assert np.isnan(out.happiness.values[0]) + + +def test_detect_multi_img_no_face(default_detector, no_face_img): + """Test detection of a multiple images with no face. Default detector returns 173 attributes""" + out = default_detector.detect_image([no_face_img] * 3) + assert out.shape == (3, 173) + + +def test_detect_multi_img_no_face_batching(default_detector, no_face_img): + """Test detection of a multiple images with no face. Default detector returns 173 attributes""" + out = default_detector.detect_image([no_face_img] * 5, batch_size=2) + assert out.shape == (5, 173) + + +def test_detect_multi_img_mixed_no_face( + default_detector, no_face_img, single_face_img, multi_face_img +): + """Test detection of a single image with no face. Default detector returns 173 attributes""" + out = default_detector.detect_image( + [single_face_img, no_face_img, multi_face_img] * 2 + ) + assert out.shape == (14, 173) + + +def test_detect_multi_img_mixed_no_face_batching( + default_detector, no_face_img, single_face_img, multi_face_img +): + """Test detection of a single image with no face. Default detector returns 173 attributes""" + out = default_detector.detect_image( + [single_face_img, no_face_img, multi_face_img] * 2, + batch_size=4, + output_size=300, + ) + assert out.shape == (14, 173) + + # Single images def test_detect_single_img_single_face(default_detector, single_face_img): """Test detection of single face from single image. Default detector returns 173 attributes""" @@ -151,13 +195,196 @@ def test_detect_mismatch_image_sizes(default_detector, single_face_img, multi_fa assert out.shape == (6, 173) out = default_detector.detect_image( - [multi_face_img, single_face_img] * 5, batch_size=5, output_size=256 + [multi_face_img, single_face_img] * 5, batch_size=5, output_size=512 ) assert out.shape == (30, 173) -def test_detect_video(default_detector, single_face_mov): +def test_detect_video( + default_detector, single_face_mov, no_face_mov, face_noface_mov, noface_face_mov +): """Test detection on video file""" out = default_detector.detect_video(single_face_mov, skip_frames=24) assert len(out) == 3 assert out.happiness.values.max() > 0 + + # Test no face movie + out = default_detector.detect_video(no_face_mov, skip_frames=24) + assert len(out) == 4 + # Empty detections are filled with NaNs + assert out.aus.isnull().all().all() + + # Test mixed movie, i.e. spliced vids of face -> noface and noface -> face + out = default_detector.detect_video(face_noface_mov, skip_frames=24) + assert len(out) == 3 + 4 + 1 + # first few frames have a face + assert not out.aus.iloc[:3].isnull().all().all() + # But the rest are from a diff video that doesn't + assert out.aus.iloc[3:].isnull().all().all() + + out = default_detector.detect_video(noface_face_mov, skip_frames=24) + assert len(out) == 3 + 4 + 1 + # beginning no face + assert out.aus.iloc[:4].isnull().all().all() + # middle frames have face + assert not out.aus.iloc[4:7].isnull().all().all() + # ending doesn't + assert out.aus.iloc[7:].isnull().all().all() + + +def test_detect_mismatch_face_pose(default_detector): + # Multiple Faces, 1 pose + faces = [ + [ + [ + 45.34465026855469, + 49.546714782714844, + 63.04056167602539, + 70.38599395751953, + 0.95337886, + ], + [ + 146.09866333007812, + 96.34442901611328, + 165.69561767578125, + 120.71611022949219, + 0.9069432, + ], + ] + ] + faces_pose = [[[46.0, 46.0, 66.0, 71.0, 0.99272925]]] + poses = [[[-3.72766398, 10.9359162, -3.19862351]]] + + new_faces, new_poses = default_detector._match_faces_to_poses( + faces, faces_pose, poses + ) + assert len(new_faces[0]) == len(new_poses[0]) + assert len(new_faces[0]) == 2 + + # 1 face, multiple poses + faces = [ + [ + [ + 45.34465026855469, + 49.546714782714844, + 63.04056167602539, + 70.38599395751953, + 0.95337886, + ] + ] + ] + + faces_pose = [ + [ + [65.0, 83.0, 87.0, 110.0, 0.99630725], + [141.0, 94.0, 167.0, 123.0, 0.9952237], + [111.0, 97.0, 136.0, 126.0, 0.99487805], + [91.0, 78.0, 109.0, 100.0, 0.99454665], + [46.0, 46.0, 66.0, 71.0, 0.99272925], + ] + ] + + poses = [ + [ + [-5.90236694, -2.81686444, -5.38250827], + [18.3324545, 7.2330487, 2.70649852], + [12.04520545, 5.91369713, 6.13698383], + [1.10688262, 1.56339815, -0.91693287], + [-3.72766398, 10.9359162, -3.19862351], + ] + ] + + new_faces, new_poses = default_detector._match_faces_to_poses( + faces, faces_pose, poses + ) + assert len(new_faces[0]) == 1 + assert len(new_poses[0]) == 1 + + # 2 Faces, 5 Poses + faces = [ + [ + [ + 45.34465026855469, + 49.546714782714844, + 63.04056167602539, + 70.38599395751953, + 0.95337886, + ], + [ + 146.09866333007812, + 96.34442901611328, + 165.69561767578125, + 120.71611022949219, + 0.9069432, + ], + ] + ] + + faces_pose = [ + [ + [65.0, 83.0, 87.0, 110.0, 0.99630725], + [141.0, 94.0, 167.0, 123.0, 0.9952237], + [111.0, 97.0, 136.0, 126.0, 0.99487805], + [91.0, 78.0, 109.0, 100.0, 0.99454665], + [46.0, 46.0, 66.0, 71.0, 0.99272925], + ] + ] + + poses = [ + [ + [-5.90236694, -2.81686444, -5.38250827], + [18.3324545, 7.2330487, 2.70649852], + [12.04520545, 5.91369713, 6.13698383], + [1.10688262, 1.56339815, -0.91693287], + [-3.72766398, 10.9359162, -3.19862351], + ] + ] + + new_faces, new_poses = default_detector._match_faces_to_poses( + faces, faces_pose, poses + ) + assert len(new_faces[0]) == len(new_poses[0]) + assert len(new_faces[0]) == 2 + + # 5 Faces, 2 Poses + faces = [ + [ + [65.0, 83.0, 87.0, 110.0, 0.99630725], + [141.0, 94.0, 167.0, 123.0, 0.9952237], + [111.0, 97.0, 136.0, 126.0, 0.99487805], + [91.0, 78.0, 109.0, 100.0, 0.99454665], + [46.0, 46.0, 66.0, 71.0, 0.99272925], + ] + ] + + faces_pose = [ + [ + [ + 45.34465026855469, + 49.546714782714844, + 63.04056167602539, + 70.38599395751953, + 0.95337886, + ], + [ + 146.09866333007812, + 96.34442901611328, + 165.69561767578125, + 120.71611022949219, + 0.9069432, + ], + ] + ] + + poses = [ + [ + [-5.90236694, -2.81686444, -5.38250827], + [18.3324545, 7.2330487, 2.70649852], + ] + ] + + new_faces, new_poses = default_detector._match_faces_to_poses( + faces, faces_pose, poses + ) + assert len(new_faces[0]) == len(new_poses[0]) + assert len(new_faces[0]) == 5 diff --git a/feat/tests/test_pretrained_models.py b/feat/tests/test_pretrained_models.py index 7777226b..72bf4e6f 100644 --- a/feat/tests/test_pretrained_models.py +++ b/feat/tests/test_pretrained_models.py @@ -202,7 +202,7 @@ def test_img2pose_facepose( default_detector.change_model(facepose_model="img2pose") poses = default_detector.detect_facepose(single_face_img_data) - assert np.allclose(poses, [0.86, -3.80, 6.60], atol=0.1) + assert np.allclose(poses["poses"], [0.86, -3.80, 6.60], atol=0.1) # Test DOF kwarg facepose_model_kwargs = {"RETURN_DIM": 6} @@ -215,10 +215,10 @@ def test_img2pose_facepose( # Also run directly poses = new_detector.detect_facepose(single_face_img_data) - assert len(poses[0][0].squeeze()) == 6 + assert len(poses["poses"][0][0]) == 6 def test_img2pose_c_facepose(self, default_detector, single_face_img_data): default_detector.change_model(facepose_model="img2pose-c") poses = default_detector.detect_facepose(single_face_img_data) - assert np.allclose(poses, [0.86, -3.80, 6.60], atol=0.1) + assert np.allclose(poses["poses"], [0.86, -3.80, 6.60], atol=0.1) diff --git a/feat/utils/__init__.py b/feat/utils/__init__.py index 60ea78ab..a03379af 100644 --- a/feat/utils/__init__.py +++ b/feat/utils/__init__.py @@ -151,3 +151,10 @@ def set_torch_device(device="cpu"): else: return device + + +# TODO: Refactor the output of each detector into a reliable dataclass with the same +# structure to avoid utility functions like this +def is_list_of_lists_empty(list_of_lists): + """Helper function to check if list of lists is empty""" + return not any(list_of_lists) diff --git a/feat/utils/image_operations.py b/feat/utils/image_operations.py index b2ce5c5e..1a278b3c 100644 --- a/feat/utils/image_operations.py +++ b/feat/utils/image_operations.py @@ -22,6 +22,8 @@ from copy import deepcopy from skimage import draw import logging +from matplotlib.patches import Rectangle +import matplotlib.pyplot as plt __all__ = [ "neutral", @@ -188,9 +190,10 @@ def extract_face_from_bbox(frame, detected_faces, face_size=112, expand_bbox=1.2 cropped_faces.append(transform(cropped)) bbox_list.append(bbox) - faces = torch.cat( - tuple([convert_image_to_tensor(x["Image"]) for x in cropped_faces]), 0 - ) + faces = torch.cat( + tuple([convert_image_to_tensor(x["Image"]) for x in cropped_faces]), 0 + ) + return (faces, bbox_list) @@ -469,6 +472,40 @@ def __init__( def __repr__(self): return f"'height': {self.height}, 'width': {self.width}" + def __mul__(self, bbox2): + """Create a new BBox based on the intersection between two BBox instances (AND operation)""" + + if isinstance(bbox2, (BBox)): + return BBox( + [ + np.max([self.left, bbox2.left]), + np.max([self.top, bbox2.top]), + np.min([self.right, bbox2.right]), + np.min([self.bottom, bbox2.bottom]), + ] + ) + else: + raise NotImplementedError( + "Multiplication is currently only supported between two BBox instances" + ) + + def __add__(self, bbox2): + """Create a new BBox based on the intersection between two BBox instances (OR Operation)""" + + if isinstance(bbox2, (BBox)): + return BBox( + [ + np.min([self.left, bbox2.left]), + np.min([self.top, bbox2.top]), + np.max([self.right, bbox2.right]), + np.max([self.bottom, bbox2.bottom]), + ] + ) + else: + raise NotImplementedError( + "Addition is currently only supported between two BBox instances" + ) + def expand_by_factor(self, factor, symmetric=True): """Expand box by factor @@ -625,6 +662,42 @@ def inverse_transform_landmark(self, landmark): landmark_[i] = (x, y) return landmark_ + def area(self): + """Compute the area of the bounding box""" + return self.height * self.width + + def overlap(self, bbox2): + """Compute the percent overlap between BBox with another BBox""" + overlap_bbox = self * bbox2 + if (overlap_bbox.height < 0) or (overlap_bbox.width < 0): + return 0 + else: + return (self * bbox2).area() / self.area() + + def plot(self, ax=None, fill=False, linewidth=2, **kwargs): + """Plot bounding box + + Args: + ax: matplotlib axis + fill (bool): fill rectangle + """ + + if ax is None: + fig, ax = plt.subplots() + ax.plot() + + ax.add_patch( + Rectangle( + (self.left, self.top), + self.width, + self.height, + fill=fill, + linewidth=linewidth, + **kwargs, + ) + ) + return ax + def reverse_color_order(img): """Convert BGR OpenCV image to RGB format""" @@ -724,7 +797,7 @@ def convert_to_euler(rotvec, is_rotvec=True): rotvec = Rotation.from_rotvec(rotvec).as_matrix() rot_mat_2 = np.transpose(rotvec) angle = Rotation.from_matrix(rot_mat_2).as_euler("xyz", degrees=True) - return np.array([angle[0], -angle[2], -angle[1]]) # pitch, roll, yaw + return [angle[0], -angle[2], -angle[1]] # pitch, roll, yaw def py_cpu_nms(dets, thresh): diff --git a/feat/version.py b/feat/version.py index 3d187266..dd9b22cc 100644 --- a/feat/version.py +++ b/feat/version.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "0.5.1" diff --git a/requirements-dev.txt b/requirements-dev.txt index 57dae2b4..cdfab17f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ pytest-cov coveralls pycodestyle black==22.3.0 -sphinx +sphinx<6 sphinx-rtd-theme sphinxcontrib-napoleon jupyter-book