diff --git a/README.md b/README.md
index 24796d3e..266d886d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # Py-FEAT: Python Facial Expression Analysis Toolbox
+[![arXiv-badge](https://img.shields.io/badge/arXiv-2104.03509-red.svg)](https://arxiv.org/abs/2104.03509) 
 [![Package versioning](https://img.shields.io/pypi/v/py-feat.svg)](https://pypi.org/project/py-feat/)
 [![Tests](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml/badge.svg)](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml)
 [![Coverage Status](https://coveralls.io/repos/github/cosanlab/py-feat/badge.svg?branch=master)](https://coveralls.io/github/cosanlab/py-feat?branch=master)
diff --git a/docs/pages/changelog.md b/docs/pages/changelog.md
index 334ad3c5..eebc6436 100644
--- a/docs/pages/changelog.md
+++ b/docs/pages/changelog.md
@@ -1,5 +1,16 @@
 # Change Log
 
+# 0.5.1
+
+## Notes
+
+This is a maintenance release that addresses multiple under-the-hood issues with `py-feat` failing when images or videos contain 0 faces. It addresses the following specific issues amongst others and is recommended for all users:
+
+- [#153](https://github.com/cosanlab/py-feat/issues/153)
+- [#155](https://github.com/cosanlab/py-feat/issues/155)
+- [#158](https://github.com/cosanlab/py-feat/issues/158)
+- [#160](https://github.com/cosanlab/py-feat/issues/160)
+
 # 0.5.0
 
 ## Notes
diff --git a/docs/pages/intro.md b/docs/pages/intro.md
index 46805a8b..ca752d5b 100644
--- a/docs/pages/intro.md
+++ b/docs/pages/intro.md
@@ -1,5 +1,6 @@
 Py-Feat: Python Facial Expression Analysis Toolbox
 ============================
+[![arXiv-badge](https://img.shields.io/badge/arXiv-2104.03509-red.svg)](https://arxiv.org/abs/2104.03509) 
 [![Package versioning](https://img.shields.io/pypi/v/py-feat.svg)](https://pypi.org/project/py-feat/)
 [![Tests](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml/badge.svg)](https://github.com/cosanlab/py-feat/actions/workflows/tests_and_docs.yml)
 [![Coverage Status](https://coveralls.io/repos/github/cosanlab/py-feat/badge.svg?branch=master)](https://coveralls.io/github/cosanlab/py-feat?branch=master)
diff --git a/feat/data.py b/feat/data.py
index 3c01cbd2..6040ae56 100644
--- a/feat/data.py
+++ b/feat/data.py
@@ -1974,19 +1974,21 @@ def _inverse_face_transform(faces, batch_data):
         out_face = []
         for face in frame:
             out_face.append(
-                np.append(
-                    (
-                        np.array(
-                            [
-                                face[0] - left,
-                                face[1] - top,
-                                face[2] - left,
-                                face[3] - top,
-                            ]
-                        )
-                        / scale
-                    ),
-                    face[4],
+                list(
+                    np.append(
+                        (
+                            np.array(
+                                [
+                                    face[0] - left,
+                                    face[1] - top,
+                                    face[2] - left,
+                                    face[3] - top,
+                                ]
+                            )
+                            / scale
+                        ),
+                        face[4],
+                    )
                 )
             )
         out_frame.append(out_face)
diff --git a/feat/detector.py b/feat/detector.py
index 86e38cd7..b490eaf8 100644
--- a/feat/detector.py
+++ b/feat/detector.py
@@ -16,12 +16,14 @@
     FEAT_FACEPOSE_COLUMNS_6D,
     FEAT_TIME_COLUMNS,
     set_torch_device,
+    is_list_of_lists_empty,
 )
 from feat.utils.io import get_resource_path
 from feat.utils.image_operations import (
     extract_face_from_landmarks,
     extract_face_from_bbox,
     convert_image_to_tensor,
+    BBox,
 )
 from feat.pretrained import get_pretrained_models, fetch_model, AU_LANDMARK_MAP
 from feat.data import (
@@ -358,6 +360,7 @@ def detect_faces(self, frame, threshold=0.5, **face_model_kwargs):
 
         Args:
             frame (np.ndarray): 3d (single) or 4d (multiple) image array
+            threshold (float): threshold for detectiong faces (default=0.5)
 
         Returns:
             list: list of lists with the same length as the number of frames. Each list
@@ -382,7 +385,7 @@ def detect_faces(self, frame, threshold=0.5, **face_model_kwargs):
         else:
             faces = self.face_detector(frame, **face_model_kwargs)
 
-        if len(faces) == 0:
+        if is_list_of_lists_empty(faces):
             logging.warning("Warning: NO FACE is detected")
 
         thresholded_face = []
@@ -417,51 +420,55 @@ def detect_landmarks(self, frame, detected_faces, **landmark_model_kwargs):
         logging.info("detecting landmarks...")
         frame = convert_image_to_tensor(frame)
 
-        if self.info["landmark_model"]:
-            if self.info["landmark_model"].lower() == "mobilenet":
-                out_size = 224
-            else:
-                out_size = 112
+        if is_list_of_lists_empty(detected_faces):
+            list_concat = detected_faces
+        else:
+            if self.info["landmark_model"]:
+                if self.info["landmark_model"].lower() == "mobilenet":
 
-        extracted_faces, new_bbox = extract_face_from_bbox(
-            frame, detected_faces, face_size=out_size
-        )
+                    out_size = 224
+                else:
+                    out_size = 112
 
-        extracted_faces = extracted_faces / 255.0
+            extracted_faces, new_bbox = extract_face_from_bbox(
+                frame, detected_faces, face_size=out_size
+            )
 
-        if self.info["landmark_model"].lower() == "mobilenet":
-            extracted_faces = Compose(
-                [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
-            )(extracted_faces)
+            extracted_faces = extracted_faces / 255.0
 
-        # Run Landmark Model
-        if self.info["landmark_model"].lower() == "mobilefacenet":
-            landmark = (
-                self.landmark_detector(extracted_faces, **landmark_model_kwargs)[0]
-                .cpu()
-                .data.numpy()
-            )
-        else:
-            landmark = (
-                self.landmark_detector(extracted_faces, **landmark_model_kwargs)
-                .cpu()
-                .data.numpy()
-            )
+            if self.info["landmark_model"].lower() == "mobilenet":
+                extracted_faces = Compose(
+                    [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
+                )(extracted_faces)
+
+            # Run Landmark Model
+            if self.info["landmark_model"].lower() == "mobilefacenet":
+                landmark = (
+                    self.landmark_detector(extracted_faces, **landmark_model_kwargs)[0]
+                    .cpu()
+                    .data.numpy()
+                )
+            else:
+                landmark = (
+                    self.landmark_detector(extracted_faces, **landmark_model_kwargs)
+                    .cpu()
+                    .data.numpy()
+                )
 
-        landmark = landmark.reshape(landmark.shape[0], -1, 2)
+            landmark = landmark.reshape(landmark.shape[0], -1, 2)
 
-        landmark_results = []
-        for ik in range(landmark.shape[0]):
+            landmark_results = []
+            for ik in range(landmark.shape[0]):
 
-            landmark_results.append(
-                new_bbox[ik].inverse_transform_landmark(landmark[ik, :, :])
-            )
+                landmark_results.append(
+                    new_bbox[ik].inverse_transform_landmark(landmark[ik, :, :])
+                )
 
-        length_index = [len(x) for x in detected_faces]
-        new_lens = np.insert(np.cumsum(length_index), 0, 0)
-        list_concat = []
-        for ij in range(len(length_index)):
-            list_concat.append(landmark_results[new_lens[ij] : new_lens[ij + 1]])
+            length_index = [len(x) for x in detected_faces]
+            new_lens = np.insert(np.cumsum(length_index), 0, 0)
+            list_concat = []
+            for ij in range(len(length_index)):
+                list_concat.append(landmark_results[new_lens[ij] : new_lens[ij + 1]])
 
         return list_concat
 
@@ -488,12 +495,17 @@ def detect_facepose(self, frame, landmarks=None, **facepose_model_kwargs):
         # Normalize Data
         frame = convert_image_to_tensor(frame, img_type="float32") / 255
 
+        output = {}
         if "img2pose" in self.info["facepose_model"]:
             faces, poses = self.facepose_detector(frame, **facepose_model_kwargs)
+            output["faces"] = faces
+            output["poses"] = poses
         else:
-            poses = self.facepose_detector(frame, landmarks, **facepose_model_kwargs)
+            output["poses"] = self.facepose_detector(
+                frame, landmarks, **facepose_model_kwargs
+            )
 
-        return poses
+        return output
 
     def detect_aus(self, frame, landmarks, **au_model_kwargs):
         """Detect Action Units from image or video frame
@@ -515,66 +527,73 @@ def detect_aus(self, frame, landmarks, **au_model_kwargs):
 
         logging.info("detecting aus...")
         frame = convert_image_to_tensor(frame, img_type="float32")
-        # frame = transforms.ToTensor()(frame)
-
-        if self["au_model"].lower() in ["svm", "xgb"]:
-            # transform = Grayscale(3)
-            # frame = transform(frame)
-            hog_arr, new_lands = self._batch_hog(frames=frame, landmarks=landmarks)
-            au_predictions = self.au_model.detect_au(
-                frame=hog_arr, landmarks=new_lands, **au_model_kwargs
-            )
+
+        if is_list_of_lists_empty(landmarks):
+            return landmarks
         else:
-            au_predictions = self.au_model.detect_au(
-                frame, landmarks=landmarks, **au_model_kwargs
-            )
+            if self["au_model"].lower() in ["svm", "xgb"]:
+                # transform = Grayscale(3)
+                # frame = transform(frame)
+                hog_features, new_landmarks = self._batch_hog(
+                    frames=frame, landmarks=landmarks
+                )
+                au_predictions = self.au_model.detect_au(
+                    frame=hog_features, landmarks=new_landmarks, **au_model_kwargs
+                )
+            else:
+                au_predictions = self.au_model.detect_au(
+                    frame, landmarks=landmarks, **au_model_kwargs
+                )
 
-        return self._convert_detector_output(landmarks, au_predictions)
+            return self._convert_detector_output(landmarks, au_predictions)
 
     def _batch_hog(self, frames, landmarks):
         """
         Helper function used in batch processing hog features
-        frames is a batch of frames
-        """
 
-        len_index = [len(aa) for aa in landmarks]
-        lenth_cumu = np.cumsum(len_index)
-        lenth_cumu2 = np.insert(lenth_cumu, 0, 0)
-        new_lands_list = []
-        flat_land = [item for sublist in landmarks for item in sublist]
-        hogs_arr = None
+        Args:
+            frames: a batch of frames
+            landmarks: a list of list of detected landmarks
 
-        for i in range(len(flat_land)):
+        Returns:
+            hog_features: a numpy array of hog features for each detected landmark
+            landmarks: updated landmarks
+        """
 
-            frame_assignment = np.where(i < lenth_cumu)[0][0]
+        hog_features = []
+        new_landmark_frames = []
+        for i, frame_landmark in enumerate(landmarks):
+            if len(frame_landmark) != 0:
+                new_landmarks_faces = []
+                for j in range(len(frame_landmark)):
+                    convex_hull, new_landmark = extract_face_from_landmarks(
+                        frame=frames[i],
+                        landmarks=frame_landmark[j],
+                        face_size=112,
+                    )
 
-            convex_hull, new_lands = extract_face_from_landmarks(
-                frame=frames[frame_assignment],
-                landmarks=flat_land[i],
-                face_size=112,
-            )
+                    hog_features.append(
+                        hog(
+                            transforms.ToPILImage()(convex_hull[0]),
+                            orientations=8,
+                            pixels_per_cell=(8, 8),
+                            cells_per_block=(2, 2),
+                            visualize=False,
+                            channel_axis=-1,
+                        ).reshape(1, -1)
+                    )
 
-            hogs = hog(
-                transforms.ToPILImage()(convex_hull[0]),
-                orientations=8,
-                pixels_per_cell=(8, 8),
-                cells_per_block=(2, 2),
-                visualize=False,
-                channel_axis=-1,
-            ).reshape(1, -1)
-
-            if hogs_arr is None:
-                hogs_arr = hogs
+                    new_landmarks_faces.append(new_landmark)
+                new_landmark_frames.append(new_landmarks_faces)
             else:
-                hogs_arr = np.concatenate([hogs_arr, hogs], 0)
-
-            new_lands_list.append(new_lands)
+                hog_features.append(
+                    np.zeros((1, 5408))
+                )  # LC: Need to confirm this size is fixed.
+                new_landmark_frames.append([np.zeros((68, 2))])
 
-        new_lands = []
-        for i in range(len(lenth_cumu)):
-            new_lands.append(new_lands_list[lenth_cumu2[i] : (lenth_cumu2[i + 1])])
+        hog_features = np.concatenate(hog_features)
 
-        return (hogs_arr, new_lands)
+        return (hog_features, new_landmark_frames)
 
     def detect_emotions(self, frame, facebox, landmarks, **emotion_model_kwargs):
         """Detect emotions from image or video frame
@@ -600,42 +619,89 @@ def detect_emotions(self, frame, facebox, landmarks, **emotion_model_kwargs):
         logging.info("detecting emotions...")
         frame = convert_image_to_tensor(frame, img_type="float32")
 
-        if self.info["emotion_model"].lower() == "resmasknet":
-            return self._convert_detector_output(
-                facebox,
-                self.emotion_model.detect_emo(frame, facebox, **emotion_model_kwargs),
-            )
+        if is_list_of_lists_empty(facebox):
+            return facebox
+        else:
+            if self.info["emotion_model"].lower() == "resmasknet":
+                return self._convert_detector_output(
+                    facebox,
+                    self.emotion_model.detect_emo(
+                        frame, facebox, **emotion_model_kwargs
+                    ),
+                )
 
-        elif self.info["emotion_model"].lower() == "svm":
-            hog_arr, new_lands = self._batch_hog(frames=frame, landmarks=landmarks)
-            return self._convert_detector_output(
-                landmarks,
-                self.emotion_model.detect_emo(
-                    frame=hog_arr, landmarks=new_lands, **emotion_model_kwargs
-                ),
-            )
+            elif self.info["emotion_model"].lower() == "svm":
+                hog_features, new_landmarks = self._batch_hog(
+                    frames=frame, landmarks=landmarks
+                )
+                return self._convert_detector_output(
+                    landmarks,
+                    self.emotion_model.detect_emo(
+                        frame=hog_features,
+                        landmarks=new_landmarks,
+                        **emotion_model_kwargs,
+                    ),
+                )
 
-        else:
-            raise ValueError(
-                "Cannot recognize input emo model! Please try to re-type emotion model"
-            )
+            else:
+                raise ValueError(
+                    "Cannot recognize input emo model! Please try to re-type emotion model"
+                )
 
-    def _check_detections(self, faces, landmarks, poses, aus, emotions, batch_data):
+    def _run_detection_waterfall(
+        self,
+        batch_data,
+        face_detection_threshold,
+        face_model_kwargs,
+        landmark_model_kwargs,
+        facepose_model_kwargs,
+        emotion_model_kwargs,
+        au_model_kwargs,
+    ):
         """
-        Private method to ensure that all detectors return the same number of detections
+        Main detection "waterfall." Calls each individual detector in the sequence
+        required to support any interactions between detections. Called
+        behind-the-scenes by .detect_image() and .detect_video()
+
+        Args:
+            batch_data (dict): singleton item from iterating over the output of a DataLoader
+            face_detection_threshold (float): value between 0-1
+            face_model_kwargs (dict): face model kwargs
+            landmark_model_kwargs (dict): landmark model kwargs
+            facepose_model_kwargs (dict): facepose model kwargs
+            emotion_model_kwargs (dict): emotion model kwargs
+            au_model_kwargs (dict): au model kwargs
+
+        Returns:
+            tuple: faces, landmarks, poses, aus, emotions
         """
+        faces = self.detect_faces(
+            batch_data["Image"],
+            threshold=face_detection_threshold,
+            **face_model_kwargs,
+        )
 
-        # Each input arg is a nested list with length == number of faces in the batch
+        landmarks = self.detect_landmarks(
+            batch_data["Image"],
+            detected_faces=faces,
+            **landmark_model_kwargs,
+        )
+        poses_dict = self.detect_facepose(
+            batch_data["Image"], landmarks, **facepose_model_kwargs
+        )
+        aus = self.detect_aus(batch_data["Image"], landmarks, **au_model_kwargs)
+        emotions = self.detect_emotions(
+            batch_data["Image"], faces, landmarks, **emotion_model_kwargs
+        )
 
-        # Check 1) img2pose sometimes gives fewer detections that other models, we can't
-        # properly assemble Fex when that's the case. Returning more or the same
-        # detections is ok for now
-        if len(poses[0]) >= len(faces[0]):
-            return
+        faces = _inverse_face_transform(faces, batch_data)
+        landmarks = _inverse_landmark_transform(landmarks, batch_data)
 
-        raise ValueError(
-            f"Mismatch across detectors when processing batch: {batch_data['FileNames']}\n\nAn error occurred trying to merge detections into a single Fex object, as each type of detector is detecting a different number of faces:\n\nface_detector: {len(faces[0])}\npose_detector: {len(poses[0])}\nlandmark_detector: {len(landmarks[0])}\nau_detector: {len(aus[0])}\nemotion_detector: {len(emotions[0])}\n\nThis can happen for a number of reasons. Here are a few solutions:\n\n1) the face_model is too liberal. You use the 'threshold' keyword argument to make the detector more conservative, e.g. threshold= some val > 0.5\n2) the pose_detector gives different predictions than other detectors. You can use the same model for both pose and face detection by setting face_model='img2pose' and pose_model='img2pose' (or 'img2pose-c')"
+        # match faces to poses - sometimes face detector finds different faces than pose detector.
+        faces, poses = self._match_faces_to_poses(
+            faces, poses_dict["faces"], poses_dict["poses"]
         )
+        return faces, landmarks, poses, aus, emotions
 
     def detect_image(
         self,
@@ -645,8 +711,7 @@ def detect_image(
         num_workers=0,
         pin_memory=False,
         frame_counter=0,
-        skip_failed_detections=False,
-        threshold=0.5,
+        face_detection_threshold=0.5,
         **kwargs,
     ):
         """
@@ -661,17 +726,17 @@ def detect_image(
         Args:
             input_file_list (list of str): Path to a list of paths to image files.
             output_size (int): image size to rescale all image preserving aspect ratio.
-            Will raise an error if not set and batch_size > 1 but images are not the same size
+                                Will raise an error if not set and batch_size > 1 but images are not the same size
             batch_size (int): how many batches of images you want to run at one shot.
-            Larger gives faster speed but is more memory-consuming. Images must be the
+                                Larger gives faster speed but is more memory-consuming. Images must be the
             same size to be run in batches!
             num_workers (int): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process.
             pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them.  If your data elements are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type
             frame_counter (int): starting value to count frames
-            threshold (float): value between 0-1 to report a detection based on the
-            confidence of the face detector; Default >= 0.5
+            face_detection_threshold (float): value between 0-1 to report a detection based on the
+                                confidence of the face detector; Default >= 0.5
             **kwargs: you can pass each detector specific kwargs using a dictionary
-            like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...`
+                                like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...`
 
         Returns:
             Fex: Prediction results dataframe
@@ -701,47 +766,34 @@ def detect_image(
             warnings.warn(
                 "Currently using mobilenet for landmark detection with batch_size > 1 may lead to erroneous detections. We recommend either setting batch_size=1 or using mobilefacenet as the landmark detection model. You can follow this issue for more: https://github.com/cosanlab/py-feat/issues/151"
             )
-        try:
 
+        try:
             batch_output = []
+
             for batch_id, batch_data in enumerate(tqdm(data_loader)):
-                frame_counter += frame_counter + batch_id * batch_size
-                faces = self.detect_faces(
-                    batch_data["Image"], threshold=threshold, **face_model_kwargs
-                )
-                landmarks = self.detect_landmarks(
-                    batch_data["Image"], detected_faces=faces, **landmark_model_kwargs
-                )
-                poses = self.detect_facepose(
-                    batch_data["Image"], landmarks, **facepose_model_kwargs
+
+                faces, landmarks, poses, aus, emotions = self._run_detection_waterfall(
+                    batch_data,
+                    face_detection_threshold,
+                    face_model_kwargs,
+                    landmark_model_kwargs,
+                    facepose_model_kwargs,
+                    emotion_model_kwargs,
+                    au_model_kwargs,
                 )
-                aus = self.detect_aus(batch_data["Image"], landmarks, **au_model_kwargs)
-                emotions = self.detect_emotions(
-                    batch_data["Image"], faces, landmarks, **emotion_model_kwargs
+
+                output = self._create_fex(
+                    faces,
+                    landmarks,
+                    poses,
+                    aus,
+                    emotions,
+                    batch_data["FileNames"],
+                    frame_counter,
                 )
+                batch_output.append(output)
+                frame_counter += 1 * batch_size
 
-                faces = _inverse_face_transform(faces, batch_data)
-                landmarks = _inverse_landmark_transform(landmarks, batch_data)
-                try:
-                    self._check_detections(
-                        faces, landmarks, poses, aus, emotions, batch_data
-                    )
-                    output = self._create_fex(
-                        faces,
-                        landmarks,
-                        poses,
-                        aus,
-                        emotions,
-                        batch_data["FileNames"],
-                        frame_counter,
-                    )
-                    batch_output.append(output)
-                except ValueError as e:
-                    if skip_failed_detections:
-                        print(e)
-                        continue
-                    else:
-                        raise e
             batch_output = pd.concat(batch_output)
             batch_output.reset_index(drop=True, inplace=True)
 
@@ -759,7 +811,8 @@ def detect_video(
         batch_size=1,
         num_workers=0,
         pin_memory=False,
-        **detector_kwargs,
+        face_detection_threshold=0.5,
+        **kwargs,
     ):
         """Detects FEX from a video file.
 
@@ -773,11 +826,20 @@ def detect_video(
             pin_memory (bool): If ``True``, the data loader will copy Tensors
                                 into CUDA pinned memory before returning them.  If your data elements
                                 are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type
+            face_detection_threshold (float): value between 0-1 to report a detection based on the
+                                confidence of the face detector; Default >= 0.5
 
         Returns:
             Fex: Prediction results dataframe
         """
 
+        # Keyword arguments than can be passed to the underlying models
+        face_model_kwargs = kwargs.pop("face_model_kwargs", dict())
+        landmark_model_kwargs = kwargs.pop("landmark_model_kwargs", dict())
+        au_model_kwargs = kwargs.pop("au_model_kwargs", dict())
+        emotion_model_kwargs = kwargs.pop("emotion_model_kwargs", dict())
+        facepose_model_kwargs = kwargs.pop("facepose_model_kwargs", dict())
+
         data_loader = DataLoader(
             VideoDataset(video_path, skip_frames=skip_frames, output_size=output_size),
             num_workers=num_workers,
@@ -787,48 +849,37 @@ def detect_video(
         )
 
         batch_output = []
+
         for batch_data in tqdm(data_loader):
-            faces = self.detect_faces(batch_data["Image"], **detector_kwargs)
-            landmarks = self.detect_landmarks(
-                batch_data["Image"], detected_faces=faces, **detector_kwargs
-            )
-            poses = self.detect_facepose(batch_data["Image"], **detector_kwargs)
-            aus = self.detect_aus(batch_data["Image"], landmarks, **detector_kwargs)
-            emotions = self.detect_emotions(
-                batch_data["Image"], faces, landmarks, **detector_kwargs
+
+            faces, landmarks, poses, aus, emotions = self._run_detection_waterfall(
+                batch_data,
+                face_detection_threshold,
+                face_model_kwargs,
+                landmark_model_kwargs,
+                facepose_model_kwargs,
+                emotion_model_kwargs,
+                au_model_kwargs,
             )
+
             frames = list(batch_data["Frame"].numpy())
-            landmarks = _inverse_landmark_transform(landmarks, batch_data)
+
             output = self._create_fex(
-                faces, landmarks, poses, aus, emotions, batch_data["FileName"], frames
+                faces,
+                landmarks,
+                poses,
+                aus,
+                emotions,
+                batch_data["FileName"],
+                frames,
             )
+
             batch_output.append(output)
 
         batch_output = pd.concat(batch_output)
         batch_output.reset_index(drop=True, inplace=True)
         return batch_output.set_index("frame", drop=False)
 
-    def _convert_detector_output(detected_faces, detector_results):
-        """Helper function to convert AU/Emotion detector output into frame by face list of lists.
-
-        Args:
-            detected_faces (list): list of lists output from face/landmark detector
-            au_results (np.array):, results from au/emotion detectors
-
-        Returns:
-            list_concat: (list of list). The list which contains the number of faces. for example
-            if you process 2 frames and each frame contains 4 faces, it will return:
-                [[xxx,xxx,xxx,xxx],[xxx,xxx,xxx,xxx]]
-        """
-
-        length_index = [len(x) for x in detected_faces]
-
-        list_concat = []
-        new_lens = np.insert(np.cumsum(length_index), 0, 0)
-        for ij in range(len(length_index)):
-            list_concat.append(detector_results[new_lens[ij] : new_lens[ij + 1], :])
-        return list_concat
-
     def _create_fex(
         self, faces, landmarks, poses, aus, emotions, file_names, frame_counter
     ):
@@ -848,11 +899,53 @@ def _create_fex(
         """
 
         logging.info("creating fex output...")
-        files = [[f] * n for f, n in zip(file_names, [len(x) for x in faces])]
 
-        # Convert to Pandas Format
         out = []
         for i, frame in enumerate(faces):
+            if not frame:
+                facebox_df = pd.DataFrame(
+                    {x: np.nan for x in self.info["face_detection_columns"]},
+                    columns=self.info["face_detection_columns"],
+                    index=[i],
+                )
+                facepose_df = pd.DataFrame(
+                    {x: np.nan for x in self.info["facepose_model_columns"]},
+                    columns=self.info["facepose_model_columns"],
+                    index=[i],
+                )
+                landmarks_df = pd.DataFrame(
+                    {x: np.nan for x in self.info["face_landmark_columns"]},
+                    columns=self.info["face_landmark_columns"],
+                    index=[i],
+                )
+                aus_df = pd.DataFrame(
+                    {x: np.nan for x in self.info["au_presence_columns"]},
+                    columns=self.info["au_presence_columns"],
+                    index=[i],
+                )
+                emotions_df = pd.DataFrame(
+                    {x: np.nan for x in self.info["emotion_model_columns"]},
+                    columns=self.info["emotion_model_columns"],
+                    index=[i],
+                )
+                input_df = pd.DataFrame(file_names[i], columns=["input"], index=[i])
+                tmp_df = pd.concat(
+                    [
+                        facebox_df,
+                        landmarks_df,
+                        facepose_df,
+                        aus_df,
+                        emotions_df,
+                        input_df,
+                    ],
+                    axis=1,
+                )
+                if isinstance(frame_counter, (list)):
+                    tmp_df[FEAT_TIME_COLUMNS] = frame_counter[i]
+                else:
+                    tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i
+                out.append(tmp_df)
+
             for j, face_in_frame in enumerate(frame):
                 facebox_df = pd.DataFrame(
                     [
@@ -869,7 +962,7 @@ def _create_fex(
                 )
 
                 facepose_df = pd.DataFrame(
-                    [poses[i][j].flatten(order="F")],
+                    [poses[i][j]],
                     columns=self.info["facepose_model_columns"],
                     index=[j],
                 )
@@ -887,13 +980,15 @@ def _create_fex(
                 )
 
                 emotions_df = pd.DataFrame(
-                    emotions[i][j, :].reshape(1, len(FEAT_EMOTION_COLUMNS)),
-                    columns=FEAT_EMOTION_COLUMNS,
+                    emotions[i][j, :].reshape(
+                        1, len(self.info["emotion_model_columns"])
+                    ),
+                    columns=self.info["emotion_model_columns"],
                     index=[j],
                 )
 
                 input_df = pd.DataFrame(
-                    files[i][j],
+                    file_names[i],
                     columns=["input"],
                     index=[j],
                 )
@@ -915,6 +1010,7 @@ def _create_fex(
                 else:
                     tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i
                 out.append(tmp_df)
+
         out = pd.concat(out)
         out.reset_index(drop=True, inplace=True)
 
@@ -922,9 +1018,9 @@ def _create_fex(
         return Fex(
             out,
             au_columns=self.info["au_presence_columns"],
-            emotion_columns=FEAT_EMOTION_COLUMNS,
-            facebox_columns=FEAT_FACEBOX_COLUMNS,
-            landmark_columns=openface_2d_landmark_columns,
+            emotion_columns=self.info["emotion_model_columns"],
+            facebox_columns=self.info["face_detection_columns"],
+            landmark_columns=self.info["face_landmark_columns"],
             facepose_columns=self.info["facepose_model_columns"],
             detector="Feat",
             face_model=self.info["face_model"],
@@ -934,7 +1030,8 @@ def _create_fex(
             facepose_model=self.info["facepose_model"],
         )
 
-    def _convert_detector_output(self, detected_faces, detector_results):
+    @staticmethod
+    def _convert_detector_output(detected_faces, detector_results):
         """
         Helper function to convert AU/Emotion detector output into frame by face list of lists.
         Either face or landmark detector list of list outputs can be used.
@@ -956,3 +1053,136 @@ def _convert_detector_output(self, detected_faces, detector_results):
         for ij in range(len(length_index)):
             list_concat.append(detector_results[new_lens[ij] : new_lens[ij + 1], :])
         return list_concat
+
+    @staticmethod
+    def _match_faces_to_poses(faces, faces_pose, poses):
+        """Helper function to match list of lists of faces and poses based on overlap in bounding boxes.
+
+        Sometimes the face detector finds different faces than the pose detector unless the user
+        is using the same detector (i.e., img2pose).
+
+        This function will match the faces and poses and will return nans if more faces are detected then poses.
+        Will only return poses that match faces even if more faces are detected by pose detector.
+
+        Args:
+            faces (list): list of lists of face bounding boxes from face detector
+            faces_pose (list): list of lists of face bounding boxes from pose detector
+            poses (list): list of lists of poses from pose detector
+
+        Returns:
+            faces (list): list of list of faces that have been matched to poses
+            poses (list): list of list of poses that have been matched to faces
+        """
+
+        if len(faces) != len(faces_pose):
+            raise ValueError(
+                "Make sure the number of batches in faces and poses is the same."
+            )
+
+        if is_list_of_lists_empty(faces):
+            # Currently assuming no faces if no face is detected. Not running pose
+            return (faces, poses)
+
+        else:
+
+            overlap_faces = []
+            overlap_poses = []
+            for frame_face, frame_face_pose, frame_pose in zip(
+                faces, faces_pose, poses
+            ):
+                if not frame_face:
+                    n_faces = 0
+                elif isinstance(frame_face[0], list):
+                    n_faces = len(frame_face)
+                else:
+                    n_faces = 1
+
+                if not frame_face_pose:
+                    n_poses = 0
+                elif isinstance(frame_face_pose[0], list):
+                    n_poses = len(frame_face_pose)
+                else:
+                    n_poses = 1
+
+                frame_overlap = np.zeros([n_faces, n_poses])
+
+                if n_faces == 0:
+                    overlap_faces.append([])
+                    overlap_poses.append([])
+
+                elif (n_faces == 1) & (n_poses > 1):
+                    b1 = BBox(frame_face[0][:-1])
+
+                    for pose_idx in range(n_poses):
+                        b2 = BBox(frame_face_pose[pose_idx][:-1])
+                        frame_overlap[0, pose_idx] = b1.overlap(b2)
+                    matched_pose_index = np.where(
+                        frame_overlap[0, :] == frame_overlap[0, :].max()
+                    )[0][0]
+                    overlap_faces.append(frame_face)
+                    overlap_poses.append([frame_pose[matched_pose_index]])
+
+                elif (n_faces > 1) & (n_poses == 1):
+                    b2 = BBox(frame_face_pose[0][:-1])
+                    for face_idx in range(n_faces):
+                        b1 = BBox(frame_face[face_idx][:-1])
+                        frame_overlap[face_idx, 0] = b1.overlap(b2)
+                    matched_face_index = np.where(
+                        frame_overlap[:, 0] == frame_overlap[:, 0].max()
+                    )[0][0]
+                    new_poses = []
+                    for f_idx in range(n_faces):
+                        if f_idx == matched_face_index:
+                            new_poses.append(frame_pose[0])
+                        else:
+                            new_poses.append(np.ones(3) * np.nan)
+                    overlap_faces.append(frame_face)
+                    overlap_poses.append(new_poses)
+
+                else:
+                    for face_idx in range(n_faces):
+                        b1 = BBox(frame_face[face_idx][:-1])
+                        for pose_idx in range(n_poses):
+                            b2 = BBox(frame_face_pose[pose_idx][:-1])
+                            frame_overlap[face_idx, pose_idx] = b1.overlap(b2)
+
+                    overlap_faces_frame = []
+                    overlap_poses_frame = []
+                    if n_faces < n_poses:
+                        for face_idx in range(n_faces):
+                            pose_idx = np.where(
+                                frame_overlap[face_idx, :]
+                                == frame_overlap[face_idx, :].max()
+                            )[0][0]
+                            overlap_faces_frame.append(frame_face[face_idx])
+                            overlap_poses_frame.append(frame_pose[pose_idx])
+                    elif n_faces > n_poses:
+                        matched_pose_index = []
+                        for pose_idx in range(n_poses):
+                            matched_pose_index.append(
+                                np.where(
+                                    frame_overlap[:, pose_idx]
+                                    == frame_overlap[:, pose_idx].max()
+                                )[0][0]
+                            )
+                        for face_idx in range(n_faces):
+                            overlap_faces_frame.append(frame_face[face_idx])
+                            if face_idx in matched_pose_index:
+                                overlap_poses_frame.append(
+                                    frame_pose[
+                                        np.where(
+                                            frame_overlap[face_idx, :]
+                                            == frame_overlap[face_idx, :].max()
+                                        )[0][0]
+                                    ]
+                                )
+                            else:
+                                overlap_poses_frame.append(np.ones(3) * np.nan)
+                    elif n_faces == n_poses:
+                        overlap_faces_frame = frame_face
+                        overlap_poses_frame = frame_pose
+
+                    overlap_faces.append(overlap_faces_frame)
+                    overlap_poses.append(overlap_poses_frame)
+
+            return (overlap_faces, overlap_poses)
diff --git a/feat/face_detectors/FaceBoxes/FaceBoxes_test.py b/feat/face_detectors/FaceBoxes/FaceBoxes_test.py
index 7a96f982..1bcfede2 100644
--- a/feat/face_detectors/FaceBoxes/FaceBoxes_test.py
+++ b/feat/face_detectors/FaceBoxes/FaceBoxes_test.py
@@ -52,7 +52,7 @@ def __init__(
         top_k=5000,
         keep_top_k=750,
         nms_threshold=0.3,
-        vis_threshold=0.5,
+        detection_threshold=0.5,
         resize=1,
         device="auto",
     ):
@@ -82,14 +82,14 @@ def __init__(
             self.top_k,
             self.keep_top_k,
             self.nms_threshold,
-            self.vis_threshold,
+            self.detection_threshold,
             self.resize,
         ) = (
             confidence_threshold,
             top_k,
             keep_top_k,
             nms_threshold,
-            vis_threshold,
+            detection_threshold,
             resize,
         )
 
@@ -148,11 +148,11 @@ def _calculate_boxinfo(self, im_height, im_width, loc, conf, scale):
         # keep top-K faster NMS
         dets = dets[: self.keep_top_k, :]
 
-        # filter using vis_thres - rescale box size to be proportional to image size
+        # filter using detection_threshold - rescale box size to be proportional to image size
         scale_x, scale_y = (im_width / im_height, im_height / im_width)
         det_bboxes = []
         for b in dets:
-            if b[4] > self.vis_threshold:
+            if b[4] > self.detection_threshold:
                 xmin, ymin, xmax, ymax, score = b
                 det_bboxes.append(
                     [
diff --git a/feat/face_detectors/MTCNN/MTCNN_test.py b/feat/face_detectors/MTCNN/MTCNN_test.py
index 113e88fb..f67be176 100644
--- a/feat/face_detectors/MTCNN/MTCNN_test.py
+++ b/feat/face_detectors/MTCNN/MTCNN_test.py
@@ -32,6 +32,7 @@ class MTCNN(nn.Module):
             (default: {0})
         min_face_size {int} -- Minimum face size to search for. (default: {20})
         thresholds {list} -- MTCNN face detection thresholds (default: {[0.6, 0.7, 0.7]})
+        detection_threshold (float): threshold for detectiong faces (default=0.5). Will override the last stage of thresholds
         factor {float} -- Factor used to create a scaling pyramid of face sizes. (default: {0.709})
         post_process {bool} -- Whether or not to post process images tensors before returning.
             (default: {True})
@@ -46,8 +47,7 @@ class MTCNN(nn.Module):
                     "center_weighted_size": box size minus weighted squared offset from image center
                 (default: {None})
         keep_all {bool} -- If True, all detected faces are returned, in the order dictated by the
-            select_largest parameter. If a save_path is specified, the first face is saved to that
-            path and the remaining faces are saved to <save_path>1, <save_path>2 etc.
+            select_largest parameter.
             (default: {False})
         device {torch.device} -- The device on which to run neural net passes. Image tensors and
             models are copied to this device before running forward passes. (default: 'auto')
@@ -59,11 +59,12 @@ def __init__(
         margin=0,
         min_face_size=20,
         thresholds=[0.6, 0.7, 0.7],
+        detection_threshold=0.5,
         factor=0.709,
         post_process=True,
         select_largest=True,
         selection_method=None,
-        keep_all=False,
+        keep_all=True,
         device="auto",
     ):
         super().__init__()
@@ -72,6 +73,7 @@ def __init__(
         self.margin = margin
         self.min_face_size = min_face_size
         self.thresholds = thresholds
+        self.thresholds[-1] = detection_threshold
         self.factor = factor
         self.post_process = post_process
         self.select_largest = select_largest
diff --git a/feat/face_detectors/Retinaface/Retinaface_test.py b/feat/face_detectors/Retinaface/Retinaface_test.py
index f1a56dc8..ed5e71d2 100644
--- a/feat/face_detectors/Retinaface/Retinaface_test.py
+++ b/feat/face_detectors/Retinaface/Retinaface_test.py
@@ -21,7 +21,7 @@ def __init__(
         self,
         device="auto",
         resize=1,
-        vis_threshold=0.5,
+        detection_threshold=0.5,
         nms_threshold=0.4,
         keep_top_k=750,
         top_k=5000,
@@ -34,7 +34,7 @@ def __init__(
             device: (str)
             timer_flag: (bool)
             resize: (int)
-            vis_threshold: (float)
+            detection_threshold: (float)
             nms_threshold: (float)
             keep_top_k: (float)
             top_k: (float)
@@ -78,14 +78,14 @@ def __init__(
         # Set cutoff parameters
         (
             self.resize,
-            self.vis_threshold,
+            self.detection_threshold,
             self.nms_threshold,
             self.keep_top_k,
             self.top_k,
             self.confidence_threshold,
         ) = (
             resize,
-            vis_threshold,
+            detection_threshold,
             nms_threshold,
             keep_top_k,
             top_k,
@@ -175,11 +175,11 @@ def _calculate_boxinfo(self, im_height, im_width, loc, conf, landms, scale, img)
         # keep top-K faster NMS
         dets = dets[: self.keep_top_k, :]
 
-        # filter using vis_thres - rescale box size to be proportional to image size
+        # filter using detection_threshold - rescale box size to be proportional to image size
         scale_x, scale_y = (im_width / im_height, im_height / im_width)
         det_bboxes = []
         for b in dets:
-            if b[4] > self.vis_threshold:
+            if b[4] > self.detection_threshold:
                 xmin, ymin, xmax, ymax, score = b
                 det_bboxes.append(
                     [
diff --git a/feat/facepose_detectors/img2pose/img2pose_test.py b/feat/facepose_detectors/img2pose/img2pose_test.py
index d915c234..3671b117 100644
--- a/feat/facepose_detectors/img2pose/img2pose_test.py
+++ b/feat/facepose_detectors/img2pose/img2pose_test.py
@@ -231,7 +231,7 @@ def predict(self, img, border_size=0, scale=1.0, euler=True):
                 dof_pose = pose_pred[:]  # pitch, roll, yaw, x, y, z
 
             dof_pose = dof_pose.reshape(1, -1)
-            det_pose.append(dof_pose)
+            det_pose.append(list(dof_pose.flatten()))
 
         return {"boxes": det_bboxes, "poses": det_pose}
 
diff --git a/feat/tests/conftest.py b/feat/tests/conftest.py
index bda0050f..69271eba 100644
--- a/feat/tests/conftest.py
+++ b/feat/tests/conftest.py
@@ -84,6 +84,11 @@ def default_detector():
     return Detector()
 
 
+@fixture(scope="module")
+def no_face_img(data_path):
+    return os.path.join(data_path, "free-mountain-vector-01.jpg")
+
+
 @fixture(scope="module")
 def single_face_img(data_path):
     return os.path.join(data_path, "single_face.jpg")
@@ -100,6 +105,21 @@ def single_face_mov(data_path):
     return os.path.join(data_path, "single_face.mp4")
 
 
+@fixture(scope="module")
+def no_face_mov(data_path):
+    return os.path.join(data_path, "no_face.mp4")
+
+
+@fixture(scope="module")
+def face_noface_mov(data_path):
+    return os.path.join(data_path, "face_noface.mov")
+
+
+@fixture(scope="module")
+def noface_face_mov(data_path):
+    return os.path.join(data_path, "noface_face.mov")
+
+
 @fixture(scope="module")
 def multi_face_img(data_path):
     return os.path.join(data_path, "multi_face.jpg")
diff --git a/feat/tests/data/face_noface.mov b/feat/tests/data/face_noface.mov
new file mode 100644
index 00000000..4122ce5d
Binary files /dev/null and b/feat/tests/data/face_noface.mov differ
diff --git a/feat/tests/data/no_face.mp4 b/feat/tests/data/no_face.mp4
new file mode 100644
index 00000000..67d85100
Binary files /dev/null and b/feat/tests/data/no_face.mp4 differ
diff --git a/feat/tests/data/noface_face.mov b/feat/tests/data/noface_face.mov
new file mode 100644
index 00000000..a0f529c8
Binary files /dev/null and b/feat/tests/data/noface_face.mov differ
diff --git a/feat/tests/test_detector_core.py b/feat/tests/test_detector_core.py
index 4dead6e5..72e763ce 100644
--- a/feat/tests/test_detector_core.py
+++ b/feat/tests/test_detector_core.py
@@ -31,6 +31,7 @@ def test_landmark_with_batches(multiple_images_for_batch_testing):
 
 # TODO: Currently making this test always pass even if batching gives slightly diff
 # results until @tiankang can debug whether we're in tolerance
+# Track progress updates in this issue: https://github.com/cosanlab/py-feat/issues/128
 def test_detection_and_batching_with_diff_img_sizes(
     single_face_img, multi_face_img, multiple_images_for_batch_testing
 ):
@@ -108,6 +109,49 @@ def test_nofile(default_detector):
         _ = default_detector.detect_image(inputFname)
 
 
+# No Face images
+def test_detect_single_img_no_face(default_detector, no_face_img):
+    """Test detection of a single image with no face. Default detector returns 173 attributes"""
+    out = default_detector.detect_image(no_face_img)
+    assert type(out) == Fex
+    assert out.shape == (1, 173)
+    assert np.isnan(out.happiness.values[0])
+
+
+def test_detect_multi_img_no_face(default_detector, no_face_img):
+    """Test detection of a multiple images with no face. Default detector returns 173 attributes"""
+    out = default_detector.detect_image([no_face_img] * 3)
+    assert out.shape == (3, 173)
+
+
+def test_detect_multi_img_no_face_batching(default_detector, no_face_img):
+    """Test detection of a multiple images with no face. Default detector returns 173 attributes"""
+    out = default_detector.detect_image([no_face_img] * 5, batch_size=2)
+    assert out.shape == (5, 173)
+
+
+def test_detect_multi_img_mixed_no_face(
+    default_detector, no_face_img, single_face_img, multi_face_img
+):
+    """Test detection of a single image with no face. Default detector returns 173 attributes"""
+    out = default_detector.detect_image(
+        [single_face_img, no_face_img, multi_face_img] * 2
+    )
+    assert out.shape == (14, 173)
+
+
+def test_detect_multi_img_mixed_no_face_batching(
+    default_detector, no_face_img, single_face_img, multi_face_img
+):
+    """Test detection of a single image with no face. Default detector returns 173 attributes"""
+    out = default_detector.detect_image(
+        [single_face_img, no_face_img, multi_face_img] * 2,
+        batch_size=4,
+        output_size=300,
+    )
+    assert out.shape == (14, 173)
+
+
 # Single images
 def test_detect_single_img_single_face(default_detector, single_face_img):
     """Test detection of single face from single image. Default detector returns 173 attributes"""
@@ -151,13 +195,196 @@ def test_detect_mismatch_image_sizes(default_detector, single_face_img, multi_fa
     assert out.shape == (6, 173)
 
     out = default_detector.detect_image(
-        [multi_face_img, single_face_img] * 5, batch_size=5, output_size=256
+        [multi_face_img, single_face_img] * 5, batch_size=5, output_size=512
     )
     assert out.shape == (30, 173)
 
 
-def test_detect_video(default_detector, single_face_mov):
+def test_detect_video(
+    default_detector, single_face_mov, no_face_mov, face_noface_mov, noface_face_mov
+):
     """Test detection on video file"""
     out = default_detector.detect_video(single_face_mov, skip_frames=24)
     assert len(out) == 3
     assert out.happiness.values.max() > 0
+
+    # Test no face movie
+    out = default_detector.detect_video(no_face_mov, skip_frames=24)
+    assert len(out) == 4
+    # Empty detections are filled with NaNs
+    assert out.aus.isnull().all().all()
+
+    # Test mixed movie, i.e. spliced vids of face -> noface and noface -> face
+    out = default_detector.detect_video(face_noface_mov, skip_frames=24)
+    assert len(out) == 3 + 4 + 1
+    # first few frames have a face
+    assert not out.aus.iloc[:3].isnull().all().all()
+    # But the rest are from a diff video that doesn't
+    assert out.aus.iloc[3:].isnull().all().all()
+
+    out = default_detector.detect_video(noface_face_mov, skip_frames=24)
+    assert len(out) == 3 + 4 + 1
+    # beginning no face
+    assert out.aus.iloc[:4].isnull().all().all()
+    # middle frames have face
+    assert not out.aus.iloc[4:7].isnull().all().all()
+    # ending doesn't
+    assert out.aus.iloc[7:].isnull().all().all()
+
+
+def test_detect_mismatch_face_pose(default_detector):
+    # Multiple Faces, 1 pose
+    faces = [
+        [
+            [
+                45.34465026855469,
+                49.546714782714844,
+                63.04056167602539,
+                70.38599395751953,
+                0.95337886,
+            ],
+            [
+                146.09866333007812,
+                96.34442901611328,
+                165.69561767578125,
+                120.71611022949219,
+                0.9069432,
+            ],
+        ]
+    ]
+    faces_pose = [[[46.0, 46.0, 66.0, 71.0, 0.99272925]]]
+    poses = [[[-3.72766398, 10.9359162, -3.19862351]]]
+
+    new_faces, new_poses = default_detector._match_faces_to_poses(
+        faces, faces_pose, poses
+    )
+    assert len(new_faces[0]) == len(new_poses[0])
+    assert len(new_faces[0]) == 2
+
+    # 1 face, multiple poses
+    faces = [
+        [
+            [
+                45.34465026855469,
+                49.546714782714844,
+                63.04056167602539,
+                70.38599395751953,
+                0.95337886,
+            ]
+        ]
+    ]
+
+    faces_pose = [
+        [
+            [65.0, 83.0, 87.0, 110.0, 0.99630725],
+            [141.0, 94.0, 167.0, 123.0, 0.9952237],
+            [111.0, 97.0, 136.0, 126.0, 0.99487805],
+            [91.0, 78.0, 109.0, 100.0, 0.99454665],
+            [46.0, 46.0, 66.0, 71.0, 0.99272925],
+        ]
+    ]
+
+    poses = [
+        [
+            [-5.90236694, -2.81686444, -5.38250827],
+            [18.3324545, 7.2330487, 2.70649852],
+            [12.04520545, 5.91369713, 6.13698383],
+            [1.10688262, 1.56339815, -0.91693287],
+            [-3.72766398, 10.9359162, -3.19862351],
+        ]
+    ]
+
+    new_faces, new_poses = default_detector._match_faces_to_poses(
+        faces, faces_pose, poses
+    )
+    assert len(new_faces[0]) == 1
+    assert len(new_poses[0]) == 1
+
+    # 2 Faces, 5 Poses
+    faces = [
+        [
+            [
+                45.34465026855469,
+                49.546714782714844,
+                63.04056167602539,
+                70.38599395751953,
+                0.95337886,
+            ],
+            [
+                146.09866333007812,
+                96.34442901611328,
+                165.69561767578125,
+                120.71611022949219,
+                0.9069432,
+            ],
+        ]
+    ]
+
+    faces_pose = [
+        [
+            [65.0, 83.0, 87.0, 110.0, 0.99630725],
+            [141.0, 94.0, 167.0, 123.0, 0.9952237],
+            [111.0, 97.0, 136.0, 126.0, 0.99487805],
+            [91.0, 78.0, 109.0, 100.0, 0.99454665],
+            [46.0, 46.0, 66.0, 71.0, 0.99272925],
+        ]
+    ]
+
+    poses = [
+        [
+            [-5.90236694, -2.81686444, -5.38250827],
+            [18.3324545, 7.2330487, 2.70649852],
+            [12.04520545, 5.91369713, 6.13698383],
+            [1.10688262, 1.56339815, -0.91693287],
+            [-3.72766398, 10.9359162, -3.19862351],
+        ]
+    ]
+
+    new_faces, new_poses = default_detector._match_faces_to_poses(
+        faces, faces_pose, poses
+    )
+    assert len(new_faces[0]) == len(new_poses[0])
+    assert len(new_faces[0]) == 2
+
+    # 5 Faces, 2 Poses
+    faces = [
+        [
+            [65.0, 83.0, 87.0, 110.0, 0.99630725],
+            [141.0, 94.0, 167.0, 123.0, 0.9952237],
+            [111.0, 97.0, 136.0, 126.0, 0.99487805],
+            [91.0, 78.0, 109.0, 100.0, 0.99454665],
+            [46.0, 46.0, 66.0, 71.0, 0.99272925],
+        ]
+    ]
+
+    faces_pose = [
+        [
+            [
+                45.34465026855469,
+                49.546714782714844,
+                63.04056167602539,
+                70.38599395751953,
+                0.95337886,
+            ],
+            [
+                146.09866333007812,
+                96.34442901611328,
+                165.69561767578125,
+                120.71611022949219,
+                0.9069432,
+            ],
+        ]
+    ]
+
+    poses = [
+        [
+            [-5.90236694, -2.81686444, -5.38250827],
+            [18.3324545, 7.2330487, 2.70649852],
+        ]
+    ]
+
+    new_faces, new_poses = default_detector._match_faces_to_poses(
+        faces, faces_pose, poses
+    )
+    assert len(new_faces[0]) == len(new_poses[0])
+    assert len(new_faces[0]) == 5
diff --git a/feat/tests/test_pretrained_models.py b/feat/tests/test_pretrained_models.py
index 7777226b..72bf4e6f 100644
--- a/feat/tests/test_pretrained_models.py
+++ b/feat/tests/test_pretrained_models.py
@@ -202,7 +202,7 @@ def test_img2pose_facepose(
 
         default_detector.change_model(facepose_model="img2pose")
         poses = default_detector.detect_facepose(single_face_img_data)
-        assert np.allclose(poses, [0.86, -3.80, 6.60], atol=0.1)
+        assert np.allclose(poses["poses"], [0.86, -3.80, 6.60], atol=0.1)
 
         # Test DOF kwarg
         facepose_model_kwargs = {"RETURN_DIM": 6}
@@ -215,10 +215,10 @@ def test_img2pose_facepose(
 
         # Also run directly
         poses = new_detector.detect_facepose(single_face_img_data)
-        assert len(poses[0][0].squeeze()) == 6
+        assert len(poses["poses"][0][0]) == 6
 
     def test_img2pose_c_facepose(self, default_detector, single_face_img_data):
 
         default_detector.change_model(facepose_model="img2pose-c")
         poses = default_detector.detect_facepose(single_face_img_data)
-        assert np.allclose(poses, [0.86, -3.80, 6.60], atol=0.1)
+        assert np.allclose(poses["poses"], [0.86, -3.80, 6.60], atol=0.1)
diff --git a/feat/utils/__init__.py b/feat/utils/__init__.py
index 60ea78ab..a03379af 100644
--- a/feat/utils/__init__.py
+++ b/feat/utils/__init__.py
@@ -151,3 +151,10 @@ def set_torch_device(device="cpu"):
 
     else:
         return device
+
+
+# TODO: Refactor the output of each detector into a reliable dataclass with the same
+# structure to avoid utility functions like this
+def is_list_of_lists_empty(list_of_lists):
+    """Helper function to check if list of lists is empty"""
+    return not any(list_of_lists)
diff --git a/feat/utils/image_operations.py b/feat/utils/image_operations.py
index b2ce5c5e..1a278b3c 100644
--- a/feat/utils/image_operations.py
+++ b/feat/utils/image_operations.py
@@ -22,6 +22,8 @@
 from copy import deepcopy
 from skimage import draw
 import logging
+from matplotlib.patches import Rectangle
+import matplotlib.pyplot as plt
 
 __all__ = [
     "neutral",
@@ -188,9 +190,10 @@ def extract_face_from_bbox(frame, detected_faces, face_size=112, expand_bbox=1.2
         cropped_faces.append(transform(cropped))
         bbox_list.append(bbox)
 
-    faces = torch.cat(
-        tuple([convert_image_to_tensor(x["Image"]) for x in cropped_faces]), 0
-    )
+        faces = torch.cat(
+            tuple([convert_image_to_tensor(x["Image"]) for x in cropped_faces]), 0
+        )
+
     return (faces, bbox_list)
 
 
@@ -469,6 +472,40 @@ def __init__(
     def __repr__(self):
         return f"'height': {self.height}, 'width': {self.width}"
 
+    def __mul__(self, bbox2):
+        """Create a new BBox based on the intersection between two BBox instances (AND operation)"""
+
+        if isinstance(bbox2, (BBox)):
+            return BBox(
+                [
+                    np.max([self.left, bbox2.left]),
+                    np.max([self.top, bbox2.top]),
+                    np.min([self.right, bbox2.right]),
+                    np.min([self.bottom, bbox2.bottom]),
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "Multiplication is currently only supported between two BBox instances"
+            )
+
+    def __add__(self, bbox2):
+        """Create a new BBox based on the intersection between two BBox instances (OR Operation)"""
+
+        if isinstance(bbox2, (BBox)):
+            return BBox(
+                [
+                    np.min([self.left, bbox2.left]),
+                    np.min([self.top, bbox2.top]),
+                    np.max([self.right, bbox2.right]),
+                    np.max([self.bottom, bbox2.bottom]),
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "Addition is currently only supported between two BBox instances"
+            )
+
     def expand_by_factor(self, factor, symmetric=True):
         """Expand box by factor
 
@@ -625,6 +662,42 @@ def inverse_transform_landmark(self, landmark):
             landmark_[i] = (x, y)
         return landmark_
 
+    def area(self):
+        """Compute the area of the bounding box"""
+        return self.height * self.width
+
+    def overlap(self, bbox2):
+        """Compute the percent overlap between BBox with another BBox"""
+        overlap_bbox = self * bbox2
+        if (overlap_bbox.height < 0) or (overlap_bbox.width < 0):
+            return 0
+        else:
+            return (self * bbox2).area() / self.area()
+
+    def plot(self, ax=None, fill=False, linewidth=2, **kwargs):
+        """Plot bounding box
+
+        Args:
+            ax: matplotlib axis
+            fill (bool): fill rectangle
+        """
+
+        if ax is None:
+            fig, ax = plt.subplots()
+            ax.plot()
+
+        ax.add_patch(
+            Rectangle(
+                (self.left, self.top),
+                self.width,
+                self.height,
+                fill=fill,
+                linewidth=linewidth,
+                **kwargs,
+            )
+        )
+        return ax
+
 
 def reverse_color_order(img):
     """Convert BGR OpenCV image to RGB format"""
@@ -724,7 +797,7 @@ def convert_to_euler(rotvec, is_rotvec=True):
         rotvec = Rotation.from_rotvec(rotvec).as_matrix()
     rot_mat_2 = np.transpose(rotvec)
     angle = Rotation.from_matrix(rot_mat_2).as_euler("xyz", degrees=True)
-    return np.array([angle[0], -angle[2], -angle[1]])  # pitch, roll, yaw
+    return [angle[0], -angle[2], -angle[1]]  # pitch, roll, yaw
 
 
 def py_cpu_nms(dets, thresh):
diff --git a/feat/version.py b/feat/version.py
index 3d187266..dd9b22cc 100644
--- a/feat/version.py
+++ b/feat/version.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.5.1"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 57dae2b4..cdfab17f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,7 +4,7 @@ pytest-cov
 coveralls
 pycodestyle
 black==22.3.0
-sphinx
+sphinx<6
 sphinx-rtd-theme
 sphinxcontrib-napoleon
 jupyter-book