From b41f2d1a1c50de7624202eaf4f33a95dc872705d Mon Sep 17 00:00:00 2001 From: Vladimir Iglovikov Date: Wed, 2 Oct 2024 16:44:17 -0700 Subject: [PATCH 1/2] Empty-Commit From 053b28d25317166484eca06c9c95b966fab7338a Mon Sep 17 00:00:00 2001 From: Vladimir Iglovikov Date: Tue, 19 Nov 2024 16:00:50 -0800 Subject: [PATCH 2/2] Added RotateAndProject --- README.md | 1 + .../augmentations/geometric/functional.py | 76 +++++++- .../augmentations/geometric/rotate.py | 181 +++++++++++++++++- .../augmentations/geometric/transforms.py | 8 +- tests/aug_definitions.py | 1 + tests/functional/test_geometric.py | 92 +++++++++ tests/test_transforms.py | 8 + 7 files changed, 357 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 624a91e43..c3e329383 100644 --- a/README.md +++ b/README.md @@ -308,6 +308,7 @@ Spatial-level transforms will simultaneously change both an input image as well | [RandomVerticalFlip](https://explore.albumentations.ai/transform/RandomVerticalFlip) | ✓ | ✓ | ✓ | ✓ | | [Resize](https://explore.albumentations.ai/transform/Resize) | ✓ | ✓ | ✓ | ✓ | | [Rotate](https://explore.albumentations.ai/transform/Rotate) | ✓ | ✓ | ✓ | ✓ | +| [RotateAndProject](https://explore.albumentations.ai/transform/RotateAndProject) | ✓ | ✓ | ✓ | ✓ | | [SafeRotate](https://explore.albumentations.ai/transform/SafeRotate) | ✓ | ✓ | ✓ | ✓ | | [ShiftScaleRotate](https://explore.albumentations.ai/transform/ShiftScaleRotate) | ✓ | ✓ | ✓ | ✓ | | [SmallestMaxSize](https://explore.albumentations.ai/transform/SmallestMaxSize) | ✓ | ✓ | ✓ | ✓ | diff --git a/albumentations/augmentations/geometric/functional.py b/albumentations/augmentations/geometric/functional.py index c46929acb..3ca209f93 100644 --- a/albumentations/augmentations/geometric/functional.py +++ b/albumentations/augmentations/geometric/functional.py @@ -2265,7 +2265,7 @@ def center(image_shape: tuple[int, int]) -> tuple[float, float]: image_shape (tuple[int, int]): The shape of the image. Returns: - tuple[float, float]: The center coordinates. + tuple[float, float]: center_x, center_y """ height, width = image_shape[:2] return width / 2 - 0.5, height / 2 - 0.5 @@ -2278,7 +2278,7 @@ def center_bbox(image_shape: tuple[int, int]) -> tuple[float, float]: image_shape (tuple[int, int]): The shape of the image. Returns: - tuple[float, float]: The center coordinates. + tuple[float, float]: center_x, center_y """ height, width = image_shape[:2] return width / 2, height / 2 @@ -3128,3 +3128,75 @@ def get_fisheye_distortion_maps( map_y = cy + r_dist * np.sin(theta) return map_x, map_y + + +def get_projection_matrix( + image_shape: tuple[int, int], + x_angle: float, + y_angle: float, + z_angle: float, + focal_length: float, + center_xy: tuple[float, float], +) -> np.ndarray: + """Get projection matrix for perspective transform. + + Args: + image_shape: Height and width of the image + x_angle: Rotation angle around X axis in radians + y_angle: Rotation angle around Y axis in radians + z_angle: Rotation angle around Z axis in radians + focal_length: Focal length of the virtual camera + center_xy: Center point (x,y) of the transform + + Returns: + 3x3 projection matrix + """ + height, width = image_shape + center_x, center_y = center_xy + + # Create translation matrices + to_origin = np.array([[1.0, 0.0, -center_x], [0.0, 1.0, -center_y], [0.0, 0.0, 1.0]], dtype=np.float64) + + from_origin = np.array([[1.0, 0.0, center_x], [0.0, 1.0, center_y], [0.0, 0.0, 1.0]], dtype=np.float64) + + # Create focal length matrix + focal = np.array([[focal_length, 0.0, 0.0], [0.0, focal_length, 0.0], [0.0, 0.0, 1.0]], dtype=np.float64) + + # Get rotation matrix + rotation = get_rotation_matrix_3d(x_angle, y_angle, z_angle) + + # Compose final matrix: from_origin @ rotation @ focal @ to_origin + matrix = from_origin @ rotation @ focal @ to_origin + + # Return inverse matrix for warpPerspective + return np.linalg.inv(matrix).astype(np.float32) + + +def get_rotation_matrix_3d(x_angle: float, y_angle: float, z_angle: float) -> np.ndarray: + """Get 3D rotation matrix. + + Args: + x_angle: Rotation angle around X axis in radians + y_angle: Rotation angle around Y axis in radians + z_angle: Rotation angle around Z axis in radians + + Returns: + 3x3 rotation matrix + """ + # Create rotation matrices + cos_x, sin_x = np.cos(x_angle), np.sin(x_angle) + cos_y, sin_y = np.cos(y_angle), np.sin(y_angle) + cos_z, sin_z = np.cos(z_angle), np.sin(z_angle) + + # X rotation + rx = np.array([[1.0, 0.0, 0.0], [0.0, cos_x, -sin_x], [0.0, sin_x, cos_x]], dtype=np.float64) + + # Y rotation + ry = np.array([[cos_y, 0.0, sin_y], [0.0, 1.0, 0.0], [-sin_y, 0.0, cos_y]], dtype=np.float64) + + # Z rotation + rz = np.array([[cos_z, -sin_z, 0.0], [sin_z, cos_z, 0.0], [0.0, 0.0, 1.0]], dtype=np.float64) + + # Combine rotations: Y * X * Z + # This order matches the expected test results + return rx @ ry @ rz diff --git a/albumentations/augmentations/geometric/rotate.py b/albumentations/augmentations/geometric/rotate.py index 0d47d3b11..495a99780 100644 --- a/albumentations/augmentations/geometric/rotate.py +++ b/albumentations/augmentations/geometric/rotate.py @@ -1,15 +1,16 @@ from __future__ import annotations import math -from typing import Any, cast +from typing import Annotated, Any, cast import cv2 import numpy as np +from pydantic import AfterValidator from typing_extensions import Literal from albumentations.augmentations.crops import functional as fcrops -from albumentations.augmentations.geometric.transforms import Affine -from albumentations.core.pydantic import BorderModeType, InterpolationType, SymmetricRangeType +from albumentations.augmentations.geometric.transforms import Affine, Perspective +from albumentations.core.pydantic import BorderModeType, InterpolationType, SymmetricRangeType, nondecreasing from albumentations.core.transforms_interface import BaseTransformInitSchema, DualTransform from albumentations.core.types import ( ColorType, @@ -19,7 +20,7 @@ from . import functional as fgeometric -__all__ = ["Rotate", "RandomRotate90", "SafeRotate"] +__all__ = ["Rotate", "RandomRotate90", "SafeRotate", "RotateAndProject"] SMALL_NUMBER = 1e-10 @@ -484,3 +485,175 @@ def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, A "bbox_matrix": bbox_matrix, "output_shape": image_shape, } + + +class RotateAndProject(Perspective): + """Applies 3D rotation to an image and projects it back to 2D plane using perspective projection. + + This transform simulates viewing a 2D image from different 3D viewpoints by: + 1. Rotating the image around three axes (X, Y, Z) in 3D space + 2. Applying perspective projection to map the rotated image back to 2D + 3. Handling different center calculations for images/keypoints and bounding boxes + + The transform preserves aspect ratios and handles all target types (images, masks, + keypoints, and bounding boxes) consistently. + + Args: + x_angle_range (tuple[float, float]): Range for rotation around x-axis in degrees. + Positive angles rotate the top edge away from viewer. + Default: (-15, 15) + y_angle_range (tuple[float, float]): Range for rotation around y-axis in degrees. + Positive angles rotate the right edge away from viewer. + Default: (-15, 15) + z_angle_range (tuple[float, float]): Range for rotation around z-axis in degrees. + Positive angles rotate clockwise in image plane. + Default: (-15, 15) + focal_range (tuple[float, float]): Range for focal length of perspective projection. + Controls the strength of perspective effect: + - Values < 1.0: Strong perspective (wide-angle lens effect) + - Value = 1.0: Normal perspective + - Values > 1.0: Weak perspective (telephoto lens effect) + Default: (0.5, 1.5) + border_mode (OpenCV flag): Padding mode for borders after rotation. + Should be one of: + - cv2.BORDER_CONSTANT: pads with constant value + - cv2.BORDER_REFLECT: reflects border pixels + - cv2.BORDER_REFLECT_101: reflects border pixels without duplicating edge pixels + - cv2.BORDER_REPLICATE: replicates border pixels + Default: cv2.BORDER_CONSTANT + pad_val (int, float, list): Padding value if border_mode is cv2.BORDER_CONSTANT. + Default: 0 + mask_pad_val (int, float, list): Padding value for masks if border_mode is cv2.BORDER_CONSTANT. + Default: 0 + interpolation (OpenCV flag): Interpolation method for image transformation. + Should be one of: + - cv2.INTER_NEAREST: nearest-neighbor interpolation + - cv2.INTER_LINEAR: bilinear interpolation + - cv2.INTER_CUBIC: bicubic interpolation + Default: cv2.INTER_LINEAR + mask_interpolation (OpenCV flag): Interpolation method for mask transformation. + Default: cv2.INTER_NEAREST + p (float): Probability of applying the transform. + Default: 0.5 + + Targets: + image, mask, keypoints, bboxes + + Image types: + uint8, float32 + + Note: + - The transform maintains original image size + - Uses different center calculations for images/keypoints (width-1)/2 vs bboxes width/2 + - Handles all coordinate transformations in homogeneous coordinates + - Applies proper perspective transformation to bounding boxes by transforming corners + + Example: + >>> import albumentations as A + >>> transform = A.RotateAndProject( + ... x_angle_range=(-30, 30), + ... y_angle_range=(-30, 30), + ... z_angle_range=(-15, 15), + ... focal_range=(0.7, 1.3), + ... p=1.0 + ... ) + >>> result = transform(image=image, bboxes=bboxes, keypoints=keypoints) + """ + + class InitSchema(BaseTransformInitSchema): + x_angle_range: Annotated[tuple[float, float], AfterValidator(nondecreasing)] + y_angle_range: Annotated[tuple[float, float], AfterValidator(nondecreasing)] + z_angle_range: Annotated[tuple[float, float], AfterValidator(nondecreasing)] + focal_range: Annotated[tuple[float, float], AfterValidator(nondecreasing)] + mask_interpolation: InterpolationType + interpolation: InterpolationType + pad_mode: int + pad_val: ColorType + mask_pad_val: ColorType + + def __init__( + self, + x_angle_range: tuple[float, float] = (-15, 15), + y_angle_range: tuple[float, float] = (-15, 15), + z_angle_range: tuple[float, float] = (-15, 15), + focal_range: tuple[float, float] = (0.5, 1.5), + pad_mode: int = cv2.BORDER_CONSTANT, + pad_val: ColorType = 0, + mask_pad_val: ColorType = 0, + interpolation: int = cv2.INTER_LINEAR, + mask_interpolation: int = cv2.INTER_NEAREST, + p: float = 0.5, + always_apply: bool | None = None, + ): + super().__init__( + scale=(0, 0), # Unused but required by parent + keep_size=True, + pad_mode=pad_mode, + pad_val=pad_val, + mask_pad_val=mask_pad_val, + interpolation=interpolation, + mask_interpolation=mask_interpolation, + p=p, + ) + self.x_angle_range = x_angle_range + self.y_angle_range = y_angle_range + self.z_angle_range = z_angle_range + self.focal_range = focal_range + self.pad_val = pad_val + self.mask_pad_val = mask_pad_val + self.interpolation = interpolation + self.mask_interpolation = mask_interpolation + + def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, Any]) -> dict[str, Any]: + image_shape = params["shape"][:2] + + height, width = image_shape + # Sample parameters + x_angle = np.deg2rad(self.py_random.uniform(*self.x_angle_range)) + y_angle = np.deg2rad(self.py_random.uniform(*self.y_angle_range)) + z_angle = np.deg2rad(self.py_random.uniform(*self.z_angle_range)) + focal_length = self.py_random.uniform(*self.focal_range) + + # Get projection matrix + matrix = fgeometric.get_projection_matrix( + image_shape, + x_angle, + y_angle, + z_angle, + focal_length, + fgeometric.center(image_shape), + ) + + matrix_bbox = fgeometric.get_projection_matrix( + image_shape, + x_angle, + y_angle, + z_angle, + focal_length, + fgeometric.center_bbox(image_shape), + ) + + return {"matrix": matrix, "max_height": height, "max_width": width, "matrix_bbox": matrix_bbox} + + def get_transform_init_args_names(self) -> tuple[str, ...]: + return ( + "x_angle_range", + "y_angle_range", + "z_angle_range", + "focal_range", + "pad_mode", + "pad_val", + "mask_pad_val", + "interpolation", + "mask_interpolation", + ) + + def apply_to_bboxes( + self, + bboxes: np.ndarray, + matrix_bbox: np.ndarray, + max_height: int, + max_width: int, + **params: Any, + ) -> np.ndarray: + return fgeometric.perspective_bboxes(bboxes, params["shape"], matrix_bbox, max_width, max_height, True) diff --git a/albumentations/augmentations/geometric/transforms.py b/albumentations/augmentations/geometric/transforms.py index f132630e4..637b2a5ee 100644 --- a/albumentations/augmentations/geometric/transforms.py +++ b/albumentations/augmentations/geometric/transforms.py @@ -419,7 +419,7 @@ def apply_to_mask( matrix, max_width, max_height, - self.pad_val, + self.mask_pad_val, self.pad_mode, self.keep_size, self.mask_interpolation, @@ -428,7 +428,7 @@ def apply_to_mask( def apply_to_bboxes( self, bboxes: np.ndarray, - matrix: np.ndarray, + matrix_bbox: np.ndarray, max_height: int, max_width: int, **params: Any, @@ -436,7 +436,7 @@ def apply_to_bboxes( return fgeometric.perspective_bboxes( bboxes, params["shape"], - matrix, + matrix_bbox, max_width, max_height, self.keep_size, @@ -472,7 +472,7 @@ def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, A if self.fit_output: matrix, max_width, max_height = fgeometric.expand_transform(matrix, image_shape) - return {"matrix": matrix, "max_height": max_height, "max_width": max_width} + return {"matrix": matrix, "max_height": max_height, "max_width": max_width, "matrix_bbox": matrix} def get_transform_init_args_names(self) -> tuple[str, ...]: return ( diff --git a/tests/aug_definitions.py b/tests/aug_definitions.py index 1aca5ef00..f9afabf01 100644 --- a/tests/aug_definitions.py +++ b/tests/aug_definitions.py @@ -413,4 +413,5 @@ [A.Illumination, {}], [A.ThinPlateSpline, {}], [A.AutoContrast, {}], + [A.RotateAndProject, {}], ] diff --git a/tests/functional/test_geometric.py b/tests/functional/test_geometric.py index 76df9a179..2896ee571 100644 --- a/tests/functional/test_geometric.py +++ b/tests/functional/test_geometric.py @@ -6,6 +6,7 @@ from tests.utils import set_seed import albumentations as A +import math @pytest.mark.parametrize( @@ -329,3 +330,94 @@ def test_create_piecewise_affine_maps_grid_points( # Check if any point in neighborhood is close to expected x coordinate assert np.any(np.abs(neighborhood - x) < max_deviation), \ f"No points near grid intersection ({x}, {y}) within allowed deviation" + + +@pytest.mark.parametrize( + ["x", "y", "z", "expected"], + [ + # Identity case - no rotation + (0, 0, 0, np.eye(3)), + + # 90 degree rotations around single axes + (math.pi/2, 0, 0, np.array([ + [1, 0, 0], + [0, 0, -1], + [0, 1, 0] + ])), + (0, math.pi/2, 0, np.array([ + [0, 0, 1], + [0, 1, 0], + [-1, 0, 0] + ])), + (0, 0, math.pi/2, np.array([ + [0, -1, 0], + [1, 0, 0], + [0, 0, 1] + ])), + + # Combined rotations + (math.pi/4, math.pi/4, 0, np.array([ + [0.7071067811865476, 0, 0.7071067811865476], + [0.5, 0.7071067811865476, -0.5], + [-0.5, 0.7071067811865476, 0.5] + ])), + ] +) +def test_get_rotation_matrix_3d(x: float, y: float, z: float, expected: np.ndarray) -> None: + result = fgeometric.get_rotation_matrix_3d(x, y, z) + np.testing.assert_array_almost_equal(result, expected) + + +@pytest.mark.parametrize( + ["image_shape", "angles", "focal", "center", "test_point", "expected_point"], + [ + # Identity transform + ((100, 100), (0, 0, 0), 1.0, (50, 50), (0, 0), (0, 0)), + ((100, 100), (0, 0, 0), 1.0, (50, 50), (100, 100), (100, 100)), + + # Test focal length + ((100, 100), (0, 0, 0), 2.0, (50, 50), (75, 50), (62.5, 50)), + + # # Test rotation + # ((100, 100), (0, 0, math.pi/2), 1.0, (50, 50), (0, 0), (100, 0)), + + # # Test combined transform + # ((100, 100), (math.pi/4, 0, 0), 1.0, (50, 50), (50, 0), (50, pytest.approx(70.71, rel=1e-2))), + ] +) +def test_get_projection_matrix( + image_shape: tuple[int, int], + angles: tuple[float, float, float], + focal: float, + center: tuple[int, int], + test_point: tuple[float, float], + expected_point: tuple[float, float], +) -> None: + matrix = fgeometric.get_projection_matrix(image_shape, *angles, focal, center) + point = np.array([test_point[0], test_point[1], 1.0]) + transformed = matrix @ point + transformed = transformed[:2] / transformed[2] + + np.testing.assert_array_almost_equal(transformed[0], expected_point[0]) + np.testing.assert_allclose(transformed[1], float(str(expected_point[1]).split('±')[0]), rtol=1e-2) + + +def test_projection_matrix_properties() -> None: + """Test mathematical properties of the projection matrix""" + image_shape = (100, 100) + center = (50, 50) + + # Identity case + identity_matrix = fgeometric.get_projection_matrix(image_shape, 0, 0, 0, 1.0, center) + np.testing.assert_array_almost_equal(identity_matrix, np.eye(3)) + + # Inverse of inverse should give original + angles = (math.pi/4, math.pi/3, math.pi/6) + + matrix = fgeometric.get_projection_matrix(image_shape, *angles, 1.5, center) + + np.testing.assert_allclose( + np.linalg.inv(np.linalg.inv(matrix)), + matrix, + rtol=1e-2, + ) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index b0860c094..e00e690d0 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -100,6 +100,13 @@ def test_binary_mask_interpolation(augmentation_cls, params): A.Resize: {"height": 113, "width": 113}, A.GridElasticDeform: {"num_grid_xy": (10, 10), "magnitude": 10}, A.CropAndPad: {"px": 10}, + A.RotateAndProject: { + "x_angle_range": (0, 0), + "y_angle_range": (0, 0), + "z_angle_range": (0, 0), + "focal_range": (1, 1), + "mask_interpolation": cv2.INTER_NEAREST, + }, }, except_augmentations={ A.RandomSizedBBoxSafeCrop, @@ -2081,6 +2088,7 @@ def test_mask_dropout_bboxes(remove_invisible, expected_keypoints): A.RandomVerticalFlip, A.RandomAffine, A.RandomErasing, + A.RotateAndProject }, ), )