Apply affine transform based on depth map

Hi everyone,

I want to apply an affine transform to a 2D image based on its estimated depth map. That means if I shift the image to the right, objects that are close should be shifted more than objects in the background. The same goes for a zoom by scaling.
Unfortunately, I can’t get it to work correctly at the moment.

So what I have is an image of shape (512, 512, 3) and a depth map of shape (512, 512).
How I see it should be done:

  1. Construct an affine transform X, e.g. with Kornia kornia.geometry.transform.get_projective_transform
  2. Construct a flow field using torch.nn.functional.affine_grid
  3. Apply the flow field using torch.nn.functional.grid_sample

My initial idea was to multiply the flow field with the depth map. But I realized that this way an identity transform would also be morphed by the depth map. So I tried taking the difference of the affine and identity transform flow fields after having scaled both by the depth map. For some reason, this does not work out…
Here’s what I have so far:

# Load the image and depth map
from PIL import Image
img_path = "test.png"
depth_path = "test_depth.png"
img = torch.tensor(np.array(Image.open(img_path)).permute(2, 0, 1).float() / 255
depth = np.array(Image.open(depth_path)).astype(np.float32) / 255

# identity matrix and flow field
identity_matrix = kornia.geometry.transform.get_projective_transform(center=torch.tensor([[0.5, 0.5, 0.5]]),
                                                           angles=torch.tensor([[0., 0., 0.]]),
                                                           scales=torch.tensor([[1.0, 1.0, 1.0]]),) # 3D all
h, w, = 512, 512
coords_3d_identity = torch.nn.functional.affine_grid(identity_matrix, [1, 1, 1,h,w], align_corners=False)
coords_2d_identity = coords_3d_identity[..., :2]

# 3D matrix and affine transform flow field - zoom in by 50%
matrix = kornia.geometry.transform.get_projective_transform(center=torch.tensor([[0.5, 0.5, 0.5]]),
                                                           angles=torch.tensor([[0., 0., 0.]]),
                                                           scales=torch.tensor([[1.0, 1.0, 1.5]]),) # 3D all
coords_3d = torch.nn.functional.affine_grid(matrix, 
                                            [1, 1, 1, h, w], 
                                            align_corners=False)
coords_2d = coords_3d[..., 1:3]

# apply depth to flow fields
matched_depth = torch.tensor(depth).unsqueeze(-1).repeat(1, 1, 2).unsqueeze(0).unsqueeze(0)
multiplied_identity_field = coords_2d_identity * matched_depth
multiplied_transform_field = coords_2d * matched_depth
diff_field = multiplied_transform_field - multiplied_identity_field

img_transformed = F.grid_sample(img.unsqueeze(0), 
                               (diff_field.squeeze(0), 
                                padding_mode="zeros", #reflection
                                align_corners=False)
1 Like

I solved it:

# Load the image and depth map
from PIL import Image
import torch
import kornia
import numpy as np
import torch.nn.functional as F


def generate_affine_matrix_3d_x_rotation(angle: float, scale: float = 1.0, tx: float = 0.0, ty: float = 0.0, tz: float = 0.0):
    """
    Generate a 3x4 affine transformation matrix for 3D rotation around the x-axis, scaling, and translation.

    :param angle: The angle of rotation in degrees.
    :param scale: The scaling factor.
    :param tx: The translation along the x-axis.
    :param ty: The translation along the y-axis.
    :param tz: The translation along the z-axis.
    :return: A 3x4 affine transformation matrix.
    """
    angle = angle * np.pi / 180
    cos = np.cos(angle)
    sin = np.sin(angle)
    matrix = torch.tensor([[scale, 0, 0, tx],
                           [0, scale * cos, -scale * sin, ty],
                           [0, scale * sin, scale * cos, tz]], dtype=torch.float)
    return matrix

img_path = "img_path.png"
depth_path = "img_path_depth.png"
img = torch.tensor(np.array(Image.open(img_path))).permute(2, 0, 1).float() / 256
depth = np.array(Image.open(depth_path)).astype(np.float32) / 256 / 256

h, w, = 512, 512
scale = 0.5
angle = 0

# identity matrix and flow field
identity_matrix = generate_affine_matrix_3d_x_rotation(angle=0 , scale= 1.0, tx = 0.0, ty = 0.0, tz = 0.0).unsqueeze(0)
coords_3d_identity = F.affine_grid(identity_matrix, [1, 1, 1, h, w], align_corners=False)
coords_2d_identity = coords_3d_identity[..., :2]
# 3D matrix and affine transform flow field - zoom in by 50%
matrix = generate_affine_matrix_3d_x_rotation(angle=angle, scale=scale, tx = 0.0, ty = 0.0, tz = 0.0).unsqueeze(0)
print(matrix)
coords_3d = F.affine_grid(matrix, [1, 1, 1, h, w], align_corners=False)
coords_2d = coords_3d[..., :2]

# apply depth to flow fields
matched_depth = torch.tensor(depth).unsqueeze(-1).repeat(1, 1, 2).unsqueeze(0).unsqueeze(0)
transform = lambda field: F.grid_sample(img.unsqueeze(0), field.squeeze(0), padding_mode="zeros", align_corners=False).squeeze()


flow_diff = coords_2d_identity - coords_2d
field = coords_2d_identity - flow_diff * matched_depth  # make close pixels more affected by difference in flows than pixels that are far away

transformed_img = transform(field)
1 Like