convert_dataset.py

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from path import Path
from imageio import imread, imwrite
from skimage.transform import rescale, resize
from skimage.measure import block_reduce
from colmap_util import read_model as rm
import numpy as np
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from tqdm import tqdm
from wrappers import FFMpeg
import gzip
from pebble import ProcessPool
import yaml
from itertools import product
import pandas as pd

parser = ArgumentParser(description='Convert dataset to KITTI format, optionnally create a visualization video',
                        formatter_class=ArgumentDefaultsHelpFormatter)

parser.add_argument('--depth_dir', metavar='DIR', type=Path, required=True,
                    help='folder where depth maps generated by ETH3D are stored Usually ends with  "ground_truth_depth/<video name>"')
parser.add_argument('--images_root_folder', metavar='DIR', type=Path, required=True,
                    help='folder where video frames are stored')
parser.add_argument('--occ_dir', metavar='DIR', type=Path,
                    help='folder where occlusion depth maps generated by ETH3D are stored. Usually ends with "occlusion_depth/<video name>"')
parser.add_argument('--metadata_path', type=Path, required=True,
                    help='path to metadata CSV file generated during video_to_colmap.py')
parser.add_argument('--dataset_output_dir', metavar='DIR', default=None, type=Path, required=True)
parser.add_argument('--video_output_dir', metavar='DIR', default=None, type=Path)
parser.add_argument('--interpolated_frames_path', metavar='TXT', type=Path)
parser.add_argument('--final_model', metavar='DIR', type=Path)
parser.add_argument('--visualize', action='store_true',
                    help='If selected, will generate images with depth colorized for visualization purpose')
parser.add_argument('--video', action='store_true',
                    help='If selected, will generate a video from visualization images')
parser.add_argument('--downscale', type=int, default=1, help='How much ground truth depth is downscaled in order to save space')
parser.add_argument('--threads', '-j', type=int, default=8, help='')
parser.add_argument('--compressed', action='store_true',
                    help='Indicates if GroundTruthCreator was used with option `--compress_depth_maps`')
parser.add_argument('--reg_mat', type=Path, default=None,
                    help='registration matrix that was used for lidar point cloud registration')
parser.add_argument('--verbose', '-v', action='count', default=0)


def rescale_and_save_cameras(cameras, images, output_dir, output_width=None, downscale=None):
    def rescale_camera(cam):
        if downscale is None:
            current_downscale = output_width / cam.width
        else:
            current_downscale = downscale
        if current_downscale == 1:
            return cam
        if 'SIMPLE' in cam.model or 'RADIAL' in cam.model:
            cam.params[:3] /= current_downscale
        else:
            cam.params[:4] /= current_downscale

        return cam._replace(width=int(cam.width//current_downscale),
                            height=int(cam.height//current_downscale))

    def construct_intrinsics(cam):
        # assert('PINHOLE' in cam.model)
        if 'SIMPLE' in cam.model or 'RADIAL' in cam.model:
            fx, cx, cy = cam.params
            fy = fx
        else:
            fx, fy, cx, cy, *_ = cam.params

        return np.array([[fx, 0, cx],
                         [0, fy, cy],
                         [0, 0, 1]])

    def save_cam(cam, intrinsics_path, yaml_path):
        intrinsics = construct_intrinsics(cam)
        np.savetxt(intrinsics_path, intrinsics)
        with open(yaml_path, 'w') as f:
            camera_dict = {"model": cam.model,
                           "params": cam.params.tolist(),
                           "width": cam.width,
                           "height": cam.height}
            yaml.dump(camera_dict, f, default_flow_style=False)
        return cam

    rescaled_cameras = {}
    if len(cameras) == 1:
        key = list(cameras.keys())[0]
        cam = cameras[key]
        rescaled_cameras[key] = rescale_camera(cam)
        save_cam(cam, output_dir / "intrinsics.txt", output_dir / "camera.yaml")

    else:
        for _, img in images.items():
            try:
                cam = rescaled_cameras[img.camera_id]
            except KeyError:
                cam = rescale_camera(cameras[img.camera_id])
                rescaled_cameras[img.camera_id] = cam
            finally:
                save_cam(cam, output_dir / Path(img.name).stem + "_intrinsics.txt",
                         output_dir / Path(img.name).stem + "_camera.yaml")
    return rescaled_cameras


def to_transform_matrix(q, t, scale=1):
    cam_R = rm.qvec2rotmat(q).T
    cam_t = (- cam_R @ t).reshape(3, 1) * scale
    transform = np.vstack((np.hstack([cam_R, cam_t]), [0, 0, 0, 1]))
    return transform


def save_poses(images, images_list, output_dir, scale):
    starting_pos = None
    poses = []
    for i in images_list:
        try:
            img = images[i]
            current_pos = to_transform_matrix(img.qvec, img.tvec, scale)
            if starting_pos is None:
                starting_pos = current_pos
            relative_position = np.linalg.inv(starting_pos) @ current_pos
            poses.append(relative_position[:3])
        except KeyError:
            # Frame is not registered so we put NaN coordinates instead
            poses.append(np.full((3, 4), np.NaN))
    poses = np.stack(poses)
    np.savetxt(output_dir/'poses.txt', poses.reshape((len(images_list), -1)))
    return poses


def high_res_colormap(low_res_cmap, resolution=1000, max_value=1):
    # Construct the list colormap, with interpolated values for higer resolution
    # For a linear segmented colormap, you can just specify the number of point in
    # cm.get_cmap(name, lutsize) with the parameter lutsize
    x = np.linspace(0, 1, low_res_cmap.N)
    low_res = low_res_cmap(x)
    new_x = np.linspace(0, max_value, resolution)
    high_res = np.stack([np.interp(new_x, x, low_res[:, i]) for i in range(low_res.shape[1])], axis=1)
    return ListedColormap(high_res)


def opencv_rainbow(resolution=1000):
    # Construct the opencv equivalent of Rainbow
    opencv_rainbow_data = (
        (0.000, (1.00, 0.00, 0.00)),
        (0.400, (1.00, 1.00, 0.00)),
        (0.600, (0.00, 1.00, 0.00)),
        (0.800, (0.00, 0.00, 1.00)),
        (1.000, (0.60, 0.00, 1.00))
    )

    return LinearSegmentedColormap.from_list('opencv_rainbow', opencv_rainbow_data, resolution)


COLORMAPS = {'rainbow': opencv_rainbow(),
             'magma': high_res_colormap(cm.get_cmap('magma')),
             'bone': cm.get_cmap('bone', 10000)}


def apply_cmap_and_resize(depth, colormap, downscale):
    downscale_depth = block_reduce(depth, (downscale, downscale), np.min)
    finite_depth = depth[depth < np.inf]
    if finite_depth.size != 0:
        max_d = depth[depth < np.inf].max()
        depth_norm = downscale_depth/max_d
        depth_norm[downscale_depth == np.inf] = 1
    else:
        depth_norm = np.ones_like(downscale_depth)

    depth_viz = COLORMAPS[colormap](depth_norm)[:, :, :3]
    depth_viz[downscale_depth == np.inf] = 0
    return downscale_depth, depth_viz*255


def process_one_frame(img_path, depth_path, occ_path, depth_shape,
                      dataset_output_dir, video_output_dir, downscale, interpolated,
                      visualization=False, viz_width=1920, compressed=True):
    img = imread(img_path)
    if len(img.shape) == 3:
        h, w, _ = img.shape
    elif len(img.shape) == 2:
        h, w = img.shape
        img = img.reshape(h, w, 1)
    assert(viz_width % 2 == 0)
    viz_height = int(viz_width * h / (2*w)) * 2
    output_img = np.zeros((viz_height, viz_width, 3), dtype=np.uint8)
    if depth_shape is not None:
        resized_img = resize(img, depth_shape)
        rescaled_img = rescale(resized_img, 1/downscale, multichannel=True)*255
        imwrite(dataset_output_dir / img_path.basename(), rescaled_img.astype(np.uint8))

    if visualization:
        viz_img = resize(img, (viz_height//2, viz_width//2))*255
        # Img goes to upper left corner of visualization
        output_img[:viz_height//2, :viz_width//2] = viz_img
    if depth_path is not None:
        with gzip.open(depth_path, "rb") if compressed else open(depth_path, "rb") as f:
            depth = np.frombuffer(f.read(), np.float32).reshape(depth_shape)
        output_depth_name = dataset_output_dir / img_path.stem + '.npy'
        downscaled_depth, viz = apply_cmap_and_resize(depth, 'rainbow', downscale)
        if not interpolated:
            np.save(output_depth_name, downscaled_depth)
        if visualization:
            viz_rescaled = resize(viz, (viz_height//2, viz_width//2))
            # Depth colormap goes to upper right corner
            output_img[:viz_height//2, viz_width//2:] = viz_rescaled
            # Mix Depth / image goest to lower left corner
            output_img[viz_height//2:, :viz_width//2] = \
                output_img[:viz_height//2, :viz_width//2]//2 + \
                output_img[:viz_height//2, viz_width//2:]//2

    if occ_path is not None and visualization:
        with gzip.open(occ_path, "rb") if compressed else open(occ_path, "rb") as f:
            occ = np.frombuffer(f.read(), np.float32).reshape(depth_shape)
        _, occ_viz = apply_cmap_and_resize(occ, 'bone', downscale)
        occ_viz_rescaled = resize(occ_viz, (viz_height//2, viz_width//2))
        # Occlusion depthmap visualization goes to lower right corner
        output_img[viz_height//2:, viz_width//2:] = occ_viz_rescaled
    if interpolated:
        output_img[:5] = output_img[-5:] = output_img[:, :5] = output_img[:, -5:] = [255, 128, 0]

    if visualization:
        imwrite(video_output_dir/img_path.stem + '.png', output_img)


def convert_dataset(final_model, depth_dir, images_root_folder, occ_dir,
                    dataset_output_dir, video_output_dir, ffmpeg, pose_scale=1,
                    interpolated_frames=[], metadata=None, images_list=None,
                    threads=8, downscale=None, compressed=True,
                    width=None, visualization=False, video=False, verbose=0, **env):
    dataset_output_dir.makedirs_p()
    video_output_dir.makedirs_p()
    if video:
        visualization = True
    cameras_colmap, images_colmap, _ = rm.read_model(final_model, '.txt')
    # image_df = pd.DataFrame.from_dict(images, orient="index").set_index("id")

    if metadata is not None:
        metadata = metadata.set_index("db_id", drop=False).sort_values("time")
        framerate = metadata["framerate"].values[0]
        # image_df = image_df.reindex(metadata.index)
        images_list = metadata["image_path"].values
    else:
        assert images_list is not None
        framerate = None
        video = False

    # Discard images and cameras that are not represented by the image list
    images_colmap = {i.name: i for k, i in images_colmap.items() if i.name in images_list}
    cameras_ids = set([i.camera_id for i in images_colmap.values()])
    cameras_colmap = {k: cameras_colmap[k] for k in cameras_ids}

    if downscale is None:
        assert width is not None
    rescaled_cameras = rescale_and_save_cameras(cameras_colmap,
                                                images_colmap,
                                                dataset_output_dir,
                                                width, downscale)
    poses = save_poses(images_colmap, images_list, dataset_output_dir, pose_scale)

    depth_maps = []
    occ_maps = []
    interpolated = []
    imgs = []
    registered = []
    depth_shapes = []

    for i in images_list:
        img_path = images_root_folder / i
        imgs.append(img_path)

        fname = img_path.basename()
        depth_path = depth_dir / fname
        occ_path = occ_dir / fname
        if compressed:
            depth_path += ".gz"
            occ_path += ".gz"
        if i in images_colmap:
            assert depth_path.isfile()
            registered.append(True)
            if occ_path.isfile():
                occ_maps.append(occ_path)
            else:
                occ_maps.append(None)
            depth_maps.append(depth_path)
            camera = cameras_colmap[images_colmap[i].camera_id]
            depth_shapes.append((camera.height, camera.width))
            if i in interpolated_frames:
                if verbose > 2:
                    print("Image {} was interpolated".format(fname))
                interpolated.append(True)
            else:
                interpolated.append(False)
        else:
            if verbose > 2:
                print("Image {} was not registered".format(fname))
            registered.append(False)
            depth_maps.append(None)
            occ_maps.append(None)
            interpolated.append(False)
            depth_shapes.append(None)
    print('{}/{} Frames not registered ({:.2f}%)'.format(len(images_list) - sum(registered),
                                                         len(images_list),
                                                         100*(1 - sum(registered)/len(images_list))))
    print('{}/{} Frames interpolated ({:.2f}%)'.format(sum(interpolated),
                                                       len(images_list),
                                                       100*sum(interpolated)/len(images_list)))
    if threads == 1:
        for i, d, o, ds, n in tqdm(zip(imgs, depth_maps, occ_maps, depth_shapes, interpolated), total=len(imgs)):
            process_one_frame(i, d, o, ds, dataset_output_dir, video_output_dir, downscale, n, visualization, viz_width=1920)
    else:
        with ProcessPool(max_workers=threads) as pool:
            tasks = pool.map(process_one_frame, imgs, depth_maps, occ_maps, depth_shapes,
                             [dataset_output_dir]*len(imgs), [video_output_dir]*len(imgs),
                             [downscale]*len(imgs), interpolated,
                             [visualization]*len(imgs), [1920]*len(imgs))
            try:
                for _ in tqdm(tasks.result(), total=len(imgs)):
                    pass
            except KeyboardInterrupt as e:
                tasks.cancel()
                raise e

    if metadata is not None:
        wanted_keys = ['image_path', 'time', 'height', 'width', 'camera_model', 'camera_id']
        filtered_metadata = metadata[wanted_keys].copy()
        filtered_metadata['interpolated'] = interpolated
        filtered_metadata['registered'] = registered
        for i, j in product(range(3), range(4)):
            filtered_metadata['pose{}{}'.format(i, j)] = poses[:, i, j]

        filtered_metadata["fx"] = np.NaN
        filtered_metadata["fy"] = np.NaN
        filtered_metadata["cx"] = np.NaN
        filtered_metadata["cy"] = np.NaN
        for cam_id in filtered_metadata["camera_id"].unique():
            if cam_id not in rescaled_cameras.keys():
                continue
            cam = rescaled_cameras[cam_id]
            rows = filtered_metadata["camera_id"] == cam_id

            filtered_metadata.loc[rows, "fx"] = cam.params[0]
            if "SIMPLE" in cam.model or "RADIAL" in cam.model:
                filtered_metadata.loc[rows, "fy"] = cam.params[0]
                filtered_metadata.loc[rows, "cx"] = cam.params[1]
                filtered_metadata.loc[rows, "cy"] = cam.params[2]
            else:
                filtered_metadata.loc[rows, "fy"] = cam.params[1]
                filtered_metadata.loc[rows, "cx"] = cam.params[2]
                filtered_metadata.loc[rows, "cy"] = cam.params[3]
        filtered_metadata.to_csv(dataset_output_dir / 'metadata.csv')

    not_registered = [i + '\n' for i, r in zip(images_list, registered) if not r]
    with open(dataset_output_dir / 'not_registered.txt', 'w') as f:
        f.writelines(not_registered)

    if video:
        video_path = str(video_output_dir.parent/'{}_groundtruth_viz.mp4'.format(video_output_dir.stem))
        glob_pattern = str(video_output_dir/'*.png')
        ffmpeg.create_video(video_path, glob_pattern, True, framerate)
        video_output_dir.rmtree_p()


if __name__ == '__main__':
    args = parser.parse_args()
    env = vars(args)
    env["metadata"] = pd.read_csv(env["metadata_path"])
    if args.reg_mat is not None:
        registration_matrix = np.genfromtxt(args.reg_mat)
        # If registration matrix is not a true rotation, it means the frame positions
        # need to be rescaled
        reg_scale = 1/np.linalg.norm(registration_matrix[:, :3], 2)
    else:
        reg_scale = 1
    if args.interpolated_frames_path is None:
        env["interpolated_frames"] = []
    else:
        with open(args.interpolated_frames_path, "r") as f:
            env["interpolated_frames"] = [line[:-1] for line in f.readlines()]
    env["ffmpeg"] = FFMpeg()
    convert_dataset(pose_scale=reg_scale, **env)