app.py

import subprocess, os, sys

result = subprocess.run(["pip", "install", "-e", "GroundingDINO"], check=True)
print(f"pip install GroundingDINO = {result}")

sys.path.insert(0, "./GroundingDINO")

if not os.path.exists("./sam_vit_h_4b8939.pth"):
    result = subprocess.run(
        [
            "wget",
            "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
        ],
        check=True,
    )
    print(f"wget sam_vit_h_4b8939.pth result = {result}")


import argparse
import random
import warnings
import json

import gradio as gr
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from scipy import ndimage
from PIL import Image
from huggingface_hub import hf_hub_download
from segments.utils import bitmap2file

# Grounding DINO
import GroundingDINO.groundingdino.datasets.transforms as T
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util import box_ops
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.util.utils import (
    clean_state_dict,
)
from GroundingDINO.groundingdino.util.inference import annotate, predict

# segment anything
from segment_anything import build_sam, SamPredictor

# CLIPSeg
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation


def load_model_hf(model_config_path, repo_id, filename, device):
    args = SLConfig.fromfile(model_config_path)
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location=device)
    log = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print("Model loaded from {} \n => {}".format(cache_file, log))
    _ = model.eval()
    model = model.to(device)
    return model


def load_image_for_dino(image):
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    dino_image, _ = transform(image, None)
    return dino_image


def dino_detection(
    model,
    image,
    image_array,
    category_names,
    category_name_to_id,
    box_threshold,
    text_threshold,
    device,
    visualize=False,
):
    detection_prompt = " . ".join(category_names)
    dino_image = load_image_for_dino(image)
    dino_image = dino_image.to(device)
    with torch.no_grad():
        boxes, logits, phrases = predict(
            model=model,
            image=dino_image,
            caption=detection_prompt,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
            device=device,
            remove_combined=True
        )
    category_ids = [category_name_to_id[phrase] for phrase in phrases]

    if visualize:
        annotated_frame = annotate(
            image_source=image_array, boxes=boxes, logits=logits, phrases=phrases
        )
        annotated_frame = annotated_frame[..., ::-1]  # BGR to RGB
        visualization = Image.fromarray(annotated_frame)
        return boxes, category_ids, visualization
    else:
        return boxes, category_ids, phrases


def sam_masks_from_dino_boxes(predictor, image_array, boxes, device):
    # box: normalized box xywh -> unnormalized xyxy
    H, W, _ = image_array.shape
    boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.Tensor([W, H, W, H])
    transformed_boxes = predictor.transform.apply_boxes_torch(
        boxes_xyxy, image_array.shape[:2]
    ).to(device)
    thing_masks, _, _ = predictor.predict_torch(
        point_coords=None,
        point_labels=None,
        boxes=transformed_boxes,
        multimask_output=False,
    )
    return thing_masks


def preds_to_semantic_inds(preds, threshold):
    flat_preds = preds.reshape((preds.shape[0], -1))
    # Initialize a dummy "unlabeled" mask with the threshold
    flat_preds_with_treshold = torch.full(
        (preds.shape[0] + 1, flat_preds.shape[-1]), threshold
    )
    flat_preds_with_treshold[1 : preds.shape[0] + 1, :] = flat_preds

    # Get the top mask index for each pixel
    semantic_inds = torch.topk(flat_preds_with_treshold, 1, dim=0).indices.reshape(
        (preds.shape[-2], preds.shape[-1])
    )

    return semantic_inds


def clipseg_segmentation(
    processor, model, image, category_names, background_threshold, device
):
    inputs = processor(
        text=category_names,
        images=[image] * len(category_names),
        padding="max_length",
        return_tensors="pt",
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    if len(logits.shape) == 2:
        logits = logits.unsqueeze(0)
    # resize the outputs
    upscaled_logits = nn.functional.interpolate(
        logits.unsqueeze(1),
        size=(image.size[1], image.size[0]),
        mode="bilinear",
    )
    preds = torch.sigmoid(upscaled_logits.squeeze(dim=1))
    semantic_inds = preds_to_semantic_inds(preds, background_threshold)
    return preds, semantic_inds


def semantic_inds_to_shrunken_bool_masks(
    semantic_inds, shrink_kernel_size, num_categories
):
    shrink_kernel = np.ones((shrink_kernel_size, shrink_kernel_size))

    bool_masks = torch.zeros((num_categories, *semantic_inds.shape), dtype=bool)
    for category in range(num_categories):
        binary_mask = semantic_inds == category
        shrunken_binary_mask_array = (
            ndimage.binary_erosion(binary_mask.numpy(), structure=shrink_kernel)
            if shrink_kernel_size > 0
            else binary_mask.numpy()
        )
        bool_masks[category] = torch.from_numpy(shrunken_binary_mask_array)

    return bool_masks


def clip_and_shrink_preds(semantic_inds, preds, shrink_kernel_size, num_categories):
    # convert semantic_inds to shrunken bool masks
    bool_masks = semantic_inds_to_shrunken_bool_masks(
        semantic_inds, shrink_kernel_size, num_categories
    ).to(preds.device)

    sizes = [
        torch.sum(bool_masks[i].int()).item() for i in range(1, bool_masks.size(0))
    ]
    max_size = max(sizes)
    relative_sizes = [size / max_size for size in sizes] if max_size > 0 else sizes

    # use bool masks to clip preds
    clipped_preds = torch.zeros_like(preds)
    for i in range(1, bool_masks.size(0)):
        float_mask = bool_masks[i].float()
        clipped_preds[i - 1] = preds[i - 1] * float_mask

    return clipped_preds, relative_sizes


def sample_points_based_on_preds(preds, N):
    height, width = preds.shape
    weights = preds.ravel()
    indices = np.arange(height * width)

    # Randomly sample N indices based on the weights
    sampled_indices = random.choices(indices, weights=weights, k=N)

    # Convert the sampled indices into (col, row) coordinates
    sampled_points = [(index % width, index // width) for index in sampled_indices]

    return sampled_points


def upsample_pred(pred, image_source):
    pred = pred.unsqueeze(dim=0)
    original_height = image_source.shape[0]
    original_width = image_source.shape[1]

    larger_dim = max(original_height, original_width)
    aspect_ratio = original_height / original_width

    # upsample the tensor to the larger dimension
    upsampled_tensor = F.interpolate(
        pred, size=(larger_dim, larger_dim), mode="bilinear", align_corners=False
    )

    # remove the padding (at the end) to get the original image resolution
    if original_height > original_width:
        target_width = int(upsampled_tensor.shape[3] * aspect_ratio)
        upsampled_tensor = upsampled_tensor[:, :, :, :target_width]
    else:
        target_height = int(upsampled_tensor.shape[2] * aspect_ratio)
        upsampled_tensor = upsampled_tensor[:, :, :target_height, :]
    return upsampled_tensor.squeeze(dim=1)


def sam_mask_from_points(predictor, image_array, points):
    points_array = np.array(points)
    # we only sample positive points, so labels are all 1
    points_labels = np.ones(len(points))
    # we don't use predict_torch here cause it didn't seem to work...
    _, _, logits = predictor.predict(
        point_coords=points_array,
        point_labels=points_labels,
    )
    # max over the 3 segmentation levels
    total_pred = torch.max(torch.sigmoid(torch.tensor(logits)), dim=0)[0].unsqueeze(
        dim=0
    )
    # logits are 256x256 -> upsample back to image shape
    upsampled_pred = upsample_pred(total_pred, image_array)
    return upsampled_pred


def inds_to_segments_format(
    panoptic_inds, thing_category_ids, stuff_category_names, category_name_to_id
):
    panoptic_inds_array = panoptic_inds.numpy().astype(np.uint32)
    bitmap_file = bitmap2file(panoptic_inds_array, is_segmentation_bitmap=True)
    segmentation_bitmap = Image.open(bitmap_file)

    stuff_category_ids = [
        category_name_to_id[stuff_category_name]
        for stuff_category_name in stuff_category_names
    ]

    unique_inds = np.unique(panoptic_inds_array)
    stuff_annotations = [
        {"id": i, "category_id": stuff_category_ids[i - 1]}
        for i in range(1, len(stuff_category_names) + 1)
        if i in unique_inds
    ]
    thing_annotations = [
        {"id": len(stuff_category_names) + 1 + i, "category_id": thing_category_id}
        for i, thing_category_id in enumerate(thing_category_ids)
    ]
    annotations = stuff_annotations + thing_annotations

    return segmentation_bitmap, annotations


def generate_panoptic_mask(
    image,
    thing_category_names_string,
    stuff_category_names_string,
    dino_box_threshold=0.3,
    dino_text_threshold=0.25,
    segmentation_background_threshold=0.1,
    shrink_kernel_size=20,
    num_samples_factor=1000,
    task_attributes_json="",
):
    if task_attributes_json != "":
        task_attributes = json.loads(task_attributes_json)
        categories = task_attributes["categories"]
        category_name_to_id = {
            category["name"]: category["id"] for category in categories
        }
        # split the categories into "stuff" categories (regions w/o instances)
        # and "thing" categories (objects/instances)
        stuff_categories = [
            category
            for category in categories
            if "has_instances" not in category or not category["has_instances"]
        ]
        thing_categories = [
            category
            for category in categories
            if "has_instances" in category and category["has_instances"]
        ]
        stuff_category_names = [category["name"] for category in stuff_categories]
        thing_category_names = [category["name"] for category in thing_categories]
        category_names = thing_category_names + stuff_category_names
    else:
        # parse inputs
        thing_category_names = [
            thing_category_name.strip()
            for thing_category_name in thing_category_names_string.split(",")
        ]
        stuff_category_names = [
            stuff_category_name.strip()
            for stuff_category_name in stuff_category_names_string.split(",")
        ]
        category_names = thing_category_names + stuff_category_names
        category_name_to_id = {
            category_name: i for i, category_name in enumerate(category_names)
        }

    image = image.convert("RGB")
    image_array = np.asarray(image)

    # compute SAM image embedding
    sam_predictor.set_image(image_array)

    # detect boxes for "thing" categories using Grounding DINO
    thing_category_ids = []
    thing_masks = []
    thing_boxes = []
    detected_thing_category_names = []
    if len(thing_category_names) > 0:
        thing_boxes, thing_category_ids, detected_thing_category_names = dino_detection(
            dino_model,
            image,
            image_array,
            thing_category_names,
            category_name_to_id,
            dino_box_threshold,
            dino_text_threshold,
            device,
        )
        if len(thing_boxes) > 0:
            # get segmentation masks for the thing boxes
            thing_masks = sam_masks_from_dino_boxes(
                sam_predictor, image_array, thing_boxes, device
            )
    if len(stuff_category_names) > 0:
        # get rough segmentation masks for "stuff" categories using CLIPSeg
        clipseg_preds, clipseg_semantic_inds = clipseg_segmentation(
            clipseg_processor,
            clipseg_model,
            image,
            stuff_category_names,
            segmentation_background_threshold,
            device,
        )
        # remove things from stuff masks
        clipseg_semantic_inds_without_things = clipseg_semantic_inds.clone()
        if len(thing_boxes) > 0:
            combined_things_mask = torch.any(thing_masks, dim=0)
            clipseg_semantic_inds_without_things[combined_things_mask[0]] = 0
        # clip CLIPSeg preds based on non-overlapping semantic segmentation inds (+ optionally shrink the mask of each category)
        # also returns the relative size of each category
        clipsed_clipped_preds, relative_sizes = clip_and_shrink_preds(
            clipseg_semantic_inds_without_things,
            clipseg_preds,
            shrink_kernel_size,
            len(stuff_category_names) + 1,
        )
        # get finer segmentation masks for the "stuff" categories using SAM
        sam_preds = torch.zeros_like(clipsed_clipped_preds)
        for i in range(clipsed_clipped_preds.shape[0]):
            clipseg_pred = clipsed_clipped_preds[i]
            # for each "stuff" category, sample points in the rough segmentation mask
            num_samples = int(relative_sizes[i] * num_samples_factor)
            if num_samples == 0:
                continue
            points = sample_points_based_on_preds(
                clipseg_pred.cpu().numpy(), num_samples
            )
            if len(points) == 0:
                continue
            # use SAM to get mask for points
            pred = sam_mask_from_points(sam_predictor, image_array, points)
            sam_preds[i] = pred
        sam_semantic_inds = preds_to_semantic_inds(
            sam_preds, segmentation_background_threshold
        )

    # combine the thing inds and the stuff inds into panoptic inds
    panoptic_inds = (
        sam_semantic_inds.clone()
        if len(stuff_category_names) > 0
        else torch.zeros(image_array.shape[0], image_array.shape[1], dtype=torch.long)
    )
    ind = len(stuff_category_names) + 1
    for thing_mask in thing_masks:
        # overlay thing mask on panoptic inds
        panoptic_inds[thing_mask.squeeze(dim=0)] = ind
        ind += 1

    panoptic_bool_masks = (
        semantic_inds_to_shrunken_bool_masks(panoptic_inds, 0, ind + 1)
        .numpy()
        .astype(int)
    )
    panoptic_names = (
        ["unlabeled"] + stuff_category_names + detected_thing_category_names
    )
    subsection_label_pairs = [
        (panoptic_bool_masks[i], panoptic_name)
        for i, panoptic_name in enumerate(panoptic_names)
    ]

    segmentation_bitmap, annotations = inds_to_segments_format(
        panoptic_inds, thing_category_ids, stuff_category_names, category_name_to_id
    )
    annotations_json = json.dumps(annotations)

    return (image_array, subsection_label_pairs), segmentation_bitmap, annotations_json


config_file = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swint_ogc.pth"
sam_checkpoint = "./sam_vit_h_4b8939.pth"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

if device != "cpu":
    try:
        from GroundingDINO.groundingdino import _C
    except:
        warnings.warn(
            "Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!"
        )

# initialize groundingdino model
dino_model = load_model_hf(config_file, ckpt_repo_id, ckpt_filename, device)

# initialize SAM
sam = build_sam(checkpoint=sam_checkpoint)
sam.to(device=device)
sam_predictor = SamPredictor(sam)

clipseg_processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
clipseg_model = CLIPSegForImageSegmentation.from_pretrained(
    "CIDAS/clipseg-rd64-refined"
)
clipseg_model.to(device)

if __name__ == "__main__":
    parser = argparse.ArgumentParser("Panoptic Segment Anything demo", add_help=True)
    parser.add_argument("--debug", action="store_true", help="using debug mode")
    parser.add_argument("--share", action="store_true", help="share the app")
    args = parser.parse_args()

    print(f"args = {args}")

    block = gr.Blocks(title="Panoptic Segment Anything").queue()
    with block:
        with gr.Column():
            title = gr.Markdown(
                "# [Panoptic Segment Anything](https://github.com/segments-ai/panoptic-segment-anything)"
            )
            description = gr.Markdown(
                "Demo for zero-shot panoptic segmentation using Segment Anything, Grounding DINO, and CLIPSeg."
            )
            with gr.Row():
                with gr.Column():
                    input_image = gr.Image(sources=["upload"], type="pil")
                    thing_category_names_string = gr.Textbox(
                        label="Thing categories (i.e. categories with instances), comma-separated",
                        placeholder="E.g. car, bus, person",
                    )
                    stuff_category_names_string = gr.Textbox(
                        label="Stuff categories (i.e. categories without instances), comma-separated",
                        placeholder="E.g. sky, road, buildings",
                    )
                    run_button = gr.Button(value="Run")
                    with gr.Accordion("Advanced options", open=False):
                        box_threshold = gr.Slider(
                            label="Grounding DINO box threshold",
                            minimum=0.0,
                            maximum=1.0,
                            value=0.3,
                            step=0.001,
                        )
                        text_threshold = gr.Slider(
                            label="Grounding DINO text threshold",
                            minimum=0.0,
                            maximum=1.0,
                            value=0.25,
                            step=0.001,
                        )
                        segmentation_background_threshold = gr.Slider(
                            label="Segmentation background threshold (under this threshold, a pixel is considered background/unlabeled)",
                            minimum=0.0,
                            maximum=1.0,
                            value=0.1,
                            step=0.001,
                        )
                        shrink_kernel_size = gr.Slider(
                            label="Shrink kernel size (how much to shrink the mask before sampling points)",
                            minimum=0,
                            maximum=100,
                            value=20,
                            step=1,
                        )
                        num_samples_factor = gr.Slider(
                            label="Number of samples factor (how many points to sample in the largest category)",
                            minimum=0,
                            maximum=1000,
                            value=1000,
                            step=1,
                        )
                        task_attributes_json = gr.Textbox(
                            label="Task attributes JSON",
                        )

                with gr.Column():
                    annotated_image = gr.AnnotatedImage()
                    with gr.Accordion("Segmentation bitmap", open=False):
                        segmentation_bitmap_text = gr.Markdown(
                            """
The segmentation bitmap is a 32-bit RGBA png image which contains the segmentation masks.
The alpha channel is set to 255, and the remaining 24-bit values in the RGB channels correspond to the object ids in the annotations list.
Unlabeled regions have a value of 0.
Because of the large dynamic range, the segmentation bitmap appears black in the image viewer.
"""
                        )
                        segmentation_bitmap = gr.Image(
                            type="pil", label="Segmentation bitmap"
                        )
                        annotations_json = gr.Textbox(
                            label="Annotations JSON",
                        )

            examples = gr.Examples(
                examples=[
                    [
                        "a2d2.png",
                        "car, bus, person",
                        "road, sky, buildings, sidewalk",
                    ],
                    [
                        "bxl.png",
                        "car, tram, motorcycle, person",
                        "road, buildings, sky",
                    ],
                ],
                fn=generate_panoptic_mask,
                inputs=[
                    input_image,
                    thing_category_names_string,
                    stuff_category_names_string,
                ],
                outputs=[annotated_image, segmentation_bitmap, annotations_json],
                cache_examples=True,
            )

        run_button.click(
            fn=generate_panoptic_mask,
            inputs=[
                input_image,
                thing_category_names_string,
                stuff_category_names_string,
                box_threshold,
                text_threshold,
                segmentation_background_threshold,
                shrink_kernel_size,
                num_samples_factor,
                task_attributes_json,
            ],
            outputs=[annotated_image, segmentation_bitmap, annotations_json],
            api_name="segment",
        )

    block.launch(server_name="0.0.0.0", debug=args.debug, share=args.share)