Few questions #25

ZERO-legion5 · 2024-12-23T10:03:54Z

I have tried to clone the repo and try to run the project. However I faced two issues

target does not contain the exemplars index in transforms.py and captions index in groundingdino.py
exemplar NestedTensor() does not have len()

I overcame 1 by manually feeding the exemplar and captions data. However I can't fix 2.

When I adjusted the batch size and number to exemplars manually, it rose a new problem in roi_align function. Roi_align expects a list of tensors of boxes which wasnt structured default. After fixing that, I get boolean ambiguity error in add_exemplar_tokens.

CountGD/models/GroundingDINO/groundingdino.py", line 251, in add_exemplar_tokens
if label_count == label:
RuntimeError: Boolean value of Tensor with more than one value is ambiguous

Any ideas on how to fix them?

niki-amini-naieni · 2024-12-23T10:56:44Z

Can you provide me with the exact command you used? The test commands here should run with no errors: https://github.com/niki-amini-naieni/CountGD/tree/main?tab=readme-ov-file#countgd-inference--pre-trained-weights

ZERO-legion5 · 2024-12-23T12:13:41Z

i modified the gradio app for simple use. Here is the code

`import copy
import random
import PIL.Image
import torch
import PIL
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms.functional as F
import numpy as np
import json
import plotly.express as px
import pandas as pd
from util.slconfig import SLConfig, DictAction
from util.misc import nested_tensor_from_tensor_list
import datasets.transforms as T
import scipy.ndimage as ndimage
import matplotlib.pyplot as plt
import io
import os
import shutil
import warnings
import argparse

warnings.filterwarnings("ignore")

cwd = os.getcwd()

Helper function to find CUDA path.

def find_cuda():
cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')

if cuda_home and os.path.exists(cuda_home):
    return cuda_home

nvcc_path = shutil.which('nvcc')

if nvcc_path:
    cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
    return cuda_path

return None

cuda_path = find_cuda()

CONF_THRESH = 0.23

def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
else:
return torch.device('cpu')

def build_model_and_transforms():
normalize = T.Compose([
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
data_transform = T.Compose([
T.RandomResize([800], max_size=1333),
normalize,
])
cfg = SLConfig.fromfile("cfg_app.py")
cfg.merge_from_dict({"text_encoder_type": "checkpoints/bert-base-uncased"})

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

args = argparse.Namespace()

default_args = {
"device": "cuda",
"note": "",
"resume": "",
"pretrain_model_path": "checkpoint_best_regular.pth",
"start_epoch": 0,
"num_workers": 8,
"world_size": 1,
"dist_url": "env://",
"rank": 0,
"options": None,
"remove_difficult": False,
"fix_size": False,
"eval": True,
"test": False,
"debug": False,
"find_unused_params": False,
"save_results": False,
"save_log": False,
"local_rank": None,
"amp": False,
"finetune_ignore": None
}

cfg_dict = cfg._cfg_dict.to_dict()
args_vars = vars(args)
for k, v in cfg_dict.items():
    if k not in args_vars:
        setattr(args, k, v)
    else:
        raise ValueError("Key {} can used by args only".format(k))
for k, v in default_args.items():
    if k not in args_vars:
        setattr(args, k, v)
    else:
        raise ValueError("Key {} can used by args only".format(k))

from models.registry import MODULE_BUILD_FUNCS

assert args.modelname in MODULE_BUILD_FUNCS._module_dict
build_func = MODULE_BUILD_FUNCS.get(args.modelname)
model, _, _ = build_func(args)

checkpoint = torch.load("checkpoint_best_regular.pth", map_location="cpu")["model"]
model.load_state_dict(checkpoint, strict=False)

model.eval()

return model, data_transform

def get_ind_to_filter(text, word_ids, keywords):
if len(keywords) <= 0:
return list(range(len(word_ids)))

input_words = text.split()
keywords = keywords.split(",")
keywords = [keyword.strip() for keyword in keywords]

word_inds = []
for keyword in keywords:
    if keyword in input_words:
        if len(word_inds) <= 0:
            ind = input_words.index(keyword)
            word_inds.append(ind)
        else:
            ind = input_words.index(keyword, word_inds[-1])
            word_inds.append(ind)
    else:
        raise Exception("Only specify keywords in the input text!")

inds_to_filter = []
for ind in range(len(word_ids)):
    word_id = word_ids[ind]
    if word_id in word_inds:
        inds_to_filter.append(ind)

return inds_to_filter

def pdump(data):

import pickle
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)

def count_main(image, text, prompts, device):
keywords = ""

if prompts is None:
    prompts = {"image": image, "points": []}

input_image, _ = transform(image, {"exemplars": torch.tensor([])})
input_image = input_image.unsqueeze(0).to(device)
exemplars = prompts["points"]

input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.as_tensor(exemplars)})
input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
exemplars = [exemplars["exemplars"].to(device)]

data = {"caption": text}
with open("data.json", "w") as f:
    json.dump(data, f)



with torch.no_grad():
    model_output = model(
        nested_tensor_from_tensor_list(input_image),
        nested_tensor_from_tensor_list(input_image_exemplars),
        exemplars,
        [torch.tensor([0]).to(device) for _ in range(len(input_image))],
        captions=[text + " ."] * len(input_image),
    )

print(model_output.keys())

ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
boxes = model_output["pred_boxes"][0]

if len(keywords.strip()) > 0:
    box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
else:
    box_mask = logits.max(dim=-1).values > CONF_THRESH

logits = logits[box_mask, :].cpu().numpy()
boxes = boxes[box_mask, :].cpu().numpy()

(w, h) = image.size
det_map = np.zeros((h, w))
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
det_map = ndimage.gaussian_filter(
    det_map, sigma=(w // 200, w // 200), order=0
)

plt.imshow(image)
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
plt.axis('off')
img_buf = io.BytesIO()
plt.savefig(img_buf, format='png', bbox_inches='tight')
plt.close()

output_img = Image.open(img_buf)

return output_img

def main(image_path):
global model, transform
device = get_device()
model, transform = build_model_and_transforms()
model.to(device)
caption = ""
x, y, w, h = list(map(int,"110 75 160 140".split(" ")))
xy = [x,y,x+w,y+h]

image = PIL.Image.open(image_path)

counted = count_main(image, caption, {"image" : image, "points" : [xy]}, device)
counted = counted.convert('RGB')
counted.save("CountGD.jpg")
print("Success!")

if name == "main":
main("strawberry.jpg")
`

Can you tell me what went wrong?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Few questions #25

Few questions #25

ZERO-legion5 commented Dec 23, 2024

niki-amini-naieni commented Dec 23, 2024

ZERO-legion5 commented Dec 23, 2024 •

edited

Loading

Few questions #25

Few questions #25

Comments

ZERO-legion5 commented Dec 23, 2024

niki-amini-naieni commented Dec 23, 2024

ZERO-legion5 commented Dec 23, 2024 • edited Loading

Helper function to find CUDA path.

ZERO-legion5 commented Dec 23, 2024 •

edited

Loading