Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Few questions #25

Open
ZERO-legion5 opened this issue Dec 23, 2024 · 2 comments
Open

Few questions #25

ZERO-legion5 opened this issue Dec 23, 2024 · 2 comments

Comments

@ZERO-legion5
Copy link

I have tried to clone the repo and try to run the project. However I faced two issues

  1. target does not contain the exemplars index in transforms.py and captions index in groundingdino.py
  2. exemplar NestedTensor() does not have len()

I overcame 1 by manually feeding the exemplar and captions data. However I can't fix 2.

When I adjusted the batch size and number to exemplars manually, it rose a new problem in roi_align function. Roi_align expects a list of tensors of boxes which wasnt structured default. After fixing that, I get boolean ambiguity error in add_exemplar_tokens.

CountGD/models/GroundingDINO/groundingdino.py", line 251, in add_exemplar_tokens
if label_count == label:
RuntimeError: Boolean value of Tensor with more than one value is ambiguous

Any ideas on how to fix them?

@niki-amini-naieni
Copy link
Owner

Can you provide me with the exact command you used? The test commands here should run with no errors: https://github.com/niki-amini-naieni/CountGD/tree/main?tab=readme-ov-file#countgd-inference--pre-trained-weights

@ZERO-legion5
Copy link
Author

ZERO-legion5 commented Dec 23, 2024

i modified the gradio app for simple use. Here is the code

`import copy
import random
import PIL.Image
import torch
import PIL
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms.functional as F
import numpy as np
import json
import plotly.express as px
import pandas as pd
from util.slconfig import SLConfig, DictAction
from util.misc import nested_tensor_from_tensor_list
import datasets.transforms as T
import scipy.ndimage as ndimage
import matplotlib.pyplot as plt
import io
import os
import shutil
import warnings
import argparse

warnings.filterwarnings("ignore")

cwd = os.getcwd()

Helper function to find CUDA path.

def find_cuda():
cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')

if cuda_home and os.path.exists(cuda_home):
    return cuda_home

nvcc_path = shutil.which('nvcc')

if nvcc_path:
    cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
    return cuda_path

return None

cuda_path = find_cuda()

CONF_THRESH = 0.23

def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
else:
return torch.device('cpu')

def build_model_and_transforms():
normalize = T.Compose([
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
data_transform = T.Compose([
T.RandomResize([800], max_size=1333),
normalize,
])
cfg = SLConfig.fromfile("cfg_app.py")
cfg.merge_from_dict({"text_encoder_type": "checkpoints/bert-base-uncased"})

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

args = argparse.Namespace()

default_args = {
"device": "cuda",
"note": "",
"resume": "",
"pretrain_model_path": "checkpoint_best_regular.pth",
"start_epoch": 0,
"num_workers": 8,
"world_size": 1,
"dist_url": "env://",
"rank": 0,
"options": None,
"remove_difficult": False,
"fix_size": False,
"eval": True,
"test": False,
"debug": False,
"find_unused_params": False,
"save_results": False,
"save_log": False,
"local_rank": None,
"amp": False,
"finetune_ignore": None
}

cfg_dict = cfg._cfg_dict.to_dict()
args_vars = vars(args)
for k, v in cfg_dict.items():
    if k not in args_vars:
        setattr(args, k, v)
    else:
        raise ValueError("Key {} can used by args only".format(k))
for k, v in default_args.items():
    if k not in args_vars:
        setattr(args, k, v)
    else:
        raise ValueError("Key {} can used by args only".format(k))

from models.registry import MODULE_BUILD_FUNCS

assert args.modelname in MODULE_BUILD_FUNCS._module_dict
build_func = MODULE_BUILD_FUNCS.get(args.modelname)
model, _, _ = build_func(args)

checkpoint = torch.load("checkpoint_best_regular.pth", map_location="cpu")["model"]
model.load_state_dict(checkpoint, strict=False)

model.eval()

return model, data_transform

def get_ind_to_filter(text, word_ids, keywords):
if len(keywords) <= 0:
return list(range(len(word_ids)))

input_words = text.split()
keywords = keywords.split(",")
keywords = [keyword.strip() for keyword in keywords]

word_inds = []
for keyword in keywords:
    if keyword in input_words:
        if len(word_inds) <= 0:
            ind = input_words.index(keyword)
            word_inds.append(ind)
        else:
            ind = input_words.index(keyword, word_inds[-1])
            word_inds.append(ind)
    else:
        raise Exception("Only specify keywords in the input text!")

inds_to_filter = []
for ind in range(len(word_ids)):
    word_id = word_ids[ind]
    if word_id in word_inds:
        inds_to_filter.append(ind)

return inds_to_filter

def pdump(data):

import pickle
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)

def count_main(image, text, prompts, device):
keywords = ""

if prompts is None:
    prompts = {"image": image, "points": []}

input_image, _ = transform(image, {"exemplars": torch.tensor([])})
input_image = input_image.unsqueeze(0).to(device)
exemplars = prompts["points"]

input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.as_tensor(exemplars)})
input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
exemplars = [exemplars["exemplars"].to(device)]

data = {"caption": text}
with open("data.json", "w") as f:
    json.dump(data, f)



with torch.no_grad():
    model_output = model(
        nested_tensor_from_tensor_list(input_image),
        nested_tensor_from_tensor_list(input_image_exemplars),
        exemplars,
        [torch.tensor([0]).to(device) for _ in range(len(input_image))],
        captions=[text + " ."] * len(input_image),
    )

print(model_output.keys())

ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
boxes = model_output["pred_boxes"][0]

if len(keywords.strip()) > 0:
    box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
else:
    box_mask = logits.max(dim=-1).values > CONF_THRESH

logits = logits[box_mask, :].cpu().numpy()
boxes = boxes[box_mask, :].cpu().numpy()

(w, h) = image.size
det_map = np.zeros((h, w))
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
det_map = ndimage.gaussian_filter(
    det_map, sigma=(w // 200, w // 200), order=0
)

plt.imshow(image)
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
plt.axis('off')
img_buf = io.BytesIO()
plt.savefig(img_buf, format='png', bbox_inches='tight')
plt.close()

output_img = Image.open(img_buf)

return output_img

def main(image_path):
global model, transform
device = get_device()
model, transform = build_model_and_transforms()
model.to(device)
caption = ""
x, y, w, h = list(map(int,"110 75 160 140".split(" ")))
xy = [x,y,x+w,y+h]

image = PIL.Image.open(image_path)

counted = count_main(image, caption, {"image" : image, "points" : [xy]}, device)
counted = counted.convert('RGB')
counted.save("CountGD.jpg")
print("Success!")

if name == "main":
main("strawberry.jpg")
`

Can you tell me what went wrong?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants