Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CLIP4STR] Integrate CLIP4STR #20

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,30 @@ int main()
// Please pay attention to the following parameters. You may need to change them according to different models.
nvOCDRParam param;
param.input_data_format = NHWC;
param.ocdnet_trt_engine_path = (char *)"/localhome/local-bizhao/models/ocdnet.fp16.engine";
param.ocdnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocdnet_vit.fp16.engine";
param.ocdnet_infer_input_shape[0] = 3;
param.ocdnet_infer_input_shape[1] = 736;
param.ocdnet_infer_input_shape[2] = 1280;
param.ocdnet_binarize_threshold = 0.1;
param.ocdnet_polygon_threshold = 0.3;
param.ocdnet_max_candidate = 200;
param.ocdnet_unclip_ratio = 1.5;
param.ocrnet_trt_engine_path = (char *)"/localhome/local-bizhao/output/vl4str_base_pcb_10_split_oversample.ckpt.img.fp32.onnx_sim.onnx.fp16.engine";
param.ocrnet_dict_file = (char *)"/localhome/local-bizhao/models/character_list";
param.ocrnet_trt_engine_path = (char *)"/home/binz/CLIP4STR_nvCLIP/trained_with_nvclip/best_ckpt/vl4str_2024-11-19-06-48-47_checkpoints_epoch_9-step_15580-val_accuracy_71.1684-val_NED_79.9133.visual.sim.fp16.engine";
param.ocrnet_text_trt_engine_path = (char *)"/home/binz/CLIP4STR_nvCLIP/trained_with_nvclip/best_ckpt/vl4str_2024-11-19-06-48-47_checkpoints_epoch_9-step_15580-val_accuracy_71.1684-val_NED_79.9133.text.sim.fp16.engine";
param.ocrnet_vocab_file = (char *)"/home/binz/CLIP4STR_nvCLIP/code/CLIP4STR/strhub/clip/bpe_simple_vocab_16e6.txt";
param.ocrnet_vocab_size = 32000;
param.ocrnet_dict_file = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/character_list_clip4str";
param.ocrnet_infer_input_shape[0] = 3;
param.ocrnet_infer_input_shape[1] = 224;
param.ocrnet_infer_input_shape[2] = 224;
param.ocrnet_decode = CLIP;
param.ocrnet_decode = Transformer;
param.ocrnet_only_alnum = false;
param.ocrnet_only_lowercase = false;

nvOCDRp nvocdr_ptr = nvOCDR_init(param);

// Load the input
const char* img_path = "/localhome/local-bizhao/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/scene_text.jpg";
const char* img_path = "/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/nvocdr.jpg";
cv::Mat img = cv::imread(img_path);
nvOCDRInput input;
input.device_type = GPU;
Expand All @@ -110,8 +116,8 @@ int main()
nvOCDR_inference(input, &output, nvocdr_ptr);

// filter the output text, and covert to lowercase
std::string keeped_charset = "0123456789abcdefghijklmnopqrstuvwxyz";
textFilter(output, keeped_charset);
std::string keeped_charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";;
textFilter(output, keeped_charset, param.ocrnet_only_lowercase);

// Visualize the output
int offset = 0;
Expand Down
8 changes: 4 additions & 4 deletions c++_samples/simple_inference_vit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,24 +45,24 @@ int main()
// Please pay attention to the following parameters. You may need to change them according to different models.
nvOCDRParam param;
param.input_data_format = NHWC;
param.ocdnet_trt_engine_path = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/ocdnet.fp16.engine";
param.ocdnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocdnet_vit.fp16.engine";
param.ocdnet_infer_input_shape[0] = 3;
param.ocdnet_infer_input_shape[1] = 736;
param.ocdnet_infer_input_shape[2] = 1280;
param.ocdnet_binarize_threshold = 0.1;
param.ocdnet_polygon_threshold = 0.3;
param.ocdnet_max_candidate = 200;
param.ocdnet_unclip_ratio = 1.5;
param.ocrnet_trt_engine_path = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/ocrnet.fp16.engine";
param.ocrnet_dict_file = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/character_list";
param.ocrnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocrnet_vit.fp16.engine";
param.ocrnet_dict_file = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/character_list";
param.ocrnet_infer_input_shape[0] = 1;
param.ocrnet_infer_input_shape[1] = 64;
param.ocrnet_infer_input_shape[2] = 200;
param.ocrnet_decode = Attention;
nvOCDRp nvocdr_ptr = nvOCDR_init(param);

// Load the input
const char* img_path = "/hdd_10t/tylerz/CTSE_DL/github/scene_text.jpg";
const char* img_path = "/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/scene_text.jpg";
cv::Mat img = cv::imread(img_path);
nvOCDRInput input;
input.device_type = GPU;
Expand Down
Binary file added c++_samples/test_img/nvocdr.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 10 additions & 2 deletions include/nvocdr.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ enum OCRNetDecode
{
CTC,
Attention,
CLIP
Transformer
};

typedef struct
Expand All @@ -58,7 +58,15 @@ typedef struct
char* ocrnet_trt_engine_path;
char* ocrnet_dict_file;
int32_t ocrnet_infer_input_shape[3];
OCRNetDecode ocrnet_decode = CTC;
OCRNetDecode ocrnet_decode = Transformer;
// Param for clip4str
char* ocrnet_text_trt_engine_path;
bool ocrnet_only_alnum = false;
bool ocrnet_only_lowercase = false;
char* ocrnet_vocab_file;
int ocrnet_vocab_size = 32000;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious, why can't we use a dict_file or vocabulary file to determine those parameters:

ocrnet_only_alnum
ocrnet_only_lowercase
ocrnet_vocab_file
ocrnet_vocab_size

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the raw output from CLIP4STR includes0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`
so ocrnet_only_alnum and ocrnet_only_lowercase flags are used to control if we should filter the upper and symbol chars

ocrnet_vocab_file includs 26k words, we only need 32000 words here

// char* charset_train = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
// char* charset_test = "0123456789abcdefghijklmnopqrstuvwxyz";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove the unused code/

// common param

} nvOCDRParam;
Expand Down
Loading