NVIDIA-AI-IOT · Bin-NV · Dec 29, 2024 · Tyler-D · Jan 6, 2025 · Bin-NV
diff --git a/c++_samples/simple_inference_ClipOCR.cpp → c++_samples/simple_inference_nvCLIP4STR.cpp b/c++_samples/simple_inference_ClipOCR.cpp → c++_samples/simple_inference_nvCLIP4STR.cpp
@@ -75,24 +75,30 @@ int main()
     // Please pay attention to the following parameters. You may need to change them according to different models.
     nvOCDRParam param;
     param.input_data_format = NHWC;
-    param.ocdnet_trt_engine_path = (char *)"/localhome/local-bizhao/models/ocdnet.fp16.engine";
+    param.ocdnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocdnet_vit.fp16.engine";
     param.ocdnet_infer_input_shape[0] = 3;
     param.ocdnet_infer_input_shape[1] = 736;
     param.ocdnet_infer_input_shape[2] = 1280;
     param.ocdnet_binarize_threshold = 0.1;
     param.ocdnet_polygon_threshold = 0.3;
     param.ocdnet_max_candidate = 200;
     param.ocdnet_unclip_ratio = 1.5;
-    param.ocrnet_trt_engine_path = (char *)"/localhome/local-bizhao/output/vl4str_base_pcb_10_split_oversample.ckpt.img.fp32.onnx_sim.onnx.fp16.engine";
-    param.ocrnet_dict_file = (char *)"/localhome/local-bizhao/models/character_list";
+    param.ocrnet_trt_engine_path = (char *)"/home/binz/CLIP4STR_nvCLIP/trained_with_nvclip/best_ckpt/vl4str_2024-11-19-06-48-47_checkpoints_epoch_9-step_15580-val_accuracy_71.1684-val_NED_79.9133.visual.sim.fp16.engine";
+    param.ocrnet_text_trt_engine_path = (char *)"/home/binz/CLIP4STR_nvCLIP/trained_with_nvclip/best_ckpt/vl4str_2024-11-19-06-48-47_checkpoints_epoch_9-step_15580-val_accuracy_71.1684-val_NED_79.9133.text.sim.fp16.engine";
+    param.ocrnet_vocab_file = (char *)"/home/binz/CLIP4STR_nvCLIP/code/CLIP4STR/strhub/clip/bpe_simple_vocab_16e6.txt";
+    param.ocrnet_vocab_size = 32000;
+    param.ocrnet_dict_file = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/character_list_clip4str";
     param.ocrnet_infer_input_shape[0] = 3;
     param.ocrnet_infer_input_shape[1] = 224;
     param.ocrnet_infer_input_shape[2] = 224;
-    param.ocrnet_decode = CLIP;
+    param.ocrnet_decode = Transformer;
+    param.ocrnet_only_alnum = false;
+    param.ocrnet_only_lowercase = false;
+
     nvOCDRp nvocdr_ptr = nvOCDR_init(param);
 
     // Load the input
-    const char* img_path = "/localhome/local-bizhao/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/scene_text.jpg";
+    const char* img_path = "/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/nvocdr.jpg";
     cv::Mat img = cv::imread(img_path);
     nvOCDRInput input;
     input.device_type = GPU;
@@ -110,8 +116,8 @@ int main()
     nvOCDR_inference(input, &output, nvocdr_ptr);
 
     // filter the output text, and covert to lowercase
-    std::string keeped_charset = "0123456789abcdefghijklmnopqrstuvwxyz";
-    textFilter(output, keeped_charset);
+    std::string keeped_charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";;
+    textFilter(output, keeped_charset, param.ocrnet_only_lowercase);
 
     // Visualize the output
     int offset = 0;

diff --git a/c++_samples/simple_inference_vit.cpp b/c++_samples/simple_inference_vit.cpp
@@ -45,24 +45,24 @@ int main()
     // Please pay attention to the following parameters. You may need to change them according to different models.
     nvOCDRParam param;
     param.input_data_format = NHWC;
-    param.ocdnet_trt_engine_path = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/ocdnet.fp16.engine";
+    param.ocdnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocdnet_vit.fp16.engine";
     param.ocdnet_infer_input_shape[0] = 3;
     param.ocdnet_infer_input_shape[1] = 736;
     param.ocdnet_infer_input_shape[2] = 1280;
     param.ocdnet_binarize_threshold = 0.1;
     param.ocdnet_polygon_threshold = 0.3;
     param.ocdnet_max_candidate = 200;
     param.ocdnet_unclip_ratio = 1.5;
-    param.ocrnet_trt_engine_path = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/ocrnet.fp16.engine";
-    param.ocrnet_dict_file = (char *)"/hdd_10t/tylerz/CTSE_DL/github/ptmv2_models/character_list";
+    param.ocrnet_trt_engine_path = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/ocrnet_vit.fp16.engine";
+    param.ocrnet_dict_file = (char *)"/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/onnx_models/character_list";
     param.ocrnet_infer_input_shape[0] = 1;
     param.ocrnet_infer_input_shape[1] = 64;
     param.ocrnet_infer_input_shape[2] = 200;
     param.ocrnet_decode = Attention;
     nvOCDRp nvocdr_ptr = nvOCDR_init(param);
 
     // Load the input
-    const char* img_path = "/hdd_10t/tylerz/CTSE_DL/github/scene_text.jpg";
+    const char* img_path = "/home/binz/ssd_4t/NVIDIA-Optical-Character-Detection-and-Recognition-Solution/c++_samples/test_img/scene_text.jpg";
     cv::Mat img = cv::imread(img_path);
     nvOCDRInput input;
     input.device_type = GPU;

diff --git a/c++_samples/test_img/nvocdr.jpg b/c++_samples/test_img/nvocdr.jpg
diff --git a/include/nvocdr.h b/include/nvocdr.h
@@ -36,7 +36,7 @@ enum OCRNetDecode
 {
   CTC,
   Attention,
-  CLIP
+  Transformer
 };
 
 typedef struct
@@ -58,7 +58,15 @@ typedef struct
   char* ocrnet_trt_engine_path;
   char* ocrnet_dict_file;
   int32_t ocrnet_infer_input_shape[3];
-  OCRNetDecode ocrnet_decode = CTC;
+  OCRNetDecode ocrnet_decode = Transformer;
+  // Param for clip4str
+  char* ocrnet_text_trt_engine_path;
+  bool ocrnet_only_alnum = false;
+  bool ocrnet_only_lowercase = false;
+  char* ocrnet_vocab_file;
+  int ocrnet_vocab_size = 32000;
+  // char* charset_train = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
+  // char* charset_test = "0123456789abcdefghijklmnopqrstuvwxyz";
   // common param
 
 } nvOCDRParam;