updated efficientnet models with onnx and real data checkpoint for ef…

…ficientnetv2_m
h2oai · Feb 26, 2023 · 6a3bbf3 · 6a3bbf3
1 parent d7e33b6
commit 6a3bbf3
Show file tree

Hide file tree

Showing 7 changed files with 210 additions and 27 deletions.
diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py
@@ -369,6 +369,10 @@ def __init__(
         self.assume_straight_pages = True
         self.postprocessor = DBPostProcessor(assume_straight_pages=self.assume_straight_pages)
         self.device = torch.cuda.is_available()
+        if os.environ.get("CUDA_VISIBLE_DEVICES", []) == "":
+            self.device = "cpu"
+        elif len(os.environ.get("CUDA_VISIBLE_DEVICES", [])) > 0:
+            self.device = "cuda"
         model_path = str(download_from_url(self.cfg["url"], cache_subdir='models'))
         if self.device:
             self.sess = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider'])
@@ -399,6 +403,10 @@ def __init__(
         self.assume_straight_pages = True
         self.postprocessor = DBPostProcessor(assume_straight_pages=self.assume_straight_pages)
         self.device = torch.cuda.is_available()
+        if os.environ.get("CUDA_VISIBLE_DEVICES", []) == "":
+            self.device = "cpu"
+        elif len(os.environ.get("CUDA_VISIBLE_DEVICES", [])) > 0:
+            self.device = "cuda"
         model_path = str(download_from_url(self.cfg["url"], cache_subdir='models'))
         if self.device:
             self.sess = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider'])

diff --git a/doctr/models/detection/predictor/pytorch.py b/doctr/models/detection/predictor/pytorch.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 from torch import nn
-
+import os
 from doctr.models.preprocessor import PreProcessor
 
 __all__ = ["DetectionPredictor"]
@@ -33,6 +33,10 @@ def __init__(
         self.pre_processor = pre_processor
         self.postprocessor = self.model.postprocessor
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if os.environ.get("CUDA_VISIBLE_DEVICES", []) == "":
+            self.device = "cpu"
+        elif len(os.environ.get("CUDA_VISIBLE_DEVICES", [])) > 0:
+            self.device = "cuda"
         if "onnx" not in str((type(self.model))) and (self.device == torch.device("cuda")):
             # self.model = nn.DataParallel(self.model)
             # self.model = self.model.half()

diff --git a/doctr/models/recognition/crnn/pytorch.py b/doctr/models/recognition/crnn/pytorch.py
@@ -23,7 +23,9 @@
 from ..core import RecognitionModel, RecognitionPostProcessor
 
 __all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_vgg16_bn_onnx', 'crnn_mobilenet_v3_small',
-           'crnn_mobilenet_v3_large', 'crnn_efficientnet_b0', 'crnn_efficientnet_b3', 'crnn_efficientnetv2_m']
+           'crnn_mobilenet_v3_large', 'crnn_efficientnet_b0', 'crnn_efficientnet_b0_onnx',
+           'crnn_efficientnet_b3', 'crnn_efficientnet_b3_onnx', 'crnn_efficientnetv2_m',
+           'crnn_efficientnetv2_m_onnx', 'crnn_efficientnetv2_mV2', 'crnn_efficientnetv2_mV2_onnx']
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
     "crnn_vgg16_bn": {
@@ -33,7 +35,7 @@
         "vocab": VOCABS["legacy_french"],
         "url": "https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0",
     },
-        'crnn_vgg16_bn_onnx': {
+    'crnn_vgg16_bn_onnx': {
         'mean': (0.694, 0.695, 0.693),
         'std': (0.299, 0.296, 0.301),
         'input_shape': (3, 32, 128),
@@ -61,20 +63,55 @@
         'vocab': VOCABS['french'] + " ",
         'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_crnnv2/crnn_effnet_b0.pt'
     },
+    'crnn_efficientnet_b0_onnx': {
+        'mean': (0.694, 0.695, 0.693),
+        'std': (0.299, 0.296, 0.301),
+        'input_shape': (3, 32, 128),
+        'vocab': VOCABS['french'] + " ",
+        'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_onnx_models/crnn_effnet_b0.onnx'
+    },
     'crnn_efficientnet_b3': {
         'mean': (0.694, 0.695, 0.693),
         'std': (0.299, 0.296, 0.301),
         'input_shape': (3, 32, 128),
         'vocab': VOCABS['french'] + " ",
         'url': "https://github.com/h2oai/doctr/releases/download/efficientnet_crnnv2/crnn_effnet_b3.pt",
     },
+    'crnn_efficientnet_b3_onnx': {
+        'mean': (0.694, 0.695, 0.693),
+        'std': (0.299, 0.296, 0.301),
+        'input_shape': (3, 32, 128),
+        'vocab': VOCABS['french'] + " ",
+        'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_onnx_models/crnn_effnet_b3.onnx'
+    },
     'crnn_efficientnetv2_m': {
         'mean': (0.694, 0.695, 0.693),
         'std': (0.299, 0.296, 0.301),
         'input_shape': (3, 32, 128),
         'vocab': VOCABS['french'] + " ",
         'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_crnnv2/crnn_effnetv2_m.pt'
     },
+    'crnn_efficientnetv2_m_onnx': {
+        'mean': (0.694, 0.695, 0.693),
+        'std': (0.299, 0.296, 0.301),
+        'input_shape': (3, 32, 128),
+        'vocab': VOCABS['french'] + " ",
+        'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_onnx_models/crnn_effnetv2_m.onnx'
+    },
+    'crnn_efficientnetv2_mV2': {
+        'mean': (0.694, 0.695, 0.693),
+        'std': (0.299, 0.296, 0.301),
+        'input_shape': (3, 32, 128),
+        'vocab': VOCABS['french'] + " ",
+        'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_onnx_models/crnn_effnetv2_mV2.pt'
+    },
+    'crnn_efficientnetv2_mV2_onnx': {
+        'mean': (0.694, 0.695, 0.693),
+        'std': (0.299, 0.296, 0.301),
+        'input_shape': (3, 32, 128),
+        'vocab': VOCABS['french'] + " ",
+        'url': 'https://github.com/h2oai/doctr/releases/download/efficientnet_onnx_models/crnn_effnetv2_mV2.onnx'
+    },
 }
 
 
@@ -293,18 +330,23 @@ def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN:
 
     return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, ignore_keys=["linear.weight", "linear.bias"], **kwargs)
 
-class crnn_vgg16_bn_onnx(RecognitionModel, nn.Module):
-    """Onnx converted crnn_vgg16_bn_onnx"""
+class _crnn_onnx(RecognitionModel, nn.Module):
+    """Onnx converted models"""
     def __init__(
         self,
-        pretrained = True
+        pretrained = True,
+        model_name = None
     ) -> None:
         super().__init__()
-        self.vocab = default_cfgs["crnn_vgg16_bn_onnx"]["vocab"]
-        self.cfg = default_cfgs["crnn_vgg16_bn_onnx"]
+        self.vocab = default_cfgs[model_name]["vocab"]
+        self.cfg = default_cfgs[model_name]
 
         self.postprocessor = CTCPostProcessor(vocab=self.vocab)
         self.device = torch.cuda.is_available()
+        if os.environ["CUDA_VISIBLE_DEVICES"] == "":
+            self.device = "cpu"
+        elif len(os.environ["CUDA_VISIBLE_DEVICES"]) > 0:
+            self.device = "cuda"
         model_path = str(download_from_url(self.cfg["url"], cache_subdir='models'))
         if self.device:
             self.sess = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider'])
@@ -323,6 +365,115 @@ def forward(
         else:
             logits = self.compiled_model_onnx([x.detach().cpu().numpy()])[self.output_layer_onnx]
         return logits
+
+def crnn_efficientnet_b0_onnx(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with efficientnetb0 onnx
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn_onnx(
+        True,
+        'crnn_efficientnet_b0_onnx',
+        **kwargs,
+    )
+def crnn_efficientnet_b3_onnx(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with efficientnetb3 onnx
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn_onnx(
+        True,
+        'crnn_efficientnet_b3_onnx',
+        **kwargs,
+    )
+
+def crnn_efficientnetv2_m_onnx(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with efficientnetv2_m onnx
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn_onnx(
+        True,
+        'crnn_efficientnetv2_m_onnx',
+        **kwargs,
+    )
+
+def crnn_efficientnetv2_mV2_onnx(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with efficientnetv2_m onnx
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn_onnx(
+        True,
+        'crnn_efficientnetv2_mV2_onnx',
+        **kwargs,
+    )
+
+def crnn_vgg16_bn_onnx(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with vgg16_bn onnx
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn_onnx(
+        True,
+        "crnn_vgg16_bn_onnx",
+        **kwargs,
+    )
 
 
 def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN:
@@ -424,7 +575,7 @@ def crnn_efficientnet_b3(pretrained: bool = False, **kwargs: Any) -> CRNN:
     )
 
 def crnn_efficientnetv2_m(pretrained: bool = False, **kwargs: Any) -> CRNN:
-    """CRNN with efficientnet_b7
+    """CRNN with efficientnetv2_m
 
     >>> import torch
     >>> from doctr.models import crnn_convnext_tiny
@@ -446,3 +597,27 @@ def crnn_efficientnetv2_m(pretrained: bool = False, **kwargs: Any) -> CRNN:
         ignore_keys=['linear.weight', 'linear.bias'],
         **kwargs,
     )
+
+def crnn_efficientnetv2_mV2(pretrained: bool = False, **kwargs: Any) -> CRNN:
+    """CRNN with efficientnetv2_m
+
+    >>> import torch
+    >>> from doctr.models import crnn_convnext_tiny
+    >>> model = crnn_convnext_tiny(pretrained=True)
+    >>> input_tensor = torch.rand(1, 3, 32, 128)
+    >>> out = model(input_tensor)
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+
+    Returns:
+        text recognition architecture
+    """
+    kwargs["rnn_units"] = 512
+    return _crnn(
+        'crnn_efficientnetv2_mV2',
+        pretrained,
+        efficientnetv2_m,
+        ignore_keys=['linear.weight', 'linear.bias'],
+        **kwargs,
+    )
diff --git a/doctr/models/recognition/export_rec_onnx.py b/doctr/models/recognition/export_rec_onnx.py
@@ -5,17 +5,18 @@
 
 from doctr.models import ocr_predictor
 
-model = ocr_predictor(pretrained=True)
+model = ocr_predictor(reco_arch = "crnn_efficientnetv2_mV2", pretrained=True)
 model.reco_predictor.model = model.reco_predictor.model.eval()
 
 input = torch.randn(1, 3, 32, 128)
 input2 = torch.randn(49, 3, 32, 128)
+model = model.to("cpu")
 start = time.time()
 pred = model.reco_predictor.model(input)
 print("pytorch time", time.time() - start)
 torch.onnx.export(model.reco_predictor.model,
                   input,
-                  "rec.onnx",
+                  "crnn_effnetv2_mV2.onnx",
                   export_params = True,
                   opset_version=11,
                   do_constant_folding=True,
@@ -24,19 +25,9 @@
                   dynamic_axes = {"input":{0:"batch_size"},
                                   "output":{0:"batch_size"}})
 
-import onnx
-import onnxoptimizer
-
-onnx_model = onnx.load("rec.onnx")
-
-passes = onnxoptimizer.get_fuse_and_elimination_passes()
-new_model = onnxoptimizer.optimize(model = onnx_model, passes = passes)
-onnx.checker.check_model(new_model)
-onnx.save(new_model, "optimized_rec.onnx")
-
 import onnxruntime
 
-ort_session = onnxruntime.InferenceSession("rec.onnx")
+ort_session = onnxruntime.InferenceSession("crnn_effnetv2_mV2.onnx", providers = ['CPUExecutionProvider'])
 
 ort_inputs = {"input":input.numpy()}
 start = time.time()
@@ -57,7 +48,7 @@
 from openvino.runtime import Core
 
 ie = Core()
-model_onnx = ie.read_model(model="rec.onnx")
+model_onnx = ie.read_model(model="crnn_effnetv2_mV2.onnx")
 compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")
 
 output_layer_onnx = compiled_model_onnx.output(0)

diff --git a/doctr/models/recognition/predictor/pytorch.py b/doctr/models/recognition/predictor/pytorch.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 from torch import nn
-
+import os
 from doctr.models.preprocessor import PreProcessor
 
 from ._utils import remap_preds, split_crops
@@ -37,6 +37,10 @@ def __init__(
         self.model = model.eval()
         self.postprocessor = self.model.postprocessor
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if os.environ.get("CUDA_VISIBLE_DEVICES", []) == "":
+            self.device = "cpu"
+        elif len(os.environ.get("CUDA_VISIBLE_DEVICES", [])) > 0:
+            self.device = "cuda"
         if "onnx" not in str((type(self.model))) and (self.device == torch.device("cuda")):
             # self.model = nn.DataParallel(self.model)
             self.model = self.model.to(self.device)
@@ -82,7 +86,7 @@ def forward(
                 batch = batch.to(self.device)
                 # batch = batch.half()
             char_logits = self.model(batch)
-            if type(char_logits) != torch.Tensor():
+            if not torch.is_tensor(char_logits):
                 char_logits = torch.tensor(char_logits)
             raw += [self.postprocessor(char_logits)]
 

diff --git a/doctr/models/recognition/zoo.py b/doctr/models/recognition/zoo.py
@@ -15,7 +15,8 @@
 
 
 ARCHS: List[str] = ['crnn_vgg16_bn', 'crnn_vgg16_bn_onnx', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large', 'sar_resnet31', 'master', 
-                    'crnn_efficientnet_b0', 'crnn_efficientnet_b3', 'crnn_efficientnetv2_m', "parseq_large"]
+                    'crnn_efficientnet_b0', 'crnn_efficientnet_b0_onnx', 'crnn_efficientnet_b3', 'crnn_efficientnet_b3_onnx',
+                    'crnn_efficientnetv2_m', 'crnn_efficientnetv2_m_onnx', 'crnn_efficientnetv2_mV2', 'crnn_efficientnetv2_mV2_onnx', "parseq_large"]
 
 
 def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 from setuptools import setup
 PKG_NAME = "python-doctr"
-VERSION = os.getenv("BUILD_VERSION", "0.5.3a0")
+VERSION = os.getenv("BUILD_VERSION", "0.5.4a0")
 
 
 if __name__ == "__main__":