intel
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py‎
Lines changed: 18 additions & 4 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎neural_compressor/torch/algorithms/fp8_quant/_core/quantized_hpu_ops.py‎
Lines changed: 133 additions & 0 deletions b/‎neural_compressor/torch/algorithms/fp8_quant/_core/quantized_hpu_ops.py‎
Lines changed: 133 additions & 0 deletions
@@ -58,6 +58,14 @@ def qdq_init(self):
             self.quant_max = int(torch.finfo(self.lp_dtype).max)
         self.forward = self.forward_qdq
 
+    def set_cast_to_op(self):
+        return torch.ops.hpu.cast_to_fp8_v2.scalar if self.scale_format == ScaleFormat.SCALAR else \
+               torch.ops.hpu.cast_to_fp8_v2
+
+    def set_cast_from_op(self):
+        return torch.ops.hpu.cast_from_fp8.scalar if self.scale_format == ScaleFormat.SCALAR else \
+               torch.ops.hpu.cast_from_fp8
+
     @abstractmethod
     def forward(self, *args, **kwargs):
         pass
@@ -95,8 +103,10 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
                 quantize_per_channel_to_fp8 if self.scale.numel() > 1 else quantize_per_tensor_to_fp8
                 )
 
+        self.cast_to_op = self.set_cast_to_op()
+
     def forward(self, x):
-        return cast_to_fp8_fcn(x, self.lp_dtype, self.scale_inv)
+        return self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)[0]
 
     def forward_qdq(self, x):
         return self.quantize_op(
@@ -124,8 +134,10 @@ def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
                 dequantize_per_channel_from_fp8 if self.scale.numel() > 1 else dequantize_per_tensor_from_fp8
                 )
 
+        self.cast_from_op = self.set_cast_from_op()
+
     def forward(self, x):
-        return cast_from_fp8_fcn(x, self.hp_dtype, self.scale)
+        return self.cast_from_op(x, self.scale, self.hp_dtype)
 
     def forward_qdq(self, x):
         return self.dequantize_op(
@@ -150,14 +162,16 @@ def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
         super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.scale_inv = create_scale_tensor(scale_inv, self.scale_format)
         self.scale = create_scale_tensor(1 / scale_inv, self.scale_format)
+        self.cast_to_op = self.set_cast_to_op()
+        self.cast_from_op = self.set_cast_from_op()
 
     def forward(self, x, *args, **kwargs):
-        y = cast_to_fp8_fcn(x, self.lp_dtype, self.scale_inv)
+        y = self.cast_to_op(x, self.scale_inv, False, False, self.lp_dtype)[0]
         # mark_step is needed so fuser won't remove 2 consecutive casts.
         # will be removed once SW-196431 is implemented
         # Call cur_accelerator.synchronize() which will call mark_step() as well
         cur_accelerator.synchronize()
-        z = cast_from_fp8_fcn(y, self.hp_dtype, self.scale)
+        z = self.cast_from_op(y, self.scale, self.hp_dtype)
         cur_accelerator.synchronize()
         return z
 
 
@@ -0,0 +1,133 @@
+
+from .._quant_common.quant_config import ScaleFormat
+from ..utils.logger import logger
+
+try:  # backwards compatibility for 1.16
+    from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
+except ImportError:
+    pass
+
+import torch
+
+from abc import ABC, abstractmethod
+from enum import Enum, auto
+
+class OP_TYPE(Enum):
+    # class per hpu custom fp8 ops used in patched modules logic
+    GEMM = auto(),
+    SOFTMAX = auto()
+    CONV = auto()
+    FSDPA = auto()
+
+
+class QuantizedHpuFuncWrapper(ABC):
+    """
+    Base class for wrapping calls to hpu custom fp8 ops.
+    The concrete class object is created in patched module init in call to get_hpu_quantized_func_wrapper.
+    Concrete class should define get_default_quantized_func method.
+    Concrete class may override base class methods in case custom op logic is unique, see examples in concrete
+    classes below.
+    """
+    def __init__(self, scale_format):
+        self.set_quantized_func(scale_format)
+        self.quantized_func_args = None
+
+    @abstractmethod
+    def get_default_quantized_func(self):
+        raise NotImplementedError()
+
+    def get_scalar_quantized_func(self):
+        return self.get_default_quantized_func().scalar
+
+    def set_quantized_func(self, scale_format):
+        if scale_format == ScaleFormat.SCALAR:
+                self._quantized_func_ = self.get_scalar_quantized_func()
+        elif scale_format == ScaleFormat.CONST:
+            self._quantized_func_ = self.get_default_quantized_func()
+        else:
+            raise ValueError("Unexpected scale format - {}".format(scale_format))
+
+    def __call__(self, *args, **kwargs):
+        return self._quantized_func_(*args, **kwargs)
+
+class QuantizedHpuMatmul(QuantizedHpuFuncWrapper):
+
+    def get_default_quantized_func(self):
+        return torch.ops.hpu.fp8_gemm_v2
+
+    # only specific arguments are defined, to avoid having all other arguments defined in each call in patched modules.
+    def __call__(self, input, other, out=None, out_dtype=torch.bfloat16, scale_input_inv=None, scale_other_inv=None):
+        return self._quantized_func_(input,
+                                     False,
+                                     other,
+                                     False,
+                                     out,
+                                     out_dtype,
+                                     scale_input_inv,
+                                     scale_other_inv,
+                                     None,
+                                     False)
+
+class QuantizedHpuConv(QuantizedHpuFuncWrapper):
+
+    def get_default_quantized_func(self):
+        return torch.ops.hpu.conv2d_fp8
+
+    @staticmethod
+    def to_list_if_necessary(param):
+        return param if hasattr(param, "__iter__") else [param] * 2
+
+    # only specific arguments are defined, to avoid having all other arguments defined in each call in patched modules.
+    def __call__(self,
+                 input,
+                 weight,
+                 bias,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 out_dtype=torch.bfloat16,
+                 scale_input_inv=None,
+                 scale_other_inv=None):
+
+        return self._quantized_func_(input=input,
+                                     weight=weight,
+                                     bias=bias,
+                                     stride=self.to_list_if_necessary(stride),
+                                     padding=self.to_list_if_necessary(padding),
+                                     dilation=self.to_list_if_necessary(dilation),
+                                     groups=groups,
+                                     out_dtype=out_dtype,
+                                     scale_input=scale_input_inv,
+                                     scale_weight=scale_other_inv)
+
+class QuantizedHpuSoftmax(QuantizedHpuFuncWrapper):
+
+    def get_default_quantized_func(self):
+        return torch.ops.hpu.softmax_fp8
+
+    def get_scalar_quantized_func(self):
+        # softmax custom op has different scalar impl name
+        return self.get_default_quantized_func().Scalar_scales
+
+class QuantizedHpuFSDPA(QuantizedHpuFuncWrapper):
+
+    def __init__(self, scale_format):
+        # FSDPA isn't optimized for scalar flavor due to complexity of specific torch op api selection
+        self._quantized_func_ = self.get_default_quantized_func()
+
+    def get_default_quantized_func(self):
+        return fp8_fused_sdpa
+
+    def get_scalar_quantized_func(self):
+        raise NotImplementedError()
+
+_OP_TYPE_HPU_QUANTIZED_WRAPPER_CLASSES = {OP_TYPE.GEMM : QuantizedHpuMatmul,
+                                          OP_TYPE.SOFTMAX : QuantizedHpuSoftmax,
+                                          OP_TYPE.CONV  : QuantizedHpuConv,
+                                          OP_TYPE.FSDPA : QuantizedHpuFSDPA
+                                          }
+
+def get_hpu_quantized_func_wrapper(op_type, scale_format):
+    quantized_hpu_wrapper_class = _OP_TYPE_HPU_QUANTIZED_WRAPPER_CLASSES[op_type]
+    return quantized_hpu_wrapper_class(scale_format)