diff --git a/tests/unit_tests/ops/data/awq/input.pt b/tests/unit_tests/ops/data/awq/input.pt
new file mode 100644
index 00000000..23933bfd
Binary files /dev/null and b/tests/unit_tests/ops/data/awq/input.pt differ
diff --git a/tests/unit_tests/ops/data/awq/output.pt b/tests/unit_tests/ops/data/awq/output.pt
new file mode 100644
index 00000000..aeda8918
Binary files /dev/null and b/tests/unit_tests/ops/data/awq/output.pt differ
diff --git a/tests/unit_tests/ops/data/awq/qweight.pt b/tests/unit_tests/ops/data/awq/qweight.pt
new file mode 100644
index 00000000..9286af88
Binary files /dev/null and b/tests/unit_tests/ops/data/awq/qweight.pt differ
diff --git a/tests/unit_tests/ops/data/awq/qzeros.pt b/tests/unit_tests/ops/data/awq/qzeros.pt
new file mode 100644
index 00000000..9a29a304
Binary files /dev/null and b/tests/unit_tests/ops/data/awq/qzeros.pt differ
diff --git a/tests/unit_tests/ops/data/awq/scales.pt b/tests/unit_tests/ops/data/awq/scales.pt
new file mode 100644
index 00000000..1a51dcdf
Binary files /dev/null and b/tests/unit_tests/ops/data/awq/scales.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt
new file mode 100644
index 00000000..ab392b5f
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_input.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt
new file mode 100644
index 00000000..56e1062d
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_output.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt
new file mode 100644
index 00000000..5e652925
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt
new file mode 100644
index 00000000..d3f24ff8
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_w8a8fp8_weight_scale.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt
new file mode 100644
index 00000000..8d08d55a
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_input.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt
new file mode 100644
index 00000000..0df01685
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_output.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt
new file mode 100644
index 00000000..09641c69
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_packed.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt
new file mode 100644
index 00000000..1beecf25
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_scale.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt
new file mode 100644
index 00000000..69384ea1
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/linear_wna16_weight_zero_point.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt
new file mode 100644
index 00000000..cac9e0a8
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_hidden_states.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt
new file mode 100644
index 00000000..58ab4c17
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_input_router_logits.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt
new file mode 100644
index 00000000..8ef218bd
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_output.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt
new file mode 100644
index 00000000..36abfd14
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_packed.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt
new file mode 100644
index 00000000..42633606
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w13_weight_scale.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt
new file mode 100644
index 00000000..e7fdd634
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_packed.pt differ
diff --git a/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt
new file mode 100644
index 00000000..050acf67
Binary files /dev/null and b/tests/unit_tests/ops/data/compressed_tensors/moe_wna16_w2_weight_scale.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/linear_input.pt b/tests/unit_tests/ops/data/fp8/linear_input.pt
new file mode 100644
index 00000000..03fed11c
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_input.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/linear_output.pt b/tests/unit_tests/ops/data/fp8/linear_output.pt
new file mode 100644
index 00000000..104d98c3
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_output.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/linear_weight.pt b/tests/unit_tests/ops/data/fp8/linear_weight.pt
new file mode 100644
index 00000000..48c6f935
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_weight.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt
new file mode 100644
index 00000000..3644d0cc
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/linear_weight_scale_inv.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt b/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt
new file mode 100644
index 00000000..2adb8031
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_input_hidden_states.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt b/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt
new file mode 100644
index 00000000..01ded36f
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_input_router_logits.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_output.pt b/tests/unit_tests/ops/data/fp8/moe_output.pt
new file mode 100644
index 00000000..4652f3c0
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_output.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt b/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt
new file mode 100644
index 00000000..4345f6f6
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w13_weight.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt
new file mode 100644
index 00000000..409c1cf8
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w13_weight_scale_inv.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt b/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt
new file mode 100644
index 00000000..55a919b4
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w2_weight.pt differ
diff --git a/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt b/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt
new file mode 100644
index 00000000..ef394af6
Binary files /dev/null and b/tests/unit_tests/ops/data/fp8/moe_w2_weight_scale_inv.pt differ
diff --git a/tests/unit_tests/ops/data/gptq/input.pt b/tests/unit_tests/ops/data/gptq/input.pt
new file mode 100644
index 00000000..4bbfd6a8
Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/input.pt differ
diff --git a/tests/unit_tests/ops/data/gptq/output.pt b/tests/unit_tests/ops/data/gptq/output.pt
new file mode 100644
index 00000000..88c44aa1
Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/output.pt differ
diff --git a/tests/unit_tests/ops/data/gptq/qweight.pt b/tests/unit_tests/ops/data/gptq/qweight.pt
new file mode 100644
index 00000000..eab3b77f
Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/qweight.pt differ
diff --git a/tests/unit_tests/ops/data/gptq/qzeros.pt b/tests/unit_tests/ops/data/gptq/qzeros.pt
new file mode 100644
index 00000000..cd4bc995
Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/qzeros.pt differ
diff --git a/tests/unit_tests/ops/data/gptq/scales.pt b/tests/unit_tests/ops/data/gptq/scales.pt
new file mode 100644
index 00000000..aedc48de
Binary files /dev/null and b/tests/unit_tests/ops/data/gptq/scales.pt differ
diff --git a/tests/unit_tests/ops/test_hpu_awq.py b/tests/unit_tests/ops/test_hpu_awq.py
new file mode 100644
index 00000000..5ddd7970
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_awq.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import habana_frameworks.torch as htorch
+from utils import get_data_path
+from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+
+def test_awq_linear_method(dist_init):
+    config = {"bits": 4, "group_size": 128, "zero_point": True}
+    oot_quant_config = AWQHPUConfig.from_config(config)
+
+    # Prepare linear layer with oot AWQHPULinearMethod
+    oot_op = RowParallelLinear(input_size=256,
+                               output_size=128,
+                               bias=False,
+                               input_is_parallel=True,
+                               skip_bias_add=False,
+                               params_dtype=torch.bfloat16,
+                               reduce_results=True,
+                               quant_config=oot_quant_config,
+                               return_bias=False,
+                               disable_tp=False).to("hpu")
+    assert isinstance(oot_op.quant_method, AWQHPULinearMethod)
+
+    # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ
+    # (with adjusted shape, to make tensors smaller)
+    qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu")
+    oot_op.qweight.copy_(qweight)
+    qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu")
+    oot_op.qzeros.copy_(qzeros)
+    scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    oot_op.scales.copy_(scales)
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input
+    # (AWQLinearMethod was triggered offline with the same input as below to get the ref_output)
+    input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
+
+    # Execute layer
+    out = oot_op(input)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
diff --git a/tests/unit_tests/ops/test_hpu_compressed_tensors.py b/tests/unit_tests/ops/test_hpu_compressed_tensors.py
new file mode 100644
index 00000000..301640a6
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_compressed_tensors.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import habana_frameworks.torch as htorch
+from utils import get_data_path
+from unittest.mock import MagicMock
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsConfig
+from vllm_gaudi.ops.hpu_compressed_tensors import (HPUCompressedTensorsLinearMethod, HPUCompressedTensorsW8A8Fp8,
+                                                   HPUCompressedTensorsWNA16, HPUCompressedTensorsWNA16MoEMethod)
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm.forward_context import override_forward_context
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+def test_compressed_tensors_linear_method_w8a8fp8(dist_init):
+    config = {
+        'config_groups': {
+            'group_0': {
+                'input_activations': {
+                    'block_structure': None,
+                    'dynamic': True,
+                    'group_size': None,
+                    'num_bits': 8,
+                    'observer': 'memoryless',
+                    'observer_kwargs': {},
+                    'strategy': 'token',
+                    'symmetric': True,
+                    'type': 'float'
+                },
+                'output_activations': None,
+                'targets': ['Linear'],
+                'weights': {
+                    'block_structure': None,
+                    'dynamic': False,
+                    'group_size': None,
+                    'num_bits': 8,
+                    'observer': 'minmax',
+                    'observer_kwargs': {},
+                    'strategy': 'channel',
+                    'symmetric': True,
+                    'type': 'float'
+                }
+            }
+        },
+        'format': 'naive-quantized',
+        'global_compression_ratio': 1.239290831149584,
+        'ignore': [],
+        'kv_cache_scheme': None,
+        'quant_method': 'compressed-tensors',
+        'quantization_status': 'frozen'
+    }
+    oot_quant_config = CompressedTensorsConfig.from_config(config)
+
+    # Prepare linear layer with oot CompressedTensorsLinearMethod
+    # with HPUCompressedTensorsW8A8Fp8 scheme
+    oot_op = RowParallelLinear(input_size=256,
+                               output_size=256,
+                               bias=False,
+                               input_is_parallel=True,
+                               skip_bias_add=False,
+                               params_dtype=torch.bfloat16,
+                               reduce_results=True,
+                               quant_config=oot_quant_config,
+                               return_bias=False,
+                               disable_tp=False).to("hpu")
+    assert isinstance(oot_op.quant_method, HPUCompressedTensorsLinearMethod)
+    assert isinstance(oot_op.scheme, HPUCompressedTensorsW8A8Fp8)
+
+    # Weight and weight_scale_inv were extracted from first RowParallelLinear
+    # layer of RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic
+    # (with adjusted shapes, to make tensors smaller)
+    weight = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_weight.pt"),
+                        weights_only=False,
+                        map_location="hpu")
+    oot_op.weight.copy_(weight)
+    weight_scale = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_weight_scale.pt"),
+                              weights_only=False,
+                              map_location="hpu")
+    oot_op.weight_scale.copy_(weight_scale)
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds data that was returned by cuda impl of CompressedTensorsLinearMethod for given input
+    # (CompressedTensorsLinearMethod was triggered offline with the same input as below to get the ref_output)
+    input = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_input.pt"),
+                       weights_only=False,
+                       map_location="hpu")
+    ref_output = torch.load(get_data_path("data/compressed_tensors/linear_w8a8fp8_output.pt"),
+                            weights_only=False,
+                            map_location="hpu")
+
+    # Execute layer
+    out = oot_op(input)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
+
+
+def test_compressed_tensors_linear_method_wna16(dist_init):
+    config = {
+        'config_groups': {
+            'group_0': {
+                'input_activations': None,
+                'output_activations': None,
+                'targets': ['Linear'],
+                'weights': {
+                    'actorder': 'weight',
+                    'block_structure': None,
+                    'dynamic': False,
+                    'group_size': 128,
+                    'num_bits': 4,
+                    'observer': 'minmax',
+                    'observer_kwargs': {},
+                    'strategy': 'group',
+                    'symmetric': False,
+                    'type': 'int'
+                }
+            }
+        },
+        'format': 'pack-quantized',
+        'global_compression_ratio': None,
+        'ignore': [],
+        'kv_cache_scheme': None,
+        'quant_method': 'compressed-tensors',
+        'quantization_status': 'compressed'
+    }
+    oot_quant_config = CompressedTensorsConfig.from_config(config)
+
+    # Prepare linear layer with oot CompressedTensorsLinearMethod
+    # with HPUCompressedTensorsWNA16 scheme
+    oot_op = RowParallelLinear(input_size=256,
+                               output_size=256,
+                               bias=False,
+                               input_is_parallel=True,
+                               skip_bias_add=False,
+                               params_dtype=torch.bfloat16,
+                               reduce_results=True,
+                               quant_config=oot_quant_config,
+                               return_bias=False,
+                               disable_tp=False).to("hpu")
+    assert isinstance(oot_op.quant_method, HPUCompressedTensorsLinearMethod)
+    assert isinstance(oot_op.scheme, HPUCompressedTensorsWNA16)
+
+    # Weights were extracted from first RowParallelLinear layer of RedHatAI/Qwen3-8B-quantized.w4a16
+    # (with adjusted shapes, to make tensors smaller)
+    weight_packed = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_packed.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    oot_op.weight_packed.copy_(weight_packed)
+    weight_scale = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_scale.pt"),
+                              weights_only=False,
+                              map_location="hpu")
+    oot_op.weight_scale.copy_(weight_scale)
+    weight_zero_point = torch.load(get_data_path("data/compressed_tensors/linear_wna16_weight_zero_point.pt"),
+                                   weights_only=False,
+                                   map_location="hpu")
+    oot_op.weight_zero_point.copy_(weight_zero_point)
+    oot_op.weight_shape.data = torch.tensor([256, 256], device='hpu:0')
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds data that was returned by cuda impl of CompressedTensorsLinearMethod for given input
+    # (CompressedTensorsLinearMethod was triggered offline with the same input as below to get the ref_output)
+    input = torch.load(get_data_path("data/compressed_tensors/linear_wna16_input.pt"),
+                       weights_only=False,
+                       map_location="hpu")
+    ref_output = torch.load(get_data_path("data/compressed_tensors/linear_wna16_output.pt"),
+                            weights_only=False,
+                            map_location="hpu")
+
+    # Execute layer
+    out = oot_op(input)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
+
+
+def test_compressed_tensors_wna16_moe_method(dist_init):
+    config = {
+        'config_groups': {
+            'group_0': {
+                'input_activations': None,
+                'output_activations': None,
+                'targets': ['Linear'],
+                'weights': {
+                    'actorder': 'weight',
+                    'block_structure': None,
+                    'dynamic': False,
+                    'group_size': 128,
+                    'num_bits': 4,
+                    'observer': 'minmax',
+                    'observer_kwargs': {},
+                    'strategy': 'group',
+                    'symmetric': True,
+                    'type': 'int'
+                }
+            }
+        },
+        'format': 'pack-quantized',
+        'global_compression_ratio': None,
+        'ignore': [],
+        'kv_cache_scheme': None,
+        'quant_method': 'compressed-tensors',
+        'quantization_status': 'compressed'
+    }
+    oot_quant_config = CompressedTensorsConfig.from_config(config)
+
+    # Prepare FusedMoE layer with oot HPUCompressedTensorsWNA16MoEMethod
+    oot_op = FusedMoE(num_experts=128,
+                      top_k=8,
+                      hidden_size=512,
+                      intermediate_size=256,
+                      params_dtype=torch.bfloat16,
+                      reduce_results=True,
+                      renormalize=True,
+                      use_grouped_topk=False,
+                      num_expert_group=None,
+                      topk_group=None,
+                      quant_config=oot_quant_config,
+                      tp_size=None,
+                      ep_size=None,
+                      dp_size=None,
+                      custom_routing_function=None,
+                      scoring_func="softmax",
+                      routed_scaling_factor=1.0,
+                      e_score_correction_bias=None,
+                      apply_router_weight_on_input=False,
+                      activation="silu",
+                      enable_eplb=False,
+                      num_redundant_experts=0,
+                      has_bias=False,
+                      is_sequence_parallel=False,
+                      zero_expert_num=0,
+                      zero_expert_type=None).to("hpu")
+    assert isinstance(oot_op.quant_method, HPUCompressedTensorsWNA16MoEMethod)
+
+    # Weights were extracted from first FusedMoE layer of RedHatAI/Qwen3-30B-A3B-quantized.w4a16
+    # (with adjusted shapes, to make tensors smaller)
+    w2_weight_packed = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w2_weight_packed.pt"),
+                                  weights_only=False,
+                                  map_location="hpu")
+    w2_weight_packed = torch.swapaxes(w2_weight_packed, 0, 1).repeat(128, 1, 1)
+    oot_op.w2_weight_packed.copy_(w2_weight_packed)
+    w13_weight_packed = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w13_weight_packed.pt"),
+                                   weights_only=False,
+                                   map_location="hpu")
+    w13_weight_packed = torch.swapaxes(w13_weight_packed, 0, 1).repeat(128, 1, 1)
+    oot_op.w13_weight_packed.copy_(w13_weight_packed)
+
+    w2_weight_scale = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w2_weight_scale.pt"),
+                                 weights_only=False,
+                                 map_location="hpu")
+    w2_weight_scale = torch.swapaxes(w2_weight_scale, 0, 1).repeat(128, 1, 1)
+    oot_op.w2_weight_scale.copy_(w2_weight_scale)
+    w13_weight_scale = torch.load(get_data_path("data/compressed_tensors/moe_wna16_w13_weight_scale.pt"),
+                                  weights_only=False,
+                                  map_location="hpu")
+    w13_weight_scale = torch.swapaxes(w13_weight_scale, 0, 1).repeat(128, 1, 1)
+    oot_op.w13_weight_scale.copy_(w13_weight_scale)
+
+    w2_weight_shape = torch.tensor([512, 256], dtype=torch.bfloat16, device="hpu")
+    oot_op.w2_weight_shape.copy_(w2_weight_shape.repeat(128, 1))
+    w13_weight_shape = torch.tensor([256, 512], dtype=torch.bfloat16, device="hpu")
+    oot_op.w13_weight_shape.copy_(w13_weight_shape.repeat(128, 1))
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds data that was returned by cuda impl of CompressedTensorsWNA16MarlinMoEMethod for given input
+    # (CompressedTensorsWNA16MarlinMoEMethod was triggered offline with the same input as below to get the ref_output)
+    hidden_states = torch.load(get_data_path("data/compressed_tensors/moe_wna16_input_hidden_states.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    router_logits = torch.load(get_data_path("data/compressed_tensors/moe_wna16_input_router_logits.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    ref_output = torch.load(get_data_path("data/compressed_tensors/moe_wna16_output.pt"),
+                            weights_only=False,
+                            map_location="hpu")
+
+    # Execute layer
+    mock_ctx = MagicMock(spec=["dp_metadata"])
+    mock_ctx.dp_metadata = None
+    with override_forward_context(mock_ctx):
+        out = oot_op.forward_impl(hidden_states, router_logits)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-4, rtol=1e-4)
diff --git a/tests/unit_tests/ops/test_hpu_fp8.py b/tests/unit_tests/ops/test_hpu_fp8.py
new file mode 100644
index 00000000..f643b918
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_fp8.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import habana_frameworks.torch as htorch
+from utils import get_data_path
+from unittest.mock import MagicMock
+from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm.forward_context import override_forward_context
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+def test_fp8_linear_method(dist_init, monkeypatch):
+    monkeypatch.setenv("VLLM_HPU_FORCE_CHANNEL_FP8", "0")
+    config = {'activation_scheme': 'dynamic', 'fmt': 'e4m3', 'quant_method': 'fp8', 'weight_block_size': [128, 128]}
+    oot_quant_config = Fp8Config.from_config(config)
+
+    # Prepare linear layer with oot Fp8LinearMethod
+    oot_op = RowParallelLinear(input_size=256,
+                               output_size=256,
+                               bias=False,
+                               input_is_parallel=True,
+                               skip_bias_add=False,
+                               params_dtype=torch.bfloat16,
+                               reduce_results=True,
+                               quant_config=oot_quant_config,
+                               return_bias=False,
+                               disable_tp=False).to("hpu")
+    assert isinstance(oot_op.quant_method, Fp8LinearMethod)
+
+    # Weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8
+    # (with adjusted shapes, to make tensors smaller)
+    weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu")
+    oot_op.weight.copy_(weight)
+    weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"),
+                                  weights_only=False,
+                                  map_location="hpu")
+    oot_op.weight_scale_inv.copy_(weight_scale_inv)
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        # Setting fullgraph to False, because currently there is a graph break
+        compile_config = HPUCompileConfig(fullgraph=False)
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input
+    # (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output)
+    input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu")
+    ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu")
+
+    # Execute layer
+    out = oot_op(input)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
+
+
+def test_fp8_moe_method(dist_init, monkeypatch):
+    monkeypatch.setenv("VLLM_HPU_FORCE_CHANNEL_FP8", "0")
+    config = {
+        'activation_scheme': 'dynamic',
+        'modules_to_not_convert': [],
+        'fmt': 'e4m3',
+        'quant_method': 'fp8',
+        'weight_block_size': [128, 128]
+    }
+    oot_quant_config = Fp8Config.from_config(config)
+
+    # Prepare FusedMoE layer with oot HPUFp8MoEMethod
+    oot_op = FusedMoE(num_experts=128,
+                      top_k=8,
+                      hidden_size=512,
+                      intermediate_size=256,
+                      params_dtype=torch.bfloat16,
+                      reduce_results=True,
+                      renormalize=True,
+                      use_grouped_topk=False,
+                      num_expert_group=None,
+                      topk_group=None,
+                      quant_config=oot_quant_config,
+                      tp_size=None,
+                      ep_size=None,
+                      dp_size=None,
+                      custom_routing_function=None,
+                      scoring_func="softmax",
+                      routed_scaling_factor=1.0,
+                      e_score_correction_bias=None,
+                      apply_router_weight_on_input=False,
+                      activation="silu",
+                      enable_eplb=False,
+                      num_redundant_experts=0,
+                      has_bias=False,
+                      is_sequence_parallel=False,
+                      zero_expert_num=0,
+                      zero_expert_type=None).to("hpu")
+    assert isinstance(oot_op.quant_method, HPUFp8MoEMethod)
+
+    # Weights were extracted from first FusedMoE layer of Qwen/Qwen3-30B-A3B-FP8
+    # (with adjusted shapes, to make tensors smaller)
+    w13_weight = torch.load(get_data_path("data/fp8/moe_w13_weight.pt"), weights_only=False, map_location="hpu")
+    oot_op.w13_weight.copy_(w13_weight.repeat(128, 1, 1))
+    w13_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w13_weight_scale_inv.pt"),
+                                      weights_only=False,
+                                      map_location="hpu")
+    oot_op.w13_weight_scale_inv.copy_(w13_weight_scale_inv.repeat(128, 1, 1))
+    w2_weight = torch.load(get_data_path("data/fp8/moe_w2_weight.pt"), weights_only=False, map_location="hpu")
+    oot_op.w2_weight.copy_(w2_weight.repeat(128, 1, 1))
+    w2_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w2_weight_scale_inv.pt"),
+                                     weights_only=False,
+                                     map_location="hpu")
+    oot_op.w2_weight_scale_inv.copy_(w2_weight_scale_inv.repeat(128, 1, 1))
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input
+    # (Fp8MoEMethod was triggered offline with the same input as below to get the ref_output)
+    hidden_states = torch.load(get_data_path("data/fp8/moe_input_hidden_states.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    router_logits = torch.load(get_data_path("data/fp8/moe_input_router_logits.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    ref_output = torch.load(get_data_path("data/fp8/moe_output.pt"), weights_only=False, map_location="hpu")
+
+    # Execute layer
+    mock_ctx = MagicMock(spec=["dp_metadata"])
+    mock_ctx.dp_metadata = None
+    with override_forward_context(mock_ctx):
+        out = oot_op.forward_impl(hidden_states, router_logits)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
diff --git a/tests/unit_tests/ops/test_hpu_gptq.py b/tests/unit_tests/ops/test_hpu_gptq.py
new file mode 100644
index 00000000..88818e6b
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_gptq.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import habana_frameworks.torch as htorch
+from utils import get_data_path
+from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+
+def test_gptq_linear_method(dist_init):
+    config = {"bits": 4, "group_size": 128, "desc_act": False, "lm_head": False}
+    oot_quant_config = GPTQHPUConfig.from_config(config)
+
+    # Prepare linear layer with oot GPTQHPULinearMethod
+    oot_op = RowParallelLinear(input_size=256,
+                               output_size=8,
+                               bias=False,
+                               input_is_parallel=True,
+                               skip_bias_add=False,
+                               params_dtype=torch.bfloat16,
+                               reduce_results=True,
+                               quant_config=oot_quant_config,
+                               return_bias=False,
+                               disable_tp=False).to("hpu")
+    assert isinstance(oot_op.quant_method, GPTQHPULinearMethod)
+
+    # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
+    # (with adjusted shape, to make tensors smaller)
+    qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu")
+    oot_op.qweight.copy_(qweight)
+    qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu")
+    oot_op.qzeros.copy_(qzeros)
+    scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    oot_op.scales.copy_(scales)
+
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
+    # Input and expected output
+    # Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
+    # (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
+    input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
+
+    # Execute layer
+    out = oot_op(input)
+
+    # Check correctness
+    torch.testing.assert_close(ref_output, out, atol=1e-3, rtol=1e-3)
diff --git a/tests/unit_tests/ops/test_hpu_layernorm.py b/tests/unit_tests/ops/test_hpu_layernorm.py
new file mode 100644
index 00000000..c0920a23
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_layernorm.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import habana_frameworks.torch as htorch
+from utils import temporary_op_registry_oot, register_op
+from vllm_gaudi.ops.hpu_layernorm import HPURMSNorm
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float]
+NUM_TOKENS = [7, 83, 4096]
+HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
+ADD_RESIDUAL = [False, True]
+DEVICE = [current_platform.device_type]
+IS_STRIDED = [False, True]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICE)
+@pytest.mark.parametrize("strided_input", IS_STRIDED)
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    device: str,
+    strided_input: bool,
+) -> None:
+    with temporary_op_registry_oot():
+        # prepare native RMSNorm module
+        native_rms_norm = RMSNorm(hidden_size=hidden_size, eps=1e-05)
+        native_rms_norm = native_rms_norm.to(dtype=dtype).to(device)
+        native_rms_norm.weight.data.normal_(mean=1.0, std=0.1)
+        assert isinstance(native_rms_norm, RMSNorm) and not isinstance(native_rms_norm, HPURMSNorm)
+
+        # Prepare oot HPURMSNorm module
+        register_op(RMSNorm, HPURMSNorm)
+        oot_rms_norm = RMSNorm(hidden_size=hidden_size, eps=1e-05)
+        oot_rms_norm = oot_rms_norm.to(dtype=dtype).to(device)
+        oot_rms_norm.weight.data = native_rms_norm.weight.data.clone()
+        assert isinstance(oot_rms_norm, RMSNorm) and isinstance(oot_rms_norm, HPURMSNorm)
+
+        if not htorch.utils.internal.is_lazy():
+            compile_config = HPUCompileConfig()
+            oot_rms_norm = torch.compile(oot_rms_norm, **compile_config.get_compile_args())
+
+        # Prepare input data
+        scale = 1 / (2 * hidden_size)
+        last_dim = 2 * hidden_size if strided_input else hidden_size
+        x = torch.randn(num_tokens, last_dim, dtype=dtype, device=device)
+        x = x[..., :hidden_size]
+        assert x.is_contiguous() != strided_input
+        x *= scale
+        residual = torch.randn_like(x) * scale if add_residual else None
+
+        # Execute layers
+        ref_out = native_rms_norm(x, residual)
+        out = oot_rms_norm(x, residual)
+
+        # Check correctness
+        if add_residual:
+            torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+        else:
+            torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/unit_tests/ops/test_hpu_rotary_embedding.py b/tests/unit_tests/ops/test_hpu_rotary_embedding.py
new file mode 100644
index 00000000..b26e6ace
--- /dev/null
+++ b/tests/unit_tests/ops/test_hpu_rotary_embedding.py
@@ -0,0 +1,389 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import habana_frameworks.torch as htorch
+from typing import NamedTuple
+from utils import temporary_op_registry_oot, register_op
+from transformers.models.auto.configuration_auto import AutoConfig
+from vllm_gaudi.utils import HPUCompileConfig
+from vllm_gaudi.ops.hpu_rotary_embedding import (HPURotaryEmbedding, HPULinearScalingRotaryEmbedding,
+                                                 HPUDynamicNTKScalingRotaryEmbedding, HPUYaRNScalingRotaryEmbedding,
+                                                 HPUDeepseekScalingRotaryEmbedding, HPULlama3RotaryEmbedding,
+                                                 HPUPhi3LongRoPEScaledRotaryEmbedding, HPULlama4VisionRotaryEmbedding,
+                                                 HPUMRotaryEmbedding)
+from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, LinearScalingRotaryEmbedding,
+                                                         DynamicNTKScalingRotaryEmbedding, YaRNScalingRotaryEmbedding,
+                                                         DeepseekScalingRotaryEmbedding, Llama3RotaryEmbedding,
+                                                         Phi3LongRoPEScaledRotaryEmbedding, Llama4VisionRotaryEmbedding,
+                                                         MRotaryEmbedding)
+
+# General settings
+HIDDEN_SIZES = [4096]
+SEQ_LENGTHS = [4096]
+HEAD_SIZES = [32, 128, 512, 1024]
+ROTARY_DIMS = [8, 32]
+MAX_POSITION_EMBEDDINGS = [131072]
+BASES = [500000.0]
+IS_NEOX_STYLE = [False, True]
+SCALING_FACTORS = [1.0, 2.0, 4.0, 8.0]
+SCALING_FACTORS_WITH_LIST = [1.0, 2.0, 4.0, 8.0, [2.0, 4.0]]
+
+# Vision model settings
+IMAGE_SIZE = 336
+PATCH_SIZE = 14
+VISION_MAX_POSITION_EMBEDDINGS = [(IMAGE_SIZE // PATCH_SIZE)**2]
+VISION_SEQ_LENGTHS = [x + 1 for x in VISION_MAX_POSITION_EMBEDDINGS]
+
+
+class RotaryData(NamedTuple):
+    """
+    Data structure for rotary embedding test parameters.
+    """
+    cls: type
+    dtype: torch.dtype
+    device: str
+
+
+def run_rotary_embedding_test(native_rotary_data: RotaryData, oot_rotary_data: RotaryData, seq_length: int,
+                              hidden_size: int, **kwargs) -> None:
+    """
+    Common code for running rotary embedding tests. It compares output of
+    natve operator and out-of-tree custom operator. It allows to
+    specify separate device for native operator and custom operator, 
+    because for example native Llama4VisionRotaryEmbedding cannot be 
+    used on hpu as it uses complex datatype. The same applies to dtype.
+    """
+    with temporary_op_registry_oot():
+        # prepare native RotaryEmbedding module
+        with torch.device(native_rotary_data.device):
+            kwargs["dtype"] = native_rotary_data.dtype
+            native_rotary_embedding = native_rotary_data.cls(**kwargs)
+            assert isinstance(native_rotary_embedding,
+                              native_rotary_data.cls) and not isinstance(native_rotary_embedding, oot_rotary_data.cls)
+
+        # Prepare oot RotaryEmbedding module
+        with torch.device(oot_rotary_data.device):
+            register_op(native_rotary_data.cls, oot_rotary_data.cls)
+            kwargs["dtype"] = oot_rotary_data.dtype
+            oot_rotary_embedding = native_rotary_data.cls(**kwargs)  # Use native as it was registered above
+            assert isinstance(oot_rotary_embedding, native_rotary_data.cls) and isinstance(
+                oot_rotary_embedding, oot_rotary_data.cls)
+
+            if not htorch.utils.internal.is_lazy():
+                compile_config = HPUCompileConfig()
+                oot_rotary_embedding = torch.compile(oot_rotary_embedding, **compile_config.get_compile_args())
+
+        # Prepare input data
+        positions = torch.randint(high=seq_length,
+                                  size=(1, seq_length),
+                                  dtype=torch.int32,
+                                  device=native_rotary_data.device)
+        query = torch.randn(1, seq_length, hidden_size, dtype=torch.bfloat16, device=native_rotary_data.device)
+        key = torch.randn(1, seq_length, hidden_size, dtype=torch.bfloat16, device=native_rotary_data.device)
+        if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, DeepseekScalingRotaryEmbedding):
+            query = query.view(query.shape[0], query.shape[1], -1, kwargs["head_size"])
+            key = key.view(key.shape[0], key.shape[1], -1, kwargs["head_size"])
+
+        # Execute layers
+        with torch.device(native_rotary_data.device):
+            if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, ):
+                ref_query_out, ref_key_out = native_rotary_embedding(query, key)
+            elif native_rotary_data.cls in (MRotaryEmbedding, ):
+                ref_query_out, ref_key_out = native_rotary_embedding(positions.flatten(), query, key)
+            else:
+                ref_query_out, ref_key_out = native_rotary_embedding(positions, query, key)
+
+        if native_rotary_data.device != oot_rotary_data.device:
+            positions = positions.to(oot_rotary_data.device)
+            query = query.to(oot_rotary_data.device)
+            key = key.to(oot_rotary_data.device)
+
+        with torch.device(oot_rotary_data.device):
+            if native_rotary_data.cls in (Llama4VisionRotaryEmbedding, ):
+                query_out, key_out = oot_rotary_embedding(query, key)
+            else:
+                query_out, key_out = oot_rotary_embedding(positions, query, key)
+
+        # Check correctness
+        if native_rotary_data.device != oot_rotary_data.device:
+            ref_query_out = ref_query_out.to("cpu")
+            query_out = query_out.to("cpu")
+            ref_key_out = ref_key_out.to("cpu")
+            key_out = key_out.to("cpu")
+        torch.testing.assert_close(query_out, ref_query_out, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(key_out, ref_key_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+def test_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+    }
+    native_rotary_data = RotaryData(cls=RotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPURotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("scaling_factors", SCALING_FACTORS_WITH_LIST)
+def test_linear_scaling_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    scaling_factors: float | list[float],
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "scaling_factors": scaling_factors,
+    }
+    native_rotary_data = RotaryData(cls=LinearScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPULinearScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS)
+def test_dynamic_ntk_scaling_rotary_rmbedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    scaling_factor: float,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "scaling_factor": scaling_factor,
+    }
+    native_rotary_data = RotaryData(cls=DynamicNTKScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPUDynamicNTKScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS)
+def test_yarn_scaling_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    scaling_factor: float,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "scaling_factor": scaling_factor,
+    }
+    native_rotary_data = RotaryData(cls=YaRNScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPUYaRNScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS)
+def test_deepseek_scaling_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    scaling_factor: float,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": head_size,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "scaling_factor": scaling_factor,
+    }
+    native_rotary_data = RotaryData(cls=DeepseekScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPUDeepseekScalingRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("scaling_factor", SCALING_FACTORS)
+def test_llama3_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+    scaling_factor: float,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "scaling_factor": scaling_factor,
+        "low_freq_factor": 1.0,
+        "high_freq_factor": 4.0,
+        "orig_max_position": 8192
+    }
+    native_rotary_data = RotaryData(cls=Llama3RotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPULlama3RotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.skip(reason="Phi3LongRoPEScaledRotaryEmbedding currently does not inherit CustomOp")
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", [True])
+def test_phi3_long_rope_scaled_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+) -> None:
+    config = AutoConfig.from_pretrained("microsoft/Phi-4-mini-instruct")
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "original_max_position_embeddings": config.original_max_position_embeddings,
+        "short_factor": config.rope_scaling["short_factor"],
+        "long_factor": config.rope_scaling["long_factor"],
+    }
+    native_rotary_data = RotaryData(cls=Phi3LongRoPEScaledRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPUPhi3LongRoPEScaledRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", VISION_SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("max_position_embeddings", VISION_MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+def test_Llama4_vision_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+) -> None:
+    rotary_dim = int(hidden_size // (hidden_size / head_size) // 2)
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+    }
+    native_rotary_data = RotaryData(cls=Llama4VisionRotaryEmbedding, dtype=torch.complex64, device="cpu")
+    oot_rotary_data = RotaryData(cls=HPULlama4VisionRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
+
+
+@pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("max_position_embeddings", MAX_POSITION_EMBEDDINGS)
+@pytest.mark.parametrize("base", BASES)
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+def test_m_rotary_embedding(
+    seq_length: int,
+    hidden_size: int,
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: float,
+    is_neox_style: bool,
+) -> None:
+    kwargs = {
+        "head_size": head_size,
+        "rotary_dim": rotary_dim,
+        "max_position_embeddings": max_position_embeddings,
+        "base": base,
+        "is_neox_style": is_neox_style,
+        "mrope_section": [rotary_dim // 2]
+    }
+    native_rotary_data = RotaryData(cls=MRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    oot_rotary_data = RotaryData(cls=HPUMRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
+    run_rotary_embedding_test(native_rotary_data, oot_rotary_data, seq_length, hidden_size, **kwargs)
diff --git a/tests/unit_tests/ops/utils.py b/tests/unit_tests/ops/utils.py
new file mode 100644
index 00000000..fd4aeb21
--- /dev/null
+++ b/tests/unit_tests/ops/utils.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import contextlib
+from vllm.model_executor.custom_op import CustomOp
+
+
+@contextlib.contextmanager
+def temporary_op_registry_oot():
+    """
+    Contextmanager which allows to temporarly modify the op registry content.
+    It clears current op_registry_oot and restors its content on exit.
+    It is usefull for testing purposes, e.g. to deregister hpu version
+    of the op. (Because when running tests, if registration happened in one
+    of them, then it is still valid in every other test).
+    """
+    old_registry = CustomOp.op_registry_oot
+    CustomOp.op_registry_oot = {}
+    try:
+        yield
+    finally:
+        CustomOp.op_registry_oot = old_registry
+
+
+def register_op(base_cls, oot_cls):
+    """
+    Manual registration of the oot op. It should be used
+    within temporary_op_registry_oot context manager.
+    """
+    CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
+
+
+def get_data_path(filename):
+    return os.path.join(os.path.dirname(__file__), filename)
diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py
index f2ae8aa5..648877d9 100644
--- a/vllm_gaudi/ops/hpu_fp8.py
+++ b/vllm_gaudi/ops/hpu_fp8.py
@@ -2,7 +2,6 @@
 
 import torch
 from vllm_gaudi import envs
-from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 
 from vllm.model_executor.layers.quantization import fp8
@@ -59,7 +58,6 @@ def dequant_fp8_weight(self, layer) -> torch.Tensor:
         return dequant_weight
 
 
-@CustomOp.register_oot(name='Fp8MoEMethod')
 class HPUFp8MoEMethod(Fp8MoEMethod):
 
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):