diff --git a/ads/aqua/cli.py b/ads/aqua/cli.py
index 9fbc83a68..d0f3199e5 100644
--- a/ads/aqua/cli.py
+++ b/ads/aqua/cli.py
@@ -14,6 +14,7 @@
 from ads.aqua.finetuning import AquaFineTuningApp
 from ads.aqua.model import AquaModelApp
 from ads.aqua.modeldeployment import AquaDeploymentApp
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
 from ads.aqua.verify_policies import AquaVerifyPoliciesApp
 from ads.common.utils import LOG_LEVELS
 
@@ -31,6 +32,7 @@ class AquaCommand:
     deployment = AquaDeploymentApp
     evaluation = AquaEvaluationApp
     verify_policies = AquaVerifyPoliciesApp
+    recommend = AquaRecommendApp
 
     def __init__(
         self,
@@ -96,18 +98,20 @@ def _validate_value(flag, value):
                 "If you intend to chain a function call to the result, please separate the "
                 "flag and the subsequent function call with separator `-`."
             )
-    
+
     @staticmethod
     def install():
         """Install ADS Aqua Extension from wheel file. Set enviroment variable `AQUA_EXTENSTION_PATH` to change the wheel file path.
 
-        Return 
+        Return
         ------
         int:
             Installatation status.
         """
         import subprocess
 
-        wheel_file_path = os.environ.get("AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl")
-        status =  subprocess.run(f"pip install {wheel_file_path}",shell=True)
-        return status.check_returncode
\ No newline at end of file
+        wheel_file_path = os.environ.get(
+            "AQUA_EXTENSTION_PATH", "/ads/extension/adsjupyterlab_aqua_extension*.whl"
+        )
+        status = subprocess.run(f"pip install {wheel_file_path}", shell=True, check=False)
+        return status.check_returncode
diff --git a/ads/aqua/common/entities.py b/ads/aqua/common/entities.py
index 13203e34b..f3251ebab 100644
--- a/ads/aqua/common/entities.py
+++ b/ads/aqua/common/entities.py
@@ -46,6 +46,17 @@ class Config:
         arbitrary_types_allowed = True
         protected_namespaces = ()
 
+class ComputeRank(Serializable):
+    """
+    Represents the cost and performance ranking for a compute shape.
+    """
+    cost: int = Field(
+    None, description="The relative rank of the cost of the shape. Range is [10 (cost-effective), 100 (most-expensive)]"
+    )
+
+    performance: int = Field(
+    None, description="The relative rank of the performance of the shape. Range is [10 (lower performance), 110 (highest performance)]"
+    )
 
 class GPUSpecs(Serializable):
     """
@@ -61,6 +72,12 @@ class GPUSpecs(Serializable):
     gpu_type: Optional[str] = Field(
         default=None, description="The type of GPU (e.g., 'V100, A100, H100')."
     )
+    quantization: Optional[List[str]] = Field(
+        default_factory=list, description="The quantization format supported by shape. (ex.  bitsandbytes, fp8, etc.)"
+    )
+    ranking: Optional[ComputeRank] = Field(
+        None, description="The relative rank of the cost and performance of the shape."
+    )
 
 
 class GPUShapesIndex(Serializable):
@@ -84,6 +101,10 @@ class ComputeShapeSummary(Serializable):
     including CPU, memory, and optional GPU characteristics.
     """
 
+    available: Optional[bool] = Field(
+        default = False,
+        description="True if shape is available on user tenancy, "
+    )
     core_count: Optional[int] = Field(
         default=None,
         description="Total number of CPU cores available for the compute shape.",
diff --git a/ads/aqua/common/errors.py b/ads/aqua/common/errors.py
index 15b1018f5..20b578a8f 100644
--- a/ads/aqua/common/errors.py
+++ b/ads/aqua/common/errors.py
@@ -55,6 +55,11 @@ class AquaValueError(AquaError, ValueError):
     def __init__(self, reason, status=403, service_payload=None):
         super().__init__(reason, status, service_payload)
 
+class AquaRecommendationError(AquaError):
+    """Exception raised for models incompatible with shape recommendation tool."""
+
+    def __init__(self, reason, status=400, service_payload=None):
+        super().__init__(reason, status, service_payload)
 
 class AquaFileNotFoundError(AquaError, FileNotFoundError):
     """Exception raised for missing target file."""
diff --git a/ads/aqua/extension/__init__.py b/ads/aqua/extension/__init__.py
index 4c8d9f3f3..ffd2241c6 100644
--- a/ads/aqua/extension/__init__.py
+++ b/ads/aqua/extension/__init__.py
@@ -13,6 +13,7 @@
 from ads.aqua.extension.evaluation_handler import __handlers__ as __eval_handlers__
 from ads.aqua.extension.finetune_handler import __handlers__ as __finetune_handlers__
 from ads.aqua.extension.model_handler import __handlers__ as __model_handlers__
+from ads.aqua.extension.recommend_handler import __handlers__ as __gpu_handlers__
 from ads.aqua.extension.ui_handler import __handlers__ as __ui_handlers__
 from ads.aqua.extension.ui_websocket_handler import __handlers__ as __ws_handlers__
 
@@ -24,6 +25,7 @@
     + __ui_handlers__
     + __eval_handlers__
     + __ws_handlers__
+    + __gpu_handlers__
 )
 
 
diff --git a/ads/aqua/extension/recommend_handler.py b/ads/aqua/extension/recommend_handler.py
new file mode 100644
index 000000000..99b512153
--- /dev/null
+++ b/ads/aqua/extension/recommend_handler.py
@@ -0,0 +1,47 @@
+from tornado.web import HTTPError
+
+from ads.aqua.common.decorator import handle_exceptions
+from ads.aqua.extension.base_handler import AquaAPIhandler
+from ads.aqua.extension.errors import Errors
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
+from ads.config import COMPARTMENT_OCID
+
+
+class AquaRecommendHandler(AquaAPIhandler):
+    """
+    Handler for Aqua GPU Recommendation REST APIs.
+
+    Methods
+    -------
+    post(self, *args, **kwargs)
+        Obtains the eligible compute shapes that would fit the specifed model, context length, model weights, and quantization level.
+
+    Raises
+    ------
+    HTTPError: For various failure scenarios such as invalid input format, missing data, etc.
+    """
+
+    @handle_exceptions
+    def post(self, *args, **kwargs):  # noqa: ARG002
+        """
+        Obtains the eligible compute shapes that would fit the specifed model, context length, model weights, and quantization level.
+
+        Returns
+        -------
+        ShapeRecommendationReport
+            Report containing shape recommendations and troubleshooting advice, if any.
+        """
+        try:
+            input_data = self.get_json_body()
+        except Exception as ex:
+            raise HTTPError(400, Errors.INVALID_INPUT_DATA_FORMAT) from ex
+
+        if not input_data:
+            raise HTTPError(400, Errors.NO_INPUT_DATA)
+
+        self.finish(AquaRecommendApp().which_gpu(**input_data))
+
+
+__handlers__ = [
+    ("recommendation/?([^/]*)", AquaRecommendHandler),
+]
diff --git a/ads/aqua/resources/gpu_shapes_index.json b/ads/aqua/resources/gpu_shapes_index.json
index c88155e45..8dd701be6 100644
--- a/ads/aqua/resources/gpu_shapes_index.json
+++ b/ads/aqua/resources/gpu_shapes_index.json
@@ -1,94 +1,152 @@
 {
   "shapes": {
-    "BM.GPU.A10.4": {
-      "gpu_count": 4,
-      "gpu_memory_in_gbs": 96,
-      "gpu_type": "A10"
+    "BM.GPU.H200.8": {
+      "gpu_count": 8,
+      "gpu_memory_in_gbs": 1128,
+      "gpu_type": "H200",
+      "quantization": ["awq", "gptq", "marlin", "fp8", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+          "cost": 100,
+          "performance": 110
+      }
     },
-    "BM.GPU.A100-V2.8": {
+    "BM.GPU.H100.8": {
       "gpu_count": 8,
       "gpu_memory_in_gbs": 640,
-      "gpu_type": "A100"
+      "gpu_type": "H100",
+      "quantization": ["awq", "gptq", "marlin", "fp8", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+        "cost": 100,
+        "performance": 100
+      }
     },
-    "BM.GPU.B4.8": {
+    "BM.GPU.MI300X.8": {
       "gpu_count": 8,
-      "gpu_memory_in_gbs": 320,
-      "gpu_type": "A100"
+      "gpu_memory_in_gbs": 1536,
+      "gpu_type": "MI300X",
+      "quantization": ["fp8", "gguf"],
+      "ranking": {
+        "cost": 90,
+        "performance": 90
+      }
     },
-    "BM.GPU.H100.8": {
+    "BM.GPU.A100-V2.8": {
       "gpu_count": 8,
       "gpu_memory_in_gbs": 640,
-      "gpu_type": "H100"
+      "gpu_type": "A100",
+      "quantization": ["awq", "gptq", "marlin", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+        "cost": 80,
+        "performance": 70
+      }
     },
-    "BM.GPU.H200.8": {
+    "BM.GPU.B4.8": {
       "gpu_count": 8,
-      "gpu_memory_in_gbs": 1128,
-      "gpu_type": "H200"
+      "gpu_memory_in_gbs": 320,
+      "gpu_type": "A100",
+      "quantization": ["awq", "gptq", "marlin", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+        "cost": 70,
+        "performance": 60
+      }
     },
     "BM.GPU.L40S-NC.4": {
       "gpu_count": 4,
       "gpu_memory_in_gbs": 192,
-      "gpu_type": "L40S"
+      "gpu_type": "L40S",
+      "quantization": ["awq", "gptq", "marlin", "fp8", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+        "cost": 60,
+        "performance": 80
+      }
     },
     "BM.GPU.L40S.4": {
       "gpu_count": 4,
       "gpu_memory_in_gbs": 192,
-      "gpu_type": "L40S"
-    },
-    "BM.GPU.MI300X.8": {
-      "gpu_count": 8,
-      "gpu_memory_in_gbs": 1536,
-      "gpu_type": "MI300X"
-    },
-    "BM.GPU2.2": {
-      "gpu_count": 2,
-      "gpu_memory_in_gbs": 32,
-      "gpu_type": "P100"
-    },
-    "BM.GPU3.8": {
-      "gpu_count": 8,
-      "gpu_memory_in_gbs": 128,
-      "gpu_type": "V100"
-    },
-    "BM.GPU4.8": {
-      "gpu_count": 8,
-      "gpu_memory_in_gbs": 320,
-      "gpu_type": "A100"
+      "gpu_type": "L40S",
+      "quantization": ["awq", "gptq", "marlin", "fp8", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking": {
+        "cost": 60,
+        "performance": 80
+      }
     },
     "VM.GPU.A10.1": {
       "gpu_count": 1,
       "gpu_memory_in_gbs": 24,
-      "gpu_type": "A10"
+      "gpu_type": "A10",
+      "quantization": ["awq", "gptq", "marlin", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking" : {
+        "cost": 20,
+        "performance": 30
+      }
     },
     "VM.GPU.A10.2": {
       "gpu_count": 2,
       "gpu_memory_in_gbs": 48,
-      "gpu_type": "A10"
+      "gpu_type": "A10",
+      "quantization": ["awq", "gptq", "marlin", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking" : {
+        "cost": 40,
+        "performance": 40
+      }
     },
-    "VM.GPU.A10.4": {
+    "BM.GPU.A10.4": {
       "gpu_count": 4,
       "gpu_memory_in_gbs": 96,
-      "gpu_type": "A10"
+      "gpu_type": "A10",
+      "quantization": ["awq", "gptq", "marlin", "int8", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking" : {
+        "cost": 50,
+        "performance": 50
+      }
+    },
+    "BM.GPU2.2": {
+      "gpu_count": 2,
+      "gpu_memory_in_gbs": 32,
+      "gpu_type": "P100",
+      "quantization": ["fp16"],
+      "ranking": {
+        "cost": 30,
+        "performance": 20
+      }
     },
     "VM.GPU2.1": {
       "gpu_count": 1,
       "gpu_memory_in_gbs": 16,
-      "gpu_type": "P100"
+      "gpu_type": "P100",
+      "quantization": ["fp16"],
+      "ranking": {
+        "cost": 10,
+        "performance": 10
+      }
     },
     "VM.GPU3.1": {
       "gpu_count": 1,
       "gpu_memory_in_gbs": 16,
-      "gpu_type": "V100"
+      "gpu_type": "V100",
+      "quantization" : ["gptq", "bitblas", "aqlm", "bitsandbytes", "deepspeedfp", "gguf"],
+      "ranking" : {
+        "cost": 35,
+        "performance": 10 
+      }
     },
     "VM.GPU3.2": {
       "gpu_count": 2,
       "gpu_memory_in_gbs": 32,
-      "gpu_type": "V100"
+      "gpu_type": "V100",
+      "ranking" : {
+        "cost": 45,
+        "performance": 20 
+      }
     },
     "VM.GPU3.4": {
       "gpu_count": 4,
       "gpu_memory_in_gbs": 64,
-      "gpu_type": "V100"
+      "gpu_type": "V100",
+      "ranking" : {
+        "cost": 55,
+        "performance": 45 
+      }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/ads/aqua/shaperecommend/__init__.py b/ads/aqua/shaperecommend/__init__.py
new file mode 100644
index 000000000..dd30edb85
--- /dev/null
+++ b/ads/aqua/shaperecommend/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
+
+__all__ = ["AquaRecommendApp"]
diff --git a/ads/aqua/shaperecommend/constants.py b/ads/aqua/shaperecommend/constants.py
new file mode 100644
index 000000000..c2fc1eeec
--- /dev/null
+++ b/ads/aqua/shaperecommend/constants.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+"""
+aqua.shaperecommend.constants
+~~~~~~~~~~~~~~
+
+This module contains constants used in Aqua GPU Recommendation for Models.
+
+LLAMA_REQUIRED_FIELDS refer to fields necessary for calculating model memory for GQA Architecture Models
+
+MOE_REQUIRED_FIELDS refer to fields necessary for Mixture of Experts (MoE) Architecture Models
+
+NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)
+"""
+
+LLAMA_REQUIRED_FIELDS = [
+    "num_hidden_layers",
+    "hidden_size",
+    "num_attention_heads",
+    "num_key_value_heads",
+    "head_dim",
+    "intermediate_size",
+    "vocab_size",
+]
+
+MOE_REQUIRED_FIELDS = LLAMA_REQUIRED_FIELDS + ["num_local_experts", "intermediate_size"]
+
+NEXT_QUANT = {
+    "float32": ["4bit"],  # vLLM only supports 4bit in-flight-quantization
+    "bfloat16": ["4bit"],
+    "float16": ["4bit"],
+    "int8": ["4bit"],
+    "fp8": ["4bit"],
+    "8bit": ["4bit"],
+    "int4": ["No smaller quantization available"],
+    "4bit": ["No smaller quantization available"],
+}
+
+TEXT_GENERATION = "text_generation"
+SAFETENSORS = "safetensors"
+
+IN_FLIGHT_QUANTIZATION = {"4bit"}
+
+TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
+
+VLLM_PARAMS = {
+    "max_model_len": "--max-model-len",
+    "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
+}
+
+DEFAULT_WEIGHT_SIZE = "float32"
+
+BITS_AND_BYTES_8BIT = "8bit"
+BITS_AND_BYTES_4BIT = "4bit"
+
+QUANT_MAPPING = {
+    "float32": 4,
+    "bfloat16": 2,
+    "float16": 2,
+    "fp16": 2,
+    "half": 2,
+    "int8": 1,
+    "fp8": 1,
+    "8bit": 1,
+    "4bit": 0.5,
+    "int4": 0.5,
+}
diff --git a/ads/aqua/shaperecommend/estimator.py b/ads/aqua/shaperecommend/estimator.py
new file mode 100644
index 000000000..e77491cee
--- /dev/null
+++ b/ads/aqua/shaperecommend/estimator.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from ads.aqua.app import logger
+from ads.aqua.shaperecommend.constants import (
+    IN_FLIGHT_QUANTIZATION,
+    LLAMA_REQUIRED_FIELDS,
+    MOE_REQUIRED_FIELDS,
+    NEXT_QUANT,
+    QUANT_MAPPING,
+    VLLM_PARAMS,
+)
+from ads.aqua.shaperecommend.llm_config import LLMConfig
+
+
+class MemoryEstimator(BaseModel):
+    """
+    The generic estimator for Transformer Architecture models (OPT/ Bloom)
+    Used as a fallback estimator if model identified is not a MoE or GQA Architecture Model.
+    Has properties to estimate the KV Cache size, Model size, and total footprint (KV Cache + Model size)
+
+    KV cache: Use num_attention_heads (all heads, no GQA)
+    Parameter estimation: Standard decoder-only, untied embeddings possible
+    """
+
+    llm_config: LLMConfig = Field(
+        ...,
+        description="The model's config.json file with the necessary parameters for model size and KV cache estimation.",
+    )
+    batch_size: Optional[int] = (
+        1  # we assume that estimation for batch sizes are not supported yet
+    )
+    seq_len: int = Field(
+        ..., description="The max-seq-len to estimate the size of the KV cache."
+    )
+
+    @property
+    def kv_cache_memory(self) -> float:
+        """
+        Estimates the KV cache size (in GB) using the LLM config.json parameters.
+
+        Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
+        """
+        seq_len = self.seq_len or self.llm_config.max_seq_len
+        c = self.llm_config
+        kv_cache_dtype_bytes = QUANT_MAPPING.get(
+            c.weight_dtype, 2
+        )  # vLLM uses model's weight applied to KV cache
+
+        total_bytes = (
+            self.batch_size
+            * c.num_hidden_layers
+            * 2
+            * c.num_attention_heads
+            * seq_len
+            * c.head_dim
+            * kv_cache_dtype_bytes
+        )
+        return total_bytes / 1e9
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Estimates the model size (in GB) based on estimating the model parameter size and model weights.
+
+        Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
+        """
+        c = self.llm_config
+        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
+        embedding_params = (
+            embedding_count * c.vocab_size * c.hidden_size
+        )  # input and output untied
+        layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2)  # GPT-style
+        num_params = layer_params + embedding_params
+
+        return num_params * c.bytes_per_parameter / 1e9
+
+    @property
+    def total_memory(self) -> float:
+        """
+        Computes the total memory footprint of the model (KV cache & model size from estimated parameters).
+        """
+        return self.model_memory + self.kv_cache_memory
+
+    def validate_shape(
+        self, allowed_gpu_memory: float, gpu_utilization: float = 0.9
+    ) -> bool:
+        """
+        Validates if a given model estimator fits within the allowed GPU memory budget, using a fixed utilization margin.
+
+        Parameters
+        ----------
+        estimator : MemoryEstimator
+            The estimator with current shape/memory needs.
+        allowed_gpu_memory : float
+            The maximum allowed GPU memory.
+
+        Returns
+        -------
+        bool
+            True if estimator uses less than adjusted GPU memory, else False.
+        """
+        return (allowed_gpu_memory * gpu_utilization) > self.total_memory
+
+    def construct_deployment_params(self) -> str:
+        """
+        Constructs a deployment parameter string for the model.
+
+        This method assembles runtime configuration parameters to be passed
+        during model deployment. It:
+        - Overrides the max sequence length if a shorter length is provided.
+        - Suggests in-flight quantization **only if the model is unquantized**
+            and in-flight quantization (such as '4bit') is requested in config.
+
+        Returns:
+            str: Parameter string for model deployment.
+        """
+        c = self.llm_config
+        params = ""
+        if self.seq_len < c.max_seq_len:
+            params += f"{VLLM_PARAMS['max_model_len']} {str(self.seq_len)}"
+
+        # Only suggest in-flight quantization for unquantized models when such quantization is requested
+        if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
+            params += " " + VLLM_PARAMS["in_flight_quant"]
+
+        return params
+
+    def suggest_param_advice(self, allowed: float) -> str:
+        """
+        Suggests parameter modifications to help a model fit within GPU memory limits.
+
+        Parameters
+        ----------
+        estimator : MemoryEstimator
+            The memory estimator object.
+        allowed : float
+            Allowed GPU memory in GB.
+
+        Returns
+        -------
+        str
+            Advice message with suggestions.
+        """
+        kv_gb = self.kv_cache_memory
+        wt_gb = self.model_memory
+        batch_size = self.batch_size
+        seq_len = self.seq_len
+        weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
+        config = self.llm_config
+
+        suggested_quant_msg = None
+        quant_advice = ", ".join(config.suggested_quantizations)
+        quantization = getattr(config, "quantization", None)
+
+        advice = []
+
+        if config.suggested_quantizations:
+            to_do = f", which is smaller than the current {quantization if quantization in NEXT_QUANT else weight_size} format."
+            if "No" in quant_advice:
+                suggested_quant_msg = "No smaller quantized version exists. Use a model with fewer parameters."
+            elif not quant_advice:
+                suggested_quant_msg = (
+                    "Use a quantized version of the same model (e.g., INT8 or other)"
+                    + to_do
+                )
+            else:
+                suggested_quant_msg = (
+                    f"Either use a pre-quantized model at {quant_advice}, or apply in-flight {quant_advice} quantization"
+                    + to_do
+                )
+
+        kv_advice = [f"Reduce maximum context length (set --max-model-len < {seq_len})"]
+
+        if batch_size != 1:
+            kv_advice.append(f"Reduce batch size to less than {batch_size}.")
+
+        wt_advice = [
+            "Use a model with fewer parameters.",
+            f"{suggested_quant_msg}" if suggested_quant_msg else "",
+        ]
+
+        if kv_gb > wt_gb and kv_gb > allowed * 0.5:
+            main = "KV cache memory usage is the main limiting factor"
+            advice = kv_advice
+        elif wt_gb > kv_gb and wt_gb > allowed * 0.5:
+            main = "Model weights are the main limiting factor"
+            advice = wt_advice
+        else:
+            main = "Both model weights and KV cache are significant contributors to memory use"
+            advice = kv_advice
+            advice.extend(wt_advice)
+
+        advice_str = "\n".join(f"{i}. {item}" for i, item in enumerate(advice, 1))
+
+        return (
+            f"{advice_str}\n\n{main} (KV cache: {kv_gb:.1f}GB, Weights: {wt_gb:.1f}GB)."
+        )
+
+    def limiting_factor(
+        self, allowed_gpu_memory: float, warn_delta: float = 0.85
+    ) -> str:
+        """
+        Determines the memory limiting factor for a model deployment and returns advice.
+
+        Parameters
+        ----------
+        estimator : MemoryEstimator
+            The memory estimator object with current model configuration.
+        allowed_gpu_memory : float
+            The maximum allowed GPU memory (in GBs).
+        warn_delta : float, optional
+            The threshold (fraction) of allowed GPU memory to trigger a warning (default=0.85).
+
+        Returns
+        -------
+        str
+            Advice message about model fit and limiting factors.
+        """
+        required = self.total_memory
+
+        # Warn if required is close to but under allowed
+        if allowed_gpu_memory > required > allowed_gpu_memory * warn_delta:
+            model_params = self.suggest_param_advice(allowed_gpu_memory)
+            advice = (
+                f"While the selected compute shape is estimated to work "
+                f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed), "
+                f"the model configuration is close to the GPU memory limit.\n\n"
+                "If you encounter issues with this shape, consider the following options to reduce memory usage:\n\n"
+                f"{model_params.lstrip()}"
+            )
+        elif required > allowed_gpu_memory:
+            model_params = self.suggest_param_advice(allowed_gpu_memory)
+            advice = (
+                f"Model does not fit within GPU memory budget. "
+                "Consider the following options to reduce memory usage:\n\n"
+                f"{model_params.lstrip()}"
+            )
+        else:
+            advice = (
+                f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
+                f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
+            )
+        return advice
+
+
+# Specialized estimators:
+class LlamaMemoryEstimator(MemoryEstimator):
+    """
+    Estimator for GQA-type architectures. Handles tied (memory savings) and untied embeddings,
+    and uses grouped attention (GQA) for more efficient KV cache memory estimation.
+
+    KV cache: Use num_attention_heads (assumes GQA)
+    Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible
+    """
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Returns estimated model parameter memory (in GB), accurately accounting
+        for Llama-style attention and MLP, and tied or untied embeddings.
+        """
+        c = self.llm_config
+
+        embedding_params, attn_params = self._calc_attn_embed_params()
+
+        # MLP params
+        gate_proj = c.hidden_size * c.intermediate_size
+        up_proj = c.hidden_size * c.intermediate_size
+        down_proj = c.intermediate_size * c.hidden_size
+        mlp_params = gate_proj + up_proj + down_proj
+
+        # Total per-layer
+        layer_params = attn_params + mlp_params
+        # Total params
+        num_params = c.num_hidden_layers * layer_params + embedding_params
+
+        return num_params * c.bytes_per_parameter / 1e9
+
+    @property
+    def kv_cache_memory(self) -> float:
+        """
+        Returns estimated KV cache memory in GB for GQA models.
+
+        Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
+        num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
+        """
+        c = self.llm_config
+        seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
+        kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
+        kv_heads = c.num_key_value_heads
+
+        total_bytes = (
+            self.batch_size
+            * c.num_hidden_layers
+            * 2
+            * kv_heads
+            * seq_len
+            * c.head_dim
+            * kv_cache_dtype_bytes
+        )
+        return total_bytes / 1e9
+
+    def _calc_attn_embed_params(self) -> tuple:
+        """
+        Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
+        """
+        c = self.llm_config
+
+        # Embedding parameters
+        # assume tied embeddings unless tie_word_embeddings = False
+        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
+        embedding_params = embedding_count * c.vocab_size * c.hidden_size
+
+        q_proj = c.hidden_size * c.hidden_size
+        k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
+        v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
+        o_proj = c.hidden_size * c.hidden_size
+        attn_params = q_proj + k_proj + v_proj + o_proj
+
+        return embedding_params, attn_params
+
+
+class MixtureMemoryEstimator(LlamaMemoryEstimator):
+    """
+    Estimator for Mixture-of-Experts (MoE) architectures (e.g., Mixtral, MoE Llama).
+    Adds extra expert parallelism block parameter count to LlamaMemoryEstimator logic.
+    """
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Accounts for the increase in model parameters due to additional expert MLP blocks in MoE Models.
+
+        Returns the estimated memory size of the MoE Model (in GB).
+        """
+        c = self.llm_config
+        # Attention parameter count (Llama-style)
+        embedding_params, attn_params = self._calc_attn_embed_params()
+
+        # MoE MLP params per layer
+        moe_params_per_layer = (
+            c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
+        )
+        total_params = (
+            c.num_hidden_layers * (attn_params + moe_params_per_layer)
+            + embedding_params
+        )
+
+        # Convert to GB
+        return total_params * c.bytes_per_parameter / 1e9
+
+
+def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
+    """
+    Extracts the correct estimator based on the defined parameters in the config.json
+    See constants.py for LLMConfig parameters necessary for specific estimators.
+    Uses MemoryEstimator as a fallback if parameters needed for GQA and MoE Architectures are missing.
+
+    Returns the appropriate MemoryEstimator based on the fields defined by the model's config.json (as represented by LLMConfig).
+    """
+    if all(
+        hasattr(llm_config, f) and getattr(llm_config, f) is not None
+        for f in MOE_REQUIRED_FIELDS
+    ):
+        return MixtureMemoryEstimator(llm_config=llm_config, **kwargs)
+    elif all(
+        hasattr(llm_config, f) and getattr(llm_config, f) is not None
+        for f in LLAMA_REQUIRED_FIELDS
+    ):
+        return LlamaMemoryEstimator(llm_config=llm_config, **kwargs)
+    else:
+        logger.warning(
+            "Falling back to generic GPT estimator: required fields missing from config.json file in model."
+        )
+        return MemoryEstimator(llm_config=llm_config, **kwargs)
diff --git a/ads/aqua/shaperecommend/llm_config.py b/ads/aqua/shaperecommend/llm_config.py
new file mode 100644
index 000000000..c7ae97d13
--- /dev/null
+++ b/ads/aqua/shaperecommend/llm_config.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import re
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from ads.aqua.common.errors import AquaRecommendationError
+from ads.aqua.shaperecommend.constants import (
+    BITS_AND_BYTES_4BIT,
+    BITS_AND_BYTES_8BIT,
+    DEFAULT_WEIGHT_SIZE,
+    NEXT_QUANT,
+    QUANT_MAPPING,
+)
+
+
+class LLMConfig(BaseModel):
+    """
+    Standardized configuration object for evaluating the size of Large Language Models (LLMs)
+    based on their architecture and quantization.
+    """
+
+    num_hidden_layers: int = Field(
+        ...,
+        description="Number of transformer blocks (layers) in the model’s neural network stack.",
+    )
+    hidden_size: int = Field(
+        ..., description="Embedding dimension or hidden size of each layer."
+    )
+    vocab_size: int = Field(..., description="Vocabulary size for input/output tokens.")
+    num_attention_heads: int = Field(
+        ...,
+        description="Number of attention heads (used for queries and to determine head_dim).",
+    )
+
+    head_dim: int = Field(
+        ...,
+        description="Dimension of each attention head. Typically hidden_size // num_attention_heads.",
+    )
+    max_seq_len: Optional[int] = Field(
+        4096, description="Maximum input sequence length (context window)."
+    )
+    weight_dtype: Optional[str] = Field(
+        DEFAULT_WEIGHT_SIZE,
+        description="Parameter data type: 'float32', 'float16', etc.",
+    )
+    quantization: Optional[str] = Field(
+        None,
+        description="Quantization weight (e.g., '8bit', '4bit') or None if unquantized.",
+    )
+    quantization_type: Optional[str] = Field(
+        None,
+        description="Quantization method (e.g., '8bit', '4bit', 'gptq', 'awq') or None if unquantized.",
+    )
+
+    in_flight_quantization: Optional[str] = Field(
+        None,
+        description="By setting this, enables recalculation of model footprint using 4bit in-flight quantization",
+    )
+
+    num_key_value_heads: Optional[int] = Field(
+        None,
+        description="Number of key/value heads (for GQA architectures: Llama, Mistral, Falcon, Qwen, etc.). Used to determine KV cache size",
+    )
+
+    num_local_experts: Optional[int] = Field(
+        None, description="For MoE architectures, the number of experts per MoE layer"
+    )
+    intermediate_size: Optional[int] = Field(
+        None, description="For MoE architectures, size of the MLP activation layer."
+    )
+
+    tie_word_embeddings: Optional[bool] = Field(None)
+
+    @property
+    def bytes_per_parameter(self) -> float:
+        """
+        Returns the number of bytes used to store a model parameter,
+        accounting for quantization or weight storage type.
+        """
+        # Quantization takes precedence
+        q = (self.quantization or "").lower()
+
+        # Direct match in mapping
+        if q in QUANT_MAPPING:
+            return QUANT_MAPPING[q]
+
+        # Dynamic bit-width detection
+        m = re.match(r"(\d+)\s*bit", q)
+        if m:
+            bits = int(m[1])
+            return bits / 8  # bytes per parameter
+
+        # consider in-flight quantization
+        if self.in_flight_quantization in QUANT_MAPPING:
+            return QUANT_MAPPING[self.in_flight_quantization]
+
+        # Fallback to dtype mapping
+        dtype = (self.weight_dtype or DEFAULT_WEIGHT_SIZE).lower()
+        return QUANT_MAPPING.get(dtype, QUANT_MAPPING[DEFAULT_WEIGHT_SIZE])
+
+    @classmethod
+    def detect_quantization_type(cls, raw: dict) -> Optional[str]:
+        """
+        Detects quantization type (e.g., 'gptq', 'bitsandbytes', 'awq', etc.) from Hugging Face config dict.
+        """
+        qcfg = raw.get("quantization_config", {})
+        if raw.get("load_in_8bit") or raw.get("load_in_4bit"):
+            return "bitsandbytes"
+        for key in [
+            "gptq",
+            "awq",
+            "marlin",
+            "bitblas",
+            "aqlm",
+            "deepspeedfp",
+            "gguf",
+            "fp8",
+        ]:
+            if key in str(qcfg).lower() or key in str(raw).lower():
+                return key
+        return None
+
+    @classmethod
+    def detect_quantization_bits(cls, raw: dict) -> Optional[str]:
+        """
+        Detects quantization bit-width as a string (e.g., '4bit', '8bit') from Hugging Face config dict.
+        """
+        if raw.get("load_in_8bit"):
+            return BITS_AND_BYTES_8BIT
+        if raw.get("load_in_4bit"):
+            return BITS_AND_BYTES_4BIT
+        if "quantization_config" in raw:
+            qcfg = raw["quantization_config"]
+            bits = qcfg.get("bits") or qcfg.get("wbits")
+            if bits:
+                return f"{bits}bit"
+        return None
+
+    @property
+    def suggested_quantizations(self):
+        """
+        Suggests the next lower quantization options based on the current quantization level/ weight size.
+
+        If model is un-quantized, uses the weight size.
+        If model is pre-quantized, uses the quantization level.
+        """
+        key = (
+            self.quantization
+            or self.in_flight_quantization
+            or self.weight_dtype
+            or DEFAULT_WEIGHT_SIZE
+        ).lower()
+        return NEXT_QUANT.get(key, [])
+
+    def calculate_possible_seq_len(self, min_len=2048):
+        """
+        Calculates a list of possible sequence lengths (in tokens).
+        [2048, ... max-length] (max-length found in model's config.json file)
+        """
+        vals = []
+        curr = min_len
+        while curr <= self.max_seq_len:
+            vals.append(curr)
+            curr *= 2
+        if vals and vals[-1] != self.max_seq_len:
+            vals.append(self.max_seq_len)
+        return vals
+
+    def optimal_config(self):
+        """
+        Builds a list of optimal configuration parameters (sorted descending). Combination of:
+            - Quantization / weight sizes: bfloat16 weight size -> 4bit
+            - max-model-len: power-of-two model lengths from max length (config.json of model) to 2048 tokens.
+
+        Example:
+        [('bfloat16', max_model_len supported by model) ('bfloat16', 1/2 of max_model_len) ... ('int4', 4096), ('int4', 2048)]
+
+        """
+        # Create a copy of the suggested_quantizations list
+        quantizations = self.suggested_quantizations[:]
+        quantizations.append("bfloat16")
+
+        lengths = self.calculate_possible_seq_len()
+
+        configs = []
+        for quantization in quantizations:
+            for length in lengths:
+                configs.append((quantization, length))
+
+        configs.sort(
+            key=lambda x: (-QUANT_MAPPING.get(x[0], 0), -x[1])
+        )  # (-quant_priority, -max_seq_len)
+        return configs
+
+    @classmethod
+    def validate_model_support(cls, raw: dict) -> ValueError:
+        """
+        Validates if model is decoder-only. Check for text-generation model occurs at DataScienceModel level.
+        """
+        excluded_models = {"t5", "gemma", "bart", "bert", "roberta", "albert"}
+        if (
+            raw.get("is_encoder_decoder", False)  # exclude encoder-decoder models
+            or (
+                raw.get("is_decoder") is False
+            )  # exclude explicit encoder-only models (altho no text-generation task ones, just dbl check)
+            or raw.get("model_type", "").lower()  # exclude by known model types
+            in excluded_models
+        ):
+            raise AquaRecommendationError(
+                "Please provide a decoder-only text-generation model (ex. Llama, Falcon, etc). "
+                "Encoder-decoder models (ex. T5, Gemma) and encoder-only (BERT) are not supported in this tool at this time."
+            )
+
+    @classmethod
+    def from_raw_config(cls, raw: dict) -> "LLMConfig":
+        """
+        Instantiates an LLMConfig from a raw Hugging Face config.json file,
+        using robust key detection and fallback for architecture.
+        """
+        cls.validate_model_support(raw)
+
+        # Field mappings with fallback
+        num_hidden_layers = (
+            raw.get("num_hidden_layers") or raw.get("n_layer") or raw.get("num_layers")
+        )
+        hidden_size = raw.get("hidden_size") or raw.get("n_embd") or raw.get("d_model")
+        vocab_size = raw.get("vocab_size")
+        weight_dtype = str(raw.get("torch_dtype", DEFAULT_WEIGHT_SIZE))
+        quantization = cls.detect_quantization_bits(raw)
+        quantization_type = cls.detect_quantization_type(raw)
+
+        if not quantization and quantization_type in QUANT_MAPPING:
+            quantization = quantization_type
+
+        num_key_value_heads = (
+            raw.get("num_key_value_heads")  # GQA models (ex. Llama-type)
+        )
+
+        num_attention_heads = (
+            raw.get("num_attention_heads") or raw.get("n_head") or raw.get("num_heads")
+        )
+
+        head_dim = raw.get("head_dim") or (
+            int(hidden_size) // int(num_attention_heads)
+            if hidden_size and num_attention_heads
+            else None
+        )
+        max_seq_len = (
+            raw.get("max_position_embeddings")
+            or raw.get("n_positions")
+            or raw.get("max_seq_len")
+            or 2048
+        )
+
+        num_local_experts = (
+            raw.get("num_local_experts")
+            or raw.get("n_routed_experts")
+            or raw.get("num_experts")
+        )
+        intermediate_size = raw.get("moe_intermediate_size") or raw.get(
+            "intermediate_size"
+        )
+
+        # Type safety: minimal assertion
+        if None in [
+            num_hidden_layers,
+            hidden_size,
+            vocab_size,
+            num_attention_heads,
+            head_dim,
+        ]:
+            raise ValueError("Missing required value in model config.")
+
+        return cls(
+            num_hidden_layers=int(num_hidden_layers),
+            hidden_size=int(hidden_size),
+            num_attention_heads=int(num_attention_heads),
+            num_key_value_heads=num_key_value_heads,
+            head_dim=int(head_dim),
+            vocab_size=int(vocab_size),
+            weight_dtype=weight_dtype,
+            quantization=quantization,
+            quantization_type=quantization_type,
+            max_seq_len=int(max_seq_len),
+            num_local_experts=num_local_experts,
+            intermediate_size=intermediate_size,
+        )
diff --git a/ads/aqua/shaperecommend/recommend.py b/ads/aqua/shaperecommend/recommend.py
new file mode 100644
index 000000000..0cbe5a432
--- /dev/null
+++ b/ads/aqua/shaperecommend/recommend.py
@@ -0,0 +1,499 @@
+import shutil
+from typing import List
+
+from pydantic import ValidationError
+from rich.table import Table
+
+from ads.aqua.app import AquaApp, logger
+from ads.aqua.common.entities import ComputeShapeSummary
+from ads.aqua.common.errors import (
+    AquaFileNotFoundError,
+    AquaRecommendationError,
+    AquaValueError,
+)
+from ads.aqua.common.utils import (
+    build_pydantic_error_message,
+    get_resource_type,
+    load_config,
+    load_gpu_shapes_index,
+)
+from ads.aqua.modeldeployment.deployment import AquaDeploymentApp
+from ads.aqua.shaperecommend.constants import (
+    SAFETENSORS,
+    TEXT_GENERATION,
+    TROUBLESHOOT_MSG,
+)
+from ads.aqua.shaperecommend.estimator import get_estimator
+from ads.aqua.shaperecommend.llm_config import LLMConfig
+from ads.aqua.shaperecommend.shape_report import (
+    ModelConfig,
+    RequestRecommend,
+    ShapeRecommendationReport,
+    ShapeReport,
+)
+from ads.config import COMPARTMENT_OCID
+from ads.model.datascience_model import DataScienceModel
+
+
+class AquaRecommendApp(AquaApp):
+    """
+    Interface for recommending GPU shapes for machine learning model deployments
+    on Oracle Cloud Infrastructure Data Science service.
+
+    This class provides methods to recommend deployment shapes based on a model's requirements,
+    handle recommendation details and troubleshooting, and retrieve specific OCI Machine Learning shapes.
+    Must be used within a properly configured and authenticated OCI environment.
+
+    Methods
+    -------
+    which_gpu(self, **kwargs) -> List[Dict]:
+        Lists the valid GPU deployment shapes that fit the given model and user-provided settings.
+
+    Note:
+        Use `ads aqua recommend which_gpu --help` to get more details on available parameters.
+    """
+
+    def which_gpu(self, **kwargs) -> ShapeRecommendationReport:
+        """
+        Lists valid GPU deployment shapes for the provided model and configuration.
+
+        Validates input, retrieves the model configuration, checks the requested sequence length,
+        identifies available and valid compute shapes, and summarizes which shapes are compatible
+        with the current model settings.
+
+        Parameters
+        ----------
+        ocid : str
+           OCID of the model to recommend feasible compute shapes.
+
+        Returns
+        -------
+        ShapeRecommendationReport
+            A recommendation report with compatible deployment shapes, or troubleshooting info
+            citing the largest shapes if no shape is suitable.
+
+        Raises
+        ------
+        AquaValueError
+            If parameters are missing or invalid, or if no valid sequence length is requested.
+        """
+        try:
+            request = RequestRecommend(**kwargs)
+            # compartment_id = kwargs.pop("compartment_id", COMPARTMENT_OCID)
+            ds_model = self.validate_model_ocid(request.model_ocid)
+            data = self.get_model_config(ds_model)
+
+            llm_config = LLMConfig.from_raw_config(data)
+            available_shapes = self.valid_compute_shapes(
+                compartment_id=request.compartment_id
+            )
+
+            model_name = ds_model.display_name if ds_model.display_name else ""
+
+            recommendations = self.summarize_shapes_for_seq_lens(
+                llm_config, available_shapes, model_name
+            )
+
+        # custom error to catch model incompatibility issues
+        except AquaRecommendationError as error:
+            return ShapeRecommendationReport(
+                recommendations=[], troubleshoot=str(error)
+            )
+
+        except ValidationError as ex:
+            custom_errors = build_pydantic_error_message(ex)
+            raise AquaValueError(
+                f"Invalid parameters to read config.json of LLM Artifact. Error details: {custom_errors}."
+            ) from ex
+        except AquaValueError as ex:
+            logger.error(f"Error with LLM config: {ex}")
+            raise
+
+        return recommendations
+
+    @staticmethod
+    def rich_diff_table(shape_report: ShapeRecommendationReport) -> Table:
+        """
+        Generates a rich-formatted table comparing deployment recommendations
+        from a ShapeRecommendationReport object.
+
+        Args:
+            shape_report (ShapeRecommendationReport): The report containing shape recommendations.
+
+        Returns:
+            Table: A rich Table displaying model deployment recommendations.
+        """
+        logger.debug(
+            "Starting to generate rich diff table from ShapeRecommendationReport."
+        )
+
+        name = shape_report.display_name
+        header = (
+            f"Model Deployment Recommendations: {name}"
+            if name
+            else "Model Deployment Recommendations"
+        )
+        logger.debug(f"Table header set to: {header!r}")
+
+        if shape_report.troubleshoot:
+            header = f"{header}\n{shape_report.troubleshoot}"
+            logger.debug("Appended troubleshoot message to the header.")
+
+        term_columns = shutil.get_terminal_size((120, 20)).columns
+
+        recs_width = min(term_columns - 50, 60)
+        logger.debug(f"Calculated recommendation column width: {recs_width}")
+
+        table = Table(
+            title=header,
+            show_lines=True,
+        )
+        logger.debug("Initialized Table object.")
+
+        table.add_column("Shape Name", max_width=16)
+        table.add_column("Avaliable", max_width=7)
+        table.add_column("Shape Type", max_width=7)
+        table.add_column("GPU Count", max_width=7)
+        table.add_column("Total Memory (GB)", max_width=10)
+        table.add_column("Model Deployment Size (GB)", max_width=7)
+        table.add_column("Deployment Quantization", max_width=10)
+        table.add_column("Recommendation", max_width=recs_width)
+        logger.debug("Added table columns with specified max widths.")
+
+        recs = getattr(shape_report, "recommendations", [])
+        logger.debug(f"Number of recommendations: {len(recs)}")
+
+        for entry in recs:
+            shape = entry.shape_details
+            gpu = shape.gpu_specs
+            conf = entry.configurations[0]
+            model = conf.model_details
+            deploy = conf.deployment_params
+            recommendation = conf.recommendation
+
+            if deploy.params:
+                recommendation = (
+                    f"Suggested PARAMS: {deploy.params}\n\n" + recommendation
+                )
+
+            if gpu.gpu_memory_in_gbs and shape.memory_in_gbs:
+                total_memory = f"GPU: {str(gpu.gpu_memory_in_gbs)}\nCPU: {str(shape.memory_in_gbs)}"
+            elif gpu.gpu_memory_in_gbs:
+                total_memory = f"GPU: {str(gpu.gpu_memory_in_gbs)}"
+            else:
+                total_memory = f"CPU: {str(shape.memory_in_gbs)}"
+
+            table.add_row(
+                shape.name,
+                str(shape.available),
+                str(shape.shape_series),
+                str(gpu.gpu_count),
+                total_memory,
+                str(model.total_model_gb),
+                deploy.quantization,
+                recommendation,
+            )
+
+        logger.debug("Completed populating table with recommendation rows.")
+        return table
+
+    def shapes(self, **kwargs) -> Table:
+        """
+        For the CLI, generates the table (in rich diff) with valid GPU deployment shapes
+        for the provided model and configuration.
+
+        Validates if recommendations are generated, calls method to construct the rich diff
+        table with the recommendation data.
+
+        Parameters
+        ----------
+        model_ocid : str
+           OCID of the model to recommend feasible compute shapes.
+
+        Returns
+        -------
+        Table
+            A table format for the recommendation report with compatible deployment shapes
+            or troubleshooting info citing the largest shapes if no shape is suitable.
+
+        Raises
+        ------
+        AquaValueError
+            If model type is unsupported by tool (no recommendation report generated)
+        """
+        shape_recommend_report = self.which_gpu(**kwargs)
+        if not shape_recommend_report.recommendations:
+            if shape_recommend_report.troubleshoot:
+                raise AquaValueError(shape_recommend_report.troubleshoot)
+            else:
+                raise AquaValueError(
+                    "Unable to generate recommendations from model. Please ensure model is registered and is a decoder-only text-generation model."
+                )
+
+        return self.rich_diff_table(shape_recommend_report)
+
+    @staticmethod
+    def validate_model_ocid(ocid: str) -> DataScienceModel:
+        """
+        Ensures the OCID passed is valid for referencing a DataScienceModel resource.
+        """
+        resource_type = get_resource_type(ocid)
+
+        if resource_type != "datasciencemodel":
+            raise AquaValueError(
+                f"The provided OCID '{ocid}' is not a valid Oracle Cloud Data Science Model OCID. "
+                "Please provide an OCID corresponding to a Data Science model resource. "
+                "Tip: Data Science model OCIDs typically start with 'ocid1.datasciencemodel...'."
+            )
+
+        model = DataScienceModel.from_id(ocid)
+        return model
+
+    @staticmethod
+    def get_model_config(model: DataScienceModel):
+        """
+        Loads the configuration for a given Oracle Cloud Data Science model.
+
+        Validates the resource type associated with the provided OCID, ensures the model
+        is for text-generation with a supported decoder-only architecture, and loads the model's
+        configuration JSON from the artifact path.
+
+        Parameters
+        ----------
+        model : DataScienceModel
+            The DataScienceModel representation of the model used in recommendations
+
+        Returns
+        -------
+        dict
+            The parsed configuration dictionary from config.json.
+
+        Raises
+        ------
+        AquaValueError
+            If the OCID is not for a Data Science model, or if the model type is not supported,
+            or if required files/tags are not present.
+
+        AquaRecommendationError
+            If the model OCID provided is not supported (only text-generation decoder models in safetensor format supported).
+        """
+
+        model_task = model.freeform_tags.get("task", "").lower()
+        model_format = model.freeform_tags.get("model_format", "").lower()
+
+        logger.info(f"Current model task type: {model_task}")
+        logger.info(f"Current model format: {model_format}")
+
+        if TEXT_GENERATION not in model_task:
+            raise AquaRecommendationError(
+                "Please provide a decoder-only text-generation model (ex. Llama, Falcon, etc.). "
+                f"Only text-generation models are supported in this tool at this time. Current model task type: {model_task}"
+            )
+        if SAFETENSORS not in model_format:
+            msg = "Please provide a model in Safetensor format."
+            if model_format:
+                msg += f"The current model format ({model_format}) is not supported by this tool at this time."
+
+            raise AquaRecommendationError(msg)
+
+        if not model.artifact:
+            raise AquaValueError(
+                "Unable to retrieve model artifact. Ensure model is registered and active."
+            )
+
+        try:
+            data = load_config(model.artifact, "config.json")
+
+        except AquaFileNotFoundError as e:
+            logger.error(
+                f"config.json not found in model artifact at {model.artifact}: {e}"
+            )
+            raise AquaRecommendationError(
+                "The configuration file 'config.json' was not found in the specified model directory. "
+                "Please ensure your model follows the Hugging Face format and includes a 'config.json' with the necessary architecture parameters."
+            ) from e
+
+        return data
+
+    def valid_compute_shapes(self, compartment_id) -> List["ComputeShapeSummary"]:
+        """
+        Returns a filtered list of GPU-only ComputeShapeSummary objects by reading and parsing a JSON file.
+
+        Parameters
+        ----------
+        file : str
+            Path to the JSON file containing shape data.
+
+        Returns
+        -------
+        List[ComputeShapeSummary]
+            List of ComputeShapeSummary objects passing the checks.
+
+        Raises
+        ------
+        ValueError
+            If the file cannot be opened, parsed, or the 'shapes' key is missing.
+        """
+        user_shapes = AquaDeploymentApp().list_shapes(compartment_id=compartment_id)
+        set_user_shapes = {shape.name: shape for shape in user_shapes}
+
+        gpu_shapes_metadata = load_gpu_shapes_index().shapes
+
+        valid_shapes = []
+        # only loops through GPU shapes, update later to include CPU shapes
+        for name, spec in gpu_shapes_metadata.items():
+            if name in set_user_shapes:
+                compute_shape = set_user_shapes.get(name)
+                compute_shape.available = True
+                compute_shape.shape_series = "GPU"
+                valid_shapes.append(compute_shape)
+            else:
+                valid_shapes.append(
+                    ComputeShapeSummary(
+                        available=False, name=name, shape_series="GPU", gpu_specs=spec
+                    )
+                )
+        valid_shapes.sort(
+            key=lambda shape: shape.gpu_specs.gpu_memory_in_gbs, reverse=True
+        )
+        return valid_shapes
+
+    @staticmethod
+    def summarize_shapes_for_seq_lens(
+        config: LLMConfig,
+        shapes: List[ComputeShapeSummary],
+        name: str,
+        batch_size: int = 1,
+    ) -> ShapeRecommendationReport:
+        """
+        Generate a recommendation report for eligible deployment shapes by evaluating
+        model memory consumption and maximum model length for given configurations.
+
+        Parameters
+        ----------
+        config : LLMConfig
+            The loaded model configuration.
+        shapes : List[ComputeShapeSummary]
+            All candidate deployment shapes.
+        name : str
+            name of the model
+        batch_size : int, optional
+            Batch size to evaluate (default is 1).
+
+        Returns
+        -------
+        ShapeRecommendationReport
+            Report containing shape recommendations and troubleshooting advice, if any.
+
+        Raises
+        ------
+        ValueError
+            If no GPU shapes are available.
+
+        Notes
+        -----
+        - Considers quantization if defined in config, otherwise cycles through optimal configs.
+        - Applies pareto optimality if too many recommendations.
+        - Provides troubleshooting options if nothing fits.
+        """
+        recommendations = []
+
+        if not shapes:
+            raise ValueError(
+                "No GPU shapes were passed for recommendation. Ensure shape parsing succeeded."
+            )
+
+        # Pre-quantized: only consider different max-seq-len
+        if config.quantization_type:
+            deployment_config = config.calculate_possible_seq_len()
+            for shape in shapes:
+                if config.quantization_type in shape.gpu_specs.quantization:
+                    allowed_gpu_memory = shape.gpu_specs.gpu_memory_in_gbs
+                    for max_seq_len in deployment_config:
+                        estimator = get_estimator(
+                            llm_config=config,
+                            seq_len=max_seq_len,
+                            batch_size=batch_size,
+                        )
+                        if estimator.validate_shape(allowed_gpu_memory):
+                            best_config = [
+                                ModelConfig.constuct_model_config(
+                                    estimator, allowed_gpu_memory
+                                )
+                            ]
+                            recommendations.append(
+                                ShapeReport(
+                                    shape_details=shape, configurations=best_config
+                                )
+                            )
+                            break
+
+        # unquantized: consider inflight quantization (4bit)
+        else:
+            deployment_config = config.optimal_config()
+            prev_quant = None
+            for shape in shapes:
+                allowed_gpu_memory = shape.gpu_specs.gpu_memory_in_gbs
+                for quantization, max_seq_len in deployment_config:
+                    if quantization != prev_quant:
+                        updated_config = config.model_copy(
+                            update={"in_flight_quantization": quantization}
+                        )
+                        prev_quant = quantization
+                    estimator = get_estimator(
+                        llm_config=updated_config,
+                        seq_len=max_seq_len,
+                        batch_size=batch_size,
+                    )
+                    if estimator.validate_shape(allowed_gpu_memory):
+                        best_config = [
+                            ModelConfig.constuct_model_config(
+                                estimator, allowed_gpu_memory
+                            )
+                        ]
+                        recommendations.append(
+                            ShapeReport(shape_details=shape, configurations=best_config)
+                        )
+                        break
+
+        troubleshoot_msg = ""
+
+        if len(recommendations) > 2:
+            recommendations = ShapeReport.pareto_front(recommendations)
+
+        if not recommendations:
+            # Troubleshooting advice if nothing fits
+            # Assumes shapes is sorted largest to smallest and quantizations 'fp8'/'4bit' exist
+            troubleshoot_msg += TROUBLESHOOT_MSG
+
+            largest_shapes = (
+                [(shapes[0], "fp8", False), (shapes[1], "4bit", True)]
+                if len(shapes) > 1
+                else []
+            )  # shape, quantization, in_flight_quantization
+
+            for shape, quantization, in_flight in largest_shapes:
+                if in_flight:
+                    updated_config = config.model_copy(
+                        update={"in_flight_quantization": quantization}
+                    )
+                else:
+                    updated_config = config.model_copy(
+                        update={"quantization": quantization}
+                    )
+                estimator = get_estimator(
+                    llm_config=updated_config, seq_len=2048, batch_size=batch_size
+                )
+                allowed_gpu_memory = shape.gpu_specs.gpu_memory_in_gbs * 0.9
+                best_config = [
+                    ModelConfig.constuct_model_config(estimator, allowed_gpu_memory)
+                ]
+                recommendations.append(
+                    ShapeReport(shape_details=shape, configurations=best_config)
+                )
+
+        return ShapeRecommendationReport(
+            display_name=name,
+            recommendations=recommendations,
+            troubleshoot=troubleshoot_msg,
+        )
diff --git a/ads/aqua/shaperecommend/shape_report.py b/ads/aqua/shaperecommend/shape_report.py
new file mode 100644
index 000000000..a6f8dc36a
--- /dev/null
+++ b/ads/aqua/shaperecommend/shape_report.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+# Copyright (c) 2025 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+from typing import List, Optional
+
+from pydantic import BaseModel, Field
+
+from ads.aqua.common.entities import ComputeShapeSummary
+from ads.aqua.shaperecommend.constants import QUANT_MAPPING
+from ads.aqua.shaperecommend.estimator import MemoryEstimator
+from ads.config import COMPARTMENT_OCID
+
+
+class RequestRecommend(BaseModel):
+    """
+    A request to recommend compute shapes and parameters for a given model.
+    """
+
+    model_ocid: str = Field(
+        ..., description="The OCID of the model to recommend feasible compute shapes."
+    )
+    compartment_id: Optional[str] = Field(
+        COMPARTMENT_OCID, description="The OCID of user's compartment"
+    )
+
+    class Config:
+        protected_namespaces = ()
+
+
+class DeploymentParams(BaseModel):  # noqa: N801
+    """
+    Recommended parameters for deployment and model inferencing (specific to compute shape & model).
+    """
+
+    quantization: Optional[str] = Field(
+        None, description="Type of quantization (e.g. 4bit)."
+    )
+    max_model_len: int = Field(..., description="Maximum length of input sequence.")
+    params: str = Field(
+        ..., description="Runtime parameters for deployment with vLLM, etc."
+    )
+
+
+class ModelDetail(BaseModel):
+    """
+    The estimated memory footprint of a model, KV cache, and its total (model + KV cache).
+    """
+
+    model_size_gb: float = Field(..., description="Size of the model in GB.")
+    kv_cache_size_gb: float = Field(..., description="Size of KV cache in GB.")
+    total_model_gb: float = Field(
+        ..., description="Total size of model and cache in GB."
+    )
+
+    class Config:
+        protected_namespaces = ()
+
+
+class ModelConfig(BaseModel):
+    """
+    The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
+    """
+
+    model_details: ModelDetail = Field(..., description="Details about the model.")
+    deployment_params: DeploymentParams = Field(
+        ..., description="Parameters for deployment."
+    )
+    recommendation: str = Field(..., description="GPU recommendation for the model.")
+
+    class Config:
+        protected_namespaces = ()
+
+    @classmethod
+    def constuct_model_config(
+        cls, estimator: MemoryEstimator, allowed_gpu_memory: float
+    ) -> "ModelConfig":
+        """
+        Assembles a complete ModelConfig, including model details, deployment parameters (vLLM), and recommendations.
+
+        Parameters
+        ----------
+        estimator : MemoryEstimator
+            Estimator with model details and processed config.
+        allowed_gpu_memory : float
+            Maximum allowed GPU memory (in GBs) for this configuration.
+
+        Returns
+        -------
+        ModelConfig
+            Contains round-tripped model size, kv cache, total, vLLM parameters, and recommendations.
+
+        Notes
+        -----
+        - Rounds all sizes to 3 decimal digits.
+        - Computes a recommendation string using `limiting_factor`.
+        """
+        c = estimator.llm_config
+        deployment_params = DeploymentParams(
+            quantization=c.quantization or c.in_flight_quantization or c.weight_dtype,
+            max_model_len=getattr(estimator, "seq_len", None),
+            params=estimator.construct_deployment_params(),
+        )
+        model_detail = ModelDetail(
+            model_size_gb=round(getattr(estimator, "model_memory", 0.0), 2),
+            kv_cache_size_gb=round(getattr(estimator, "kv_cache_memory", 0.0), 2),
+            total_model_gb=round(getattr(estimator, "total_memory", 0.0), 2),
+        )
+        return ModelConfig(
+            model_details=model_detail,
+            deployment_params=deployment_params,
+            recommendation=estimator.limiting_factor(allowed_gpu_memory),
+        )
+
+
+class ShapeReport(BaseModel):
+    """
+    The feasible deployment configurations for the model per shape.
+    """
+
+    shape_details: "ComputeShapeSummary" = Field(
+        ..., description="Details about the compute shape (ex. VM.GPU.A10.2)."
+    )
+    configurations: List["ModelConfig"] = Field(
+        default_factory=list, description="List of model configurations."
+    )
+
+    def is_dominated(self, others: List["ShapeReport"]) -> bool:
+        """
+        Determines whether this shape is dominated by any other shape in a Pareto sense.
+
+        Parameters
+        ----------
+        others : list of ShapeReport
+            List of other shape/deployment configurations to compare against.
+
+        Returns
+        -------
+        bool
+            True if this shape is dominated by at least one other, False otherwise.
+
+        Notes
+        -----
+        A shape is dominated if there exists another configuration that is
+        at least as good in all criteria and strictly better in at least one.
+        Criteria:
+        - Cost (to be minimized)
+        - Performance, quantization level, max sequence length (to be maximized)
+        """
+
+        cand_cost = self.shape_details.gpu_specs.ranking.cost
+        cand_perf = self.shape_details.gpu_specs.ranking.performance
+        cand_quant = QUANT_MAPPING.get(
+            self.configurations[0].deployment_params.quantization, 0
+        )
+        cand_maxlen = self.configurations[0].deployment_params.max_model_len
+
+        for other in others:
+            other_cost = other.shape_details.gpu_specs.ranking.cost
+            other_perf = other.shape_details.gpu_specs.ranking.performance
+            other_quant = QUANT_MAPPING.get(
+                other.configurations[0].deployment_params.quantization, 0
+            )
+            other_maxlen = other.configurations[0].deployment_params.max_model_len
+            if (
+                other_cost <= cand_cost
+                and other_perf >= cand_perf
+                and other_quant >= cand_quant
+                and other_maxlen >= cand_maxlen
+                and (
+                    other_cost < cand_cost
+                    or other_perf > cand_perf
+                    or other_quant > cand_quant
+                    or other_maxlen > cand_maxlen
+                )
+            ):
+                return True
+        return False
+
+    @classmethod
+    def pareto_front(cls, shapes: List["ShapeReport"]) -> List["ShapeReport"]:
+        """
+        Filters a list of shapes/configurations to those on the Pareto frontier.
+
+        Parameters
+        ----------
+        shapes : list of ShapeReport
+            List of candidate shape/configuration reports to evaluate.
+
+        Returns
+        -------
+        list of ShapeReport
+            Subset of input shapes that are not dominated by any other (the Pareto front).
+
+        Notes
+        -----
+        The returned set contains non-dominated deployments for maximizing
+        performance, quantization, and model length, while minimizing cost.
+        """
+        return [
+            shape
+            for shape in shapes
+            if not shape.is_dominated([s for s in shapes if s != shape])
+        ]
+
+
+class ShapeRecommendationReport(BaseModel):
+    """
+    Full report of shape fit recommendations and troubleshooting, if applicable.
+
+    Attributes:
+        recommendations (List[DeploymentShapeSummary]): Recommended deployment shapes
+            for each tested batch size and max sequence length combination.
+        troubleshoot (Optional[TroubleshootShapeSummary]): Troubleshooting information
+            if no valid deployment shapes are available.
+    """
+
+    display_name: Optional[str] = Field(
+        "", description="Name of the model used for recommendations."
+    )
+    recommendations: List[ShapeReport] = Field(
+        default_factory=list, description="List of shape fit recommendations."
+    )
+    troubleshoot: Optional[str] = Field(
+        None,
+        description="Details for troubleshooting if no shapes fit the current model.",
+    )
diff --git a/ads/cli.py b/ads/cli.py
index d02d13e5d..6df841fd9 100644
--- a/ads/cli.py
+++ b/ads/cli.py
@@ -7,6 +7,8 @@
 import sys
 import traceback
 import uuid
+from rich.console import Console
+from rich.table import Table
 
 import fire
 from pydantic import BaseModel
@@ -92,6 +94,12 @@ def serialize(data):
                 print(str(item))
     elif isinstance(data, BaseModel):
         print(json.dumps(data.dict(), indent=4))
+    elif isinstance(data, Table):
+        console = Console()
+        console.print(data)
+        return
+    elif data is None:
+        return
     else:
         print(str(data))
 
@@ -131,7 +139,7 @@ def exit_program(ex: Exception, logger: "logging.Logger") -> None:
 
     request_id = str(uuid.uuid4())
     logger.debug(f"Error Request ID: {request_id}\nError: {traceback.format_exc()}")
-    logger.error(f"Error Request ID: {request_id}\n" f"Error: {str(ex)}")
+    logger.error(f"Error Request ID: {request_id}\nError: {str(ex)}")
 
     exit_code = getattr(ex, "exit_code", 1)
     logger.error(f"Exit code: {exit_code}")
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json b/tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json
new file mode 100644
index 000000000..a7119b3a2
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 11,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.1",
+  "use_cache": true,
+  "vocab_size": 131072
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json b/tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json
new file mode 100644
index 000000000..da5e6d57d
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json
@@ -0,0 +1,69 @@
+{
+  "architectures": [
+    "DeepseekV3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_deepseek.DeepseekV3Config",
+    "AutoModel": "modeling_deepseek.DeepseekV3Model",
+    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
+  },
+  "aux_loss_alpha": 0.001,
+  "bos_token_id": 163584,
+  "eos_token_id": 163585,
+  "first_k_dense_replace": 1,
+  "hidden_act": "silu",
+  "hidden_size": 7168,
+  "initializer_range": 0.02,
+  "intermediate_size": 18432,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "kimi_k2",
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "n_group": 1,
+  "n_routed_experts": 384,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 64,
+  "num_nextn_predict_layers": 0,
+  "pretraining_tp": 1,
+  "q_lora_rank": 1536,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [
+      128,
+      128
+    ]
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 50000.0,
+  "routed_scaling_factor": 2.827,
+  "rope_scaling": {
+    "beta_fast": 1.0,
+    "beta_slow": 1.0,
+    "factor": 32.0,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn"
+  },
+  "scoring_func": "sigmoid",
+  "seq_aux": true,
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.3",
+  "use_cache": true,
+  "v_head_dim": 128,
+  "vocab_size": 163840
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json b/tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json
new file mode 100644
index 000000000..b3567f0f8
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json
@@ -0,0 +1,332 @@
+{
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "decoder_sparse_step": 1,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 94,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 1536,
+  "norm_topk_prob": true,
+  "num_attention_heads": 64,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 94,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,  
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "modules_to_not_convert": [
+      "lm_head",
+      "model.layers.0.input_layernorm",
+      "model.layers.0.mlp.gate",
+      "model.layers.0.post_attention_layernorm",
+      "model.layers.1.input_layernorm",
+      "model.layers.1.mlp.gate",
+      "model.layers.1.post_attention_layernorm",
+      "model.layers.2.input_layernorm",
+      "model.layers.2.mlp.gate",
+      "model.layers.2.post_attention_layernorm",
+      "model.layers.3.input_layernorm",
+      "model.layers.3.mlp.gate",
+      "model.layers.3.post_attention_layernorm",
+      "model.layers.4.input_layernorm",
+      "model.layers.4.mlp.gate",
+      "model.layers.4.post_attention_layernorm",
+      "model.layers.5.input_layernorm",
+      "model.layers.5.mlp.gate",
+      "model.layers.5.post_attention_layernorm",
+      "model.layers.6.input_layernorm",
+      "model.layers.6.mlp.gate",
+      "model.layers.6.post_attention_layernorm",
+      "model.layers.7.input_layernorm",
+      "model.layers.7.mlp.gate",
+      "model.layers.7.post_attention_layernorm",
+      "model.layers.8.input_layernorm",
+      "model.layers.8.mlp.gate",
+      "model.layers.8.post_attention_layernorm",
+      "model.layers.9.input_layernorm",
+      "model.layers.9.mlp.gate",
+      "model.layers.9.post_attention_layernorm",
+      "model.layers.10.input_layernorm",
+      "model.layers.10.mlp.gate",
+      "model.layers.10.post_attention_layernorm",
+      "model.layers.11.input_layernorm",
+      "model.layers.11.mlp.gate",
+      "model.layers.11.post_attention_layernorm",
+      "model.layers.12.input_layernorm",
+      "model.layers.12.mlp.gate",
+      "model.layers.12.post_attention_layernorm",
+      "model.layers.13.input_layernorm",
+      "model.layers.13.mlp.gate",
+      "model.layers.13.post_attention_layernorm",
+      "model.layers.14.input_layernorm",
+      "model.layers.14.mlp.gate",
+      "model.layers.14.post_attention_layernorm",
+      "model.layers.15.input_layernorm",
+      "model.layers.15.mlp.gate",
+      "model.layers.15.post_attention_layernorm",
+      "model.layers.16.input_layernorm",
+      "model.layers.16.mlp.gate",
+      "model.layers.16.post_attention_layernorm",
+      "model.layers.17.input_layernorm",
+      "model.layers.17.mlp.gate",
+      "model.layers.17.post_attention_layernorm",
+      "model.layers.18.input_layernorm",
+      "model.layers.18.mlp.gate",
+      "model.layers.18.post_attention_layernorm",
+      "model.layers.19.input_layernorm",
+      "model.layers.19.mlp.gate",
+      "model.layers.19.post_attention_layernorm",
+      "model.layers.20.input_layernorm",
+      "model.layers.20.mlp.gate",
+      "model.layers.20.post_attention_layernorm",
+      "model.layers.21.input_layernorm",
+      "model.layers.21.mlp.gate",
+      "model.layers.21.post_attention_layernorm",
+      "model.layers.22.input_layernorm",
+      "model.layers.22.mlp.gate",
+      "model.layers.22.post_attention_layernorm",
+      "model.layers.23.input_layernorm",
+      "model.layers.23.mlp.gate",
+      "model.layers.23.post_attention_layernorm",
+      "model.layers.24.input_layernorm",
+      "model.layers.24.mlp.gate",
+      "model.layers.24.post_attention_layernorm",
+      "model.layers.25.input_layernorm",
+      "model.layers.25.mlp.gate",
+      "model.layers.25.post_attention_layernorm",
+      "model.layers.26.input_layernorm",
+      "model.layers.26.mlp.gate",
+      "model.layers.26.post_attention_layernorm",
+      "model.layers.27.input_layernorm",
+      "model.layers.27.mlp.gate",
+      "model.layers.27.post_attention_layernorm",
+      "model.layers.28.input_layernorm",
+      "model.layers.28.mlp.gate",
+      "model.layers.28.post_attention_layernorm",
+      "model.layers.29.input_layernorm",
+      "model.layers.29.mlp.gate",
+      "model.layers.29.post_attention_layernorm",
+      "model.layers.30.input_layernorm",
+      "model.layers.30.mlp.gate",
+      "model.layers.30.post_attention_layernorm",
+      "model.layers.31.input_layernorm",
+      "model.layers.31.mlp.gate",
+      "model.layers.31.post_attention_layernorm",
+      "model.layers.32.input_layernorm",
+      "model.layers.32.mlp.gate",
+      "model.layers.32.post_attention_layernorm",
+      "model.layers.33.input_layernorm",
+      "model.layers.33.mlp.gate",
+      "model.layers.33.post_attention_layernorm",
+      "model.layers.34.input_layernorm",
+      "model.layers.34.mlp.gate",
+      "model.layers.34.post_attention_layernorm",
+      "model.layers.35.input_layernorm",
+      "model.layers.35.mlp.gate",
+      "model.layers.35.post_attention_layernorm",
+      "model.layers.36.input_layernorm",
+      "model.layers.36.mlp.gate",
+      "model.layers.36.post_attention_layernorm",
+      "model.layers.37.input_layernorm",
+      "model.layers.37.mlp.gate",
+      "model.layers.37.post_attention_layernorm",
+      "model.layers.38.input_layernorm",
+      "model.layers.38.mlp.gate",
+      "model.layers.38.post_attention_layernorm",
+      "model.layers.39.input_layernorm",
+      "model.layers.39.mlp.gate",
+      "model.layers.39.post_attention_layernorm",
+      "model.layers.40.input_layernorm",
+      "model.layers.40.mlp.gate",
+      "model.layers.40.post_attention_layernorm",
+      "model.layers.41.input_layernorm",
+      "model.layers.41.mlp.gate",
+      "model.layers.41.post_attention_layernorm",
+      "model.layers.42.input_layernorm",
+      "model.layers.42.mlp.gate",
+      "model.layers.42.post_attention_layernorm",
+      "model.layers.43.input_layernorm",
+      "model.layers.43.mlp.gate",
+      "model.layers.43.post_attention_layernorm",
+      "model.layers.44.input_layernorm",
+      "model.layers.44.mlp.gate",
+      "model.layers.44.post_attention_layernorm",
+      "model.layers.45.input_layernorm",
+      "model.layers.45.mlp.gate",
+      "model.layers.45.post_attention_layernorm",
+      "model.layers.46.input_layernorm",
+      "model.layers.46.mlp.gate",
+      "model.layers.46.post_attention_layernorm",
+      "model.layers.47.input_layernorm",
+      "model.layers.47.mlp.gate",
+      "model.layers.47.post_attention_layernorm",
+      "model.layers.48.input_layernorm",
+      "model.layers.48.mlp.gate",
+      "model.layers.48.post_attention_layernorm",
+      "model.layers.49.input_layernorm",
+      "model.layers.49.mlp.gate",
+      "model.layers.49.post_attention_layernorm",
+      "model.layers.50.input_layernorm",
+      "model.layers.50.mlp.gate",
+      "model.layers.50.post_attention_layernorm",
+      "model.layers.51.input_layernorm",
+      "model.layers.51.mlp.gate",
+      "model.layers.51.post_attention_layernorm",
+      "model.layers.52.input_layernorm",
+      "model.layers.52.mlp.gate",
+      "model.layers.52.post_attention_layernorm",
+      "model.layers.53.input_layernorm",
+      "model.layers.53.mlp.gate",
+      "model.layers.53.post_attention_layernorm",
+      "model.layers.54.input_layernorm",
+      "model.layers.54.mlp.gate",
+      "model.layers.54.post_attention_layernorm",
+      "model.layers.55.input_layernorm",
+      "model.layers.55.mlp.gate",
+      "model.layers.55.post_attention_layernorm",
+      "model.layers.56.input_layernorm",
+      "model.layers.56.mlp.gate",
+      "model.layers.56.post_attention_layernorm",
+      "model.layers.57.input_layernorm",
+      "model.layers.57.mlp.gate",
+      "model.layers.57.post_attention_layernorm",
+      "model.layers.58.input_layernorm",
+      "model.layers.58.mlp.gate",
+      "model.layers.58.post_attention_layernorm",
+      "model.layers.59.input_layernorm",
+      "model.layers.59.mlp.gate",
+      "model.layers.59.post_attention_layernorm",
+      "model.layers.60.input_layernorm",
+      "model.layers.60.mlp.gate",
+      "model.layers.60.post_attention_layernorm",
+      "model.layers.61.input_layernorm",
+      "model.layers.61.mlp.gate",
+      "model.layers.61.post_attention_layernorm",
+      "model.layers.62.input_layernorm",
+      "model.layers.62.mlp.gate",
+      "model.layers.62.post_attention_layernorm",
+      "model.layers.63.input_layernorm",
+      "model.layers.63.mlp.gate",
+      "model.layers.63.post_attention_layernorm",
+      "model.layers.64.input_layernorm",
+      "model.layers.64.mlp.gate",
+      "model.layers.64.post_attention_layernorm",
+      "model.layers.65.input_layernorm",
+      "model.layers.65.mlp.gate",
+      "model.layers.65.post_attention_layernorm",
+      "model.layers.66.input_layernorm",
+      "model.layers.66.mlp.gate",
+      "model.layers.66.post_attention_layernorm",
+      "model.layers.67.input_layernorm",
+      "model.layers.67.mlp.gate",
+      "model.layers.67.post_attention_layernorm",
+      "model.layers.68.input_layernorm",
+      "model.layers.68.mlp.gate",
+      "model.layers.68.post_attention_layernorm",
+      "model.layers.69.input_layernorm",
+      "model.layers.69.mlp.gate",
+      "model.layers.69.post_attention_layernorm",
+      "model.layers.70.input_layernorm",
+      "model.layers.70.mlp.gate",
+      "model.layers.70.post_attention_layernorm",
+      "model.layers.71.input_layernorm",
+      "model.layers.71.mlp.gate",
+      "model.layers.71.post_attention_layernorm",
+      "model.layers.72.input_layernorm",
+      "model.layers.72.mlp.gate",
+      "model.layers.72.post_attention_layernorm",
+      "model.layers.73.input_layernorm",
+      "model.layers.73.mlp.gate",
+      "model.layers.73.post_attention_layernorm",
+      "model.layers.74.input_layernorm",
+      "model.layers.74.mlp.gate",
+      "model.layers.74.post_attention_layernorm",
+      "model.layers.75.input_layernorm",
+      "model.layers.75.mlp.gate",
+      "model.layers.75.post_attention_layernorm",
+      "model.layers.76.input_layernorm",
+      "model.layers.76.mlp.gate",
+      "model.layers.76.post_attention_layernorm",
+      "model.layers.77.input_layernorm",
+      "model.layers.77.mlp.gate",
+      "model.layers.77.post_attention_layernorm",
+      "model.layers.78.input_layernorm",
+      "model.layers.78.mlp.gate",
+      "model.layers.78.post_attention_layernorm",
+      "model.layers.79.input_layernorm",
+      "model.layers.79.mlp.gate",
+      "model.layers.79.post_attention_layernorm",
+      "model.layers.80.input_layernorm",
+      "model.layers.80.mlp.gate",
+      "model.layers.80.post_attention_layernorm",
+      "model.layers.81.input_layernorm",
+      "model.layers.81.mlp.gate",
+      "model.layers.81.post_attention_layernorm",
+      "model.layers.82.input_layernorm",
+      "model.layers.82.mlp.gate",
+      "model.layers.82.post_attention_layernorm",
+      "model.layers.83.input_layernorm",
+      "model.layers.83.mlp.gate",
+      "model.layers.83.post_attention_layernorm",
+      "model.layers.84.input_layernorm",
+      "model.layers.84.mlp.gate",
+      "model.layers.84.post_attention_layernorm",
+      "model.layers.85.input_layernorm",
+      "model.layers.85.mlp.gate",
+      "model.layers.85.post_attention_layernorm",
+      "model.layers.86.input_layernorm",
+      "model.layers.86.mlp.gate",
+      "model.layers.86.post_attention_layernorm",
+      "model.layers.87.input_layernorm",
+      "model.layers.87.mlp.gate",
+      "model.layers.87.post_attention_layernorm",
+      "model.layers.88.input_layernorm",
+      "model.layers.88.mlp.gate",
+      "model.layers.88.post_attention_layernorm",
+      "model.layers.89.input_layernorm",
+      "model.layers.89.mlp.gate",
+      "model.layers.89.post_attention_layernorm",
+      "model.layers.90.input_layernorm",
+      "model.layers.90.mlp.gate",
+      "model.layers.90.post_attention_layernorm",
+      "model.layers.91.input_layernorm",
+      "model.layers.91.mlp.gate",
+      "model.layers.91.post_attention_layernorm",
+      "model.layers.92.input_layernorm",
+      "model.layers.92.mlp.gate",
+      "model.layers.92.post_attention_layernorm",
+      "model.layers.93.input_layernorm",
+      "model.layers.93.mlp.gate",
+      "model.layers.93.post_attention_layernorm"
+    ],
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [
+      128,
+      128
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json b/tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json
new file mode 100644
index 000000000..dab4744ed
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json
@@ -0,0 +1,382 @@
+{
+    "display_name": "Devstral-Small-2507-GQA",
+    "recommendations": [
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.H200.8",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 1128,
+                    "gpu_count": 8,
+                    "gpu_type": "H200",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "fp8",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 100,
+                        "performance": 110
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 47.98,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 69.46
+                    },
+                    "deployment_params": {
+                        "quantization": "bfloat16",
+                        "max_model_len": 131072,
+                        "params": ""
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1128.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.MI300X.8",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 1536,
+                    "gpu_count": 8,
+                    "gpu_type": "MI300X",
+                    "quantization": [
+                        "fp8",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 90,
+                        "performance": 90
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 47.98,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 69.46
+                    },
+                    "deployment_params": {
+                        "quantization": "bfloat16",
+                        "max_model_len": 131072,
+                        "params": ""
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1536.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.L40S-NC.4",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 192,
+                    "gpu_count": 4,
+                    "gpu_type": "L40S",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "fp8",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 60,
+                        "performance": 80
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 47.98,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 69.46
+                    },
+                    "deployment_params": {
+                        "quantization": "bfloat16",
+                        "max_model_len": 131072,
+                        "params": ""
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.L40S.4",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 192,
+                    "gpu_count": 4,
+                    "gpu_type": "L40S",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "fp8",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 60,
+                        "performance": 80
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 47.98,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 69.46
+                    },
+                    "deployment_params": {
+                        "quantization": "bfloat16",
+                        "max_model_len": 131072,
+                        "params": ""
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "VM.GPU.A10.1",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 24,
+                    "gpu_count": 1,
+                    "gpu_type": "A10",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 20,
+                        "performance": 30
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 12.0,
+                        "kv_cache_size_gb": 5.37,
+                        "total_model_gb": 17.36
+                    },
+                    "deployment_params": {
+                        "quantization": "4bit",
+                        "max_model_len": 32768,
+                        "params": "--max-model-len 32768 --quantization bitsandbytes --load-format bitsandbytes"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (17.4GB used / 24.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "VM.GPU.A10.2",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 48,
+                    "gpu_count": 2,
+                    "gpu_type": "A10",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 40,
+                        "performance": 40
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 12.0,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 33.47
+                    },
+                    "deployment_params": {
+                        "quantization": "4bit",
+                        "max_model_len": 131072,
+                        "params": " --quantization bitsandbytes --load-format bitsandbytes"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (33.5GB used / 48.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.A10.4",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 96,
+                    "gpu_count": 4,
+                    "gpu_type": "A10",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 50,
+                        "performance": 50
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 47.98,
+                        "kv_cache_size_gb": 21.47,
+                        "total_model_gb": 69.46
+                    },
+                    "deployment_params": {
+                        "quantization": "bfloat16",
+                        "max_model_len": 131072,
+                        "params": ""
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 96.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU2.2",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 32,
+                    "gpu_count": 2,
+                    "gpu_type": "P100",
+                    "quantization": [
+                        "fp16"
+                    ],
+                    "ranking": {
+                        "cost": 30,
+                        "performance": 20
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 12.0,
+                        "kv_cache_size_gb": 10.74,
+                        "total_model_gb": 22.73
+                    },
+                    "deployment_params": {
+                        "quantization": "4bit",
+                        "max_model_len": 65536,
+                        "params": "--max-model-len 65536 --quantization bitsandbytes --load-format bitsandbytes"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (22.7GB used / 32.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "VM.GPU2.1",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 16,
+                    "gpu_count": 1,
+                    "gpu_type": "P100",
+                    "quantization": [
+                        "fp16"
+                    ],
+                    "ranking": {
+                        "cost": 10,
+                        "performance": 10
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 12.0,
+                        "kv_cache_size_gb": 1.34,
+                        "total_model_gb": 13.34
+                    },
+                    "deployment_params": {
+                        "quantization": "4bit",
+                        "max_model_len": 8192,
+                        "params": "--max-model-len 8192 --quantization bitsandbytes --load-format bitsandbytes"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (13.3GB used / 16.0GB allowed)."
+                }
+            ]
+        }
+    ],
+    "troubleshoot": ""
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json b/tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json
new file mode 100644
index 000000000..7a4966574
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json
@@ -0,0 +1,43 @@
+{
+    "display_name": "Kimi-K2-Instruct-MOE",
+    "recommendations": [
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.MI300X.8",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 1536,
+                    "gpu_count": 8,
+                    "gpu_type": "MI300X",
+                    "quantization": [
+                        "fp8",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 90,
+                        "performance": 90
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 1046.48,
+                        "kv_cache_size_gb": 3.58,
+                        "total_model_gb": 1050.06
+                    },
+                    "deployment_params": {
+                        "quantization": "fp8",
+                        "max_model_len": 2048,
+                        "params": "--max-model-len 2048"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (1050.1GB used / 1536.0GB allowed)."
+                }
+            ]
+        }
+    ],
+    "troubleshoot": ""
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json b/tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json
new file mode 100644
index 000000000..b75fb09cc
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json
@@ -0,0 +1,88 @@
+{
+    "display_name": "Qwen3-235B-A22B-Instruct-2507-FP8",
+    "recommendations": [
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.H200.8",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 1128,
+                    "gpu_count": 8,
+                    "gpu_type": "H200",
+                    "quantization": [
+                        "awq",
+                        "gptq",
+                        "marlin",
+                        "fp8",
+                        "int8",
+                        "bitblas",
+                        "aqlm",
+                        "bitsandbytes",
+                        "deepspeedfp",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 100,
+                        "performance": 110
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 231.89,
+                        "kv_cache_size_gb": 0.39,
+                        "total_model_gb": 232.28
+                    },
+                    "deployment_params": {
+                        "quantization": "fp8",
+                        "max_model_len": 2048,
+                        "params": "--max-model-len 2048"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 1128.0GB allowed)."
+                }
+            ]
+        },
+        {
+            "shape_details": {
+                "available": false,
+                "core_count": null,
+                "memory_in_gbs": null,
+                "name": "BM.GPU.MI300X.8",
+                "shape_series": "GPU",
+                "gpu_specs": {
+                    "gpu_memory_in_gbs": 1536,
+                    "gpu_count": 8,
+                    "gpu_type": "MI300X",
+                    "quantization": [
+                        "fp8",
+                        "gguf"
+                    ],
+                    "ranking": {
+                        "cost": 90,
+                        "performance": 90
+                    }
+                }
+            },
+            "configurations": [
+                {
+                    "model_details": {
+                        "model_size_gb": 231.89,
+                        "kv_cache_size_gb": 0.39,
+                        "total_model_gb": 232.28
+                    },
+                    "deployment_params": {
+                        "quantization": "fp8",
+                        "max_model_len": 2048,
+                        "params": "--max-model-len 2048"
+                    },
+                    "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 1536.0GB allowed)."
+                }
+            ]
+        }
+    ],
+    "troubleshoot": ""
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json b/tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json
new file mode 100644
index 000000000..15ab1cd80
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json
@@ -0,0 +1,126 @@
+{
+  "architectures": [
+    "T5GemmaForConditionalGeneration"
+  ],
+  "classifier_dropout_rate": 0.0,
+  "decoder": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": 50.0,
+    "classifier_dropout_rate": 0.0,
+    "cross_attention_hidden_size": 1152,
+    "dropout_rate": 0.0,
+    "final_logit_softcapping": 30.0,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "initializer_range": 0.02,
+    "intermediate_size": 6912,
+    "is_decoder": true,
+    "layer_types": [
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 8192,
+    "model_type": "t5_gemma_module",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 26,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 10000.0,
+    "sliding_window": 4096,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "vocab_size": 256000
+  },
+  "dropout_rate": 0.0,
+  "encoder": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": 50.0,
+    "classifier_dropout_rate": 0.0,
+    "dropout_rate": 0.0,
+    "final_logit_softcapping": 30.0,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "initializer_range": 0.02,
+    "intermediate_size": 6912,
+    "layer_types": [
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 8192,
+    "model_type": "t5_gemma_module",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 26,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 10000.0,
+    "sliding_window": 4096,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "vocab_size": 256000
+  },
+  "eos_token_id": [
+    1,
+    107
+  ],
+  "initializer_range": 0.02,
+  "is_encoder_decoder": true,
+  "model_type": "t5gemma",
+  "pad_token_id": 0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true
+}
\ No newline at end of file
diff --git a/tests/unitary/with_extras/aqua/test_recommend.py b/tests/unitary/with_extras/aqua/test_recommend.py
new file mode 100644
index 000000000..b9c5c6982
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_recommend.py
@@ -0,0 +1,396 @@
+import json
+import os
+import re
+from unittest.mock import MagicMock
+
+import pytest
+
+from ads.aqua.common.entities import ComputeShapeSummary
+from ads.aqua.common.errors import AquaRecommendationError
+from ads.aqua.shaperecommend.estimator import (
+    LlamaMemoryEstimator,
+    MemoryEstimator,
+    MixtureMemoryEstimator,
+    get_estimator,
+)
+from ads.aqua.shaperecommend.llm_config import LLMConfig
+from ads.aqua.shaperecommend.recommend import AquaRecommendApp
+from ads.aqua.shaperecommend.shape_report import (
+    DeploymentParams,
+    ModelConfig,
+    ModelDetail,
+    ShapeReport,
+)
+from ads.model.model_metadata import ModelCustomMetadata, ModelProvenanceMetadata
+
+CONFIG_ROOT = os.path.join(os.path.dirname(__file__), "test_data/recommend/")
+
+
+def load_config(filename):
+    with open(os.path.join(CONFIG_ROOT, filename)) as f:
+        return json.load(f)
+
+
+# --- Tests for estimator.py ---
+class TestMemoryEstimator:
+    def test_memory_estimator_properties(self):
+        config = LLMConfig(
+            num_hidden_layers=2,
+            hidden_size=64,
+            vocab_size=1000,
+            num_attention_heads=4,
+            head_dim=16,
+            weight_dtype="float32",
+        )
+        estimator = MemoryEstimator(llm_config=config, seq_len=128, batch_size=2)
+        assert estimator.model_memory > 0
+        assert estimator.kv_cache_memory > 0
+        assert estimator.total_memory == pytest.approx(
+            estimator.model_memory + estimator.kv_cache_memory
+        )
+
+    def test_get_estimator_llama_and_moe_fields(self):
+        base_args = {
+            "num_hidden_layers": 2,
+            "hidden_size": 64,
+            "vocab_size": 1000,
+            "num_attention_heads": 4,
+            "head_dim": 16,
+            "weight_dtype": "float32",
+            "num_key_value_heads": 2,
+            "intermediate_size": 256,
+        }
+        config_reg = LLMConfig(**base_args)
+        base_args["num_key_value_heads"] = 2
+        config_llama = LLMConfig(**base_args)
+        base_args["num_local_experts"] = 4
+        config_moe = LLMConfig(**base_args)
+
+        assert isinstance(
+            get_estimator(config_reg, seq_len=128, batch_size=1), MemoryEstimator
+        )
+        assert isinstance(
+            get_estimator(config_llama, seq_len=128, batch_size=1), LlamaMemoryEstimator
+        )
+        assert isinstance(
+            get_estimator(config_moe, seq_len=128, batch_size=1), MixtureMemoryEstimator
+        )
+
+    @pytest.mark.parametrize(
+        "config_file,should_raise",
+        [
+            ("Devstral-Small-2507-GQA.json", False),
+            ("Kimi-K2-Instruct-MOE.json", False),
+            ("Qwen3-235B-A22B-Instruct-2507-FP8.json", False),
+            ("t5gemma-ml-ml-prefixlm.json", True),  # This one is expected to raise
+        ],
+    )
+    def test_memory_estimator_properties_from_file(self, config_file, should_raise):
+        raw = load_config(config_file)
+        if should_raise:
+            with pytest.raises(AquaRecommendationError):
+                config = LLMConfig.from_raw_config(raw)
+                MemoryEstimator(llm_config=config, seq_len=128, batch_size=2)
+        else:
+            config = LLMConfig.from_raw_config(raw)
+            estimator = MemoryEstimator(llm_config=config, seq_len=128, batch_size=2)
+            assert estimator.model_memory > 0
+            assert estimator.kv_cache_memory > 0
+            assert estimator.total_memory == pytest.approx(
+                estimator.model_memory + estimator.kv_cache_memory
+            )
+
+    @pytest.mark.parametrize(
+        "config_file, expected_estimator_cls",
+        [
+            ("Devstral-Small-2507-GQA.json", LlamaMemoryEstimator),
+            ("Kimi-K2-Instruct-MOE.json", MixtureMemoryEstimator),
+            ("Qwen3-235B-A22B-Instruct-2507-FP8.json", MixtureMemoryEstimator),
+        ],
+    )
+    def test_get_estimator_types_from_config_file(
+        self, config_file, expected_estimator_cls
+    ):
+        raw = load_config(config_file)
+        config = LLMConfig.from_raw_config(raw)
+        estimator = get_estimator(config, seq_len=128, batch_size=1)
+        assert isinstance(estimator, expected_estimator_cls)
+
+
+# --- Tests for llm_config.py ---
+class TestLLMConfig:
+    def test_llm_config_from_raw_config(self):
+        raw = {
+            "num_hidden_layers": 2,
+            "hidden_size": 64,
+            "vocab_size": 1000,
+            "num_attention_heads": 4,
+            "head_dim": 16,
+            "torch_dtype": "float16",
+            "max_position_embeddings": 2048,
+        }
+        config = LLMConfig.from_raw_config(raw)
+        assert config.hidden_size == 64
+        assert config.max_seq_len == 2048
+        assert config.weight_dtype.lower() == "float16"
+        assert config.quantization is None
+
+    @pytest.mark.parametrize(
+        "config_file, expected_hidden_size, expected_max_seq_len, expected_dtype, exp_num_key_value_heads, exp_num_local_experts, expected_head_dim, expected_quant",
+        [
+            (
+                "Devstral-Small-2507-GQA.json",
+                5120,
+                131072,
+                "bfloat16",
+                8,
+                None,
+                128,
+                None,
+            ),
+            (
+                "Kimi-K2-Instruct-MOE.json",
+                7168,
+                131072,
+                "bfloat16",
+                64,
+                384,
+                112,
+                "fp8",
+            ),
+            (
+                "Qwen3-235B-A22B-Instruct-2507-FP8.json",
+                4096,
+                262144,
+                "bfloat16",
+                4,
+                128,
+                128,
+                "fp8",
+            ),
+        ],
+    )
+    def test_llm_config_from_raw_config_file(
+        self,
+        config_file,
+        expected_hidden_size,
+        expected_max_seq_len,
+        expected_dtype,
+        exp_num_key_value_heads,
+        exp_num_local_experts,
+        expected_head_dim,
+        expected_quant,
+    ):
+        raw = load_config(config_file)
+        config = LLMConfig.from_raw_config(raw)
+        assert config.hidden_size == expected_hidden_size
+        assert config.max_seq_len == expected_max_seq_len
+        assert config.num_key_value_heads == exp_num_key_value_heads
+        assert config.num_local_experts == exp_num_local_experts
+        assert config.weight_dtype.lower() == expected_dtype
+        assert config.head_dim == expected_head_dim
+        assert config.quantization == expected_quant
+
+    def test_suggested_quantizations(self):
+        c = LLMConfig(
+            num_hidden_layers=2,
+            hidden_size=64,
+            vocab_size=1000,
+            num_attention_heads=4,
+            head_dim=16,
+            weight_dtype="bfloat16",
+            max_seq_len=2048,
+        )
+        suggestions = c.suggested_quantizations
+        assert "4bit" in suggestions
+
+    @pytest.mark.parametrize(
+        "config_file, expected_quantizations",
+        [
+            ("Devstral-Small-2507-GQA.json", {"4bit"}),
+            ("Kimi-K2-Instruct-MOE.json", {"4bit"}),
+            ("Qwen3-235B-A22B-Instruct-2507-FP8.json", {"4bit"}),
+        ],
+    )
+    def test_suggested_quantizations_from_file(
+        self, config_file, expected_quantizations
+    ):
+        raw = load_config(config_file)
+        config = LLMConfig.from_raw_config(raw)
+        suggestions = set(config.suggested_quantizations)
+        assert expected_quantizations.issubset(suggestions)
+
+
+# --- Tests for recommend.py ---
+class GPUShapesIndexMock:
+    def __init__(self):
+        # local_path = os.path.join(os.path.dirname(__file__), "../../resources", "gpu_shapes_index.json")
+        local_path = "ads/aqua/resources/gpu_shapes_index.json"
+        with open(local_path) as f:
+            local_data = json.load(f)
+
+        local_shapes = local_data.get("shapes", {})
+        self.shapes = local_shapes
+
+class MockDataScienceModel:
+    @staticmethod
+    def create(config_file = ""):
+        mock_model = MagicMock()
+        mock_model.model_file_description = {"test_key": "test_value"}
+        mock_model.display_name = re.sub(r"\.json$", "", config_file)
+        mock_model.description = "test_description"
+        mock_model.freeform_tags = {
+            "OCI_AQUA": "ACTIVE",
+            "license": "test_license",
+            "organization": "test_organization",
+            "task": "text-generation",
+            "model_format" : "SAFETENSORS",
+            "ready_to_fine_tune": "true",
+            "aqua_custom_base_model": "true",
+        }
+        custom_metadata_list = ModelCustomMetadata()
+        custom_metadata_list.add(
+            **{"key": "test_metadata_item_key", "value": "test_metadata_item_value"}
+        )
+        mock_model.custom_metadata_list = custom_metadata_list
+        mock_model.provenance_metadata = ModelProvenanceMetadata(
+            training_id="test_training_id"
+        )
+        return mock_model
+
+
+class TestAquaRecommendApp:
+    def test_which_gpu_valid(self, monkeypatch, **kwargs):
+        app = AquaRecommendApp()
+        mock_model = MockDataScienceModel.create()
+        monkeypatch.setattr(
+            "ads.aqua.app.DataScienceModel.from_id", lambda _: mock_model
+        )
+        monkeypatch.setattr(
+            app,
+            "get_model_config",
+            lambda _: {
+                "num_hidden_layers": 2,
+                "hidden_size": 64,
+                "vocab_size": 1000,
+                "num_attention_heads": 4,
+                "head_dim": 16,
+                "max_position_embeddings": 2048,
+            },
+        )
+        monkeypatch.setattr(app, "valid_compute_shapes", lambda *args, **kwargs: [])
+        monkeypatch.setattr(
+            app, "summarize_shapes_for_seq_lens", lambda  *args, **kwargs: "mocked_report"
+        )
+        result = app.which_gpu(model_ocid="ocid1.datasciencemodel.oc1.TEST")
+        assert result == "mocked_report"
+
+    @pytest.mark.parametrize(
+        "config_file, result_file",
+        [
+            ("Devstral-Small-2507-GQA.json", "result-Devstral-Small-2507-GQA.json"),
+            ("Kimi-K2-Instruct-MOE.json", "result-Kimi-K2-Instruct-MOE.json"),
+            (
+                "Qwen3-235B-A22B-Instruct-2507-FP8.json",
+                "result-Qwen3-235B-A22B-Instruct-2507-FP8.json",
+            ),
+        ],
+    )
+    def test_which_gpu_valid_from_file(self, monkeypatch, config_file, result_file, **kwargs):
+        raw = load_config(config_file)
+        app = AquaRecommendApp()
+        mock_model = MockDataScienceModel.create(config_file)
+        monkeypatch.setattr(
+            "ads.aqua.app.DataScienceModel.from_id", lambda _: mock_model
+        )
+        monkeypatch.setattr(app, "get_model_config", lambda _: raw)
+
+        shapes_index = GPUShapesIndexMock()
+        real_shapes = [
+            ComputeShapeSummary(name=name, shape_series="GPU", gpu_specs=spec)
+            for name, spec in shapes_index.shapes.items()
+        ]
+        monkeypatch.setattr(app, "valid_compute_shapes", lambda *args, **kwargs: real_shapes)
+
+        result = app.which_gpu(model_ocid="ocid1.datasciencemodel.oc1.TEST")
+
+        expected_result = load_config(result_file)
+        assert result.model_dump() == expected_result
+
+
+# --- Tests for shape_report.py ---
+class TestShapeReport:
+    def test_shape_report_pareto_front(self):
+        # worse recommendation- higher cost and lower performance -> should be filtered out
+        mock_shape_a = ComputeShapeSummary(
+            name="VM.GPU2.1",
+            shape_series="GPU",
+            gpu_specs={"ranking": {"cost": 15, "performance": 10}},
+        )
+
+        mock_shape_b = ComputeShapeSummary(
+            name="VM.GPU.A10.1",
+            shape_series="GPU",
+            gpu_specs={"ranking": {"cost": 10, "performance": 12}},
+        )
+
+        a = ShapeReport(
+            shape_details=mock_shape_a,
+            configurations=[
+                ModelConfig(
+                    model_details=ModelDetail(
+                        model_size_gb=1, kv_cache_size_gb=1, total_model_gb=2
+                    ),
+                    deployment_params=DeploymentParams(
+                        quantization="8bit", max_model_len=2048, params = ""
+                    ),
+                    recommendation="ok",
+                )
+            ],
+        )
+        b = ShapeReport(
+            shape_details=mock_shape_b,
+            configurations=[
+                ModelConfig(
+                    model_details=ModelDetail(
+                        model_size_gb=1, kv_cache_size_gb=1, total_model_gb=2
+                    ),
+                    deployment_params=DeploymentParams(
+                        quantization="8bit", max_model_len=2048, params = ""
+                    ),
+                    recommendation="ok",
+                )
+            ],
+        )
+        c = ShapeReport(
+            shape_details=mock_shape_b,
+            configurations=[
+                ModelConfig(
+                    model_details=ModelDetail(
+                        model_size_gb=1, kv_cache_size_gb=1, total_model_gb=2
+                    ),
+                    deployment_params=DeploymentParams(
+                        quantization="bfloat16", max_model_len=2048, params = ""
+                    ),
+                    recommendation="ok",
+                )
+            ],
+        )
+        d = ShapeReport(
+            shape_details=mock_shape_b,
+            configurations=[
+                ModelConfig(
+                    model_details=ModelDetail(
+                        model_size_gb=1, kv_cache_size_gb=1, total_model_gb=2
+                    ),
+                    deployment_params=DeploymentParams(
+                        quantization="8bit", max_model_len=4096, params=""
+                    ),
+                    recommendation="ok",
+                )
+            ],
+        )
+        pf = ShapeReport.pareto_front([a, b, c, d])
+        assert c and d in pf
+        assert a and b not in pf
+        assert len(pf) == 2
diff --git a/tests/unitary/with_extras/aqua/test_recommend_handler.py b/tests/unitary/with_extras/aqua/test_recommend_handler.py
new file mode 100644
index 000000000..3ad40ff30
--- /dev/null
+++ b/tests/unitary/with_extras/aqua/test_recommend_handler.py
@@ -0,0 +1,69 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+from tornado.web import HTTPError
+
+from ads.aqua.extension.base_handler import AquaAPIhandler
+from ads.aqua.extension.errors import Errors
+from ads.aqua.extension.recommend_handler import AquaRecommendHandler
+
+
+@pytest.fixture
+def handler():
+    # Patch AquaAPIhandler.__init__ for unit test stubbing
+    AquaAPIhandler.__init__ = lambda self, *args, **kwargs: None
+    h = AquaRecommendHandler(MagicMock(), MagicMock())
+    h.finish = MagicMock()
+    h.request = MagicMock()
+    # Set required Tornado internal fields
+    h._headers = {}
+    h._write_buffer = []
+    return h
+
+
+def test_post_valid_input(monkeypatch, handler):
+    input_data = {"model_ocid": "ocid1.datasciencemodel.oc1.XYZ"}
+    expected = {"recommendations": ["VM.GPU.A10.1"], "troubleshoot": ""}
+
+    # Patch class on correct import path, so handler sees our fake implementation
+    class FakeAquaRecommendApp:
+        def which_gpu(self, **kwargs):
+            return expected
+
+    monkeypatch.setattr(
+        "ads.aqua.extension.recommend_handler.AquaRecommendApp", FakeAquaRecommendApp
+    )
+
+    handler.get_json_body = MagicMock(return_value=input_data)
+    handler.post()
+    handler.finish.assert_called_once_with(expected)
+
+
+def test_post_no_input(handler):
+    handler.get_json_body = MagicMock(return_value=None)
+    handler._headers = {}
+    handler._write_buffer = []
+    handler.write_error = MagicMock()
+    handler.post()
+    handler.write_error.assert_called_once()
+    exc_info = handler.write_error.call_args.kwargs.get("exc_info")
+    assert exc_info is not None
+    exc_type, exc_value, _ = exc_info
+    assert exc_type is HTTPError
+    assert exc_value.status_code == 400
+    assert exc_value.log_message == Errors.NO_INPUT_DATA
+
+
+def test_post_invalid_input(handler):
+    handler.get_json_body = MagicMock(side_effect=Exception("bad input"))
+    handler._headers = {}
+    handler._write_buffer = []
+    handler.write_error = MagicMock()
+    handler.post()
+    handler.write_error.assert_called_once()
+    exc_info = handler.write_error.call_args.kwargs.get("exc_info")
+    assert exc_info is not None
+    exc_type, exc_value, _ = exc_info
+    assert exc_type is HTTPError
+    assert exc_value.status_code == 400
+    assert exc_value.log_message == Errors.INVALID_INPUT_DATA_FORMAT