diff --git a/README.md b/README.md
index c453de30c2..4d6fb87545 100644
--- a/README.md
+++ b/README.md
@@ -260,6 +260,7 @@ Note that for externally hosted models, configs such as `--device` which relate
 | Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
 | [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Watsonx.ai                                                                                                                 | :heavy_check_mark:              | `watsonx_llm`                                         | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx)                                                                                                                                                                                                                                                               | `generate_until` `loglikelihood`                         |
 | Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |
 
 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
@@ -476,6 +477,8 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | gptq            | For loading models with GPTQ                 |
 | hf_transfer     | For speeding up HF Hub file downloads        |
 | ifeval          | For running the IFEval task                  |
+| ibm_watsonx_ai  | For using IBM watsonx.ai model apis          |
+
 | neuronx         | For running on AWS inf2 instances            |
 | mamba           | For loading Mamba SSM models                 |
 | math            | For running math task answer checking        |
diff --git a/lm_eval/models/ibm_watsonx_ai.py b/lm_eval/models/ibm_watsonx_ai.py
index a82c5c4567..96fdb6e93e 100644
--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -8,6 +9,7 @@
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.api_models import JsonChatStr
 from lm_eval.utils import eval_logger, simple_parse_args_string
 
 
@@ -248,7 +250,12 @@ def generate_until(self, requests: List[Instance]) -> List[str]:
         ):
             context, continuation = request
             try:
-                response = self.model.generate_text(context, self.generate_params)
+                if isinstance(context, JsonChatStr):
+                    context = json.loads(context.prompt)
+                    response = self.model.chat(context, self.generate_params)
+                    response = response["choices"][0]["message"]["content"]
+                else:
+                    response = self.model.generate_text(context, self.generate_params)
             except Exception as exp:
                 eval_logger.error("Error while generating text.")
                 raise exp
@@ -372,3 +379,13 @@ def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
             )
 
         return cast(List[Tuple[float, bool]], results)
+
+    @property
+    def tokenizer_name(self) -> str:
+        return ""
+
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> List[Dict[str, str]]:
+        # A hack similar from api_model to allow encoding for cache
+        return JsonChatStr(json.dumps(chat_history))
diff --git a/lm_eval/tasks/unitxt/README.md b/lm_eval/tasks/unitxt/README.md
index 06bee13380..63649473ca 100644
--- a/lm_eval/tasks/unitxt/README.md
+++ b/lm_eval/tasks/unitxt/README.md
@@ -6,6 +6,8 @@ The full Unitxt catalog can be viewed in an [online explorer](https://unitxt.rea
 
 Read more about Unitxt at [www.unitxt.ai](https://www.unitxt.ai/).
 
+To use Unitxt dataset with lm-eval, you should first install unitxt via 'pip install unitxt'.
+
 ### Paper
 
 Title: `Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI`
diff --git a/lm_eval/tasks/unitxt/task.py b/lm_eval/tasks/unitxt/task.py
index 12b2760864..78e5c106af 100644
--- a/lm_eval/tasks/unitxt/task.py
+++ b/lm_eval/tasks/unitxt/task.py
@@ -6,11 +6,11 @@
 
 import importlib.util
 import re
+from collections.abc import Callable
 from functools import partial
 from typing import Any, Dict, Optional
 
 import datasets
-import evaluate
 
 from lm_eval.api.instance import Instance
 from lm_eval.api.task import ConfigurableTask
@@ -28,16 +28,21 @@
 """
 
 
-def is_unitxt_installed() -> bool:
-    return importlib.util.find_spec("unitxt") is not None
+def assert_unitxt_installed():
+    if importlib.util.find_spec("unitxt") is None:
+        raise Exception(
+            "Please install unitxt via 'pip install unitxt'. For more information see: https://www.unitxt.ai/"
+        )
 
 
 def score(items, metric):
     predictions, references = zip(*items)
-    evaluator = evaluate.load("unitxt/metric")
+    assert_unitxt_installed()
+    from unitxt import evaluate
+
     for reference in references:
         reference["metrics"] = [metric]
-    results = evaluator.compute(predictions=predictions, references=references)
+    results = evaluate(predictions, references)
     return results[0]["score"]["global"]["score"]
 
 
@@ -61,16 +66,10 @@ def __init__(
         self.metrics = self.dataset["test"][0]["metrics"]
 
     def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None:
-        if is_unitxt_installed():
-            from unitxt import load_dataset
+        assert_unitxt_installed()
+        from unitxt import load_dataset
 
-            self.dataset = load_dataset(self.DATASET_NAME)
-        else:
-            self.dataset = datasets.load_dataset(
-                name=self.DATASET_NAME,
-                path="unitxt/data",
-                trust_remote_code=True,
-            )
+        self.dataset = load_dataset(self.DATASET_NAME, disable_cache=False)
 
     def has_training_docs(self):
         return "train" in self.dataset
@@ -102,6 +101,27 @@ def doc_to_target(self, doc):
     def get_arguments(self, doc, ctx):
         return (ctx, {"until": ["\n"]})
 
+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+    ) -> str:
+        source = self.doc_to_text(doc)
+        if isinstance(source, list):
+            if apply_chat_template:
+                formated_source = chat_template(self.doc_to_text(doc))
+                return formated_source
+            else:
+                raise Exception(
+                    "Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
+                )
+        else:
+            return source
+
     def construct_requests(self, doc, ctx, **kwargs):
         """Uses RequestFactory to construct Requests and returns an iterable of
         Requests which will be sent to the LM.
@@ -113,6 +133,7 @@ def construct_requests(self, doc, ctx, **kwargs):
             language description, as well as the few shot examples, and the question
             part of the document for `doc`.
         """
+        kwargs.pop("apply_chat_template", False)  # Not used by unitxt
         return [
             Instance(
                 request_type="generate_until",