diff --git a/docs/reference/models/openai.md b/docs/reference/models/openai.md
index bd1123a59..5ddd4a457 100644
--- a/docs/reference/models/openai.md
+++ b/docs/reference/models/openai.md
@@ -1,4 +1,4 @@
-# Generate text with the OpenAI API
+# Generate text with the OpenAI and compatible APIs
 
 !!! Installation
 
@@ -16,16 +16,33 @@ print(type(model))
 # OpenAI
 ```
 
+Outlines also supports Azure OpenAI models:
 
-It is possible to pass a system message to the model when initializing it:
 
-```python
+```
 from outlines import models
 
-model = models.openai("gpt-4", system_prompt="You are a useful assistant")
+model = models.azure_openai(
+    api_version="2023-07-01-preview",
+    azure_endpoint="https://example-endpoint.openai.azure.com",
+)
+```
+
+More generally, you can use any API client compatible with the OpenAI interface by passing an instance of the client, a configuration, and optionally the corresponding tokenizer (if you want to be able to use `outlines.generate.choice`):
+
+```
+from openai import AsyncOpenAI
+import tiktoken
+
+from outlines.models.openai import OpenAI, OpenAIConfig
+
+config = OpenAIConfig(model="gpt-4")
+client = AsyncOpenAI()
+tokenizer = tiktoken.encoding_for_model("gpt-4")
+
+model = OpenAI(client, config, tokenizer)
 ```
 
-This message will be used for every subsequent use of the model:
 
 ## Monitoring API use
 
diff --git a/outlines/models/__init__.py b/outlines/models/__init__.py
index ca3335d08..15b370a85 100644
--- a/outlines/models/__init__.py
+++ b/outlines/models/__init__.py
@@ -7,12 +7,10 @@
 """
 from typing import Union
 
-from .azure import AzureOpenAI, azure_openai
 from .exllamav2 import ExLlamaV2Model, exl2
 from .llamacpp import LlamaCpp, llamacpp
 from .mamba import Mamba, mamba
 from .openai import OpenAI, openai
-from .openai_compatible import OpenAICompatibleAPI, openai_compatible_api
-from .transformers import Transformers, transformers
+from .transformers import Transformer, transformers
 
 LogitsGenerator = Union[Transformers, LlamaCpp, ExLlamaV2Model, Mamba]
diff --git a/outlines/models/azure.py b/outlines/models/azure.py
deleted file mode 100644
index 9d93539a5..000000000
--- a/outlines/models/azure.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Integration with Azure OpenAI's API."""
-import functools
-import os
-from dataclasses import replace
-from typing import Optional
-
-from outlines.models.openai import OpenAI, OpenAIConfig
-
-__all__ = ["AzureOpenAI", "azure_openai"]
-
-
-AZURE_API_VERSION = "2023-05-15"
-
-
-class AzureOpenAI(OpenAI):
-    def __init__(
-        self,
-        model_name: str,
-        deployment_name: str,
-        azure_endpoint: Optional[str] = None,
-        api_key: Optional[str] = None,
-        max_retries: int = 6,
-        timeout: Optional[float] = None,
-        system_prompt: Optional[str] = None,
-        config: Optional[OpenAIConfig] = None,
-    ):
-        """Create an `AzureOpenAI` instance.
-
-        Parameters
-        ----------
-        model_name
-            The name of the OpenAI model being used
-        deployment_name
-            The name of your Azure OpenAI deployment
-        api_key
-            Secret key to use with the OpenAI API. One can also set the
-            `OPENAI_API_KEY` environment variable, or the value of
-            `openai.api_key`.
-        max_retries
-            The maximum number of retries when calls to the API fail.
-        timeout
-            Duration after which the request times out.
-        system_prompt
-            The content of the system message that precedes the user's prompt.
-        config
-            An instance of `OpenAIConfig`. Can be useful to specify some
-            parameters that cannot be set by calling this class' methods.
-
-        """
-        try:
-            import openai
-        except ImportError:
-            raise ImportError(
-                "The `openai` library needs to be installed in order to use Outlines' Azure OpenAI integration."
-            )
-        try:
-            client = openai.OpenAI()
-            client.models.retrieve(model_name)
-        except openai.NotFoundError:
-            raise ValueError(
-                "Invalid model_name. Check openai models list at https://platform.openai.com/docs/models"
-            )
-
-        self.model_name = model_name
-
-        if api_key is None:
-            if os.getenv("AZURE_OPENAI_KEY") is not None:
-                api_key = os.getenv("AZURE_OPENAI_KEY")
-            elif openai.api_key is not None:
-                api_key = openai.api_key
-            else:
-                raise ValueError(
-                    "You must specify an API key to use the Azure OpenAI API integration."
-                )
-        if azure_endpoint is None:
-            if os.getenv("AZURE_OPENAI_ENDPOINT") is not None:
-                azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
-            else:
-                raise ValueError(
-                    "You must specify an API base to use the Azure OpenAI API integration."
-                )
-
-        if config is not None:
-            self.config = replace(config, model=deployment_name)  # type: ignore
-        else:
-            self.config = OpenAIConfig(model=deployment_name)
-
-        # This is necesssary because of an issue with the OpenAI API.
-        # Status updates: https://github.com/openai/openai-python/issues/769
-        self.create_client = functools.partial(
-            openai.AsyncAzureOpenAI,
-            azure_endpoint=azure_endpoint,
-            api_key=api_key,
-            api_version=AZURE_API_VERSION,
-            max_retries=max_retries,
-            timeout=timeout,
-        )
-
-        self.system_prompt = system_prompt
-
-        # We count the total number of prompt and generated tokens as returned
-        # by the OpenAI API, summed over all the requests performed with this
-        # model instance.
-        self.prompt_tokens = 0
-        self.completion_tokens = 0
-
-    @property
-    def tokenizer(self):
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "The `tiktoken` library needs to be installed in order to choose `outlines.models.openai` with `is_in`"
-            )
-
-        return tiktoken.encoding_for_model(self.model_name)
-
-
-azure_openai = AzureOpenAI
diff --git a/outlines/models/openai.py b/outlines/models/openai.py
index 4f9b5a869..8de138fa1 100644
--- a/outlines/models/openai.py
+++ b/outlines/models/openai.py
@@ -1,20 +1,15 @@
 """Integration with OpenAI's API."""
 import functools
-import os
-import textwrap
 from dataclasses import asdict, dataclass, field, replace
 from itertools import zip_longest
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
 from outlines.base import vectorize
 from outlines.caching import cache
 
-__all__ = ["OpenAI", "openai"]
-
-if TYPE_CHECKING:
-    from openai import AsyncOpenAI
+__all__ = ["OpenAI", "openai", "azure_openai"]
 
 
 @dataclass(frozen=True)
@@ -27,7 +22,7 @@ class OpenAIConfig:
 
     Properties
     ----------
-    model_name
+    model
         The name of the model. Available models can be found on OpenAI's website.
     frequence_penalty
         Number between 2.0 and -2.0. Positive values penalize new tokens based on
@@ -66,7 +61,7 @@ class OpenAIConfig:
     response_format: Optional[Dict[str, str]] = None
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
-    temperature: Optional[float] = None
+    temperature: float = 1.0
     top_p: int = 1
     user: str = field(default_factory=str)
 
@@ -76,74 +71,31 @@ class OpenAI:
 
     def __init__(
         self,
-        model_name: str,
-        api_key: Optional[str] = None,
-        max_retries: int = 6,
-        timeout: Optional[float] = None,
+        client,
+        config,
+        tokenizer=None,
         system_prompt: Optional[str] = None,
-        config: Optional[OpenAIConfig] = None,
     ):
         """Create an `OpenAI` instance.
 
+        This class supports the standard OpenAI API, the Azure OpeanAI API as
+        well as compatible APIs that rely on the OpenAI client.
+
         Parameters
         ----------
-        model_name
-            Model to use, as defined in OpenAI's documentation
-        api_key
-            Secret key to use with the OpenAI API. One can also set the
-            `OPENAI_API_KEY` environment variable, or the value of
-            `openai.api_key`.
-        max_retries
-            The maximum number of retries when calls to the API fail.
-        timeout
-            Duration after which the request times out.
-        system_prompt
-            The content of the system message that precedes the user's prompt.
+        client
+            An instance of the API's async client.
         config
             An instance of `OpenAIConfig`. Can be useful to specify some
             parameters that cannot be set by calling this class' methods.
+        tokenizer
+            The tokenizer associated with the model the client connects to.
 
         """
 
-        try:
-            import openai
-        except ImportError:
-            raise ImportError(
-                "The `openai` library needs to be installed in order to use Outlines' OpenAI integration."
-            )
-
-        if api_key is None:
-            if os.getenv("OPENAI_API_KEY") is not None:
-                api_key = os.getenv("OPENAI_API_KEY")
-            elif openai.api_key is not None:
-                api_key = openai.api_key
-            else:
-                raise ValueError(
-                    "You must specify an API key to use the OpenAI API integration."
-                )
-        try:
-            client = openai.OpenAI(api_key=api_key)
-            client.models.retrieve(model_name)
-        except openai.NotFoundError:
-            raise ValueError(
-                "Invalid model_name. Check openai models list at https://platform.openai.com/docs/models"
-            )
-
-        if config is not None:
-            self.config = replace(config, model=model_name)  # type: ignore
-        else:
-            self.config = OpenAIConfig(model=model_name)
-
-        # This is necesssary because of an issue with the OpenAI API.
-        # Status updates: https://github.com/openai/openai-python/issues/769
-        self.create_client = functools.partial(
-            openai.AsyncOpenAI,
-            api_key=api_key,
-            max_retries=max_retries,
-            timeout=timeout,
-        )
-
-        self.system_prompt = system_prompt
+        self.client = client
+        self.tokenizer = tokenizer
+        self.config = config
 
         # We count the total number of prompt and generated tokens as returned
         # by the OpenAI API, summed over all the requests performed with this
@@ -157,7 +109,8 @@ def __call__(
         max_tokens: Optional[int] = None,
         stop_at: Optional[Union[List[str], str]] = None,
         *,
-        temperature: float = 1.0,
+        system_prompt: Optional[str] = None,
+        temperature: Optional[float] = None,
         samples: Optional[int] = None,
     ) -> np.ndarray:
         """Call the OpenAI API to generate text.
@@ -168,6 +121,11 @@ def __call__(
             A string or list of strings that will be used to prompt the model
         max_tokens
             The maximum number of tokens to generate
+        stop_at
+            A string or array of strings which, such that the generation stops
+            when they are generated.
+        system_prompt
+            The content of the system message that precedes the user's prompt.
         temperature
             The value of the temperature used to sample tokens
         samples
@@ -176,52 +134,36 @@ def __call__(
             Up to 4 words where the API will stop the completion.
 
         """
+        if max_tokens is None:
+            max_tokens = self.config.max_tokens
+        if stop_at is None:
+            stop_at = self.config.stop
+        if temperature is None:
+            temperature = self.config.temperature
         if samples is None:
             samples = self.config.n
 
-        config = replace(self.config, max_tokens=max_tokens, n=samples, stop=stop_at)  # type: ignore
+        config = replace(self.config, max_tokens=max_tokens, temperature=temperature, n=samples, stop=stop_at)  # type: ignore
 
-        if isinstance(stop_at, list) and len(stop_at) > 4:
-            raise NotImplementedError(
-                "The OpenAI API supports at most 4 stop sequences."
-            )
-
-        if "text-" in self.config.model:
-            raise NotImplementedError(
-                textwrap.dedent(
-                    "Most models that support the legacy completion endpoints will be "
-                    "deprecated on January 2024. Use Chat models instead.\n"
-                    "The list of chat models is available at https://platform.openai.com/docs/guides/text-generation."
-                )
-            )
-        if "gpt-" in self.config.model:
-            client = self.create_client()
-            response, prompt_tokens, completion_tokens = generate_chat(
-                prompt, self.system_prompt, client, config
-            )
-            self.prompt_tokens += prompt_tokens
-            self.completion_tokens += completion_tokens
+        response, prompt_tokens, completion_tokens = generate_chat(
+            prompt, system_prompt, self.client, config
+        )
+        self.prompt_tokens += prompt_tokens
+        self.completion_tokens += completion_tokens
 
-            return response
+        return response
 
     def stream(self, *args, **kwargs):
         raise NotImplementedError(
             "Streaming is currently not supported for the OpenAI API"
         )
 
-    @property
-    def tokenizer(self):
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "The `tiktoken` library needs to be installed in order to choose `outlines.models.openai` with `is_in`"
-            )
-
-        return tiktoken.encoding_for_model(self.config.model)
-
     def generate_choice(
-        self, prompt: str, choices: List[str], max_tokens: Optional[int] = None
+        self,
+        prompt: str,
+        choices: List[str],
+        max_tokens: Optional[int] = None,
+        system_prompt: Optional[str] = None,
     ) -> str:
         """Call the OpenAI API to generate one of several choices.
 
@@ -233,8 +175,15 @@ def generate_choice(
             The list of strings between which we ask the model to choose
         max_tokens
             The maximum number of tokens to generate
+        system_prompt
+            The content of the system message that precedes the user's prompt.
 
         """
+        if self.tokenizer is None:
+            raise ValueError(
+                "You must initialize the `OpenAI` class with a tokenizer to use `outlines.generate.choice`"
+            )
+
         config = replace(self.config, max_tokens=max_tokens)
 
         greedy = False
@@ -262,9 +211,8 @@ def generate_choice(
 
             config = replace(config, logit_bias=mask, max_tokens=max_tokens_left)
 
-            client = self.create_client()
             response, prompt_tokens, completion_tokens = generate_chat(
-                prompt, self.system_prompt, client, config
+                prompt, system_prompt, self.client, config
             )
             self.prompt_tokens += prompt_tokens
             self.completion_tokens += completion_tokens
@@ -316,7 +264,7 @@ def __repr__(self):
 async def generate_chat(
     prompt: str,
     system_prompt: Union[str, None],
-    client: "AsyncOpenAI",
+    client,
     config: OpenAIConfig,
 ) -> Tuple[np.ndarray, int, int]:
     """Call OpenAI's Chat Completion API.
@@ -340,14 +288,13 @@ async def generate_chat(
 
     """
 
+    @error_handler
     @cache()
     async def call_api(prompt, system_prompt, config):
         responses = await client.chat.completions.create(
             messages=system_message + user_message,
             **asdict(config),  # type: ignore
         )
-        await client.close()
-
         return responses.model_dump()
 
     system_message = (
@@ -365,9 +312,6 @@ async def call_api(prompt, system_prompt, config):
     return results, usage["prompt_tokens"], usage["completion_tokens"]
 
 
-openai = OpenAI
-
-
 def find_longest_intersection(response: List[int], choice: List[int]) -> List[int]:
     """Find the longest intersection between the response and the choice."""
     for i, (token_r, token_c) in enumerate(zip_longest(response, choice)):
@@ -468,3 +412,57 @@ def call(*args, **kwargs):
             raise e
 
     return call
+
+
+def openai(
+    model_name: str,
+    api_key: Optional[str] = None,
+    system_prompt: Optional[str] = None,
+    config: Optional[OpenAIConfig] = None,
+):
+    try:
+        import tiktoken
+        from openai import AsyncOpenAI
+    except ImportError:
+        raise ImportError(
+            "The `openai` and `tiktoken` libraries needs to be installed in order to use Outlines' OpenAI integration."
+        )
+
+    if config is not None:
+        config = replace(config, model=model_name)  # type: ignore
+    else:
+        config = OpenAIConfig(model=model_name)
+
+    client = AsyncOpenAI(api_key=api_key)
+    tokenizer = tiktoken.encoding_for_model(model_name)
+
+    return OpenAI(client, config, tokenizer)
+
+
+def azure_openai(
+    model_name: str,
+    azure_endpoint: Optional[str] = None,
+    api_version: Optional[str] = None,
+    api_key: Optional[str] = None,
+    system_prompt: Optional[str] = None,
+    config: Optional[OpenAIConfig] = None,
+):
+    try:
+        import tiktoken
+        from openai import AzureAsyncOpenAI
+    except ImportError:
+        raise ImportError(
+            "The `openai` and `tiktoken` libraries needs to be installed in order to use Outlines' Azure OpenAI integration."
+        )
+
+    if config is not None:
+        config = replace(config, model=model_name)  # type: ignore
+    if config is None:
+        config = OpenAIConfig(model=model_name)
+
+    client = AzureAsyncOpenAI(
+        azure_endpoint=azure_endpoint, api_version=api_version, api_key=api_key
+    )
+    tokenizer = tiktoken.encoding_for_model(model_name)
+
+    return OpenAI(client, config, tokenizer)
diff --git a/outlines/models/openai_compatible.py b/outlines/models/openai_compatible.py
deleted file mode 100644
index 10547bc24..000000000
--- a/outlines/models/openai_compatible.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Integration with custom OpenAI compatible APIs."""
-import functools
-import os
-from dataclasses import replace
-from typing import List, Optional, Union
-
-import numpy as np
-
-from outlines.models.openai import OpenAI, OpenAIConfig, generate_chat
-
-__all__ = ["OpenAICompatibleAPI", "openai_compatible_api"]
-
-
-class OpenAICompatibleAPI(OpenAI):
-    """An object that represents an OpenAI-compatible API."""
-
-    def __init__(
-        self,
-        model_name: str,
-        api_key: Optional[str] = None,
-        base_url: Optional[str] = None,
-        max_retries: int = 6,
-        timeout: Optional[float] = None,
-        system_prompt: Optional[str] = None,
-        config: Optional[OpenAIConfig] = None,
-        encoding="gpt-4",  # Default for tiktoken, should USUALLY work
-    ):
-        """Create an `OpenAI` instance.
-
-        Parameters
-        ----------
-        model_name
-            Model to use, as defined in OpenAI's documentation
-        api_key
-            Secret key to use with the OpenAI compatible API. One can also set the
-            `INFERENCE_API_KEY` environment variable, or the value of
-            `openai.api_key`.
-        base_url
-            Base URL to use for the API calls. Required if a Custom OpenAI endpoint is used.
-            Can also be set with the `INFERENCE_BASE_URL` environment variable.
-        max_retries
-            The maximum number of retries when calls to the API fail.
-        timeout
-            Duration after which the request times out.
-        system_prompt
-            The content of the system message that precedes the user's prompt.
-        config
-            An instance of `OpenAIConfig`. Can be useful to specify some
-            parameters that cannot be set by calling this class' methods.
-
-        """
-
-        try:
-            import openai
-        except ImportError:
-            raise ImportError(
-                "The `openai` library needs to be installed in order to use Outlines' OpenAI integration."
-            )
-
-        if api_key is None:
-            if os.getenv("INFERENCE_API_KEY") is not None:
-                api_key = os.getenv("INFERENCE_API_KEY")
-            elif openai.api_key is not None:
-                api_key = openai.api_key
-            else:
-                raise ValueError(
-                    "You must specify an API key to use the Custom OpenAI API integration."
-                )
-
-        if base_url is None:
-            if os.getenv("INFERENCE_BASE_URL") is not None:
-                base_url = os.getenv("INFERENCE_BASE_URL")
-            else:
-                raise ValueError(
-                    "You must specify a base URL to use the Custom OpenAI API integration."
-                )
-
-        if config is not None:
-            self.config = replace(config, model=model_name)  # type: ignore
-        else:
-            self.config = OpenAIConfig(model=model_name)
-
-        # This is necesssary because of an issue with the OpenAI API.
-        # Status updates: https://github.com/openai/openai-python/issues/769
-        self.create_client = functools.partial(
-            openai.AsyncOpenAI,
-            api_key=api_key,
-            base_url=base_url,
-            max_retries=max_retries,
-            timeout=timeout,
-        )
-
-        self.system_prompt = system_prompt
-
-        # We count the total number of prompt and generated tokens as returned
-        # by the OpenAI API, summed over all the requests performed with this
-        # model instance.
-        self.prompt_tokens = 0
-        self.completion_tokens = 0
-        self.encoding = encoding
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        max_tokens: Optional[int] = None,
-        stop_at: Optional[Union[List[str], str]] = None,
-        *,
-        temperature: float = 1.0,
-        samples: Optional[int] = None,
-    ) -> np.ndarray:
-        """Call the OpenAI compatible API to generate text.
-
-        Parameters
-        ----------
-        prompt
-            A string or list of strings that will be used to prompt the model
-        max_tokens
-            The maximum number of tokens to generate
-        temperature
-            The value of the temperature used to sample tokens
-        samples
-            The number of completions to generate for each prompt
-        stop_at
-            Up to 4 words where the API will stop the completion.
-
-        """
-        if samples is None:
-            samples = self.config.n
-
-        config = replace(self.config, max_tokens=max_tokens, n=samples, stop=stop_at, temperature=temperature)  # type: ignore
-
-        # We assume it's using the chat completion API style as that's the most commonly supported
-        client = self.create_client()
-        response, prompt_tokens, completion_tokens = generate_chat(
-            prompt, self.system_prompt, client, config
-        )
-        self.prompt_tokens += prompt_tokens
-        self.completion_tokens += completion_tokens
-
-        return response
-
-    @property
-    def tokenizer(self):
-        """Defaults to gpt4, as that seems to work with most custom endpoints. Can be overridden if required in the constructor"""
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "The `tiktoken` library needs to be installed in order to choose `outlines.models.openai` with `is_in`"
-            )
-
-        return tiktoken.encoding_for_model(self.encoding)
-
-
-openai_compatible_api = OpenAICompatibleAPI
diff --git a/pyproject.toml b/pyproject.toml
index d131e8453..62e37e752 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,8 +56,9 @@ test = [
     "beartype<0.16.0",
     "datasets",
     "responses",
-    "llama-cpp-python>=0.2.42",
-    "huggingface_hub"
+    "llama-cpp-python",
+    "huggingface_hub",
+    "openai>=1.0.0"
 ]
 serve = [
     "vllm>=0.3.0",
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py
index c2e885eb1..5ba1b1411 100644
--- a/tests/models/test_openai.py
+++ b/tests/models/test_openai.py
@@ -1,12 +1,68 @@
+import importlib
+from unittest import mock
+from unittest.mock import MagicMock
+
 import pytest
+from openai import AsyncOpenAI
 
 from outlines.models.openai import (
+    OpenAI,
+    OpenAIConfig,
     build_optimistic_mask,
     find_longest_intersection,
     find_response_choices_intersection,
 )
 
 
+def module_patch(path):
+    """Patch functions that have the same name as the module in which they're implemented."""
+    target = path
+    components = target.split(".")
+    for i in range(len(components), 0, -1):
+        try:
+            # attempt to import the module
+            imported = importlib.import_module(".".join(components[:i]))
+
+            # module was imported, let's use it in the patch
+            patch = mock.patch(path)
+            patch.getter = lambda: imported
+            patch.attribute = ".".join(components[i:])
+            return patch
+        except Exception:
+            continue
+
+    # did not find a module, just return the default mock
+    return mock.patch(path)
+
+
+def test_openai_call():
+    with module_patch("outlines.models.openai.generate_chat") as mocked_generate_chat:
+        mocked_generate_chat.return_value = ["foo"], 1, 2
+        async_client = MagicMock(spec=AsyncOpenAI, api_key="key")
+
+        model = OpenAI(
+            async_client,
+            "gpt-4",
+            config=OpenAIConfig(max_tokens=10, temperature=0.5, n=2, stop=["."]),
+        )
+
+        assert model("bar")[0] == "foo"
+        assert model.prompt_tokens == 1
+        assert model.completion_tokens == 2
+        mocked_generate_chat_args = mocked_generate_chat.call_args
+        mocked_generate_chat_arg_config = mocked_generate_chat_args[0][3]
+        assert isinstance(mocked_generate_chat_arg_config, OpenAIConfig)
+        assert mocked_generate_chat_arg_config.max_tokens == 10
+        assert mocked_generate_chat_arg_config.temperature == 0.5
+        assert mocked_generate_chat_arg_config.n == 2
+        assert mocked_generate_chat_arg_config.stop == ["."]
+
+        model("bar", samples=3)
+        mocked_generate_chat_args = mocked_generate_chat.call_args
+        mocked_generate_chat_arg_config = mocked_generate_chat_args[0][3]
+        assert mocked_generate_chat_arg_config.n == 3
+
+
 @pytest.mark.parametrize(
     "response,choice,expected_intersection,expected_choices_left",
     (