From 7f1bfe5b3121eea5db4c04223360edf644fe8efb Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Sun, 3 Mar 2024 22:08:19 -0800
Subject: [PATCH] Autogen python client docs (#295)

---
 docs/reference/python_client.md        | 235 ------------
 docs/reference/python_client/client.md | 478 +++++++++++++++++++++++++
 docs/reference/python_client/index.md  |  89 +++++
 mkdocs.yml                             |   5 +-
 4 files changed, 571 insertions(+), 236 deletions(-)
 delete mode 100644 docs/reference/python_client.md
 create mode 100644 docs/reference/python_client/client.md
 create mode 100644 docs/reference/python_client/index.md

diff --git a/docs/reference/python_client.md b/docs/reference/python_client.md
deleted file mode 100644
index 9e238d5e9..000000000
--- a/docs/reference/python_client.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# Python Client
-
-LoRAX Python client provides a convenient way of interfacing with a
-`lorax` instance running in your environment.
-
-## Install
-
-```shell
-pip install lorax-client
-```
-
-## Usage
-
-```python
-from lorax import Client
-
-endpoint_url = "http://127.0.0.1:8080"
-
-client = Client(endpoint_url)
-text = client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text
-print(text)
-# ' Rayleigh scattering'
-
-# Token Streaming
-text = ""
-for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
-    if not response.token.special:
-        text += response.token.text
-
-print(text)
-# ' Rayleigh scattering'
-```
-
-or with the asynchronous client:
-
-```python
-from lorax import AsyncClient
-
-endpoint_url = "http://127.0.0.1:8080"
-
-client = AsyncClient(endpoint_url)
-response = await client.generate("Why is the sky blue?", adapter_id="some/adapter")
-print(response.generated_text)
-# ' Rayleigh scattering'
-
-# Token Streaming
-text = ""
-async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
-    if not response.token.special:
-        text += response.token.text
-
-print(text)
-# ' Rayleigh scattering'
-```
-
-### Predibase Inference Endpoints
-
-The LoRAX client can also be used to connect to [Predibase](https://predibase.com/) managed LoRAX endpoints (including Predibase's [serverless endpoints](https://docs.predibase.com/user-guide/inference/serverless_deployments)).
-
-You need only make the following changes to the above examples:
-
-1. Change the `endpoint_url` to match the endpoint of your Predibase LLM of choice.
-2. Provide your Predibase API token in the `headers` provided to the client.
-
-Example:
-
-```python
-from lorax import Client
-
-# You can get your Predibase API token by going to Settings > My Profile > Generate API Token
-# You can get your Predibase Tenant short code by going to Settings > My Profile > Overview > Tenant ID
-endpoint_url = f"https://serving.app.predibase.com/{predibase_tenant_short_code}/deployments/v2/llms/{llm_deployment_name}"
-headers = {
-    "Authorization": f"Bearer {api_token}"
-}
-
-client = Client(endpoint_url, headers=headers)
-
-# same as above from here ...
-response = client.generate("Why is the sky blue?", adapter_id=f"{model_repo}/{model_version}")
-```
-
-Note that by default Predibase will use its internal model repos as the default `adapter_source`. To use an adapter from Huggingface:
-
-```python
-response = client.generate("Why is the sky blue?", adapter_id="some/adapter", adapter_source="hub")
-```
-
-## Types
-
-```python
-# Request Parameters
-class Parameters:
-    # The ID of the adapter to use
-    adapter_id: Optional[str]
-    # The source of the adapter to use
-    adapter_source: Optional[str]
-    # API token for accessing private adapters
-    api_token: Optional[str]
-    # Activate logits sampling
-    do_sample: bool
-    # Maximum number of generated tokens
-    max_new_tokens: int
-    # The parameter for repetition penalty. 1.0 means no penalty.
-    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
-    repetition_penalty: Optional[float]
-    # Whether to prepend the prompt to the generated text
-    return_full_text: bool
-    # Stop generating tokens if a member of `stop_sequences` is generated
-    stop: List[str]
-    # Random sampling seed
-    seed: Optional[int]
-    # The value used to module the logits distribution.
-    temperature: Optional[float]
-    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-    top_k: Optional[int]
-    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
-    # higher are kept for generation.
-    top_p: Optional[float]
-    # truncate inputs tokens to the given size
-    truncate: Optional[int]
-    # Typical Decoding mass
-    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
-    typical_p: Optional[float]
-    # Generate best_of sequences and return the one if the highest token logprobs
-    best_of: Optional[int]
-    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
-    watermark: bool
-    # Get decoder input token logprobs and ids
-    decoder_input_details: bool
-
-# Decoder input tokens
-class InputToken:
-    # Token ID from the model tokenizer
-    id: int
-    # Token text
-    text: str
-    # Logprob
-    # Optional since the logprob of the first token cannot be computed
-    logprob: Optional[float]
-
-
-# Generated tokens
-class Token:
-    # Token ID from the model tokenizer
-    id: int
-    # Token text
-    text: str
-    # Logprob
-    logprob: float
-    # Is the token a special token
-    # Can be used to ignore tokens when concatenating
-    special: bool
-
-
-# Generation finish reason
-class FinishReason(Enum):
-    # number of generated tokens == `max_new_tokens`
-    Length = "length"
-    # the model generated its end of sequence token
-    EndOfSequenceToken = "eos_token"
-    # the model generated a text included in `stop_sequences`
-    StopSequence = "stop_sequence"
-
-
-# Additional sequences when using the `best_of` parameter
-class BestOfSequence:
-    # Generated text
-    generated_text: str
-    # Generation finish reason
-    finish_reason: FinishReason
-    # Number of generated tokens
-    generated_tokens: int
-    # Sampling seed if sampling was activated
-    seed: Optional[int]
-    # Decoder input tokens, empty if decoder_input_details is False
-    prefill: List[InputToken]
-    # Generated tokens
-    tokens: List[Token]
-
-
-# `generate` details
-class Details:
-    # Generation finish reason
-    finish_reason: FinishReason
-    # Number of prompt tokens
-    prompt_tokens: int
-    # Number of generated tokens
-    generated_tokens: int
-    # Sampling seed if sampling was activated
-    seed: Optional[int]
-    # Decoder input tokens, empty if decoder_input_details is False
-    prefill: List[InputToken]
-    # Generated tokens
-    tokens: List[Token]
-    # Additional sequences when using the `best_of` parameter
-    best_of_sequences: Optional[List[BestOfSequence]]
-
-
-# `generate` return value
-class Response:
-    # Generated text
-    generated_text: str
-    # Generation details
-    details: Details
-
-
-# `generate_stream` details
-class StreamDetails:
-    # Generation finish reason
-    finish_reason: FinishReason
-    # Number of prompt tokens
-    prompt_tokens: int
-    # Number of generated tokens
-    generated_tokens: int
-    # Sampling seed if sampling was activated
-    seed: Optional[int]
-
-
-# `generate_stream` return value
-class StreamResponse:
-    # Generated token
-    token: Token
-    # Complete generated text
-    # Only available when the generation is finished
-    generated_text: Optional[str]
-    # Generation details
-    # Only available when the generation is finished
-    details: Optional[StreamDetails]
-
-# Inference API currently deployed model
-class DeployedModel:
-    model_id: str
-    sha: str
-```
diff --git a/docs/reference/python_client/client.md b/docs/reference/python_client/client.md
new file mode 100644
index 000000000..2ac52e2b5
--- /dev/null
+++ b/docs/reference/python_client/client.md
@@ -0,0 +1,478 @@
+# Table of Contents
+
+* [lorax.client](#lorax.client)
+  * [Client](#lorax.client.Client)
+    * [\_\_init\_\_](#lorax.client.Client.__init__)
+    * [generate](#lorax.client.Client.generate)
+    * [generate\_stream](#lorax.client.Client.generate_stream)
+  * [AsyncClient](#lorax.client.AsyncClient)
+    * [\_\_init\_\_](#lorax.client.AsyncClient.__init__)
+    * [generate](#lorax.client.AsyncClient.generate)
+    * [generate\_stream](#lorax.client.AsyncClient.generate_stream)
+
+<a id="lorax.client"></a>
+
+# lorax.client
+
+<a id="lorax.client.Client"></a>
+
+## Client Objects
+
+```python
+class Client()
+```
+
+Client to make calls to a LoRAX instance
+
+**Example**:
+
+  
+```python
+from lorax import Client
+
+client = Client("http://127.0.0.1:8080")
+client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text
+ ' Rayleigh scattering'
+
+result = ""
+for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
+    if not response.token.special:
+        result += response.token.text
+result
+' Rayleigh scattering'
+```
+
+<a id="lorax.client.Client.__init__"></a>
+
+#### \_\_init\_\_
+
+```python
+def __init__(base_url: str,
+             headers: Optional[Dict[str, str]] = None,
+             cookies: Optional[Dict[str, str]] = None,
+             timeout: int = 60)
+```
+
+**Arguments**:
+
+  - base_url (`str`):
+  LoRAX instance base url
+  - headers (`Optional[Dict[str, str]]`):
+  Additional headers
+  - cookies (`Optional[Dict[str, str]]`):
+  Cookies to include in the requests
+  - timeout (`int`):
+  Timeout in seconds
+
+<a id="lorax.client.Client.generate"></a>
+
+#### generate
+
+```python
+def generate(prompt: str,
+             adapter_id: Optional[str] = None,
+             adapter_source: Optional[str] = None,
+             merged_adapters: Optional[MergedAdapters] = None,
+             api_token: Optional[str] = None,
+             do_sample: bool = False,
+             max_new_tokens: int = 20,
+             best_of: Optional[int] = None,
+             repetition_penalty: Optional[float] = None,
+             return_full_text: bool = False,
+             seed: Optional[int] = None,
+             stop_sequences: Optional[List[str]] = None,
+             temperature: Optional[float] = None,
+             top_k: Optional[int] = None,
+             top_p: Optional[float] = None,
+             truncate: Optional[int] = None,
+             typical_p: Optional[float] = None,
+             watermark: bool = False,
+             response_format: Optional[Union[Dict[str, Any],
+                                             ResponseFormat]] = None,
+             decoder_input_details: bool = False,
+             details: bool = True) -> Response
+```
+
+Given a prompt, generate the following text
+
+**Arguments**:
+
+  - prompt (`str`):
+  Input text
+  - adapter_id (`Optional[str]`):
+  Adapter ID to apply to the base model for the request
+  - adapter_source (`Optional[str]`):
+  Source of the adapter (hub, local, s3)
+  - merged_adapters (`Optional[MergedAdapters]`):
+  Merged adapters to apply to the base model for the request
+  - api_token (`Optional[str]`):
+  API token for accessing private adapters
+  - do_sample (`bool`):
+  Activate logits sampling
+  - max_new_tokens (`int`):
+  Maximum number of generated tokens
+  - best_of (`int`):
+  Generate best_of sequences and return the one if the highest token logprobs
+  - repetition_penalty (`float`):
+  The parameter for repetition penalty. 1.0 means no penalty. See [this
+  paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+  return_full_text (`bool`):
+  Whether to prepend the prompt to the generated text
+  - seed (`int`):
+  Random sampling seed
+  - stop_sequences (`List[str]`):
+  Stop generating tokens if a member of `stop_sequences` is generated
+  - temperature (`float`):
+  The value used to module the logits distribution.
+  - top_k (`int`):
+  The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  - top_p (`float`):
+  If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+  higher are kept for generation.
+  - truncate (`int`):
+  Truncate inputs tokens to the given size
+  - typical_p (`float`):
+  Typical Decoding mass
+  See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+  - watermark (`bool`):
+  Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+  - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`):
+  Optional specification of a format to impose upon the generated text, e.g.,:
+        ```
+        {
+            "type": "json_object",
+            "schema": {
+                "type": "string",
+                "title": "response"
+            }
+        }
+        ```
+  - decoder_input_details (`bool`):
+  Return the decoder input token logprobs and ids
+  - details (`bool`):
+  Return the token logprobs and ids for generated tokens
+  
+
+**Returns**:
+
+- `Response` - generated response
+
+<a id="lorax.client.Client.generate_stream"></a>
+
+#### generate\_stream
+
+```python
+def generate_stream(prompt: str,
+                    adapter_id: Optional[str] = None,
+                    adapter_source: Optional[str] = None,
+                    merged_adapters: Optional[MergedAdapters] = None,
+                    api_token: Optional[str] = None,
+                    do_sample: bool = False,
+                    max_new_tokens: int = 20,
+                    repetition_penalty: Optional[float] = None,
+                    return_full_text: bool = False,
+                    seed: Optional[int] = None,
+                    stop_sequences: Optional[List[str]] = None,
+                    temperature: Optional[float] = None,
+                    top_k: Optional[int] = None,
+                    top_p: Optional[float] = None,
+                    truncate: Optional[int] = None,
+                    typical_p: Optional[float] = None,
+                    watermark: bool = False,
+                    response_format: Optional[Union[Dict[str, Any],
+                                                    ResponseFormat]] = None,
+                    details: bool = True) -> Iterator[StreamResponse]
+```
+
+Given a prompt, generate the following stream of tokens
+
+**Arguments**:
+
+  - prompt (`str`):
+  Input text
+  - adapter_id (`Optional[str]`):
+  Adapter ID to apply to the base model for the request
+  - adapter_source (`Optional[str]`):
+  Source of the adapter (hub, local, s3)
+  - merged_adapters (`Optional[MergedAdapters]`):
+  Merged adapters to apply to the base model for the request
+  - api_token (`Optional[str]`):
+  API token for accessing private adapters
+  - do_sample (`bool`):
+  Activate logits sampling
+  - max_new_tokens (`int`):
+  Maximum number of generated tokens
+  - repetition_penalty (`float`):
+  The parameter for repetition penalty. 1.0 means no penalty. See [this
+  paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+  return_full_text (`bool`):
+  Whether to prepend the prompt to the generated text
+  - seed (`int`):
+  Random sampling seed
+  - stop_sequences (`List[str]`):
+  Stop generating tokens if a member of `stop_sequences` is generated
+  - temperature (`float`):
+  The value used to module the logits distribution.
+  - top_k (`int`):
+  The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  - top_p (`float`):
+  If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+  higher are kept for generation.
+  - truncate (`int`):
+  Truncate inputs tokens to the given size
+  - typical_p (`float`):
+  Typical Decoding mass
+  See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+  - watermark (`bool`):
+  Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+  response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`):
+  Optional specification of a format to impose upon the generated text, e.g.,:
+        ```
+        {
+            "type": "json_object",
+            "schema": {
+                "type": "string",
+                "title": "response"
+            }
+        }
+        ```
+  - details (`bool`):
+  Return the token logprobs and ids for generated tokens
+  
+
+**Returns**:
+
+- `Iterator[StreamResponse]` - stream of generated tokens
+
+<a id="lorax.client.AsyncClient"></a>
+
+## AsyncClient Objects
+
+```python
+class AsyncClient()
+```
+
+Asynchronous Client to make calls to a LoRAX instance
+
+**Example**:
+
+  
+```python
+from lorax import AsyncClient
+
+client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz")
+response = await client.generate("Why is the sky blue?", adapter_id="some/adapter")
+response.generated_text
+' Rayleigh scattering'
+
+result = ""
+async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
+    if not response.token.special:
+        result += response.token.text
+result
+' Rayleigh scattering'
+```
+
+<a id="lorax.client.AsyncClient.__init__"></a>
+
+#### \_\_init\_\_
+
+```python
+def __init__(base_url: str,
+             headers: Optional[Dict[str, str]] = None,
+             cookies: Optional[Dict[str, str]] = None,
+             timeout: int = 60)
+```
+
+**Arguments**:
+
+  - base_url (`str`):
+  LoRAX instance base url
+  - headers (`Optional[Dict[str, str]]`):
+  Additional headers
+  - cookies (`Optional[Dict[str, str]]`):
+  Cookies to include in the requests
+  - timeout (`int`):
+  Timeout in seconds
+
+<a id="lorax.client.AsyncClient.generate"></a>
+
+#### generate
+
+```python
+async def generate(prompt: str,
+                   adapter_id: Optional[str] = None,
+                   adapter_source: Optional[str] = None,
+                   merged_adapters: Optional[MergedAdapters] = None,
+                   api_token: Optional[str] = None,
+                   do_sample: bool = False,
+                   max_new_tokens: int = 20,
+                   best_of: Optional[int] = None,
+                   repetition_penalty: Optional[float] = None,
+                   return_full_text: bool = False,
+                   seed: Optional[int] = None,
+                   stop_sequences: Optional[List[str]] = None,
+                   temperature: Optional[float] = None,
+                   top_k: Optional[int] = None,
+                   top_p: Optional[float] = None,
+                   truncate: Optional[int] = None,
+                   typical_p: Optional[float] = None,
+                   watermark: bool = False,
+                   response_format: Optional[Union[Dict[str, Any],
+                                                   ResponseFormat]] = None,
+                   decoder_input_details: bool = False,
+                   details: bool = True) -> Response
+```
+
+Given a prompt, generate the following text asynchronously
+
+**Arguments**:
+
+  - prompt (`str`):
+  Input text
+  - adapter_id (`Optional[str]`):
+  Adapter ID to apply to the base model for the request
+  - adapter_source (`Optional[str]`):
+  Source of the adapter (hub, local, s3)
+  - merged_adapters (`Optional[MergedAdapters]`):
+  Merged adapters to apply to the base model for the request
+  - api_token (`Optional[str]`):
+  API token for accessing private adapters
+  - do_sample (`bool`):
+  Activate logits sampling
+  - max_new_tokens (`int`):
+  Maximum number of generated tokens
+  - best_of (`int`):
+  Generate best_of sequences and return the one if the highest token logprobs
+  repetition_penalty (`float`):
+  The parameter for repetition penalty. 1.0 means no penalty. See [this
+  paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+  return_full_text (`bool`):
+  Whether to prepend the prompt to the generated text
+  - seed (`int`):
+  Random sampling seed
+  - stop_sequences (`List[str]`):
+  Stop generating tokens if a member of `stop_sequences` is generated
+  - temperature (`float`):
+  The value used to module the logits distribution.
+  - top_k (`int`):
+  The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  - top_p (`float`):
+  If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+  higher are kept for generation.
+  - truncate (`int`):
+  Truncate inputs tokens to the given size
+  - typical_p (`float`):
+  Typical Decoding mass
+  See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+  - watermark (`bool`):
+  Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+  - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`):
+  Optional specification of a format to impose upon the generated text, e.g.,:
+        ```
+        {
+            "type": "json_object",
+            "schema": {
+                "type": "string",
+                "title": "response"
+            }
+        }
+        ```
+  - decoder_input_details (`bool`):
+  Return the decoder input token logprobs and ids
+  - details (`bool`):
+  Return the token logprobs and ids for generated tokens
+  
+
+**Returns**:
+
+- `Response` - generated response
+
+<a id="lorax.client.AsyncClient.generate_stream"></a>
+
+#### generate\_stream
+
+```python
+async def generate_stream(
+        prompt: str,
+        adapter_id: Optional[str] = None,
+        adapter_source: Optional[str] = None,
+        merged_adapters: Optional[MergedAdapters] = None,
+        api_token: Optional[str] = None,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+        response_format: Optional[Union[Dict[str, Any],
+                                        ResponseFormat]] = None,
+        details: bool = True) -> AsyncIterator[StreamResponse]
+```
+
+Given a prompt, generate the following stream of tokens asynchronously
+
+**Arguments**:
+
+  - prompt (`str`):
+  Input text
+  - adapter_id (`Optional[str]`):
+  Adapter ID to apply to the base model for the request
+  - adapter_source (`Optional[str]`):
+  Source of the adapter (hub, local, s3)
+  - merged_adapters (`Optional[MergedAdapters]`):
+  Merged adapters to apply to the base model for the request
+  - api_token (`Optional[str]`):
+  API token for accessing private adapters
+  - do_sample (`bool`):
+  Activate logits sampling
+  - max_new_tokens (`int`):
+  Maximum number of generated tokens
+  - repetition_penalty (`float`):
+  The parameter for repetition penalty. 1.0 means no penalty. See [this
+  paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+  return_full_text (`bool`):
+  Whether to prepend the prompt to the generated text
+  - seed (`int`):
+  Random sampling seed
+  - stop_sequences (`List[str]`):
+  Stop generating tokens if a member of `stop_sequences` is generated
+  - temperature (`float`):
+  The value used to module the logits distribution.
+  - top_k (`int`):
+  The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  - top_p (`float`):
+  If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+  higher are kept for generation.
+  - truncate (`int`):
+  Truncate inputs tokens to the given size
+  - typical_p (`float`):
+  Typical Decoding mass
+  See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+  - watermark (`bool`):
+  Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+  - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`):
+  Optional specification of a format to impose upon the generated text, e.g.,:
+        ```
+        {
+            "type": "json_object",
+            "schema": {
+                "type": "string",
+                "title": "response"
+            }
+        }
+        ```
+  - details (`bool`):
+  Return the token logprobs and ids for generated tokens
+  
+
+**Returns**:
+
+- `AsyncIterator[StreamResponse]` - stream of generated tokens
+
diff --git a/docs/reference/python_client/index.md b/docs/reference/python_client/index.md
new file mode 100644
index 000000000..f9cfe4af0
--- /dev/null
+++ b/docs/reference/python_client/index.md
@@ -0,0 +1,89 @@
+# Python Client
+
+LoRAX Python client provides a convenient way of interfacing with a
+`lorax` instance running in your environment.
+
+## Install
+
+```shell
+pip install lorax-client
+```
+
+## Usage
+
+```python
+from lorax import Client
+
+endpoint_url = "http://127.0.0.1:8080"
+
+client = Client(endpoint_url)
+text = client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text
+print(text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+or with the asynchronous client:
+
+```python
+from lorax import AsyncClient
+
+endpoint_url = "http://127.0.0.1:8080"
+
+client = AsyncClient(endpoint_url)
+response = await client.generate("Why is the sky blue?", adapter_id="some/adapter")
+print(response.generated_text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+See [API reference](./client.md) for full details.
+
+### Predibase Inference Endpoints
+
+The LoRAX client can also be used to connect to [Predibase](https://predibase.com/) managed LoRAX endpoints (including Predibase's [serverless endpoints](https://docs.predibase.com/user-guide/inference/serverless_deployments)).
+
+You need only make the following changes to the above examples:
+
+1. Change the `endpoint_url` to match the endpoint of your Predibase LLM of choice.
+2. Provide your Predibase API token in the `headers` provided to the client.
+
+Example:
+
+```python
+from lorax import Client
+
+# You can get your Predibase API token by going to Settings > My Profile > Generate API Token
+# You can get your Predibase Tenant short code by going to Settings > My Profile > Overview > Tenant ID
+endpoint_url = f"https://serving.app.predibase.com/{predibase_tenant_short_code}/deployments/v2/llms/{llm_deployment_name}"
+headers = {
+    "Authorization": f"Bearer {api_token}"
+}
+
+client = Client(endpoint_url, headers=headers)
+
+# same as above from here ...
+response = client.generate("Why is the sky blue?", adapter_id=f"{model_repo}/{model_version}")
+```
+
+Note that by default Predibase will use its internal model repos as the default `adapter_source`. To use an adapter from Huggingface:
+
+```python
+response = client.generate("Why is the sky blue?", adapter_id="some/adapter", adapter_source="hub")
+```
diff --git a/mkdocs.yml b/mkdocs.yml
index 3f5213314..65821cf25 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -42,7 +42,10 @@ nav:
   - 📚 Reference:
       - Launcher: reference/launcher.md
       - REST API: reference/rest_api.md
-      - Python Client: reference/python_client.md
+      - Python Client:
+        - Python Client: reference/python_client/index.md
+        - lorax.client: reference/python_client/client.md
+        # - lorax.types: reference/python_client/types.md
       - OpenAI Compatible API: reference/openai_api.md
   - 🔬 Guides:
       - Quantization: guides/quantization.md