From 7f1bfe5b3121eea5db4c04223360edf644fe8efb Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 3 Mar 2024 22:08:19 -0800 Subject: [PATCH] Autogen python client docs (#295) --- docs/reference/python_client.md | 235 ------------ docs/reference/python_client/client.md | 478 +++++++++++++++++++++++++ docs/reference/python_client/index.md | 89 +++++ mkdocs.yml | 5 +- 4 files changed, 571 insertions(+), 236 deletions(-) delete mode 100644 docs/reference/python_client.md create mode 100644 docs/reference/python_client/client.md create mode 100644 docs/reference/python_client/index.md diff --git a/docs/reference/python_client.md b/docs/reference/python_client.md deleted file mode 100644 index 9e238d5e9..000000000 --- a/docs/reference/python_client.md +++ /dev/null @@ -1,235 +0,0 @@ -# Python Client - -LoRAX Python client provides a convenient way of interfacing with a -`lorax` instance running in your environment. - -## Install - -```shell -pip install lorax-client -``` - -## Usage - -```python -from lorax import Client - -endpoint_url = "http://127.0.0.1:8080" - -client = Client(endpoint_url) -text = client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text -print(text) -# ' Rayleigh scattering' - -# Token Streaming -text = "" -for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): - if not response.token.special: - text += response.token.text - -print(text) -# ' Rayleigh scattering' -``` - -or with the asynchronous client: - -```python -from lorax import AsyncClient - -endpoint_url = "http://127.0.0.1:8080" - -client = AsyncClient(endpoint_url) -response = await client.generate("Why is the sky blue?", adapter_id="some/adapter") -print(response.generated_text) -# ' Rayleigh scattering' - -# Token Streaming -text = "" -async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): - if not response.token.special: - text += response.token.text - -print(text) -# ' Rayleigh scattering' -``` - -### Predibase Inference Endpoints - -The LoRAX client can also be used to connect to [Predibase](https://predibase.com/) managed LoRAX endpoints (including Predibase's [serverless endpoints](https://docs.predibase.com/user-guide/inference/serverless_deployments)). - -You need only make the following changes to the above examples: - -1. Change the `endpoint_url` to match the endpoint of your Predibase LLM of choice. -2. Provide your Predibase API token in the `headers` provided to the client. - -Example: - -```python -from lorax import Client - -# You can get your Predibase API token by going to Settings > My Profile > Generate API Token -# You can get your Predibase Tenant short code by going to Settings > My Profile > Overview > Tenant ID -endpoint_url = f"https://serving.app.predibase.com/{predibase_tenant_short_code}/deployments/v2/llms/{llm_deployment_name}" -headers = { - "Authorization": f"Bearer {api_token}" -} - -client = Client(endpoint_url, headers=headers) - -# same as above from here ... -response = client.generate("Why is the sky blue?", adapter_id=f"{model_repo}/{model_version}") -``` - -Note that by default Predibase will use its internal model repos as the default `adapter_source`. To use an adapter from Huggingface: - -```python -response = client.generate("Why is the sky blue?", adapter_id="some/adapter", adapter_source="hub") -``` - -## Types - -```python -# Request Parameters -class Parameters: - # The ID of the adapter to use - adapter_id: Optional[str] - # The source of the adapter to use - adapter_source: Optional[str] - # API token for accessing private adapters - api_token: Optional[str] - # Activate logits sampling - do_sample: bool - # Maximum number of generated tokens - max_new_tokens: int - # The parameter for repetition penalty. 1.0 means no penalty. - # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. - repetition_penalty: Optional[float] - # Whether to prepend the prompt to the generated text - return_full_text: bool - # Stop generating tokens if a member of `stop_sequences` is generated - stop: List[str] - # Random sampling seed - seed: Optional[int] - # The value used to module the logits distribution. - temperature: Optional[float] - # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_k: Optional[int] - # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or - # higher are kept for generation. - top_p: Optional[float] - # truncate inputs tokens to the given size - truncate: Optional[int] - # Typical Decoding mass - # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information - typical_p: Optional[float] - # Generate best_of sequences and return the one if the highest token logprobs - best_of: Optional[int] - # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) - watermark: bool - # Get decoder input token logprobs and ids - decoder_input_details: bool - -# Decoder input tokens -class InputToken: - # Token ID from the model tokenizer - id: int - # Token text - text: str - # Logprob - # Optional since the logprob of the first token cannot be computed - logprob: Optional[float] - - -# Generated tokens -class Token: - # Token ID from the model tokenizer - id: int - # Token text - text: str - # Logprob - logprob: float - # Is the token a special token - # Can be used to ignore tokens when concatenating - special: bool - - -# Generation finish reason -class FinishReason(Enum): - # number of generated tokens == `max_new_tokens` - Length = "length" - # the model generated its end of sequence token - EndOfSequenceToken = "eos_token" - # the model generated a text included in `stop_sequences` - StopSequence = "stop_sequence" - - -# Additional sequences when using the `best_of` parameter -class BestOfSequence: - # Generated text - generated_text: str - # Generation finish reason - finish_reason: FinishReason - # Number of generated tokens - generated_tokens: int - # Sampling seed if sampling was activated - seed: Optional[int] - # Decoder input tokens, empty if decoder_input_details is False - prefill: List[InputToken] - # Generated tokens - tokens: List[Token] - - -# `generate` details -class Details: - # Generation finish reason - finish_reason: FinishReason - # Number of prompt tokens - prompt_tokens: int - # Number of generated tokens - generated_tokens: int - # Sampling seed if sampling was activated - seed: Optional[int] - # Decoder input tokens, empty if decoder_input_details is False - prefill: List[InputToken] - # Generated tokens - tokens: List[Token] - # Additional sequences when using the `best_of` parameter - best_of_sequences: Optional[List[BestOfSequence]] - - -# `generate` return value -class Response: - # Generated text - generated_text: str - # Generation details - details: Details - - -# `generate_stream` details -class StreamDetails: - # Generation finish reason - finish_reason: FinishReason - # Number of prompt tokens - prompt_tokens: int - # Number of generated tokens - generated_tokens: int - # Sampling seed if sampling was activated - seed: Optional[int] - - -# `generate_stream` return value -class StreamResponse: - # Generated token - token: Token - # Complete generated text - # Only available when the generation is finished - generated_text: Optional[str] - # Generation details - # Only available when the generation is finished - details: Optional[StreamDetails] - -# Inference API currently deployed model -class DeployedModel: - model_id: str - sha: str -``` diff --git a/docs/reference/python_client/client.md b/docs/reference/python_client/client.md new file mode 100644 index 000000000..2ac52e2b5 --- /dev/null +++ b/docs/reference/python_client/client.md @@ -0,0 +1,478 @@ +# Table of Contents + +* [lorax.client](#lorax.client) + * [Client](#lorax.client.Client) + * [\_\_init\_\_](#lorax.client.Client.__init__) + * [generate](#lorax.client.Client.generate) + * [generate\_stream](#lorax.client.Client.generate_stream) + * [AsyncClient](#lorax.client.AsyncClient) + * [\_\_init\_\_](#lorax.client.AsyncClient.__init__) + * [generate](#lorax.client.AsyncClient.generate) + * [generate\_stream](#lorax.client.AsyncClient.generate_stream) + + + +# lorax.client + + + +## Client Objects + +```python +class Client() +``` + +Client to make calls to a LoRAX instance + +**Example**: + + +```python +from lorax import Client + +client = Client("http://127.0.0.1:8080") +client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text + ' Rayleigh scattering' + +result = "" +for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): + if not response.token.special: + result += response.token.text +result +' Rayleigh scattering' +``` + + + +#### \_\_init\_\_ + +```python +def __init__(base_url: str, + headers: Optional[Dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, + timeout: int = 60) +``` + +**Arguments**: + + - base_url (`str`): + LoRAX instance base url + - headers (`Optional[Dict[str, str]]`): + Additional headers + - cookies (`Optional[Dict[str, str]]`): + Cookies to include in the requests + - timeout (`int`): + Timeout in seconds + + + +#### generate + +```python +def generate(prompt: str, + adapter_id: Optional[str] = None, + adapter_source: Optional[str] = None, + merged_adapters: Optional[MergedAdapters] = None, + api_token: Optional[str] = None, + do_sample: bool = False, + max_new_tokens: int = 20, + best_of: Optional[int] = None, + repetition_penalty: Optional[float] = None, + return_full_text: bool = False, + seed: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + truncate: Optional[int] = None, + typical_p: Optional[float] = None, + watermark: bool = False, + response_format: Optional[Union[Dict[str, Any], + ResponseFormat]] = None, + decoder_input_details: bool = False, + details: bool = True) -> Response +``` + +Given a prompt, generate the following text + +**Arguments**: + + - prompt (`str`): + Input text + - adapter_id (`Optional[str]`): + Adapter ID to apply to the base model for the request + - adapter_source (`Optional[str]`): + Source of the adapter (hub, local, s3) + - merged_adapters (`Optional[MergedAdapters]`): + Merged adapters to apply to the base model for the request + - api_token (`Optional[str]`): + API token for accessing private adapters + - do_sample (`bool`): + Activate logits sampling + - max_new_tokens (`int`): + Maximum number of generated tokens + - best_of (`int`): + Generate best_of sequences and return the one if the highest token logprobs + - repetition_penalty (`float`): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + return_full_text (`bool`): + Whether to prepend the prompt to the generated text + - seed (`int`): + Random sampling seed + - stop_sequences (`List[str]`): + Stop generating tokens if a member of `stop_sequences` is generated + - temperature (`float`): + The value used to module the logits distribution. + - top_k (`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + - top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation. + - truncate (`int`): + Truncate inputs tokens to the given size + - typical_p (`float`): + Typical Decoding mass + See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information + - watermark (`bool`): + Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) + - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`): + Optional specification of a format to impose upon the generated text, e.g.,: + ``` + { + "type": "json_object", + "schema": { + "type": "string", + "title": "response" + } + } + ``` + - decoder_input_details (`bool`): + Return the decoder input token logprobs and ids + - details (`bool`): + Return the token logprobs and ids for generated tokens + + +**Returns**: + +- `Response` - generated response + + + +#### generate\_stream + +```python +def generate_stream(prompt: str, + adapter_id: Optional[str] = None, + adapter_source: Optional[str] = None, + merged_adapters: Optional[MergedAdapters] = None, + api_token: Optional[str] = None, + do_sample: bool = False, + max_new_tokens: int = 20, + repetition_penalty: Optional[float] = None, + return_full_text: bool = False, + seed: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + truncate: Optional[int] = None, + typical_p: Optional[float] = None, + watermark: bool = False, + response_format: Optional[Union[Dict[str, Any], + ResponseFormat]] = None, + details: bool = True) -> Iterator[StreamResponse] +``` + +Given a prompt, generate the following stream of tokens + +**Arguments**: + + - prompt (`str`): + Input text + - adapter_id (`Optional[str]`): + Adapter ID to apply to the base model for the request + - adapter_source (`Optional[str]`): + Source of the adapter (hub, local, s3) + - merged_adapters (`Optional[MergedAdapters]`): + Merged adapters to apply to the base model for the request + - api_token (`Optional[str]`): + API token for accessing private adapters + - do_sample (`bool`): + Activate logits sampling + - max_new_tokens (`int`): + Maximum number of generated tokens + - repetition_penalty (`float`): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + return_full_text (`bool`): + Whether to prepend the prompt to the generated text + - seed (`int`): + Random sampling seed + - stop_sequences (`List[str]`): + Stop generating tokens if a member of `stop_sequences` is generated + - temperature (`float`): + The value used to module the logits distribution. + - top_k (`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + - top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation. + - truncate (`int`): + Truncate inputs tokens to the given size + - typical_p (`float`): + Typical Decoding mass + See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information + - watermark (`bool`): + Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) + response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`): + Optional specification of a format to impose upon the generated text, e.g.,: + ``` + { + "type": "json_object", + "schema": { + "type": "string", + "title": "response" + } + } + ``` + - details (`bool`): + Return the token logprobs and ids for generated tokens + + +**Returns**: + +- `Iterator[StreamResponse]` - stream of generated tokens + + + +## AsyncClient Objects + +```python +class AsyncClient() +``` + +Asynchronous Client to make calls to a LoRAX instance + +**Example**: + + +```python +from lorax import AsyncClient + +client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz") +response = await client.generate("Why is the sky blue?", adapter_id="some/adapter") +response.generated_text +' Rayleigh scattering' + +result = "" +async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): + if not response.token.special: + result += response.token.text +result +' Rayleigh scattering' +``` + + + +#### \_\_init\_\_ + +```python +def __init__(base_url: str, + headers: Optional[Dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, + timeout: int = 60) +``` + +**Arguments**: + + - base_url (`str`): + LoRAX instance base url + - headers (`Optional[Dict[str, str]]`): + Additional headers + - cookies (`Optional[Dict[str, str]]`): + Cookies to include in the requests + - timeout (`int`): + Timeout in seconds + + + +#### generate + +```python +async def generate(prompt: str, + adapter_id: Optional[str] = None, + adapter_source: Optional[str] = None, + merged_adapters: Optional[MergedAdapters] = None, + api_token: Optional[str] = None, + do_sample: bool = False, + max_new_tokens: int = 20, + best_of: Optional[int] = None, + repetition_penalty: Optional[float] = None, + return_full_text: bool = False, + seed: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + truncate: Optional[int] = None, + typical_p: Optional[float] = None, + watermark: bool = False, + response_format: Optional[Union[Dict[str, Any], + ResponseFormat]] = None, + decoder_input_details: bool = False, + details: bool = True) -> Response +``` + +Given a prompt, generate the following text asynchronously + +**Arguments**: + + - prompt (`str`): + Input text + - adapter_id (`Optional[str]`): + Adapter ID to apply to the base model for the request + - adapter_source (`Optional[str]`): + Source of the adapter (hub, local, s3) + - merged_adapters (`Optional[MergedAdapters]`): + Merged adapters to apply to the base model for the request + - api_token (`Optional[str]`): + API token for accessing private adapters + - do_sample (`bool`): + Activate logits sampling + - max_new_tokens (`int`): + Maximum number of generated tokens + - best_of (`int`): + Generate best_of sequences and return the one if the highest token logprobs + repetition_penalty (`float`): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + return_full_text (`bool`): + Whether to prepend the prompt to the generated text + - seed (`int`): + Random sampling seed + - stop_sequences (`List[str]`): + Stop generating tokens if a member of `stop_sequences` is generated + - temperature (`float`): + The value used to module the logits distribution. + - top_k (`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + - top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation. + - truncate (`int`): + Truncate inputs tokens to the given size + - typical_p (`float`): + Typical Decoding mass + See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information + - watermark (`bool`): + Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) + - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`): + Optional specification of a format to impose upon the generated text, e.g.,: + ``` + { + "type": "json_object", + "schema": { + "type": "string", + "title": "response" + } + } + ``` + - decoder_input_details (`bool`): + Return the decoder input token logprobs and ids + - details (`bool`): + Return the token logprobs and ids for generated tokens + + +**Returns**: + +- `Response` - generated response + + + +#### generate\_stream + +```python +async def generate_stream( + prompt: str, + adapter_id: Optional[str] = None, + adapter_source: Optional[str] = None, + merged_adapters: Optional[MergedAdapters] = None, + api_token: Optional[str] = None, + do_sample: bool = False, + max_new_tokens: int = 20, + repetition_penalty: Optional[float] = None, + return_full_text: bool = False, + seed: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + truncate: Optional[int] = None, + typical_p: Optional[float] = None, + watermark: bool = False, + response_format: Optional[Union[Dict[str, Any], + ResponseFormat]] = None, + details: bool = True) -> AsyncIterator[StreamResponse] +``` + +Given a prompt, generate the following stream of tokens asynchronously + +**Arguments**: + + - prompt (`str`): + Input text + - adapter_id (`Optional[str]`): + Adapter ID to apply to the base model for the request + - adapter_source (`Optional[str]`): + Source of the adapter (hub, local, s3) + - merged_adapters (`Optional[MergedAdapters]`): + Merged adapters to apply to the base model for the request + - api_token (`Optional[str]`): + API token for accessing private adapters + - do_sample (`bool`): + Activate logits sampling + - max_new_tokens (`int`): + Maximum number of generated tokens + - repetition_penalty (`float`): + The parameter for repetition penalty. 1.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + return_full_text (`bool`): + Whether to prepend the prompt to the generated text + - seed (`int`): + Random sampling seed + - stop_sequences (`List[str]`): + Stop generating tokens if a member of `stop_sequences` is generated + - temperature (`float`): + The value used to module the logits distribution. + - top_k (`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + - top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation. + - truncate (`int`): + Truncate inputs tokens to the given size + - typical_p (`float`): + Typical Decoding mass + See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information + - watermark (`bool`): + Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) + - response_format (`Optional[Union[Dict[str, Any], ResponseFormat]]`): + Optional specification of a format to impose upon the generated text, e.g.,: + ``` + { + "type": "json_object", + "schema": { + "type": "string", + "title": "response" + } + } + ``` + - details (`bool`): + Return the token logprobs and ids for generated tokens + + +**Returns**: + +- `AsyncIterator[StreamResponse]` - stream of generated tokens + diff --git a/docs/reference/python_client/index.md b/docs/reference/python_client/index.md new file mode 100644 index 000000000..f9cfe4af0 --- /dev/null +++ b/docs/reference/python_client/index.md @@ -0,0 +1,89 @@ +# Python Client + +LoRAX Python client provides a convenient way of interfacing with a +`lorax` instance running in your environment. + +## Install + +```shell +pip install lorax-client +``` + +## Usage + +```python +from lorax import Client + +endpoint_url = "http://127.0.0.1:8080" + +client = Client(endpoint_url) +text = client.generate("Why is the sky blue?", adapter_id="some/adapter").generated_text +print(text) +# ' Rayleigh scattering' + +# Token Streaming +text = "" +for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): + if not response.token.special: + text += response.token.text + +print(text) +# ' Rayleigh scattering' +``` + +or with the asynchronous client: + +```python +from lorax import AsyncClient + +endpoint_url = "http://127.0.0.1:8080" + +client = AsyncClient(endpoint_url) +response = await client.generate("Why is the sky blue?", adapter_id="some/adapter") +print(response.generated_text) +# ' Rayleigh scattering' + +# Token Streaming +text = "" +async for response in client.generate_stream("Why is the sky blue?", adapter_id="some/adapter"): + if not response.token.special: + text += response.token.text + +print(text) +# ' Rayleigh scattering' +``` + +See [API reference](./client.md) for full details. + +### Predibase Inference Endpoints + +The LoRAX client can also be used to connect to [Predibase](https://predibase.com/) managed LoRAX endpoints (including Predibase's [serverless endpoints](https://docs.predibase.com/user-guide/inference/serverless_deployments)). + +You need only make the following changes to the above examples: + +1. Change the `endpoint_url` to match the endpoint of your Predibase LLM of choice. +2. Provide your Predibase API token in the `headers` provided to the client. + +Example: + +```python +from lorax import Client + +# You can get your Predibase API token by going to Settings > My Profile > Generate API Token +# You can get your Predibase Tenant short code by going to Settings > My Profile > Overview > Tenant ID +endpoint_url = f"https://serving.app.predibase.com/{predibase_tenant_short_code}/deployments/v2/llms/{llm_deployment_name}" +headers = { + "Authorization": f"Bearer {api_token}" +} + +client = Client(endpoint_url, headers=headers) + +# same as above from here ... +response = client.generate("Why is the sky blue?", adapter_id=f"{model_repo}/{model_version}") +``` + +Note that by default Predibase will use its internal model repos as the default `adapter_source`. To use an adapter from Huggingface: + +```python +response = client.generate("Why is the sky blue?", adapter_id="some/adapter", adapter_source="hub") +``` diff --git a/mkdocs.yml b/mkdocs.yml index 3f5213314..65821cf25 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,7 +42,10 @@ nav: - 📚 Reference: - Launcher: reference/launcher.md - REST API: reference/rest_api.md - - Python Client: reference/python_client.md + - Python Client: + - Python Client: reference/python_client/index.md + - lorax.client: reference/python_client/client.md + # - lorax.types: reference/python_client/types.md - OpenAI Compatible API: reference/openai_api.md - 🔬 Guides: - Quantization: guides/quantization.md