From d5bcc171a42360700ad6135b6b8af7b1cfdeb15f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Wed, 15 Nov 2023 21:26:32 -0800 Subject: [PATCH] Renamed lorax-inference to lorax (#30) --- .github/ISSUE_TEMPLATE/bug-report.yml | 4 ++-- .github/ISSUE_TEMPLATE/feature-request.yml | 4 ++-- .github/workflows/load_test.yaml | 2 +- README.md | 8 ++++---- benchmark/README.md | 2 +- clients/python/lorax/client.py | 8 ++++---- launcher/src/main.rs | 6 +++--- router/README.md | 4 ++-- router/src/main.rs | 2 +- server/lorax_server/tracing.py | 2 +- server/lorax_server/utils/flash_attn.py | 4 ++-- server/lorax_server/utils/gptq/quantize.py | 2 +- 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index f8aa9d19d..8b5415483 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -1,5 +1,5 @@ name: "\U0001F41B Bug Report" -description: Submit a bug report to help us improve lorax-inference +description: Submit a bug report to help us improve LoRAX body: - type: textarea id: system-info @@ -16,7 +16,7 @@ body: Deployment specificities (Kubernetes, EKS, AKS, any particular deployments): The current version being used: - placeholder: lorax-inference version, platform, python version, ... + placeholder: lorax version, platform, python version, ... validations: required: true diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 1614dc0bd..18ebdd61c 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -1,5 +1,5 @@ name: "\U0001F680 Feature request" -description: Submit a proposal/request for a new lorax-inference feature +description: Submit a proposal/request for a new LoRAX feature labels: [ "feature" ] body: - type: textarea @@ -28,4 +28,4 @@ body: attributes: label: Your contribution description: | - Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/lorax-inference/blob/main/CONTRIBUTING.md) + Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/predibase/lorax/blob/main/CONTRIBUTING.md) diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml index 05bdbcbce..3b99cec6d 100644 --- a/.github/workflows/load_test.yaml +++ b/.github/workflows/load_test.yaml @@ -70,7 +70,7 @@ jobs: - name: Start starcoder run: | - docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/lorax-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 + docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/predibase/lorax:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 sleep 10 wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health diff --git a/README.md b/README.md index dd233d466..a88562bd2 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # LoRA Exchange (LoRAX) - GitHub Repo stars + GitHub Repo stars License @@ -32,7 +32,7 @@ Lorax is a framework that allows users to serve over a hundred fine-tuned models - 🚅 **Dynamic Adapter Loading:** allowing each set of fine-tuned LoRA weights to be loaded from storage just-in-time as requests come in at runtime, without blocking concurrent requests. - 🏋️‍♀️ **Tiered Weight Caching:** to support fast exchanging of LoRA adapters between requests, and offloading of adapter weights to CPU and disk to avoid out-of-memory errors. - 🧁 **Continuous Multi-Adapter Batching:** a fair scheduling policy for optimizing aggregate throughput of the system that extends the popular continuous batching strategy to work across multiple sets of LoRA adapters in parallel. -- 👬 **Optimized Inference:** [flash-attention](https://github.com/HazyResearch/flash-attention), [paged attention](https://github.com/vllm-project/vllm), quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323), tensor parallelism, token streaming, and [continuous batching](https://github.com/huggingface/lorax-inference/tree/main/router) work together to optimize our inference speeds. +- 👬 **Optimized Inference:** [flash-attention](https://github.com/HazyResearch/flash-attention), [paged attention](https://github.com/vllm-project/vllm), quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323), tensor parallelism, token streaming, and [continuous batching](https://github.com/predibase/lorax/tree/main/router) work together to optimize our inference speeds. - ✅ **Production Readiness** reliably stable, Lorax supports Prometheus metrics and distributed tracing with Open Telemetry - 🤯 **Free Commercial Use:** Apache 2.0 License. Enough said 😎. @@ -65,11 +65,11 @@ The easiest way of getting started is using the official Docker container: model=mistralai/Mistral-7B-Instruct-v0.1 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/lorax-inference:0.9.4 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/predibase/lorax:latest --model-id $model ``` **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. -To see all options to serve your models (in the [code](https://github.com/huggingface/lorax-inference/blob/main/launcher/src/main.rs) or in the cli: +To see all options to serve your models (in the [code](https://github.com/predibase/lorax/blob/main/launcher/src/main.rs) or in the cli: ``` lorax-launcher --help ``` diff --git a/benchmark/README.md b/benchmark/README.md index 78d0a3cc0..cefc32078 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -17,7 +17,7 @@ make install-benchmark ## Run -First, start `lorax-inference`: +First, start `lorax`: ```shell lorax-launcher --model-id bigscience/bloom-560m diff --git a/clients/python/lorax/client.py b/clients/python/lorax/client.py index cd352fd89..e3e23d3ec 100644 --- a/clients/python/lorax/client.py +++ b/clients/python/lorax/client.py @@ -15,7 +15,7 @@ class Client: - """Client to make calls to a lorax-inference instance + """Client to make calls to a LoRAX instance Example: @@ -45,7 +45,7 @@ def __init__( """ Args: base_url (`str`): - lorax-inference instance base url + LoRAX instance base url headers (`Optional[Dict[str, str]]`): Additional headers cookies (`Optional[Dict[str, str]]`): @@ -272,7 +272,7 @@ def generate_stream( class AsyncClient: - """Asynchronous Client to make calls to a lorax-inference instance + """Asynchronous Client to make calls to a LoRAX instance Example: @@ -303,7 +303,7 @@ def __init__( """ Args: base_url (`str`): - lorax-inference instance base url + LoRAX instance base url headers (`Optional[Dict[str, str]]`): Additional headers cookies (`Optional[Dict[str, str]]`): diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 0bbdeb52b..480e5d730 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -105,7 +105,7 @@ struct Args { validation_workers: usize, /// Whether to shard the model across multiple GPUs - /// By default lorax-inference will use all available GPUs to run + /// By default LoRAX will use all available GPUs to run /// the model. Setting it to `false` deactivates `num_shard`. #[clap(long, env)] sharded: Option, @@ -204,7 +204,7 @@ struct Args { /// Overall this number should be the largest possible amount that fits the /// remaining memory (after the model is loaded). Since the actual memory overhead /// depends on other parameters like if you're using quantization, flash attention - /// or the model implementation, lorax-inference cannot infer this number + /// or the model implementation, LoRAX cannot infer this number /// automatically. #[clap(long, env)] max_batch_total_tokens: Option, @@ -260,7 +260,7 @@ struct Args { #[clap(long, env)] weights_cache_override: Option, - /// For some models (like bloom), lorax-inference implemented custom + /// For some models (like llama), LoRAX implemented custom /// cuda kernels to speed up inference. Those kernels were only tested on A100. /// Use this flag to disable them if you're running on different hardware and /// encounter issues. diff --git a/router/README.md b/router/README.md index 992fe31ea..8c52da255 100644 --- a/router/README.md +++ b/router/README.md @@ -10,7 +10,7 @@ much simpler and focus on having the most efficient forward passes as possible. ## Continuous batching -One important feature of `lorax-inference` is enabled +One important feature of `lorax` is enabled by this `router`. Continuous batching is the act of regularly running queries in the same @@ -69,7 +69,7 @@ but a currently running query is probably doing `decode`. If we want to do the c batching as explained previously we need to run `prefill` at some point in order to create the attention matrix required to be able to join the `decode` group. -`lorax-inference` uses a bunch of different strategies and parameters in +`lorax` uses a bunch of different strategies and parameters in order to enable you to find the sweet spot between exploiting the hardware and perceived latency. With no continuous batching at all, latency is going to be super good, but throughput (meaning diff --git a/router/src/main.rs b/router/src/main.rs index 38afd8b5a..2cebd1e50 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -316,7 +316,7 @@ fn init_logging(otlp_endpoint: Option, json_output: bool) { trace::config() .with_resource(Resource::new(vec![KeyValue::new( "service.name", - "lorax-inference.router", + "lorax.router", )])) .with_sampler(Sampler::AlwaysOn), ) diff --git a/server/lorax_server/tracing.py b/server/lorax_server/tracing.py index 445b7da5c..0d9dcccea 100644 --- a/server/lorax_server/tracing.py +++ b/server/lorax_server/tracing.py @@ -56,7 +56,7 @@ def _start_span(self, handler_call_details, context, set_status_on_exception=Fal def setup_tracing(shard: int, otlp_endpoint: str): resource = Resource.create( - attributes={"service.name": f"lorax-inference.server-{shard}"} + attributes={"service.name": f"lorax.server-{shard}"} ) span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True) span_processor = BatchSpanProcessor(span_exporter) diff --git a/server/lorax_server/utils/flash_attn.py b/server/lorax_server/utils/flash_attn.py index 1e2ddc2bb..d84bf4313 100644 --- a/server/lorax_server/utils/flash_attn.py +++ b/server/lorax_server/utils/flash_attn.py @@ -22,7 +22,7 @@ except ImportError: raise ImportError( "Flash Attention V2 is not installed.\n" - "Use the official Docker image (ghcr.io/huggingface/lorax-inference:latest) " + "Use the official Docker image (ghcr.io/predibase/lorax:latest) " "or install flash attention v2 with `cd server && make install install-flash-attention-v2`" ) if not (is_sm8x or is_sm90): @@ -37,7 +37,7 @@ except ImportError: raise ImportError( "Flash Attention is not installed.\n" - "Use the official Docker image (ghcr.io/huggingface/lorax-inference:latest) " + "Use the official Docker image (ghcr.io/predibase/lorax:latest) " "or install flash attention with `cd server && make install install-flash-attention`" ) from e diff --git a/server/lorax_server/utils/gptq/quantize.py b/server/lorax_server/utils/gptq/quantize.py index ee21134cd..c300dcfbe 100644 --- a/server/lorax_server/utils/gptq/quantize.py +++ b/server/lorax_server/utils/gptq/quantize.py @@ -944,7 +944,7 @@ def _unload(): metadata={ "format": "pt", "quantized": "gptq", - "origin": "lorax-inference", + "origin": "lorax", }, ) if index is None: