Renamed lorax-inference to lorax (#30)

predibase · Nov 16, 2023 · d5bcc17 · d5bcc17
1 parent a9426bb
commit d5bcc17
Show file tree

Hide file tree

Showing 12 changed files with 24 additions and 24 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -1,5 +1,5 @@
 name: "\U0001F41B Bug Report"
-description: Submit a bug report to help us improve lorax-inference
+description: Submit a bug report to help us improve LoRAX
 body:
   - type: textarea
     id: system-info
@@ -16,7 +16,7 @@ body:
         Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
         The current version being used:
 
-      placeholder: lorax-inference version, platform, python version, ...
+      placeholder: lorax version, platform, python version, ...
     validations:
       required: true
 

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -1,5 +1,5 @@
 name: "\U0001F680 Feature request"
-description: Submit a proposal/request for a new lorax-inference feature
+description: Submit a proposal/request for a new LoRAX feature
 labels: [ "feature" ]
 body:
   - type: textarea
@@ -28,4 +28,4 @@ body:
     attributes:
       label: Your contribution
       description: |
-        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/lorax-inference/blob/main/CONTRIBUTING.md)
+        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/predibase/lorax/blob/main/CONTRIBUTING.md)
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
@@ -70,7 +70,7 @@ jobs:
 
       - name: Start starcoder
         run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/lorax-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/predibase/lorax:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
           sleep 10
           wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
 

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 # LoRA Exchange (LoRAX)
 
 <a href="https://github.com/predibase/lorax">
-  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/lorax-inference?style=social">
+  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/predibase/lorax?style=social">
 </a>
 <a href="https://github.com/predibase/lorax/blob/main/LICENSE">
   <img alt="License" src="https://img.shields.io/github/license/predibase/lorax">
@@ -32,7 +32,7 @@ Lorax is a framework that allows users to serve over a hundred fine-tuned models
 - 🚅 **Dynamic Adapter Loading:** allowing each set of fine-tuned LoRA weights to be loaded from storage just-in-time as requests come in at runtime, without blocking concurrent requests.
 - 🏋️‍♀️ **Tiered Weight Caching:** to support fast exchanging of LoRA adapters between requests, and offloading of adapter weights to CPU and disk to avoid out-of-memory errors.
 - 🧁 **Continuous Multi-Adapter Batching:** a fair scheduling policy for optimizing aggregate throughput of the system that extends the popular continuous batching strategy to work across multiple sets of LoRA adapters in parallel.
-- 👬 **Optimized Inference:**  [flash-attention](https://github.com/HazyResearch/flash-attention), [paged attention](https://github.com/vllm-project/vllm), quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323), tensor parallelism, token streaming, and [continuous batching](https://github.com/huggingface/lorax-inference/tree/main/router) work together to optimize our inference speeds.
+- 👬 **Optimized Inference:**  [flash-attention](https://github.com/HazyResearch/flash-attention), [paged attention](https://github.com/vllm-project/vllm), quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323), tensor parallelism, token streaming, and [continuous batching](https://github.com/predibase/lorax/tree/main/router) work together to optimize our inference speeds.
 - ✅ **Production Readiness** reliably stable, Lorax supports  Prometheus metrics and distributed tracing with Open Telemetry
 - 🤯 **Free Commercial Use:** Apache 2.0 License. Enough said 😎.
 
@@ -65,11 +65,11 @@ The easiest way of getting started is using the official Docker container:
 model=mistralai/Mistral-7B-Instruct-v0.1
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/lorax-inference:0.9.4 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/predibase/lorax:latest --model-id $model
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
 
-To see all options to serve your models (in the [code](https://github.com/huggingface/lorax-inference/blob/main/launcher/src/main.rs) or in the cli:
+To see all options to serve your models (in the [code](https://github.com/predibase/lorax/blob/main/launcher/src/main.rs) or in the cli:
 ```
 lorax-launcher --help
 ```

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -17,7 +17,7 @@ make install-benchmark
 
 ## Run
 
-First, start `lorax-inference`:
+First, start `lorax`:
 
 ```shell
 lorax-launcher --model-id bigscience/bloom-560m

diff --git a/clients/python/lorax/client.py b/clients/python/lorax/client.py
@@ -15,7 +15,7 @@
 
 
 class Client:
-    """Client to make calls to a lorax-inference instance
+    """Client to make calls to a LoRAX instance
 
      Example:
 
@@ -45,7 +45,7 @@ def __init__(
         """
         Args:
             base_url (`str`):
-                lorax-inference instance base url
+                LoRAX instance base url
             headers (`Optional[Dict[str, str]]`):
                 Additional headers
             cookies (`Optional[Dict[str, str]]`):
@@ -272,7 +272,7 @@ def generate_stream(
 
 
 class AsyncClient:
-    """Asynchronous Client to make calls to a lorax-inference instance
+    """Asynchronous Client to make calls to a LoRAX instance
 
      Example:
 
@@ -303,7 +303,7 @@ def __init__(
         """
         Args:
             base_url (`str`):
-                lorax-inference instance base url
+                LoRAX instance base url
             headers (`Optional[Dict[str, str]]`):
                 Additional headers
             cookies (`Optional[Dict[str, str]]`):

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -105,7 +105,7 @@ struct Args {
     validation_workers: usize,
 
     /// Whether to shard the model across multiple GPUs
-    /// By default lorax-inference will use all available GPUs to run
+    /// By default LoRAX will use all available GPUs to run
     /// the model. Setting it to `false` deactivates `num_shard`.
     #[clap(long, env)]
     sharded: Option<bool>,
@@ -204,7 +204,7 @@ struct Args {
     /// Overall this number should be the largest possible amount that fits the
     /// remaining memory (after the model is loaded). Since the actual memory overhead
     /// depends on other parameters like if you're using quantization, flash attention
-    /// or the model implementation, lorax-inference cannot infer this number
+    /// or the model implementation, LoRAX cannot infer this number
     /// automatically.
     #[clap(long, env)]
     max_batch_total_tokens: Option<u32>,
@@ -260,7 +260,7 @@ struct Args {
     #[clap(long, env)]
     weights_cache_override: Option<String>,
 
-    /// For some models (like bloom), lorax-inference implemented custom
+    /// For some models (like llama), LoRAX implemented custom
     /// cuda kernels to speed up inference. Those kernels were only tested on A100.
     /// Use this flag to disable them if you're running on different hardware and
     /// encounter issues.

diff --git a/router/README.md b/router/README.md
@@ -10,7 +10,7 @@ much simpler and focus on having the most efficient forward passes as possible.
 
 ## Continuous batching
 
-One important feature of `lorax-inference` is enabled
+One important feature of `lorax` is enabled
 by this `router`.
 
 Continuous batching is the act of regularly running queries in the same
@@ -69,7 +69,7 @@ but a currently running query is probably doing `decode`. If we want to do the c
 batching as explained previously we need to run `prefill` at some point in order to create
 the attention matrix required to be able to join the `decode` group.
 
-`lorax-inference` uses a bunch of different strategies and parameters in
+`lorax` uses a bunch of different strategies and parameters in
 order to enable you to find the sweet spot between exploiting the hardware and perceived latency.
 
 With no continuous batching at all, latency is going to be super good, but throughput (meaning

diff --git a/router/src/main.rs b/router/src/main.rs
@@ -316,7 +316,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
                 trace::config()
                     .with_resource(Resource::new(vec![KeyValue::new(
                         "service.name",
-                        "lorax-inference.router",
+                        "lorax.router",
                     )]))
                     .with_sampler(Sampler::AlwaysOn),
             )

diff --git a/server/lorax_server/tracing.py b/server/lorax_server/tracing.py
@@ -56,7 +56,7 @@ def _start_span(self, handler_call_details, context, set_status_on_exception=Fal
 
 def setup_tracing(shard: int, otlp_endpoint: str):
     resource = Resource.create(
-        attributes={"service.name": f"lorax-inference.server-{shard}"}
+        attributes={"service.name": f"lorax.server-{shard}"}
     )
     span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
     span_processor = BatchSpanProcessor(span_exporter)

diff --git a/server/lorax_server/utils/flash_attn.py b/server/lorax_server/utils/flash_attn.py
@@ -22,7 +22,7 @@
     except ImportError:
         raise ImportError(
             "Flash Attention V2 is not installed.\n"
-            "Use the official Docker image (ghcr.io/huggingface/lorax-inference:latest) "
+            "Use the official Docker image (ghcr.io/predibase/lorax:latest) "
             "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
         )
     if not (is_sm8x or is_sm90):
@@ -37,7 +37,7 @@
     except ImportError:
         raise ImportError(
             "Flash Attention is not installed.\n"
-            "Use the official Docker image (ghcr.io/huggingface/lorax-inference:latest) "
+            "Use the official Docker image (ghcr.io/predibase/lorax:latest) "
             "or install flash attention with `cd server && make install install-flash-attention`"
         ) from e
 

diff --git a/server/lorax_server/utils/gptq/quantize.py b/server/lorax_server/utils/gptq/quantize.py
@@ -944,7 +944,7 @@ def _unload():
             metadata={
                 "format": "pt",
                 "quantized": "gptq",
-                "origin": "lorax-inference",
+                "origin": "lorax",
             },
         )
     if index is None: