diff --git a/README.md b/README.md index e90a643..8e0d6db 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The core features include: * LLaMa, LLaMa2, GLM, Bloom, OPT, GPT2, GPT Neo, GPT Big Code and so on. ## Tested Models -* LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM, NingYu and so on. +* NingYu, LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM and so on. ## Benchmark diff --git a/langport/model/executor/base.py b/langport/model/executor/base.py index d2861ba..0620099 100644 --- a/langport/model/executor/base.py +++ b/langport/model/executor/base.py @@ -26,7 +26,7 @@ def __init__( device: str, num_gpus: int, max_gpu_memory: Optional[str], - quantization: Optional[str] = False, + quantization: Optional[str] = None, cpu_offloading: bool = False, ) -> None: super(LocalModelExecutor, self).__init__( diff --git a/langport/model/executor/ggml.py b/langport/model/executor/ggml.py index 0ea1f10..bbae012 100644 --- a/langport/model/executor/ggml.py +++ b/langport/model/executor/ggml.py @@ -30,7 +30,7 @@ def __init__( model_type: str = 'llama', chunk_size: int = 1024, threads: int = -1, - load_8bit: bool = False, + quantization: Optional[str] = None, cpu_offloading: bool = False, ) -> None: super(GgmlExecutor, self).__init__( @@ -39,7 +39,7 @@ def __init__( device = device, num_gpus = num_gpus, max_gpu_memory = max_gpu_memory, - load_8bit = load_8bit, + quantization = quantization, cpu_offloading = cpu_offloading, ) self.gpu_layers = gpu_layers diff --git a/langport/service/server/ggml_generation_worker.py b/langport/service/server/ggml_generation_worker.py index ad6724d..6ba6720 100644 --- a/langport/service/server/ggml_generation_worker.py +++ b/langport/service/server/ggml_generation_worker.py @@ -2,6 +2,7 @@ import os import random import uuid +import warnings import uvicorn from langport.workers.generation_worker import GenerationModelWorker @@ -41,6 +42,9 @@ f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!" ) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus + + if args.load_8bit or args.load_4bit: + warnings.warn("The ggml backend does not yet support quantization parameters.") if args.port is None: args.port = random.randint(21001, 29001) diff --git a/langport/service/server/optimum_generation_worker.py b/langport/service/server/optimum_generation_worker.py index 545a316..a20a547 100644 --- a/langport/service/server/optimum_generation_worker.py +++ b/langport/service/server/optimum_generation_worker.py @@ -2,6 +2,7 @@ import os import random import uuid +import warnings import uvicorn from langport.workers.generation_worker import GenerationModelWorker @@ -37,6 +38,9 @@ ) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus + if args.load_8bit or args.load_4bit: + warnings.warn("The optimum backend does not yet support quantization parameters.") + if args.port is None: args.port = random.randint(21001, 29001) diff --git a/langport/version.py b/langport/version.py index 0225648..faa8328 100644 --- a/langport/version.py +++ b/langport/version.py @@ -1 +1 @@ -LANGPORT_VERSION = "0.3.1" \ No newline at end of file +LANGPORT_VERSION = "0.3.2" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 839b3d5..f82d0d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "langport" -version = "0.3.1" +version = "0.3.2" description = "A large language model serving platform." readme = "README.md" requires-python = ">=3.8"