Merge pull request #19 from vtuber-plan/AFM

Bump to 0.3.2 version
vtuber-plan · Jul 19, 2023 · ccc8d67 · ccc8d67
2 parents a60ca8a + 8af1e1b
commit ccc8d67
Show file tree

Hide file tree

Showing 7 changed files with 14 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ The core features include:
 * LLaMa, LLaMa2, GLM, Bloom, OPT, GPT2, GPT Neo, GPT Big Code and so on.
 
 ## Tested Models
-* LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM, NingYu and so on.
+* NingYu, LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM and so on.
 
 
 ## Benchmark

diff --git a/langport/model/executor/base.py b/langport/model/executor/base.py
@@ -26,7 +26,7 @@ def __init__(
         device: str,
         num_gpus: int,
         max_gpu_memory: Optional[str],
-        quantization: Optional[str] = False,
+        quantization: Optional[str] = None,
         cpu_offloading: bool = False,
     ) -> None:
         super(LocalModelExecutor, self).__init__(

diff --git a/langport/model/executor/ggml.py b/langport/model/executor/ggml.py
@@ -30,7 +30,7 @@ def __init__(
         model_type: str = 'llama',
         chunk_size: int = 1024,
         threads: int = -1,
-        load_8bit: bool = False,
+        quantization: Optional[str] = None,
         cpu_offloading: bool = False,
     ) -> None:
         super(GgmlExecutor, self).__init__(
@@ -39,7 +39,7 @@ def __init__(
             device = device,
             num_gpus = num_gpus,
             max_gpu_memory = max_gpu_memory,
-            load_8bit = load_8bit,
+            quantization = quantization,
             cpu_offloading = cpu_offloading,
         )
         self.gpu_layers = gpu_layers

diff --git a/langport/service/server/ggml_generation_worker.py b/langport/service/server/ggml_generation_worker.py
@@ -2,6 +2,7 @@
 import os
 import random
 import uuid
+import warnings
 import uvicorn
 
 from langport.workers.generation_worker import GenerationModelWorker
@@ -41,6 +42,9 @@
                 f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
+
+    if args.load_8bit or args.load_4bit:
+        warnings.warn("The ggml backend does not yet support quantization parameters.")
 
     if args.port is None:
         args.port = random.randint(21001, 29001)

diff --git a/langport/service/server/optimum_generation_worker.py b/langport/service/server/optimum_generation_worker.py
@@ -2,6 +2,7 @@
 import os
 import random
 import uuid
+import warnings
 import uvicorn
 
 from langport.workers.generation_worker import GenerationModelWorker
@@ -37,6 +38,9 @@
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
 
+    if args.load_8bit or args.load_4bit:
+        warnings.warn("The optimum backend does not yet support quantization parameters.")
+
     if args.port is None:
         args.port = random.randint(21001, 29001)
 

diff --git a/langport/version.py b/langport/version.py
@@ -1 +1 @@
-LANGPORT_VERSION = "0.3.1"
+LANGPORT_VERSION = "0.3.2"
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "langport"
-version = "0.3.1"
+version = "0.3.2"
 description = "A large language model serving platform."
 readme = "README.md"
 requires-python = ">=3.8"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		LANGPORT_VERSION = "0.3.1"
		LANGPORT_VERSION = "0.3.2"