Skip to content

Commit

Permalink
Merge pull request #19 from vtuber-plan/AFM
Browse files Browse the repository at this point in the history
Bump to 0.3.2 version
  • Loading branch information
FrostMiKu authored Jul 19, 2023
2 parents a60ca8a + 8af1e1b commit ccc8d67
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ The core features include:
* LLaMa, LLaMa2, GLM, Bloom, OPT, GPT2, GPT Neo, GPT Big Code and so on.

## Tested Models
* LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM, NingYu and so on.
* NingYu, LLaMa, LLaMa2-chat, Vicuna, ChatGLM, ChatGLM2, Falcon, Starcoder, WizardLM, InternLM, OpenBuddy, FireFly, CodeGen, Phoenix, RWKV, StableLM and so on.


## Benchmark
Expand Down
2 changes: 1 addition & 1 deletion langport/model/executor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
device: str,
num_gpus: int,
max_gpu_memory: Optional[str],
quantization: Optional[str] = False,
quantization: Optional[str] = None,
cpu_offloading: bool = False,
) -> None:
super(LocalModelExecutor, self).__init__(
Expand Down
4 changes: 2 additions & 2 deletions langport/model/executor/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(
model_type: str = 'llama',
chunk_size: int = 1024,
threads: int = -1,
load_8bit: bool = False,
quantization: Optional[str] = None,
cpu_offloading: bool = False,
) -> None:
super(GgmlExecutor, self).__init__(
Expand All @@ -39,7 +39,7 @@ def __init__(
device = device,
num_gpus = num_gpus,
max_gpu_memory = max_gpu_memory,
load_8bit = load_8bit,
quantization = quantization,
cpu_offloading = cpu_offloading,
)
self.gpu_layers = gpu_layers
Expand Down
4 changes: 4 additions & 0 deletions langport/service/server/ggml_generation_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import random
import uuid
import warnings
import uvicorn

from langport.workers.generation_worker import GenerationModelWorker
Expand Down Expand Up @@ -41,6 +42,9 @@
f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
)
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

if args.load_8bit or args.load_4bit:
warnings.warn("The ggml backend does not yet support quantization parameters.")

if args.port is None:
args.port = random.randint(21001, 29001)
Expand Down
4 changes: 4 additions & 0 deletions langport/service/server/optimum_generation_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import random
import uuid
import warnings
import uvicorn

from langport.workers.generation_worker import GenerationModelWorker
Expand Down Expand Up @@ -37,6 +38,9 @@
)
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

if args.load_8bit or args.load_4bit:
warnings.warn("The optimum backend does not yet support quantization parameters.")

if args.port is None:
args.port = random.randint(21001, 29001)

Expand Down
2 changes: 1 addition & 1 deletion langport/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
LANGPORT_VERSION = "0.3.1"
LANGPORT_VERSION = "0.3.2"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "langport"
version = "0.3.1"
version = "0.3.2"
description = "A large language model serving platform."
readme = "README.md"
requires-python = ">=3.8"
Expand Down

0 comments on commit ccc8d67

Please sign in to comment.