Skip to content

Commit

Permalink
Merge pull request #9 from vtuber-plan/ggml
Browse files Browse the repository at this point in the history
Change llama.cpp support to ggml
  • Loading branch information
FrostMiKu authored Jun 18, 2023
2 parents 83473a7 + 5165de1 commit 7428edf
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 124 deletions.
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ Our goal is to build a super fast LLM inference service.
This project is inspired by [lmsys/fastchat](https://github.com/lm-sys/FastChat), we hope that the serving platform is lightweight and fast, but fastchat includes other features such as training and evaluation make it complicated.

The core features include:
- Huggingface transformers support.
- ggml (llama.cpp) support.
- A distributed serving system for state-of-the-art models.
- Streaming generation support with various decoding strategies.
- Batch inference for higher throughput.
Expand All @@ -36,6 +38,7 @@ We create 32 threads to submit chat tasks to the server, and the following figur


## News
- [2023/06/18] Add ggml (llama.cpp gpt.cpp starcoder.cpp etc.) worker support.
- [2023/06/09] Add LLama.cpp worker support.
- [2023/06/01] Add HuggingFace Bert embedding worker support.
- [2023/06/01] Add HuggingFace text generation API support.
Expand All @@ -51,18 +54,23 @@ We create 32 threads to submit chat tasks to the server, and the following figur
### Method 1: With pip

```bash
pip3 install langport
pip install langport
```

or:

```bash
pip3 install git+https://github.com/vtuber-plan/langport.git
pip install git+https://github.com/vtuber-plan/langport.git
```

If you need llamacpp generation worker, use this command:
If you need ggml generation worker, use this command:
```bash
pip3 install langport[llamacpp]
pip install langport[ggml]
```

If you wanna use GPU:
```bash
CT_CUBLAS=1 pip install langport[ggml]
```

### Method 2: From source
Expand Down Expand Up @@ -114,10 +122,10 @@ python -m langport.service.server.generation_worker --port 21001 --model-path <y
python -m langport.service.gateway.openai_api
```

Run text generation with LLama.cpp worker:
Run text generation with ggml worker:

```bash
python -m langport.service.server.generation_worker --port 21001 --model-path <your model path> --n-gpu-layers <num layer to gpu (resize this for your VRAM)>
python -m langport.service.server.generation_worker --port 21001 --model-path <your model path> --gpu-layers <num layer to gpu (resize this for your VRAM)>
```

## License
Expand Down
12 changes: 0 additions & 12 deletions langport/data/conversation/settings/llama_cpp.py

This file was deleted.

29 changes: 0 additions & 29 deletions langport/model/adapters/llama_cpp.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
from typing import List, Optional

from llama_cpp import Llama, LlamaTokenizer

from langport.model.executor.llamacpp import LlamaCppExecutor
from langport.model.model_adapter import get_model_adapter
from langport.model.executor.base import BaseModelExecutor, LocalModelExecutor
from langport.model.executor.ggml import GgmlExecutor, GgmlTokenizer
from ctransformers import LLM
from langport.protocol.worker_protocol import BaseWorkerResult, GenerationTask, GenerationWorkerResult, UsageInfo
from langport.workers.generation_worker import GenerationModelWorker


def batch_generation(
model: Llama,
tokenizer: LlamaTokenizer,
def stream_generation(
model: LLM,
tokenizer: GgmlTokenizer,
stream_interval: int,
tasks: List[GenerationTask],
):
Expand All @@ -21,22 +17,25 @@ def batch_generation(

# todo: add stop words support
for i, task in enumerate(tasks):

output = ""

if task.echo:
output = task.prompt
else:
tokens = tokenizer.encode(" " + task.prompt + " ")
tokens = tokenizer.encode(task.prompt)
prompt_length = len(tokens)
output_ids = []

for j, token in enumerate(model.generate(tokens, top_k=40, top_p=task.top_p,
temp=task.temperature, repeat_penalty=1.17647)):
# Compatible with some models
top_k = 40 if task.top_k <= 1 else task.top_k
repetition_penalty = 1.17647 if task.repetition_penalty == 0.0 else task.repetition_penalty

for j, token in enumerate(model.generate(tokens, top_k=top_k, top_p=task.top_p,
temperature=task.temperature, repetition_penalty=repetition_penalty)):
output_ids.append(token)
if token == model.token_eos() or len(tokens) + j == task.max_tokens - 1:
if tokenizer.is_eos_token(token) or prompt_length + j == task.max_tokens - 1:
output = tokenizer.decode(output_ids)
if token == model.token_eos():
if tokenizer.is_eos_token(token):
finish_reason = "stop"
else:
finish_reason = "length"
Expand Down Expand Up @@ -71,31 +70,29 @@ def batch_generation(
)


class LlamaCppGenerationExecutor(LlamaCppExecutor):
class GgmlGenerationExecutor(GgmlExecutor):
def __init__(
self,
model_name: str,
model_path: str,
n_ctx: int,
n_gpu_layers: int,
seed: int,
n_batch: int,
last_n_tokens_size: int
context_length: int,
gpu_layers: int,
model_type: str = "llama",
lib: Optional[str] = None,
) -> None:
super(LlamaCppGenerationExecutor, self).__init__(
n_gpu = 1 if gpu_layers > 0 else 0
super(GgmlGenerationExecutor, self).__init__(
model_name=model_name,
model_path=model_path,
device="cpu",
num_gpus=1,
num_gpus=n_gpu,
max_gpu_memory=None,
gpu_layers=gpu_layers,
lib=lib,
model_type=model_type,
)
self.n_ctx = n_ctx
self.adapter = get_model_adapter(model_path)
self.model, self.tokenizer = self.load_model(model_path, {"n_ctx":n_ctx,
"n_gpu_layers":n_gpu_layers,
"seed":seed,
"n_batch":n_batch,
"last_n_tokens_size":last_n_tokens_size,})
self.n_ctx = context_length
self.adapter, self.model, self.tokenizer = self.load_model(model_path, from_pretrained_kwargs={})

@property
def context_length(self) -> int:
Expand All @@ -114,7 +111,7 @@ def inference(self, worker: "GenerationModelWorker"):
return

# batch inference
for chunk in batch_generation(
for chunk in stream_generation(
self.model,
self.tokenizer,
worker.stream_interval,
Expand Down
61 changes: 61 additions & 0 deletions langport/model/executor/ggml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import List, Optional
from ctransformers import AutoModelForCausalLM, LLM, AutoConfig, Config
from langport.model.executor.base import LocalModelExecutor
from langport.model.model_adapter import get_model_adapter

class GgmlTokenizer:
def __init__(self, model:LLM) -> None:
self.model = model

def encode(self, text: str) -> List[int]:
return self.model.tokenize(text)

def decode(self, tokens: List[int]) -> str:
return self.model.detokenize(tokens)

def is_eos_token(self, token: int) -> bool:
return self.model.is_eos_token(token)


class GgmlExecutor(LocalModelExecutor):
def __init__(
self,
model_name: str,
model_path: str,
device: str,
num_gpus: int,
max_gpu_memory: Optional[str],
lib: Optional[str] = None,
gpu_layers: int = 0,
model_type: str = 'llama',
load_8bit: bool = False,
cpu_offloading: bool = False,
) -> None:
super(GgmlExecutor, self).__init__(
model_name = model_name,
model_path = model_path,
device = device,
num_gpus = num_gpus,
max_gpu_memory = max_gpu_memory,
load_8bit = load_8bit,
cpu_offloading = cpu_offloading,
)
self.gpu_layers = gpu_layers
# ctransformers has a bug
self.lib = lib
self.model_type = model_type


def load_model(self, model_path: str, from_pretrained_kwargs: dict):
adapter = get_model_adapter(model_path)
config = Config()
setattr(config, 'stream', True)
setattr(config, 'gpu_layers', self.gpu_layers)
auto_config = AutoConfig(config=config, model_type=self.model_type)
model = AutoModelForCausalLM.from_pretrained(model_path,
config=auto_config,
lib=self.lib,
)
tokenizer = GgmlTokenizer(model)

return adapter, model, tokenizer
31 changes: 0 additions & 31 deletions langport/model/executor/llamacpp.py

This file was deleted.

22 changes: 10 additions & 12 deletions langport/service/server/ggml_generation_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@
parser.add_argument("--batch", type=int, default=1)
parser.add_argument("--stream-interval", type=int, default=2)

parser.add_argument("--n-ctx", type=int, default=2048)
parser.add_argument("--n-gpu-layers", type=int, default=24)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--n-batch", type=int, default=1024)
parser.add_argument("--last-n-tokens-size", type=int, default=1024)
parser.add_argument("--context-length", type=int, default=2048)
parser.add_argument("--gpu-layers", type=int, default=0)
parser.add_argument("--lib", type=str, default=None, choices=["avx2", "avx", "basic"], help="The path to a shared library or one of avx2, avx, basic.")
parser.add_argument("--model-type", type=str, default="llama", choices=["llama", "gpt2", "dolly-v2", "starcoder"], help="The type of model to use.")
args = parser.parse_args()

node_id = str(uuid.uuid4())
Expand All @@ -51,15 +50,14 @@
if args.model_name is None:
args.model_name = os.path.basename(os.path.normpath(args.model_path))

from langport.model.executor.generation.llamacpp import LlamaCppGenerationExecutor
executor = LlamaCppGenerationExecutor(
from langport.model.executor.generation.ggml import GgmlGenerationExecutor
executor = GgmlGenerationExecutor(
model_name=args.model_name,
model_path=args.model_path,
n_ctx=args.n_ctx,
n_gpu_layers=args.n_gpu_layers,
seed=args.seed,
n_batch=args.n_batch,
last_n_tokens_size=args.last_n_tokens_size
context_length=args.context_length,
gpu_layers=args.gpu_layers,
lib=args.lib,
model_type=args.model_type,
)

app.node = GenerationModelWorker(
Expand Down
2 changes: 1 addition & 1 deletion langport/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
LANGPORT_VERSION = "0.1.0"
LANGPORT_VERSION = "0.2.0"
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "langport"
version = "0.1.0"
version = "0.2.0"
description = "A large language model serving platform."
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -21,7 +21,7 @@ dependencies = [

[project.optional-dependencies]
dev = ["black==23.3.0", "pylint==2.8.2"]
llamacpp = ["llama-cpp-python"]
ggml = ["ctransformers"]

[project.urls]
"Homepage" = "https://github.com/vtuber-plan/langport"
Expand Down

0 comments on commit 7428edf

Please sign in to comment.