Skip to content

Commit

Permalink
Merge pull request #16 from vtuber-plan/development
Browse files Browse the repository at this point in the history
logprobs parameter implement
  • Loading branch information
FrostMiKu authored Jul 14, 2023
2 parents 1715fa2 + a9be5c7 commit 3f87a74
Show file tree
Hide file tree
Showing 16 changed files with 211 additions and 28 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ We create 32 threads to submit chat tasks to the server, and the following figur
![benchmark_chat](assets/benchmark_chat.jpg)

## News
- [2023/07/13] Support generation logprobs parameter.
- [2023/06/18] Add ggml (llama.cpp gpt.cpp starcoder.cpp etc.) worker support.
- [2023/06/09] Add LLama.cpp worker support.
- [2023/06/01] Add HuggingFace Bert embedding worker support.
Expand Down
2 changes: 1 addition & 1 deletion docs/openai_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Here we list the parameter compatibility of completions API.
| `top_p` ||| `1.0` | - |
| `n` ||| `1` | `COMPLETION_MAX_N` |
| `stream` ||| `false` | - |
| `logprobs` | || `0` | `COMPLETION_MAX_LOGPROBS` |
| `logprobs` | || `0` | `COMPLETION_MAX_LOGPROBS` |
| `echo` ||| `false` | - |
| `stop` ||| - | - |
| `presence_penalty` ||| - | - |
Expand Down
13 changes: 13 additions & 0 deletions langport/data/conversation/settings/ningyu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from langport.data.conversation import (
ConversationSettings,
SeparatorStyle,
)

# Ningyu default template
ningyu = ConversationSettings(
name="ningyu",
roles=("user", "assistant"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n### ",
stop_str="###",
)
22 changes: 22 additions & 0 deletions langport/model/adapters/ningyu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from langport.data.conversation import ConversationHistory, SeparatorStyle
from langport.data.conversation.conversation_settings import get_conv_settings
from langport.model.model_adapter import BaseAdapter


class NingYuAdapter(BaseAdapter):
"""The model adapter for ningyu"""

def match(self, model_path: str):
return "ningyu" in model_path

def get_default_conv_template(self, model_path: str) -> ConversationHistory:
settings = get_conv_settings("ningyu")
return ConversationHistory(
system="""A chat between a curious user and an artificial intelligence assistant.
The name of the assistant is NingYu (凝语).
The assistant gives helpful, detailed, and polite answers to the user's questions.""",
messages=[],
offset=0,
settings=settings,
)

8 changes: 5 additions & 3 deletions langport/model/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ def __init__(self, weight=None, bias=None, device=None):

def forward(self, input: Tensor) -> Tensor:
weight = decompress(self.weight, default_compression_config)
return F.linear(input.to(weight.dtype), weight, self.bias)
if self.bias is not None:
bias = self.bias.to(weight.dtype)
else:
bias = self.bias
return F.linear(input.to(weight.dtype), weight, bias)


def compress_module(module, target_device):
Expand Down Expand Up @@ -138,7 +142,6 @@ def load_compress_model(model_path, device, torch_dtype):

return model, tokenizer


def compress(tensor, config):
"""Simulate group-wise quantization."""
if not config.enabled:
Expand Down Expand Up @@ -191,7 +194,6 @@ def compress(tensor, config):
data = data.clamp_(0, B).round_().to(torch.uint8)
return data, mn, scale, original_shape


def decompress(packed_data, config):
"""Simulate group-wise dequantization."""
if not config.enabled:
Expand Down
14 changes: 10 additions & 4 deletions langport/model/executor/generation/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@ def stream_generation(
output_ids = []

# Compatible with some models
top_k = 40 if task.top_k <= 1 else task.top_k
repetition_penalty = 1.17647 if task.repetition_penalty == 0.0 else task.repetition_penalty
top_k = 10 if task.top_k <= 1 else task.top_k
repetition_penalty = 1.01 if task.repetition_penalty == 0.0 else task.repetition_penalty
model.config.max_new_tokens = task.max_tokens

finish_reason = "stop"
n_tokens = 0
for token in model.generate(
tokens, top_k=top_k, top_p=task.top_p, batch_size=512,
temperature=task.temperature, repetition_penalty=repetition_penalty):
tokens, top_k=top_k, top_p=task.top_p, batch_size=model.config.batch_size,
threads=model.config.threads, temperature=task.temperature,
last_n_tokens=256, repetition_penalty=repetition_penalty, reset=True):
n_tokens += 1
output_ids.append(token)
if n_tokens == task.max_tokens:
Expand Down Expand Up @@ -94,6 +96,8 @@ def __init__(
model_path: str,
context_length: int,
gpu_layers: int,
chunk_size: int,
threads: int,
model_type: str = "llama",
lib: Optional[str] = None,
) -> None:
Expand All @@ -105,6 +109,8 @@ def __init__(
num_gpus=n_gpu,
max_gpu_memory=None,
gpu_layers=gpu_layers,
chunk_size=chunk_size,
threads=threads,
lib=lib,
model_type=model_type,
)
Expand Down
124 changes: 114 additions & 10 deletions langport/model/executor/generation/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from typing import Any, Iterable, List, Optional, Union
from typing import Any, Dict, Iterable, List, Optional, Union
from langport.model.executor.generation import BaseStreamer

from langport.model.executor.huggingface import HuggingfaceExecutor

from langport.protocol.worker_protocol import (
BaseWorkerResult,
GenerationTask,
GenerationWorkerLogprobs,
GenerationWorkerResult,
UsageInfo,
)

import torch

from transformers import PreTrainedModel, PreTrainedTokenizerBase
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers.generation.logits_process import (
LogitsProcessor,
LogitsProcessorList,
Expand All @@ -30,6 +31,12 @@

import torch

def token_to_unicode(token: str) -> str:
utf8_bytes = token.encode("utf-8")
# Convert the bytes to a string with \\x escape sequences
escaped_bytes = "".join([f"\\x{b:02x}" for b in utf8_bytes])
return escaped_bytes

@cached(LRUCache(maxsize=64))
def prepare_logits_processor(
temperature: float, repetition_penalty: float, top_p: float, top_k: int
Expand All @@ -48,7 +55,7 @@ def prepare_logits_processor(


class BatchingTask:
def __init__(self, tasks: List[GenerationTask], tokenizer: PreTrainedTokenizerBase, device: str, is_encoder_decoder: bool) -> None:
def __init__(self, tasks: List[GenerationTask], tokenizer: PreTrainedTokenizer, device: str, is_encoder_decoder: bool) -> None:
self.batch_size = len(tasks)
if self.batch_size == 0:
return
Expand Down Expand Up @@ -79,6 +86,8 @@ def __init__(self, tasks: List[GenerationTask], tokenizer: PreTrainedTokenizerBa

# variables used in the streaming process
self.batch_tokens_cache: List[List[int]] = [[] for i in range(self.batch_size)]
self.batch_tokens_probs_cache: List[List[float]] = [[] for i in range(self.batch_size)]
self.batch_top_logprobs_cache: List[List[Dict[str, float]]] = [[] for i in range(self.batch_size)]
self.stop = [False for i in range(self.batch_size)]

def __len__(self):
Expand Down Expand Up @@ -132,15 +141,31 @@ def get_logits_processor_list(self, idx:int) -> LogitsProcessorList:
self._check_idx(idx)
return self.logits_processor_list[idx]

def get_generated_ids(self, idx:int) -> List[int]:
def get_generated_ids(self, idx: int) -> List[int]:
self._check_idx(idx)
return self.batch_tokens_cache[idx]

def get_generated_length(self, idx:int) -> int:
def get_generated_length(self, idx: int) -> int:
return len(self.get_generated_ids(idx))

def update_new_token(self, batch_token: List[int]):
def get_generated_token_probs(self, idx: int) -> List[float]:
self._check_idx(idx)
return self.batch_tokens_probs_cache[idx]

def get_generated_top_logprobs(self, idx: int) -> List[Dict[int, float]]:
self._check_idx(idx)
return self.batch_top_logprobs_cache[idx]

def update_new_token(self, batch_token: List[int],
token_probs: Optional[List[Optional[float]]]=None,
top_logprobs: Optional[List[Optional[Dict[int, float]]]]=None
):
self._check_batch_size(batch_token)
if token_probs is not None:
self._check_batch_size(token_probs)
if top_logprobs is not None:
self._check_batch_size(top_logprobs)

for i, token in enumerate(batch_token):
if self.is_stop(i):
continue
Expand All @@ -153,6 +178,11 @@ def update_new_token(self, batch_token: List[int]):
self.set_stop(i)
if self.get_generated_length(i) == self.max_tokens[i]:
self.set_stop(i)

if token_probs is not None and token_probs[i] is not None:
self.batch_tokens_probs_cache[i].append(token_probs[i])
if top_logprobs is not None and top_logprobs[i] is not None:
self.batch_top_logprobs_cache[i].append(top_logprobs[i])

def set_stop(self, idx:int):
self._check_idx(idx)
Expand Down Expand Up @@ -227,6 +257,9 @@ def generate(self, inputs: BatchingTask,
decoder_input_ids_list = []

new_ids = []
# logprobs
token_probs = [None] * inputs.batch_size
top_logprobs = [None] * inputs.batch_size

for i, task in enumerate(inputs.tasks):
if inputs.is_stop(i):
Expand All @@ -253,10 +286,18 @@ def generate(self, inputs: BatchingTask,
else:
probs = torch.softmax(last_token_logits, dim=-1)
token = int(torch.multinomial(probs, num_samples=1))

if task.logprobs is not None:
token_probs[i] = each_logits[0, token].item()
top_values, top_indices = torch.topk(each_logits[0, :], task.logprobs, dim=-1, largest=True, sorted=True)
item = {}
for top_i in range(len(top_values)):
item[top_indices[top_i].item()] = top_values[top_i].item()
top_logprobs[i] = item
new_ids.append(token)

# update state
inputs.update_new_token(new_ids)
inputs.update_new_token(new_ids, token_probs=token_probs, top_logprobs=top_logprobs)
if streamer:
streamer.put(new_ids)

Expand Down Expand Up @@ -284,7 +325,7 @@ def generate(self, inputs: BatchingTask,
class GenerationWorkerStreamer(BaseStreamer):
def __init__(self,
task_batch: BatchingTask,
tokenizer: PreTrainedTokenizerBase,
tokenizer: PreTrainedTokenizer,
worker: "GenerationModelWorker") -> None:
self.task_batch = task_batch
self.tokenizer = tokenizer
Expand All @@ -299,13 +340,74 @@ def put(self, value):
if (self.done[i] or generated_len % self.stream_interval != 0) and self.done[i]==self.task_batch.is_stop(i):
continue
task = self.task_batch.tasks[i]
text = self.tokenizer.decode(self.task_batch.get_generated_ids(i), skip_special_tokens=True)

token_ids = self.task_batch.get_generated_ids(i)

# text = self.tokenizer.decode(token_ids, skip_special_tokens=True)
tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
text = self.tokenizer.convert_tokens_to_string(tokens)

# get offset mapping from token to text
text_offset = []
for token_i in range(0, len(tokens)):
if token_i == 0:
text_offset.append(-1)
continue
prefix_text = self.tokenizer.convert_tokens_to_string(tokens[:token_i])
if text.startswith(prefix_text):
text_offset.append(len(prefix_text))
else:
text_offset.append(-1)

last_id = len(text)
for token_i in reversed(range(0, len(tokens))):
if text_offset[token_i] == -1:
text_offset[token_i] = last_id
else:
last_id = text_offset[token_i]

token_logprobs = self.task_batch.get_generated_token_probs(i)
top_logprobs = self.task_batch.get_generated_top_logprobs(i)
if top_logprobs is not None:
top_logprobs_new = []
for prob in top_logprobs:
top_logprobs_new.append({self.tokenizer.convert_ids_to_tokens(k): v for k, v in prob.items()})
top_logprobs = top_logprobs_new

# remove stop words
stop_pos = stop_by_stopwords(text, 0, task.stop)
if stop_pos != -1:
token_stop_pos = len(tokens)
for token_i in reversed(range(0, len(text_offset))):
if text_offset[token_i] < stop_pos:
token_stop_pos = token_i + 1
break

self.task_batch.set_stop(i)

# remove tokens after stop pos
text = text[:stop_pos]
tokens = tokens[:token_stop_pos]
if token_logprobs is not None:
token_logprobs = token_logprobs[:token_stop_pos]
if top_logprobs is not None:
top_logprobs = top_logprobs[:token_stop_pos]
text_offset = text_offset[:token_stop_pos]

prompt_len = self.task_batch.get_prompt_length(i)


# logprobs
if self.task_batch.tasks[i].logprobs is not None:
logprobs = GenerationWorkerLogprobs(
tokens=tokens,
token_logprobs=token_logprobs,
top_logprobs=top_logprobs,
text_offset=text_offset,
)
else:
logprobs = None

# push task to queue
if self.task_batch.is_stop(i):
if generated_len == self.task_batch.max_tokens[i]:
finish_reason = "length"
Expand All @@ -321,6 +423,7 @@ def put(self, value):
total_tokens=prompt_len + generated_len,
completion_tokens=generated_len,
),
logprobs=logprobs,
finish_reason=finish_reason,
)
)
Expand All @@ -339,6 +442,7 @@ def put(self, value):
total_tokens=prompt_len + generated_len,
completion_tokens=generated_len,
),
logprobs=logprobs,
finish_reason=None,
)
)
Expand Down
6 changes: 6 additions & 0 deletions langport/model/executor/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def __init__(
lib: Optional[str] = None,
gpu_layers: int = 0,
model_type: str = 'llama',
chunk_size: int = 1024,
threads: int = -1,
load_8bit: bool = False,
cpu_offloading: bool = False,
) -> None:
Expand All @@ -44,13 +46,17 @@ def __init__(
# ctransformers has a bug
self.lib = lib
self.model_type = model_type
self.chunk_size = chunk_size
self.threads = threads


def load_model(self, model_path: str, from_pretrained_kwargs: dict):
adapter = get_model_adapter(model_path)
config = Config()
setattr(config, 'stream', True)
setattr(config, 'gpu_layers', self.gpu_layers)
setattr(config, 'batch_size', self.chunk_size)
setattr(config, 'threads', self.threads)
auto_config = AutoConfig(config=config, model_type=self.model_type)
model = AutoModelForCausalLM.from_pretrained(model_path,
config=auto_config,
Expand Down
Loading

0 comments on commit 3f87a74

Please sign in to comment.