Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix benchmark serving cannot use Qwen tokenizer #443

Merged
merged 2 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 5 additions & 18 deletions benchmark/profile_restful_api.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
import multiprocessing as mp
import os
import random
import time
from typing import Iterable, List

import fire
import numpy as np
import requests
from sentencepiece import SentencePieceProcessor

from lmdeploy.turbomind.tokenizer import Tokenizer
from lmdeploy.utils import get_logger


Expand Down Expand Up @@ -45,20 +44,6 @@ def get_streaming_response(prompt: str,
yield output, tokens


class Tokenizer:

def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)

def encode(self, prompts: List):
prompts_token_ids = self.sp_model.Encode(prompts,
add_bos=False,
add_eos=False)
return [len(token_ids) for token_ids in prompts_token_ids]


def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
res_que: mp.Queue):
stats = []
Expand Down Expand Up @@ -132,8 +117,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,

start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
prompts_token_lens = tokenizer.encode(prompts)
completions_token_lens = tokenizer.encode(completions)
prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
completions_token_lens = [
len(tokenizer.encode(prompt)) for prompt in completions
Copy link
Collaborator Author

@AllentDan AllentDan Sep 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is slower compared with self.sp_model.Encode.

]
print(f'elapsed time for tokenization: '
f'{round(time.perf_counter() - start, 2)} s')

Expand Down
24 changes: 5 additions & 19 deletions benchmark/profile_serving.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,14 @@
import json
import logging
import multiprocessing as mp
import os
import random
import time
from typing import List

import fire
import numpy as np
from sentencepiece import SentencePieceProcessor

from lmdeploy.serve.turbomind.chatbot import Chatbot


class Tokenizer:

def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)

def encode(self, prompts: List):
prompts_token_ids = self.sp_model.Encode(prompts,
add_bos=False,
add_eos=False)
return [len(token_ids) for token_ids in prompts_token_ids]
from lmdeploy.turbomind.tokenizer import Tokenizer


def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
Expand Down Expand Up @@ -103,8 +87,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,

start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
prompts_token_lens = tokenizer.encode(prompts)
completions_token_lens = tokenizer.encode(completions)
prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
completions_token_lens = [
len(tokenizer.encode(prompt)) for prompt in completions
]
print(f'elapsed time for tokenization: '
f'{round(time.perf_counter() - start, 2)} s')

Expand Down