diff --git a/docs/reference/vllm.md b/docs/reference/vllm.md index 3b18dfbaf..fb71f8ef6 100644 --- a/docs/reference/vllm.md +++ b/docs/reference/vllm.md @@ -50,7 +50,7 @@ curl http://127.0.0.1:8000/generate \ }' ``` -To generate a string that matches the grammar ``: +To generate a string that matches a given grammar ``: ```bash curl http://127.0.0.1:8000/generate \ diff --git a/outlines/serve/vllm.py b/outlines/serve/vllm.py index bfb3c81a1..ded66223d 100644 --- a/outlines/serve/vllm.py +++ b/outlines/serve/vllm.py @@ -44,10 +44,16 @@ def _adapt_tokenizer(tokenizer): """Adapt vLLM's tokenizer to use to compile the FSM. The API of Outlines tokenizers is slightly different to that of - `transformers`. In addition we need to handle the missing spaces to - Llama's tokenizer to be able to compile FSMs for this model. + `transformers`. The decoder of outlines, returns a list whereas + the decode of vLLM returns an str. To sync the vLLM decoder with + outlines internal api, the decoder should be adapted. In addition + we need to handle the missing spaces to Llama's tokenizer to be + able to compile FSMs for this model. """ + if getattr(tokenizer, "_outlines_adapted", False): + return tokenizer + tokenizer.vocabulary = tokenizer.get_vocab() tokenizer.special_tokens = set(tokenizer.all_special_tokens) @@ -65,6 +71,8 @@ def convert_token_to_string(token: str) -> str: def change_decoder( decoder: Callable[[List[int]], str] ) -> Callable[[List[int]], List[str]]: + """Sync vLLM's decoder with the outlines expectations by returning list""" + def new_decoder(inp_tokens: List[int]) -> List[str]: return [decoder(inp_tokens)] @@ -72,6 +80,7 @@ def new_decoder(inp_tokens: List[int]) -> List[str]: tokenizer.convert_token_to_string = convert_token_to_string tokenizer.decode = change_decoder(tokenizer.decode) + setattr(tokenizer, "_outlines_adapted", True) return tokenizer