Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ if (BUILD_TEST)
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v3.8.0
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)
FetchContent_MakeAvailable(Catch2)
Expand All @@ -53,8 +55,10 @@ endif()
FetchContent_Declare(
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG v3.9.2
GIT_SHALLOW ON
GIT_TAG v3.9.2
GIT_SHALLOW ON
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
EXCLUDE_FROM_ALL
)

Expand All @@ -66,13 +70,38 @@ FetchContent_MakeAvailable(repo-cutlass)
FetchContent_Declare(
yaml-cpp
GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
GIT_TAG 0.8.0
GIT_TAG 0.8.0
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/yaml-cpp_cmake_policy.patch
UPDATE_DISCONNECTED 1
UPDATE_DISCONNECTED 1
)
set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp")
FetchContent_MakeAvailable(yaml-cpp)

FetchContent_Declare(
xgrammar
GIT_REPOSITORY https://github.com/mlc-ai/xgrammar.git
GIT_TAG v0.1.25
GIT_SUBMODULES "3rdparty/dlpack"
GIT_PROGRESS TRUE
USES_TERMINAL_DOWNLOAD TRUE
UPDATE_DISCONNECTED 1
)

FetchContent_GetProperties(xgrammar)
if(NOT xgrammar_POPULATED)
# Fetch the content using previously declared details
FetchContent_Populate(xgrammar)

file(WRITE ${xgrammar_SOURCE_DIR}/config.cmake "set(XGRAMMAR_BUILD_PYTHON_BINDINGS OFF)\n")
if(NOT MSVC)
file(APPEND ${xgrammar_SOURCE_DIR}/config.cmake "set(CMAKE_CXX_FLAGS \"-Wno-error\")\n")
endif()

# Bring the populated content into the build
add_subdirectory(${xgrammar_SOURCE_DIR} ${xgrammar_BINARY_DIR})
endif()

# the environment variable
# ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
Expand Down Expand Up @@ -266,7 +295,9 @@ add_subdirectory(src)
if (BUILD_PY_FFI)
if (CALL_FROM_SETUP_PY)
install(TARGETS _turbomind DESTINATION ${CMAKE_INSTALL_PREFIX})
install(TARGETS _xgrammar DESTINATION ${CMAKE_INSTALL_PREFIX})
else()
install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
install(TARGETS _xgrammar DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
endif()
endif ()
3 changes: 2 additions & 1 deletion debug.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash -e

builder="-G Ninja"

Expand All @@ -15,4 +15,5 @@ cmake ${builder} .. \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON \
-DPYTHON_EXECUTABLE=$(which python3) \
-DFETCHCONTENT_QUIET=OFF \
-DBUILD_TEST=ON
1 change: 0 additions & 1 deletion docker/prepare_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ if [[ ${PYTHON_VERSION} = "3.13" ]]; then

pip install setuptools_rust
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/google/[email protected]#subdirectory=python"
pip wheel -v --no-build-isolation --no-deps -w /wheels --use-deprecated=legacy-resolver outlines_core==0.1.26
fi

if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
Expand Down
3 changes: 2 additions & 1 deletion generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ cmake ${builder} .. \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON
-DUSE_NVTX=ON \
-DFETCHCONTENT_QUIET=OFF
2 changes: 1 addition & 1 deletion lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class GenerationConfig:
around special tokens. The behavior of Fast tokenizers is to have
this to False. This is setup to True in slow tokenizers.
logprobs (int): Number of log probabilities to return per output token.
response_format (Dict): Only pytorch backend support formatting
response_format (Dict): Generate responses according to given formatting.
response. Examples:
{
"type": "json_schema",
Expand Down
178 changes: 52 additions & 126 deletions lmdeploy/pytorch/engine/guided_process.py
Original file line number Diff line number Diff line change
@@ -1,161 +1,87 @@
# Copyright 2024- the Outlines developers
# This file is adapted from
# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Copyright (c) OpenMMLab. All rights reserved.
import copy
import math
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import json
import logging
from functools import lru_cache
from typing import DefaultDict, Dict, List, Union
from typing import Optional

import torch
from outlines.fsm.guide import CFGGuide, Generate, RegexGuide, Write
from outlines.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel
import xgrammar as xgr
from transformers import PreTrainedTokenizerBase

logger = logging.getLogger('guided_process')

class BaseLogitsProcessor:

def init_state(self):
"""Initialize the FSM states."""
self.fsm_state: DefaultDict[int, int] = defaultdict(int)

def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
"""Use the FSM to bias the logits before sampling the next token."""

seq_id = hash(tuple(input_ids))

if len(input_ids) == 0:
self.init_state()
else:
last_token = input_ids[-1]
last_seq_id = hash(tuple(input_ids[:-1]))
self.fsm_state[seq_id] = self.fsm.get_next_state(state=self.fsm_state[last_seq_id], token_id=last_token)

instruction = self.fsm.get_next_instruction(self.fsm_state[seq_id])

if type(instruction) == Generate:
allowed_tokens = instruction.tokens
elif type(instruction) == Write:
# TODO: support fast forward tokens
allowed_tokens = [instruction.tokens[0]]
else:
raise TypeError(f'Unsupported instruction type {type(instruction)}')
class BaseLogitsProcessor:
"""Base logits processor that uses xgrammar matcher for guided decoding."""

mask = torch.full((scores.shape[-1], ), -math.inf, device=scores.device)
mask[allowed_tokens] = 0
scores.add_(mask)
def __init__(self, compiled_grammar: xgr.CompiledGrammar, tokenizer_info: xgr.TokenizerInfo):
self.matcher = xgr.GrammarMatcher(compiled_grammar, terminate_without_stop_token=True)
self.token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)

def process(self, scores: torch.Tensor) -> torch.Tensor:
"""Apply grammar constraints to logits before sampling the next
token."""
self.matcher.fill_next_token_bitmask(self.token_bitmask)
xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device))
return scores

def adapt_tokenizer(self, tokenizer):
"""Adapt tokenizer to use to compile the FSM.
def accept(self, token_id: int) -> bool:
"""Update matcher state after a token is generated."""
return self.matcher.accept_token(token_id)

The API of Outlines tokenizers is slightly different to that of `transformers`. In addition we need to handle
the missing spaces to Llama's tokenizer to be able to compile FSMs for this model.
"""
from outlines.integrations.utils import adapt_tokenizer
tokenizer = adapt_tokenizer(tokenizer)
# vocab size greater than logits shape because of '[UNUSED_TOKEN_...]'
if hasattr(tokenizer, '_tokenizer'):
tokenizer.vocabulary = tokenizer._tokenizer.get_vocab(with_added_tokens=False)
return tokenizer
def reset(self):
"""Reset matcher state for next generation."""
self.matcher.reset()


class RegexLogitsProcessor(BaseLogitsProcessor):
"""Regex-guided logits processor using xgrammar."""

def __init__(self, regex_string: str, tokenizer):
"""Compile the FSM that drives the regex-structured generation.

Args:
regex_string: A string that represents a regular expression
tokenizer: The model's tokenizer
"""
tokenizer = self.adapt_tokenizer(copy.deepcopy(tokenizer))
fsm = RegexGuide(regex_string, tokenizer)
self.fsm = fsm


class JSONLogitsProcessor(RegexLogitsProcessor):

def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer):
"""Compile the FSM that drives the JSON-guided generation.

Args:
schema: A str schema that encodes the structure we want the model
to generate
tokenizer: The model's tokenizer
"""
regex_string = build_regex_from_schema(schema)
super().__init__(regex_string, tokenizer)

def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
tokenizer = copy.deepcopy(tokenizer)
if vocab_size_padded is None:
vocab_size_padded = tokenizer.vocab_size

class CFGLogitsProcessor(BaseLogitsProcessor):
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)

def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
"""Compile the FSM that drives the context free grammar generation.
compiler = xgr.GrammarCompiler(tokenizer_info)
compiled = compiler.compile_regex_grammar(regex_string)

Parameters
----------
cfg
A string that represents a context-free grammar
tokenizer
The model's tokenizer
"""
tokenizer = self.adapt_tokenizer(tokenizer)
fsm = CFGGuide(cfg, tokenizer)
self.fsm = fsm
super().__init__(compiled, tokenizer_info)


# copied from https://github.com/vllm-project/vllm/blob/a7f65c2be93f491771aca31106f790bf381c0bad/vllm/model_executor/guided_decoding/outlines_decoding.py#L31 # noqa
JSON_GRAMMAR = r"""
?start: object | array
class JSONLogitsProcessor(BaseLogitsProcessor):
"""JSON-schema guided logits processor using xgrammar."""

?value: object
| array
| UNESCAPED_STRING
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null
def __init__(self, schema: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
tokenizer = copy.deepcopy(tokenizer)
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)
if vocab_size_padded is None:
vocab_size_padded = tokenizer.vocab_size

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : UNESCAPED_STRING ":" value
compiler = xgr.GrammarCompiler(tokenizer_info)
if isinstance(schema, str):
schema = json.loads(schema)

%import common.UNESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
assert isinstance(schema, dict)
compiled = compiler.compile_json_schema(schema)

%ignore WS
"""
super().__init__(compiled, tokenizer_info)


@lru_cache(maxsize=32)
def _get_guided_logits_processor(guide: str, tokenizer: PreTrainedTokenizerBase, type: str):
def _get_guided_logits_processor(guide: str,
tokenizer: PreTrainedTokenizerBase,
type: str,
vocab_size_padded: Optional[int] = None):
try:
if type == 'json_object':
return CFGLogitsProcessor(guide, tokenizer)
elif type == 'json_schema':
return JSONLogitsProcessor(guide, tokenizer)
if type == 'json_schema':
return JSONLogitsProcessor(guide, tokenizer, vocab_size_padded)
elif type == 'regex_schema':
return RegexLogitsProcessor(guide, tokenizer)
return RegexLogitsProcessor(guide, tokenizer, vocab_size_padded)
else:
return None
except Exception as e:
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
logger.error(e)
return None
raise
Loading
Loading