Skip to content

Commit

Permalink
Simplify the transformers and llamacpp interfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
rlouf committed Mar 5, 2024
1 parent 0488ad2 commit a62ff00
Show file tree
Hide file tree
Showing 11 changed files with 134 additions and 60 deletions.
37 changes: 33 additions & 4 deletions docs/reference/models/llamacpp.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,41 @@

You need to install the `llama-cpp-python` library to be able to use these models in Outlines.

Outlines provides an integration with [Llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python library](https://github.com/abetlen/llama-cpp-python). Llamacpp allows to run quantized models on machines with limited compute.
Outlines provides an integration with [Llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python library][llamacpp]. Llamacpp allows to run quantized models on machines with limited compute.

Assuming [Phi2's weights](https://huggingface.co/TheBloke/phi-2-GGUF) are in the current directory:
You can initialize the model by pasing the path to the weights on your machine. Assuming [Phi2's weights](https://huggingface.co/TheBloke/phi-2-GGUF) are in the current directory:

```python
from outlines import models, generate
from outlines import models

model = models.llamacpp("./phi-2.Q4_K_M.gguf")
model = models.llamacpp("./phi-2.Q4_K_M.gguf", device="cuda")
```

If you need more control, you can pass the same keyword arguments to the model as you would pass in the [llama-ccp-library][llamacpp]:

```python
from outlines import models

model = models.llamacpp(
"./phi-2.Q4_K_M.gguf",
n_gpu_layers=-1, # to use GPU acceleration
seed=1337, # to set a specific seed
)
```

Please see the [llama-cpp-python documentation](https://llama-cpp-python.readthedocs.io/) for a list of available keyword arguments. Finally, if for some reason you would like to initialize `llama_cpp.Llama` separately, you can convert it to an Outlines model using:

```python
from llama_cpp import Llama
from outlines import models

llm = Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf",
verbose=False
)
model = models.LlamaCpp(llm)
```


[llamacpp]: https://github.com/abetlen/llama-cpp-python
29 changes: 29 additions & 0 deletions docs/reference/models/transformers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# transformers


!!! Installation

You need to install the `transformer` and `datasets` libraries to be able to use these models in Outlines.


Outlines provides an integration with the `torch` implementation of causal models in the [transformers][transformers] library. You can initialize the model by passing its name:

```python
from outlines import models

model = models.transformers("mistralai/Mistral-7B-v0.1", device="cuda")
```

If you need more fine-grained control you can also initialize the model and tokenizer separately:


```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from outlines import models

llm = AutoModelForCausalLM.from_pretrained("gpt2", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = models.Transformers(llm, tokenizer)
```

[transformers]: https://github.com/huggingface/transformers
4 changes: 2 additions & 2 deletions outlines/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
from .mamba import Mamba, mamba
from .openai import OpenAI, openai
from .openai_compatible import OpenAICompatibleAPI, openai_compatible_api
from .transformers import Transformer, transformers
from .transformers import Transformers, transformers

LogitsGenerator = Union[Transformer, LlamaCpp, ExLlamaV2Model, Mamba]
LogitsGenerator = Union[Transformers, LlamaCpp, ExLlamaV2Model, Mamba]
12 changes: 8 additions & 4 deletions outlines/models/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(
):
self.device = device
self.model = model
self.tokenizer = tokenizer
self.tokenizer = TransformerTokenizer(tokenizer)
self.cache = cache
self.past_seq = None

Expand Down Expand Up @@ -75,20 +75,21 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:


def exl2(
model_name: str,
model_path: str,
device: Optional[str] = None,
model_kwargs: dict = {},
tokenizer_kwargs: dict = {},
):
try:
from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"The `exllamav2` library needs to be installed in order to use `exllamav2` models."
)

config = ExLlamaV2Config()
config.model_dir = model_name
config.model_dir = model_path
config.prepare()

config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len)
Expand All @@ -108,7 +109,10 @@ def exl2(
split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")]

model.load(split)
tokenizer = TransformerTokenizer(model_name, **tokenizer_kwargs)

tokenizer_kwargs.setdefault("padding_side", "left")
tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)

cache = ExLlamaV2Cache(model)

return ExLlamaV2Model(model, tokenizer, device, cache)
32 changes: 20 additions & 12 deletions outlines/models/llamacpp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import math
from typing import List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union

import numpy as np
import torch
from numpy.typing import NDArray

from outlines.fsm.fsm import CFGFSM, FSM, FSMState, RegexFSM

if TYPE_CHECKING:
from llama_cpp import Llama


class LlamaSequenceGenerator:
def __init__(
Expand Down Expand Up @@ -87,34 +90,39 @@ def stream(
class LlamaCpp:
"""Represents a `llama_cpp` model."""

def __init__(self, model_path, **kwargs):
from llama_cpp import Llama

self.model = Llama(model_path, **kwargs)
self.tokenizer = LlamaCppTokenizer(self)
def __init__(self, model: "Llama", **kwargs):
self.model = model
self.tokenizer = LlamaCppTokenizer(model)


class LlamaCppTokenizer:
def __init__(self, model, **kwargs):
self.eos_token_id = model.model.token_eos()
self.eos_token_id = model.token_eos()
self.pad_token_id = self.eos_token_id
self.special_tokens = {}

self.vocabulary = {}
for t in range(model.model.n_vocab()):
token_piece = model.model.tokenizer().decode([t])
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t

def convert_token_to_string(self, token: str) -> str:
return token


def llamacpp(
model_name: str,
model_path: str,
device: Optional[str] = None,
model_kwargs: dict = {},
**model_kwargs,
):
return LlamaCpp(model_name, **model_kwargs)
from llama_cpp import Llama

if device == "cuda":
model_kwargs["n_gpu_layers"].setdefault(-1)

model = Llama(model_path, **model_kwargs)

return LlamaCpp(model)


class LogitsProcessor:
Expand Down
7 changes: 5 additions & 2 deletions outlines/models/mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(
):
self.device = device
self.model = model
self.tokenizer = tokenizer
self.tokenizer = TransformerTokenizer(tokenizer)

def forward(self, input_ids: torch.LongTensor, *_):
"""Compute a forward pass through the mamba model."""
Expand All @@ -41,6 +41,7 @@ def mamba(
):
try:
from mamba_ssm import MambaLMHeadModel
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"The `mamba_ssm` library needs to be installed in order to use Mamba people."
Expand All @@ -53,6 +54,8 @@ def mamba(
device = "cuda"

model = MambaLMHeadModel.from_pretrained(model_name, device=device)
tokenizer = TransformerTokenizer(TOKENIZER_MODEL, **tokenizer_kwargs)

tokenizer_kwargs.setdefault("padding_side", "left")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL, **tokenizer_kwargs)

return Mamba(model, tokenizer, device)
34 changes: 11 additions & 23 deletions outlines/models/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from outlines.models.tokenizer import Tokenizer

if TYPE_CHECKING:
from transformers import PreTrainedModel, PreTrainedTokenizerBase
from transformers import PreTrainedModel, PreTrainedTokenizer

__all__ = ["transformers"]

Expand Down Expand Up @@ -58,22 +58,8 @@ class CodeLlamaTokenizerFast: # type: ignore
class TransformerTokenizer(Tokenizer):
"""Represents a tokenizer for models in the `transformers` library."""

def __init__(
self, tokenizer_or_model_name: Union["PreTrainedTokenizerBase", str], **kwargs
):
if isinstance(tokenizer_or_model_name, str):
from transformers import AutoTokenizer

kwargs.setdefault("padding_side", "left")
self.model_name = tokenizer_or_model_name
# TODO: Do something to make this hashable?
self.kwargs = kwargs
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_or_model_name, **kwargs
)
else:
self.tokenizer = tokenizer_or_model_name

def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
self.tokenizer = tokenizer
self.eos_token_id = self.tokenizer.eos_token_id
self.eos_token = self.tokenizer.eos_token

Expand Down Expand Up @@ -129,17 +115,17 @@ def __hash__(self):
return hash(Hasher.hash(self.tokenizer))


class Transformer:
class Transformers:
"""Represents a `transformers` model."""

def __init__(
self,
model: "PreTrainedModel",
tokenizer: TransformerTokenizer,
tokenizer: "PreTrainedTokenizer",
):
self.device = model.device
self.model = model
self.tokenizer = tokenizer
self.tokenizer = TransformerTokenizer(tokenizer)

@torch.inference_mode
def forward(
Expand Down Expand Up @@ -221,7 +207,7 @@ def transformers(
"""
try:
from transformers import AutoModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer
except ImportError:
raise ImportError(
"The `transformers` library needs to be installed in order to use `transformers` models."
Expand All @@ -231,6 +217,8 @@ def transformers(
model_kwargs["device_map"] = device

model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
tokenizer = TransformerTokenizer(model_name, **tokenizer_kwargs)

return Transformer(model, tokenizer)
tokenizer_kwargs.setdefault("padding_side", "left")
tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)

return Transformers(model, tokenizer)
4 changes: 3 additions & 1 deletion tests/benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import pytest
from transformers import AutoTokenizer

from outlines.fsm.fsm import RegexFSM
from outlines.models.transformers import TransformerTokenizer


@pytest.fixture
def tokenizer():
return TransformerTokenizer("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
return TransformerTokenizer(tokenizer)


@pytest.fixture
Expand Down
7 changes: 5 additions & 2 deletions tests/fsm/test_regex.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import interegular
import numba
import pytest
from transformers import AutoTokenizer

from outlines.fsm.regex import (
_walk_fsm,
Expand Down Expand Up @@ -272,7 +273,8 @@ def test_create_fsm_index_tokenizer():
num_fsm_states = len(regex_fsm.states)
assert num_fsm_states == 220

tokenizer = TransformerTokenizer("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = TransformerTokenizer(tokenizer)

states_to_token_subsets, empty_token_ids = create_fsm_index_tokenizer(
regex_fsm, tokenizer
Expand All @@ -295,7 +297,8 @@ def test_regex_index_performance():
num_fsm_states = len(regex_fsm.states)
assert num_fsm_states == 220

tokenizer = TransformerTokenizer("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = TransformerTokenizer(tokenizer)

# Pre-compile Numba functions
res, _ = create_fsm_index_tokenizer(regex_fsm, tokenizer)
Expand Down
12 changes: 7 additions & 5 deletions tests/generate/test_integration_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import outlines.generate as generate
import outlines.models as models
from outlines.fsm.regex import reduced_vocabulary
from outlines.models.transformers import Transformer, TransformerTokenizer
from outlines.models.transformers import Transformers, TransformerTokenizer
from outlines.samplers import beam_search, multinomial


Expand Down Expand Up @@ -567,8 +567,11 @@ def test_transformers_json_custom_ws():


def test_transformers_reduced_vocabulary_caching():
tokenizer = TransformerTokenizer("gpt2")
tokenizer2 = TransformerTokenizer("gpt2")
from transformers import AutoTokenizer

hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = TransformerTokenizer(hf_tokenizer)
tokenizer2 = TransformerTokenizer(hf_tokenizer)

# TODO: We might actually want only one copy of a given tokenizer.
assert tokenizer is not tokenizer2
Expand Down Expand Up @@ -626,7 +629,6 @@ def test_transformers_use_existing_model_and_tokenizer():
model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM"
hf_tokenizer = AutoTokenizer.from_pretrained(model_name)
hf_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = TransformerTokenizer(hf_tokenizer)
model = Transformer(hf_model, tokenizer)
model = Transformers(hf_model, hf_tokenizer)
sequence = generate.text(model)("Write a short sentence ", rng=rng)
assert isinstance(sequence, str)
Loading

0 comments on commit a62ff00

Please sign in to comment.