Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exllamav2 Integration #1010

Open
wants to merge 3 commits into
base: main
Choose a base branch
from

Conversation

isamu-isozaki
Copy link
Contributor

This fixes #1009

The tests I did were:

For loading:

from outlines.integrations.exllamav2 import RegexFilter, TextFilter, JSONFilter, ChoiceFilter
import json
import torch
from exllamav2.generator.filters import ExLlamaV2PrefixFilter
from pydantic import BaseModel
from typing import Literal
from exllamav2 import(
    ExLlamaV2,
    ExLlamaV2Config,
    ExLlamaV2Cache,
    ExLlamaV2Cache_8bit,
    ExLlamaV2Cache_Q4,
    ExLlamaV2Tokenizer,
)

from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler, ExLlamaV2DynamicJob
from transformers import AutoTokenizer
import uuid

repo_id = "../Phi-3-mini-128k-instruct-exl2"
paged = False
model_dir = repo_id
total_context = 8192
max_context = 1024 
max_batch_size = 4 if paged else 1
max_chunk_size = 1024
max_new_tokens = 1024
healing = True
draft_model = None
draft_cache = None
use_ngram_draft = None
use_ngram = None

config = ExLlamaV2Config(model_dir)
config.max_input_len = max_chunk_size
config.max_attention_size = max_chunk_size ** 2

config.max_seq_len = max_context
model = ExLlamaV2(config)

cache = ExLlamaV2Cache_Q4(
    model,
    max_seq_len = total_context,
    lazy = True
)
tokenizer = ExLlamaV2Tokenizer(config)
hf_tokenizer_kwargs = {}
hf_tokenizer_kwargs.setdefault("padding_side", "left")
hf_tokenizer = AutoTokenizer.from_pretrained(model_dir, **hf_tokenizer_kwargs)
model.load_autosplit(cache, progress = True)
generator = ExLlamaV2DynamicGenerator(
    model = model,
    cache = cache,
    draft_model = draft_model,
    draft_cache = draft_cache,
    tokenizer = tokenizer,
    max_batch_size = max_batch_size,
    use_ngram_draft = use_ngram,
    max_chunk_size = max_chunk_size,
    paged = paged,
)

Choices test:

filters = [
    ChoiceFilter(["bob", "fred"], hf_tokenizer)
]
context_ids = torch.empty((1, 0), dtype = torch.long)


instruction = "Who is better bob or fred?"
print()
print("Assistant:", end = "")

instruction_ids = tokenizer.encode(f"[INST] {instruction} [/INST]", add_bos = True)
context_ids = torch.cat([context_ids, instruction_ids], dim = -1)

generator.enqueue(
    ExLlamaV2DynamicJob(
        input_ids = context_ids,
        max_new_tokens = 1024,
        stop_conditions = [],
        filters=filters
    )
)

eos = False
while not eos:
    results = generator.iterate()
    for result in results:
        if result["stage"] == "streaming":
            eos = result["eos"]
            if "text" in result:
                print(result["text"], end="")
                sys.stdout.flush()
            if "token_ids" in result:
                context_ids = torch.cat([context_ids, result["token_ids"]], dim = -1)

print()

Returns

Assistant:bob

Json test

class JSONResponse(BaseModel):
    response: str
    confidence: Literal["low", "medium", "high"]
    is_subjective: Literal["no", "yes", "possibly"]
filters = [
    JSONFilter(JSONResponse, hf_tokenizer)
]
context_ids = torch.empty((1, 0), dtype = torch.long)


instruction = f"Give a sample response in the format of {JSONResponse.schema()} on a movie review of love actually"
print()
print("Assistant: ", end = "")

instruction_ids = tokenizer.encode(f"[INST] {instruction} [/INST]", add_bos = True)
context_ids = torch.cat([context_ids, instruction_ids], dim = -1)

generator.enqueue(
    ExLlamaV2DynamicJob(
        input_ids = context_ids,
        max_new_tokens = 1024,
        stop_conditions = [tokenizer.eos_token_id],
        filters=filters
    )
)

eos = False
while not eos:
    results = generator.iterate()
    for result in results:
        if result["stage"] == "streaming":
            eos = result["eos"]
            if "text" in result:
                print(result["text"], end="")
                sys.stdout.flush()
            if "token_ids" in result:
                context_ids = torch.cat([context_ids, result["token_ids"]], dim = -1)

print()

Returns

Assistant: {"response": "Love Actually is a charming and heartwarming romantic comedy that delivers a delightful experience. The performances by the lead actors, especially Drew Barrymore and Gael García Bernal, are genuinely commendable. The film beautifully blends humor with heart-tugging moments, making it an ideal watch for those in search of a feel-good cinematic experience. Despite some predictable plot trends, the overall impact of the film remains largely positive. Rating: 7/10", "confidence": "medium", "is_subjective": "no"}

@isamu-isozaki isamu-isozaki changed the title Exllamav2 filter Exllamav2 Integration Jun 29, 2024
@isamu-isozaki
Copy link
Contributor Author

Some questions I had for maintainers were

  1. Should we do the prefix logic here? I noticed that in some exllamav2 filters in their repo the prefix is ignored but for one they are used.
  2. Do we want to return the stop tokens? It requires us to check through all the allowed tokens to see which one gives a final state. This may be a bit slower

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Exllamav2 integration
1 participant