Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LCEL with LLamaCPP #125

Open
ahuang11 opened this issue Jan 25, 2024 · 1 comment
Open

LCEL with LLamaCPP #125

ahuang11 opened this issue Jan 25, 2024 · 1 comment

Comments

@ahuang11
Copy link
Collaborator

ahuang11 commented Jan 25, 2024

"""
Demonstrates how to use the `ChatInterface` to create a chatbot using
[LangChain Expression Language](https://python.langchain.com/docs/expression_language/) (LCEL)
with streaming and memory.
"""

from operator import itemgetter

import panel as pn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from langchain.memory import ConversationTokenBufferMemory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.llms.llamacpp import LlamaCpp

pn.extension()

TOKENIZER_REPO_ID = "HuggingFaceH4/zephyr-7b-beta"
REPO_ID = "TheBloke/zephyr-7B-beta-GGUF"
FILENAME = "zephyr-7b-beta.Q5_K_M.gguf"
SYSTEM_PROMPT = "Be a helpful chatbot."
PROMPT_TEMPLATE = """
<|system|>
{system_prompt}</s>
{chat_history}
<|user|>
{user_input}</s>
<|assistant|>
""".strip()
ROLE_MAPPING = {
    "system": "system",
    "human": "user",
    "ai": "assistant",
}


def load_llm(repo_id: str = REPO_ID, filename: str = FILENAME, **kwargs):
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    llm = LlamaCpp(model_path=model_path, **kwargs)
    return llm


def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
    message = ""
    inputs = {"user_input": contents}
    for token in chain.stream(inputs):
        message += token
        yield message
    memory.save_context(inputs, {"output": message})


def apply_chat_template_to_history(history):
    conversation = [
        {"role": ROLE_MAPPING[message.type], "content": message.content}
        for message in history["chat_history"]
    ]
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO_ID)
    chat_history = tokenizer.apply_chat_template(conversation, tokenize=False)
    return chat_history


model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
llm = LlamaCpp(
    model_path=model_path,
    streaming=True,
    n_gpu_layers=1,
    temperature=0.75,
    max_tokens=1024,
    n_ctx=8192,
    top_p=1,
)
memory = ConversationTokenBufferMemory(
    return_messages=True,
    llm=llm,
    memory_key="chat_history",
    max_token_limit=8192 - 1024,
)
prompt = PromptTemplate.from_template(
    PROMPT_TEMPLATE, partial_variables={"system_prompt": SYSTEM_PROMPT}
)

output_parser = StrOutputParser()
chain = (
    RunnablePassthrough.assign(
        chat_history=RunnableLambda(memory.load_memory_variables)
        | itemgetter("chat_history")
    )
    | RunnablePassthrough.assign(chat_history=apply_chat_template_to_history)
    | prompt
    | llm
    | output_parser
)

chat_interface = pn.chat.ChatInterface(
    pn.chat.ChatMessage(
        "Offer a topic and Mistral will try to be funny!", user="System"
    ),
    callback=callback,
    callback_user="Mistral",
    callback_exception="verbose",
)
chat_interface.servable()
@ahuang11
Copy link
Collaborator Author

Agent:

"""
Demonstrates how to use the `ChatInterface` to create a chatbot using
[LangChain Expression Language](https://python.langchain.com/docs/expression_language/) (LCEL)
with streaming and memory.
"""

import re
from operator import itemgetter

import panel as pn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.output_parsers import XMLAgentOutputParser
from langchain.agents import AgentExecutor
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.llms.llamacpp import LlamaCpp
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_community.tools.sleep.tool import SleepTool
from langchain_core.agents import AgentFinish

pn.extension()

TOKENIZER_REPO_ID = "HuggingFaceH4/zephyr-7b-beta"
REPO_ID = "TheBloke/zephyr-7B-beta-GGUF"
FILENAME = "zephyr-7b-beta.Q5_K_M.gguf"
SYSTEM_PROMPT = (
    "You are a friendy chat bot that tries its best to answer questions from the user."
)
PROMPT_TEMPLATE = """
<|system|>
{system_prompt}

If needed, you have access to these tools; use them!
'{tools}'

In order to use a tool, you can use <tool></tool> and <tool_input></tool_input> XML tags; no need to wrap in code fence. You will then get back a response in the form <observation></observation>

For example, if you have a tool called 'search' that could run a google search, in order to search for the weather in SF you would respond:
'<tool>search</tool><tool_input>weather in SF</tool_input>'
'<observation>64 degrees</observation>'

Finally, always respond with a final answer between <final_answer></final_answer>, especially if you don't know how to respond. For example:
'<final_answer>The weather in SF is 64 degrees</final_answer>'

Here's the previous conversation history:</s>
{chat_history}

<|user|>
{user_input}</s>

<|assistant|>
{agent_scratchpad}
""".strip()

ROLE_MAPPING = {
    "system": "system",
    "human": "user",
    "ai": "assistant",
}


class CustomXMLAgentOutputParser(XMLAgentOutputParser):
    def parse(self, text):
        if "</s>" not in text:
            text += "</s>"
        try:
            return super().parse(text)
        except ValueError:
            return AgentFinish(return_values={"output": text}, log=text)


def load_llm(repo_id: str = REPO_ID, filename: str = FILENAME, **kwargs):
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    llm = LlamaCpp(model_path=model_path, **kwargs)
    return llm


def convert_intermediate_steps(intermediate_steps):
    log = ""
    for action, observation in intermediate_steps:
        log += (
            f"<tool>{action.tool}</tool><tool_input>{action.tool_input}"
            f"</tool_input><observation>{observation}</observation>"
        )
    return log


def convert_tools(tools):
    return "\n".join([f"{tool.name}: {tool.description}" for tool in tools])


def fix_output(output):
    pattern = r"<([^/][^>]*)>(.*?)</s>"
    fixed_output = re.sub(pattern, r"<\1>\2</\1>", output)
    # print(f"Fixed output: {fixed_output}, original output: {output}")

    pattern = r"```[^\n]*"
    final_output = re.sub(pattern, "```", fixed_output)
    # print(f"Fixed output: {fixed_output}, original output: {final_output}")

    if "</s>" not in final_output:
        final_output += "</s>"
    return final_output


def apply_chat_template_to_history(history):
    conversation = [
        {"role": ROLE_MAPPING[message.type], "content": message.content}
        for message in history["chat_history"]
    ]
    if not conversation:
        return "N/A"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO_ID)
    chat_history = tokenizer.apply_chat_template(conversation, tokenize=False)
    return chat_history


def stream_response(contents: str, user: str, instance: pn.chat.ChatInterface):
    message = ""
    inputs = {"user_input": contents}
    # for token in agent_executor.stream(inputs):
    #     message += token
    #     yield message
    message = agent_executor.invoke(inputs)["output"]
    memory.save_context(inputs, {"output": message})
    return message


# initialize the LLM
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
llm = LlamaCpp(
    model_path=model_path,
    streaming=True,
    use_mlock=True,
    n_gpu_layers=1,
    temperature=0.75,
    max_tokens=2048,
    n_ctx=8192,
)
llm.client.verbose = False

# define the chain's links
memory = ConversationTokenBufferMemory(
    return_messages=True,
    llm=llm,
    memory_key="chat_history",
    max_token_limit=8192 - 1024,
)
memory_link = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables)
    | itemgetter("chat_history")
) | RunnablePassthrough.assign(chat_history=apply_chat_template_to_history)

tools = [DuckDuckGoSearchRun(), SleepTool()]
prompt_link = PromptTemplate.from_template(
    PROMPT_TEMPLATE,
    partial_variables={"system_prompt": SYSTEM_PROMPT, "tools": convert_tools(tools)},
).partial(tools=convert_tools(tools))

agent = (
    {
        "user_input": lambda x: x["user_input"],
        "agent_scratchpad": lambda x: convert_intermediate_steps(
            x["intermediate_steps"]
        ),
    }
    | memory_link
    | prompt_link
    | llm.bind(stop=["</tool_input>", "</final_answer>"])
    | RunnableLambda(fix_output)
    | CustomXMLAgentOutputParser()
)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# initialize the chat interface
chat_interface = pn.chat.ChatInterface(
    pn.chat.ChatMessage(
        "Offer a topic and Mistral will try to be funny!", user="System"
    ),
    callback=stream_response,
    callback_user="Mistral",
    callback_exception="verbose",
)
chat_interface.servable()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant