LCEL with LLamaCPP #125

ahuang11 · 2024-01-25T22:44:52Z

"""
Demonstrates how to use the `ChatInterface` to create a chatbot using
[LangChain Expression Language](https://python.langchain.com/docs/expression_language/) (LCEL)
with streaming and memory.
"""

from operator import itemgetter

import panel as pn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from langchain.memory import ConversationTokenBufferMemory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.llms.llamacpp import LlamaCpp

pn.extension()

TOKENIZER_REPO_ID = "HuggingFaceH4/zephyr-7b-beta"
REPO_ID = "TheBloke/zephyr-7B-beta-GGUF"
FILENAME = "zephyr-7b-beta.Q5_K_M.gguf"
SYSTEM_PROMPT = "Be a helpful chatbot."
PROMPT_TEMPLATE = """
<|system|>
{system_prompt}</s>
{chat_history}
<|user|>
{user_input}</s>
<|assistant|>
""".strip()
ROLE_MAPPING = {
    "system": "system",
    "human": "user",
    "ai": "assistant",
}


def load_llm(repo_id: str = REPO_ID, filename: str = FILENAME, **kwargs):
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    llm = LlamaCpp(model_path=model_path, **kwargs)
    return llm


def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
    message = ""
    inputs = {"user_input": contents}
    for token in chain.stream(inputs):
        message += token
        yield message
    memory.save_context(inputs, {"output": message})


def apply_chat_template_to_history(history):
    conversation = [
        {"role": ROLE_MAPPING[message.type], "content": message.content}
        for message in history["chat_history"]
    ]
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO_ID)
    chat_history = tokenizer.apply_chat_template(conversation, tokenize=False)
    return chat_history


model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
llm = LlamaCpp(
    model_path=model_path,
    streaming=True,
    n_gpu_layers=1,
    temperature=0.75,
    max_tokens=1024,
    n_ctx=8192,
    top_p=1,
)
memory = ConversationTokenBufferMemory(
    return_messages=True,
    llm=llm,
    memory_key="chat_history",
    max_token_limit=8192 - 1024,
)
prompt = PromptTemplate.from_template(
    PROMPT_TEMPLATE, partial_variables={"system_prompt": SYSTEM_PROMPT}
)

output_parser = StrOutputParser()
chain = (
    RunnablePassthrough.assign(
        chat_history=RunnableLambda(memory.load_memory_variables)
        | itemgetter("chat_history")
    )
    | RunnablePassthrough.assign(chat_history=apply_chat_template_to_history)
    | prompt
    | llm
    | output_parser
)

chat_interface = pn.chat.ChatInterface(
    pn.chat.ChatMessage(
        "Offer a topic and Mistral will try to be funny!", user="System"
    ),
    callback=callback,
    callback_user="Mistral",
    callback_exception="verbose",
)
chat_interface.servable()

ahuang11 · 2024-01-26T21:55:32Z

Agent:

"""
Demonstrates how to use the `ChatInterface` to create a chatbot using
[LangChain Expression Language](https://python.langchain.com/docs/expression_language/) (LCEL)
with streaming and memory.
"""

import re
from operator import itemgetter

import panel as pn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.output_parsers import XMLAgentOutputParser
from langchain.agents import AgentExecutor
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.llms.llamacpp import LlamaCpp
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_community.tools.sleep.tool import SleepTool
from langchain_core.agents import AgentFinish

pn.extension()

TOKENIZER_REPO_ID = "HuggingFaceH4/zephyr-7b-beta"
REPO_ID = "TheBloke/zephyr-7B-beta-GGUF"
FILENAME = "zephyr-7b-beta.Q5_K_M.gguf"
SYSTEM_PROMPT = (
    "You are a friendy chat bot that tries its best to answer questions from the user."
)
PROMPT_TEMPLATE = """
<|system|>
{system_prompt}

If needed, you have access to these tools; use them!
'{tools}'

In order to use a tool, you can use <tool></tool> and <tool_input></tool_input> XML tags; no need to wrap in code fence. You will then get back a response in the form <observation></observation>

For example, if you have a tool called 'search' that could run a google search, in order to search for the weather in SF you would respond:
'<tool>search</tool><tool_input>weather in SF</tool_input>'
'<observation>64 degrees</observation>'

Finally, always respond with a final answer between <final_answer></final_answer>, especially if you don't know how to respond. For example:
'<final_answer>The weather in SF is 64 degrees</final_answer>'

Here's the previous conversation history:</s>
{chat_history}

<|user|>
{user_input}</s>

<|assistant|>
{agent_scratchpad}
""".strip()

ROLE_MAPPING = {
    "system": "system",
    "human": "user",
    "ai": "assistant",
}


class CustomXMLAgentOutputParser(XMLAgentOutputParser):
    def parse(self, text):
        if "</s>" not in text:
            text += "</s>"
        try:
            return super().parse(text)
        except ValueError:
            return AgentFinish(return_values={"output": text}, log=text)


def load_llm(repo_id: str = REPO_ID, filename: str = FILENAME, **kwargs):
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    llm = LlamaCpp(model_path=model_path, **kwargs)
    return llm


def convert_intermediate_steps(intermediate_steps):
    log = ""
    for action, observation in intermediate_steps:
        log += (
            f"<tool>{action.tool}</tool><tool_input>{action.tool_input}"
            f"</tool_input><observation>{observation}</observation>"
        )
    return log


def convert_tools(tools):
    return "\n".join([f"{tool.name}: {tool.description}" for tool in tools])


def fix_output(output):
    pattern = r"<([^/][^>]*)>(.*?)</s>"
    fixed_output = re.sub(pattern, r"<\1>\2</\1>", output)
    # print(f"Fixed output: {fixed_output}, original output: {output}")

    pattern = r"```[^\n]*"
    final_output = re.sub(pattern, "```", fixed_output)
    # print(f"Fixed output: {fixed_output}, original output: {final_output}")

    if "</s>" not in final_output:
        final_output += "</s>"
    return final_output


def apply_chat_template_to_history(history):
    conversation = [
        {"role": ROLE_MAPPING[message.type], "content": message.content}
        for message in history["chat_history"]
    ]
    if not conversation:
        return "N/A"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO_ID)
    chat_history = tokenizer.apply_chat_template(conversation, tokenize=False)
    return chat_history


def stream_response(contents: str, user: str, instance: pn.chat.ChatInterface):
    message = ""
    inputs = {"user_input": contents}
    # for token in agent_executor.stream(inputs):
    #     message += token
    #     yield message
    message = agent_executor.invoke(inputs)["output"]
    memory.save_context(inputs, {"output": message})
    return message


# initialize the LLM
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
llm = LlamaCpp(
    model_path=model_path,
    streaming=True,
    use_mlock=True,
    n_gpu_layers=1,
    temperature=0.75,
    max_tokens=2048,
    n_ctx=8192,
)
llm.client.verbose = False

# define the chain's links
memory = ConversationTokenBufferMemory(
    return_messages=True,
    llm=llm,
    memory_key="chat_history",
    max_token_limit=8192 - 1024,
)
memory_link = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables)
    | itemgetter("chat_history")
) | RunnablePassthrough.assign(chat_history=apply_chat_template_to_history)

tools = [DuckDuckGoSearchRun(), SleepTool()]
prompt_link = PromptTemplate.from_template(
    PROMPT_TEMPLATE,
    partial_variables={"system_prompt": SYSTEM_PROMPT, "tools": convert_tools(tools)},
).partial(tools=convert_tools(tools))

agent = (
    {
        "user_input": lambda x: x["user_input"],
        "agent_scratchpad": lambda x: convert_intermediate_steps(
            x["intermediate_steps"]
        ),
    }
    | memory_link
    | prompt_link
    | llm.bind(stop=["</tool_input>", "</final_answer>"])
    | RunnableLambda(fix_output)
    | CustomXMLAgentOutputParser()
)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# initialize the chat interface
chat_interface = pn.chat.ChatInterface(
    pn.chat.ChatMessage(
        "Offer a topic and Mistral will try to be funny!", user="System"
    ),
    callback=stream_response,
    callback_user="Mistral",
    callback_exception="verbose",
)
chat_interface.servable()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LCEL with LLamaCPP #125

LCEL with LLamaCPP #125

ahuang11 commented Jan 25, 2024 •

edited

Loading

ahuang11 commented Jan 26, 2024

LCEL with LLamaCPP #125

LCEL with LLamaCPP #125

Comments

ahuang11 commented Jan 25, 2024 • edited Loading

ahuang11 commented Jan 26, 2024

ahuang11 commented Jan 25, 2024 •

edited

Loading