-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LCEL with LLamaCPP #125
Comments
Agent: """
Demonstrates how to use the `ChatInterface` to create a chatbot using
[LangChain Expression Language](https://python.langchain.com/docs/expression_language/) (LCEL)
with streaming and memory.
"""
import re
from operator import itemgetter
import panel as pn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.output_parsers import XMLAgentOutputParser
from langchain.agents import AgentExecutor
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.llms.llamacpp import LlamaCpp
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_community.tools.sleep.tool import SleepTool
from langchain_core.agents import AgentFinish
pn.extension()
TOKENIZER_REPO_ID = "HuggingFaceH4/zephyr-7b-beta"
REPO_ID = "TheBloke/zephyr-7B-beta-GGUF"
FILENAME = "zephyr-7b-beta.Q5_K_M.gguf"
SYSTEM_PROMPT = (
"You are a friendy chat bot that tries its best to answer questions from the user."
)
PROMPT_TEMPLATE = """
<|system|>
{system_prompt}
If needed, you have access to these tools; use them!
'{tools}'
In order to use a tool, you can use <tool></tool> and <tool_input></tool_input> XML tags; no need to wrap in code fence. You will then get back a response in the form <observation></observation>
For example, if you have a tool called 'search' that could run a google search, in order to search for the weather in SF you would respond:
'<tool>search</tool><tool_input>weather in SF</tool_input>'
'<observation>64 degrees</observation>'
Finally, always respond with a final answer between <final_answer></final_answer>, especially if you don't know how to respond. For example:
'<final_answer>The weather in SF is 64 degrees</final_answer>'
Here's the previous conversation history:</s>
{chat_history}
<|user|>
{user_input}</s>
<|assistant|>
{agent_scratchpad}
""".strip()
ROLE_MAPPING = {
"system": "system",
"human": "user",
"ai": "assistant",
}
class CustomXMLAgentOutputParser(XMLAgentOutputParser):
def parse(self, text):
if "</s>" not in text:
text += "</s>"
try:
return super().parse(text)
except ValueError:
return AgentFinish(return_values={"output": text}, log=text)
def load_llm(repo_id: str = REPO_ID, filename: str = FILENAME, **kwargs):
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
llm = LlamaCpp(model_path=model_path, **kwargs)
return llm
def convert_intermediate_steps(intermediate_steps):
log = ""
for action, observation in intermediate_steps:
log += (
f"<tool>{action.tool}</tool><tool_input>{action.tool_input}"
f"</tool_input><observation>{observation}</observation>"
)
return log
def convert_tools(tools):
return "\n".join([f"{tool.name}: {tool.description}" for tool in tools])
def fix_output(output):
pattern = r"<([^/][^>]*)>(.*?)</s>"
fixed_output = re.sub(pattern, r"<\1>\2</\1>", output)
# print(f"Fixed output: {fixed_output}, original output: {output}")
pattern = r"```[^\n]*"
final_output = re.sub(pattern, "```", fixed_output)
# print(f"Fixed output: {fixed_output}, original output: {final_output}")
if "</s>" not in final_output:
final_output += "</s>"
return final_output
def apply_chat_template_to_history(history):
conversation = [
{"role": ROLE_MAPPING[message.type], "content": message.content}
for message in history["chat_history"]
]
if not conversation:
return "N/A"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO_ID)
chat_history = tokenizer.apply_chat_template(conversation, tokenize=False)
return chat_history
def stream_response(contents: str, user: str, instance: pn.chat.ChatInterface):
message = ""
inputs = {"user_input": contents}
# for token in agent_executor.stream(inputs):
# message += token
# yield message
message = agent_executor.invoke(inputs)["output"]
memory.save_context(inputs, {"output": message})
return message
# initialize the LLM
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
llm = LlamaCpp(
model_path=model_path,
streaming=True,
use_mlock=True,
n_gpu_layers=1,
temperature=0.75,
max_tokens=2048,
n_ctx=8192,
)
llm.client.verbose = False
# define the chain's links
memory = ConversationTokenBufferMemory(
return_messages=True,
llm=llm,
memory_key="chat_history",
max_token_limit=8192 - 1024,
)
memory_link = RunnablePassthrough.assign(
chat_history=RunnableLambda(memory.load_memory_variables)
| itemgetter("chat_history")
) | RunnablePassthrough.assign(chat_history=apply_chat_template_to_history)
tools = [DuckDuckGoSearchRun(), SleepTool()]
prompt_link = PromptTemplate.from_template(
PROMPT_TEMPLATE,
partial_variables={"system_prompt": SYSTEM_PROMPT, "tools": convert_tools(tools)},
).partial(tools=convert_tools(tools))
agent = (
{
"user_input": lambda x: x["user_input"],
"agent_scratchpad": lambda x: convert_intermediate_steps(
x["intermediate_steps"]
),
}
| memory_link
| prompt_link
| llm.bind(stop=["</tool_input>", "</final_answer>"])
| RunnableLambda(fix_output)
| CustomXMLAgentOutputParser()
)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# initialize the chat interface
chat_interface = pn.chat.ChatInterface(
pn.chat.ChatMessage(
"Offer a topic and Mistral will try to be funny!", user="System"
),
callback=stream_response,
callback_user="Mistral",
callback_exception="verbose",
)
chat_interface.servable() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The text was updated successfully, but these errors were encountered: