1.5.0 - Mistral Tokenizer v7 (new System Prompt + Fn calling)
Mistral's newest tokenizer has two major improvements:
System prompt
Similar to other tokenization schemes the system prompt is now treated as a "normal" message encapsulated by [SYSTEM_PROMPT] ...[\SYSTEM_PROMPT]
E.g.
from mistral_common.protocol.instruct.messages import (
UserMessage,
SystemMessage,
AssistantMessage,
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
# Load Mistral tokenizer
tokenizer = MistralTokenizer.v7()
# Tokenize a list of messages
tokenized = tokenizer.encode_chat_completion(
ChatCompletionRequest(
messages=[
SystemMessage(content="You are a funny AI assistant. Always make jokes."),
UserMessage(content="What's the weather like today in Paris"),
],
model="joker",
)
)
tokens, text = tokenized.tokens, tokenized.text
print(text)
# <s>[SYSTEM_PROMPT]▁You▁are▁a▁funny▁AI▁assistant.▁Always▁make▁jokes.[/SYSTEM_PROMPT][INST]▁What's▁the▁weather▁like▁today▁in▁Paris[/INST]
Improve function calling
A new [TOOL_CONTENT]
is added if trained with correctly should improve the accuracy of function calling.
from mistral_common.protocol.instruct.messages import (
UserMessage,
SystemMessage,
AssistantMessage,
ToolMessage
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.protocol.instruct.tool_calls import (
Function,
Tool,
)
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
# Load Mistral tokenizer
tokenizer = MistralTokenizer.v7()
tokenized = tokenizer.encode_chat_completion(
ChatCompletionRequest(
tools=[
Tool(
function=Function(
name="get_current_weather",
description="Get the current weather",
parameters={
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
)
)
],
messages=[
UserMessage(content="What's the weather like today in Paris"),
AssistantMessage(content="", tool_calls=[
{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name": "weather",
"arguments": '{"location": "Paris", "format": "celsius"}',
},
}
]),
ToolMessage(content="24 degrees celsius", tool_call_id="bbc5b7ede"),
],
model="joker",
)
)
tokens, text = tokenized.tokens, tokenized.text
# Count the number of tokens
print(text)
# <s>[AVAILABLE_TOOLS]▁[{"type":▁"function",▁"function":▁{"name":▁"get_current_weather",▁"description":▁"Get▁the▁current▁weather",▁"parameters":▁{"type":▁"object",▁"properties":▁{"location":▁{"type":▁"string",▁"description":▁"The▁city▁and▁state,▁e.g.▁San▁Francisco,▁CA"},▁"format":▁{"type":▁"string",▁"enum":▁["celsius",▁"fahrenheit"],▁"description":▁"The▁temperature▁unit▁to▁use.▁Infer▁this▁from▁the▁users▁location."}},▁"required":▁["location",▁"format"]}}}][/AVAILABLE_TOOLS][INST]▁What\'s▁the▁weather▁like▁today▁in▁Paris[/INST][TOOL_CALLS]▁[{"name":▁"weather",▁"arguments":▁{"location":▁"Paris",▁"format":▁"celsius"},▁"id":▁"bbc5b7ede"}]</s>[TOOL_RESULTS]▁bbc5b7ede[TOOL_CONTENT]▁24▁degrees▁celsius[/TOOL_RESULTS]'