Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for multiple column shareGPT #1161

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion unsloth/chat_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"standardize_sharegpt",
"apply_chat_template",
"train_on_responses_only",
"parse_multicolumn_output",

"test_construct_chat_template",
]
Expand All @@ -33,6 +34,7 @@
import os
import shutil
from .tokenizer_utils import *
from functools import partial
from .models._utils import patch_tokenizer
import re
from unsloth_zoo.dataset_utils import (
Expand Down Expand Up @@ -1087,6 +1089,21 @@ def remove_special_tokens(tokenizer, prompt):
return prompt
pass

def extract_add_generation_prompt(tokenizer):
if not hasattr(tokenizer, "_generation_prompt"):
w_generation_prompt = tokenizer.apply_chat_template(conversation=[{"role" : "user", "content" : "Dummy"}], tokenize = False, add_generation_prompt = True)
wo_generation_prompt = tokenizer.apply_chat_template(conversation=[{"role" : "user", "content" : "Dummy"}], tokenize = False, add_generation_prompt = False)
tokenizer._generation_prompt = w_generation_prompt[len(wo_generation_prompt):]
pass

return tokenizer._generation_prompt
pass

def parse_multicolumn_output(tokenizer, output):
generation_prompt = extract_add_generation_prompt(tokenizer)
return eval(output.lstrip(generation_prompt).rstrip(tokenizer.eos_token))
pass


def _parse_combined_prompt(combined_prompt, dataset):
# Find {...}
Expand Down Expand Up @@ -1189,7 +1206,7 @@ def to_sharegpt(
"""
Converts a dataset to ShareGPT style.
ShareGPT requires only 1 input and 1 output field.
This means one has to merge multiple columns into 1 for 1 input field.
You can pass a multiple columns in the output field and they will be merged into 1 column in JSON format.
Use `conversation_extension` to increase the length of each conversation by randomnly
selecting a few and packing them into 1.

Expand All @@ -1207,6 +1224,17 @@ def to_sharegpt(
pass
pass

if isinstance(output_column_name, list):
def format_to_merge_output_columns(example, columns: list):
merged_dict = {}
for column in columns:
merged_dict[column] = example[column]
example["🔥merged😅"] = str(merged_dict)
return example

dataset = dataset.map(partial(format_to_merge_output_columns, columns=output_column_name), desc="Merging output columns")
output_column_name = "🔥merged😅"

possible_columns, final_optional_prompts = _parse_combined_prompt(merged_prompt, dataset)
function = _create_formatter(possible_columns, final_optional_prompts, merged_column_name)
exec(function, globals())
Expand Down