diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index cab6130d..68f3c367 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -22,6 +22,7 @@ "standardize_sharegpt", "apply_chat_template", "train_on_responses_only", + "parse_multicolumn_output", "test_construct_chat_template", ] @@ -33,6 +34,7 @@ import os import shutil from .tokenizer_utils import * +from functools import partial from .models._utils import patch_tokenizer import re from unsloth_zoo.dataset_utils import ( @@ -1087,6 +1089,21 @@ def remove_special_tokens(tokenizer, prompt): return prompt pass +def extract_add_generation_prompt(tokenizer): + if not hasattr(tokenizer, "_generation_prompt"): + w_generation_prompt = tokenizer.apply_chat_template(conversation=[{"role" : "user", "content" : "Dummy"}], tokenize = False, add_generation_prompt = True) + wo_generation_prompt = tokenizer.apply_chat_template(conversation=[{"role" : "user", "content" : "Dummy"}], tokenize = False, add_generation_prompt = False) + tokenizer._generation_prompt = w_generation_prompt[len(wo_generation_prompt):] + pass + + return tokenizer._generation_prompt +pass + +def parse_multicolumn_output(tokenizer, output): + generation_prompt = extract_add_generation_prompt(tokenizer) + return eval(output.lstrip(generation_prompt).rstrip(tokenizer.eos_token)) +pass + def _parse_combined_prompt(combined_prompt, dataset): # Find {...} @@ -1189,7 +1206,7 @@ def to_sharegpt( """ Converts a dataset to ShareGPT style. ShareGPT requires only 1 input and 1 output field. - This means one has to merge multiple columns into 1 for 1 input field. + You can pass a multiple columns in the output field and they will be merged into 1 column in JSON format. Use `conversation_extension` to increase the length of each conversation by randomnly selecting a few and packing them into 1. @@ -1207,6 +1224,17 @@ def to_sharegpt( pass pass + if isinstance(output_column_name, list): + def format_to_merge_output_columns(example, columns: list): + merged_dict = {} + for column in columns: + merged_dict[column] = example[column] + example["🔥merged😅"] = str(merged_dict) + return example + + dataset = dataset.map(partial(format_to_merge_output_columns, columns=output_column_name), desc="Merging output columns") + output_column_name = "🔥merged😅" + possible_columns, final_optional_prompts = _parse_combined_prompt(merged_prompt, dataset) function = _create_formatter(possible_columns, final_optional_prompts, merged_column_name) exec(function, globals())