From 76724e0c83fded14bbdb2332819359c848dc6e4d Mon Sep 17 00:00:00 2001 From: Insop Song Date: Fri, 29 Nov 2024 21:25:46 -0800 Subject: [PATCH] Add code alpaca dataset --- torchtune/datasets/__init__.py | 5 ++++- torchtune/datasets/_alpaca.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/torchtune/datasets/__init__.py b/torchtune/datasets/__init__.py index b0c7c11738..0e47082e03 100644 --- a/torchtune/datasets/__init__.py +++ b/torchtune/datasets/__init__.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. from torchtune.datasets import multimodal -from torchtune.datasets._alpaca import alpaca_cleaned_dataset, alpaca_dataset +from torchtune.datasets._alpaca import alpaca_evol_codealpaca_v1_dataset, alpaca_code_instruction_120k_dataset, alpaca_code_20k_dataset, alpaca_cleaned_dataset, alpaca_dataset from torchtune.datasets._chat import chat_dataset from torchtune.datasets._cnn_dailymail import cnn_dailymail_articles_dataset from torchtune.datasets._concat import ConcatDataset @@ -27,6 +27,9 @@ __all__ = [ "alpaca_dataset", "alpaca_cleaned_dataset", + "alpaca_code_20k_dataset", + "alpaca_code_instruction_120k_dataset", + "alpaca_evol_codealpaca_v1_dataset", "grammar_dataset", "samsum_dataset", "stack_exchange_paired_dataset", diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py index 4254cb7cb6..93caa9f695 100644 --- a/torchtune/datasets/_alpaca.py +++ b/torchtune/datasets/_alpaca.py @@ -101,3 +101,21 @@ def alpaca_dataset( original Alpaca dataset, `yahma/alpaca-cleaned `_. See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details. """ + +alpaca_code_20k_dataset = partial(alpaca_dataset, source="sahil2801/CodeAlpaca-20k") +alpaca_code_20k_dataset.__doc__ = """ +Builder for a variant of Alpaca-style datasets for code, `sahil2801/CodeAlpaca-20k `_. +See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details. +""" + +alpaca_code_instruction_120k_dataset = partial(alpaca_dataset, source="iamtarun/code_instructions_120k_alpaca") +alpaca_code_instruction_120k_dataset.__doc__ = """ +Builder for a variant of Alpaca-style datasets for code, `iamtarun/code_instructions_120k_alpaca `_. +See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details. +""" + +alpaca_evol_codealpaca_v1_dataset = partial(alpaca_dataset, source="theblackcat102/evol-codealpaca-v1") +alpaca_evol_codealpaca_v1_dataset.__doc__ = """ +Builder for a variant of Alpaca-style datasets for code in evol style, `theblackcat102/evol-codealpaca-v1 `_. +See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details. +""" \ No newline at end of file