Skip to content

Commit

Permalink
Add code alpaca dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
insop committed Nov 30, 2024
1 parent 32e265d commit 76724e0
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
5 changes: 4 additions & 1 deletion torchtune/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# LICENSE file in the root directory of this source tree.

from torchtune.datasets import multimodal
from torchtune.datasets._alpaca import alpaca_cleaned_dataset, alpaca_dataset
from torchtune.datasets._alpaca import alpaca_evol_codealpaca_v1_dataset, alpaca_code_instruction_120k_dataset, alpaca_code_20k_dataset, alpaca_cleaned_dataset, alpaca_dataset
from torchtune.datasets._chat import chat_dataset
from torchtune.datasets._cnn_dailymail import cnn_dailymail_articles_dataset
from torchtune.datasets._concat import ConcatDataset
Expand All @@ -27,6 +27,9 @@
__all__ = [
"alpaca_dataset",
"alpaca_cleaned_dataset",
"alpaca_code_20k_dataset",
"alpaca_code_instruction_120k_dataset",
"alpaca_evol_codealpaca_v1_dataset",
"grammar_dataset",
"samsum_dataset",
"stack_exchange_paired_dataset",
Expand Down
18 changes: 18 additions & 0 deletions torchtune/datasets/_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,21 @@ def alpaca_dataset(
original Alpaca dataset, `yahma/alpaca-cleaned <https://huggingface.co/datasets/yahma/alpaca-cleaned>`_.
See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details.
"""

alpaca_code_20k_dataset = partial(alpaca_dataset, source="sahil2801/CodeAlpaca-20k")
alpaca_code_20k_dataset.__doc__ = """
Builder for a variant of Alpaca-style datasets for code, `sahil2801/CodeAlpaca-20k <https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k>`_.
See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details.
"""

alpaca_code_instruction_120k_dataset = partial(alpaca_dataset, source="iamtarun/code_instructions_120k_alpaca")
alpaca_code_instruction_120k_dataset.__doc__ = """
Builder for a variant of Alpaca-style datasets for code, `iamtarun/code_instructions_120k_alpaca <https://huggingface.co/datasets/iamtarun/code_instructions_120k_alpaca>`_.
See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details.
"""

alpaca_evol_codealpaca_v1_dataset = partial(alpaca_dataset, source="theblackcat102/evol-codealpaca-v1")
alpaca_evol_codealpaca_v1_dataset.__doc__ = """
Builder for a variant of Alpaca-style datasets for code in evol style, `theblackcat102/evol-codealpaca-v1 <https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1>`_.
See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details.
"""

0 comments on commit 76724e0

Please sign in to comment.