Skip to content

Commit

Permalink
simplify train test split
Browse files Browse the repository at this point in the history
  • Loading branch information
djliden committed Mar 7, 2024
1 parent f49f013 commit b0ff299
Showing 1 changed file with 7 additions and 16 deletions.
23 changes: 7 additions & 16 deletions notebooks/4_olmo_1b_instruction_tune/4_olmo_instruction_tune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -744,22 +744,13 @@
"outputs": [],
"source": [
"from datasets import DatasetDict\n",
"from transformers import set_seed\n",
"\n",
"# Assume slimorca_tokenized['train'] is your initial dataset\n",
"# First, perform the initial train-test split as before\n",
"slimorca_tokenized_split = slimorca_tokenized['train'].train_test_split(test_size=0.1)\n",
"set_seed(123)\n",
"\n",
"# Subset the training set to 5000 examples\n",
"train_subset = slimorca_tokenized_split[\"train\"].select(range(5000))\n",
"\n",
"# Subset the test set to 500 examples\n",
"test_subset = slimorca_tokenized_split[\"test\"].select(range(500))\n",
"\n",
"# Create a new DatasetDict with these subsets\n",
"slimorca_tokenized_split_subset = DatasetDict({\n",
" \"train\": train_subset,\n",
" \"valid\": test_subset,\n",
"})"
"slimorca_tokenized_split = slimorca_tokenized[\"train\"].train_test_split(\n",
" train_size=10000, test_size=1000\n",
")"
]
},
{
Expand Down Expand Up @@ -878,8 +869,8 @@
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=slimorca_tokenized_split_subset[\"train\"],\n",
" eval_dataset=slimorca_tokenized_split_subset[\"valid\"],\n",
" train_dataset=slimorca_tokenized_split[\"train\"],\n",
" eval_dataset=slimorca_tokenized_split[\"test\"],\n",
" data_collator=data_collator,\n",
")\n",
"\n",
Expand Down

0 comments on commit b0ff299

Please sign in to comment.