Skip to content

Commit

Permalink
Fix training codes
Browse files Browse the repository at this point in the history
  • Loading branch information
sungho.park committed Mar 15, 2023
1 parent cdbc5c0 commit eedad2a
Show file tree
Hide file tree
Showing 5 changed files with 773 additions and 4 deletions.
6 changes: 4 additions & 2 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

model_path = os.environ["model_path"]

model = LLaMAForCausalLM.from_pretrained(
"decapoda-research/llama-7b-hf",
model_path,
)
tokenizer = LLaMATokenizer.from_pretrained(
"decapoda-research/llama-7b-hf", add_eos_token=True
model_path, add_eos_token=True
)

# model = prepare_model_for_int8_training(model)
Expand Down
118 changes: 117 additions & 1 deletion lengths.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,121 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/alfh/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Found cached dataset json (/home/irteam/.cache/huggingface/datasets/json/default-801198b68acc55bc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"100%|██████████| 1/1 [00:00<00:00, 173.13it/s]\n"
]
}
],
"source": [
"from datasets import load_dataset, Dataset, DatasetDict\n",
"data = load_dataset(\"json\", data_files=\"alpaca_data.json\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['instruction', 'input', 'output'],\n",
" num_rows: 52002\n",
" })\n",
"})"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['instruction', 'input', 'output'],\n",
" num_rows: 52002\n",
"})"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['train']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'valid'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data[\u001b[39m'\u001b[39;49m\u001b[39mvalid\u001b[39;49m\u001b[39m'\u001b[39;49m]\n",
"File \u001b[0;32m/opt/conda/envs/alfh/lib/python3.10/site-packages/datasets/dataset_dict.py:58\u001b[0m, in \u001b[0;36mDatasetDict.__getitem__\u001b[0;34m(self, k)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__getitem__\u001b[39m(\u001b[39mself\u001b[39m, k) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dataset:\n\u001b[1;32m 57\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(k, (\u001b[39mstr\u001b[39m, NamedSplit)) \u001b[39mor\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m---> 58\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__getitem__\u001b[39;49m(k)\n\u001b[1;32m 59\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m available_suggested_splits \u001b[39m=\u001b[39m [\n\u001b[1;32m 61\u001b[0m split \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m (Split\u001b[39m.\u001b[39mTRAIN, Split\u001b[39m.\u001b[39mTEST, Split\u001b[39m.\u001b[39mVALIDATION) \u001b[39mif\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\n\u001b[1;32m 62\u001b[0m ]\n",
"\u001b[0;31mKeyError\u001b[0m: 'valid'"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(data['train']['caption'], data['tag'], test_size=0.2, random_state=42)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = DatasetDict({\n",
" 'train': Dataset.from_dict({\n",
" 'caption': X_train,\n",
" 'tag': y_train\n",
" }),\n",
" 'validation': Dataset.from_dict({\n",
" 'caption': X_val,\n",
" 'tag': y_val\n",
" }),\n",
" 'test': Dataset.from_dict({\n",
" 'caption': X_test,\n",
" 'tag': y_test\n",
" })\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 1,
Expand Down Expand Up @@ -166,7 +282,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.10.9"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ loralib
lora
datasets
peft
accelerate
accelerate
evaluate
scikit-learn
Loading

0 comments on commit eedad2a

Please sign in to comment.