Fix training codes

naem1023 · Mar 15, 2023 · eedad2a · eedad2a
1 parent cdbc5c0
commit eedad2a
Show file tree

Hide file tree

Showing 5 changed files with 773 additions and 4 deletions.
diff --git a/finetune.py b/finetune.py
@@ -20,11 +20,13 @@
 LORA_ALPHA = 16
 LORA_DROPOUT = 0.05
 
+model_path = os.environ["model_path"]
+
 model = LLaMAForCausalLM.from_pretrained(
-    "decapoda-research/llama-7b-hf",
+    model_path,
 )
 tokenizer = LLaMATokenizer.from_pretrained(
-    "decapoda-research/llama-7b-hf", add_eos_token=True
+    model_path, add_eos_token=True
 )
 
 # model = prepare_model_for_int8_training(model)

diff --git a/lengths.ipynb b/lengths.ipynb
@@ -1,5 +1,121 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/alfh/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Found cached dataset json (/home/irteam/.cache/huggingface/datasets/json/default-801198b68acc55bc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "100%|██████████| 1/1 [00:00<00:00, 173.13it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset, Dataset, DatasetDict\n",
+    "data = load_dataset(\"json\", data_files=\"alpaca_data.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['instruction', 'input', 'output'],\n",
+       "        num_rows: 52002\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['instruction', 'input', 'output'],\n",
+       "    num_rows: 52002\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'valid'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data[\u001b[39m'\u001b[39;49m\u001b[39mvalid\u001b[39;49m\u001b[39m'\u001b[39;49m]\n",
+      "File \u001b[0;32m/opt/conda/envs/alfh/lib/python3.10/site-packages/datasets/dataset_dict.py:58\u001b[0m, in \u001b[0;36mDatasetDict.__getitem__\u001b[0;34m(self, k)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__getitem__\u001b[39m(\u001b[39mself\u001b[39m, k) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dataset:\n\u001b[1;32m     57\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(k, (\u001b[39mstr\u001b[39m, NamedSplit)) \u001b[39mor\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m---> 58\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__getitem__\u001b[39;49m(k)\n\u001b[1;32m     59\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m     60\u001b[0m         available_suggested_splits \u001b[39m=\u001b[39m [\n\u001b[1;32m     61\u001b[0m             split \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m (Split\u001b[39m.\u001b[39mTRAIN, Split\u001b[39m.\u001b[39mTEST, Split\u001b[39m.\u001b[39mVALIDATION) \u001b[39mif\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\n\u001b[1;32m     62\u001b[0m         ]\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'valid'"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(data['train']['caption'], data['tag'], test_size=0.2, random_state=42)\n",
+    "X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = DatasetDict({\n",
+    "    'train': Dataset.from_dict({\n",
+    "        'caption': X_train,\n",
+    "        'tag': y_train\n",
+    "    }),\n",
+    "    'validation': Dataset.from_dict({\n",
+    "        'caption': X_val,\n",
+    "        'tag': y_val\n",
+    "    }),\n",
+    "    'test': Dataset.from_dict({\n",
+    "        'caption': X_test,\n",
+    "        'tag': y_test\n",
+    "    })\n",
+    "})"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -166,7 +282,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,6 @@ loralib
 lora
 datasets
 peft
-accelerate
+accelerate
+evaluate
+scikit-learn