examples for downloading data and for testing the model

ScaDS · Jul 29, 2024 · 5614507 · 5614507
1 parent ce8b00b
commit 5614507
Show file tree

Hide file tree

Showing 3 changed files with 328 additions and 1 deletion.
diff --git a/docs/71_fine_tuning_hf/hf_data_upload.ipynb b/docs/71_fine_tuning_hf/hf_data_upload.ipynb
@@ -242,9 +242,176 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
+   "id": "68406b44-19d5-47bf-980e-a7d2909e4e37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "id": "fa8356ea-804f-4d9e-9730-e10ffc255ac9",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['question', 'answer'],\n",
+       "    num_rows: 130\n",
+       "})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset2_name = \"haesleinhuepf/bio-image-analysis-qa\"\n",
+    "dataset2 = load_dataset(dataset2_name, split=\"all\")\n",
+    "dataset2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6bc10573-2d12-4842-97ea-b497e3784374",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>How can we calculate the average values along ...</td>\n",
+       "      <td>\\nThis code imports the numpy library and crea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>How can I write Python code to apply statistic...</td>\n",
+       "      <td>\\nThe code uses the numpy library in Python, w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How can we obtain the precise shape (dimension...</td>\n",
+       "      <td>\\nThis code reads an image file called \"blobs....</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>How can we use indices in Python to crop image...</td>\n",
+       "      <td>\\nThis code imports the necessary functions fr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>How can we write Python code to crop an image ...</td>\n",
+       "      <td>\\nThe code imports functions `imshow` and `imr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>125</th>\n",
+       "      <td>How can we use Python code to visualize our `l...</td>\n",
+       "      <td>\\nThe code uses the `curtain` function from th...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>126</th>\n",
+       "      <td>How can we open an image and label objects in ...</td>\n",
+       "      <td>\\nThis code imports the necessary libraries an...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>127</th>\n",
+       "      <td>How can we use Python to analyze the labeled e...</td>\n",
+       "      <td>\\nThe code uses the skimage library's measure ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128</th>\n",
+       "      <td>What Python code can be used to create a label...</td>\n",
+       "      <td>\\nThis code imports necessary libraries and fu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>129</th>\n",
+       "      <td>Can you provide a Python code for creating nea...</td>\n",
+       "      <td>\\nThis code uses the pyclesperanto_prototype l...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>130 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              question  \\\n",
+       "0    How can we calculate the average values along ...   \n",
+       "1    How can I write Python code to apply statistic...   \n",
+       "2    How can we obtain the precise shape (dimension...   \n",
+       "3    How can we use indices in Python to crop image...   \n",
+       "4    How can we write Python code to crop an image ...   \n",
+       "..                                                 ...   \n",
+       "125  How can we use Python code to visualize our `l...   \n",
+       "126  How can we open an image and label objects in ...   \n",
+       "127  How can we use Python to analyze the labeled e...   \n",
+       "128  What Python code can be used to create a label...   \n",
+       "129  Can you provide a Python code for creating nea...   \n",
+       "\n",
+       "                                                answer  \n",
+       "0    \\nThis code imports the numpy library and crea...  \n",
+       "1    \\nThe code uses the numpy library in Python, w...  \n",
+       "2    \\nThis code reads an image file called \"blobs....  \n",
+       "3    \\nThis code imports the necessary functions fr...  \n",
+       "4    \\nThe code imports functions `imshow` and `imr...  \n",
+       "..                                                 ...  \n",
+       "125  \\nThe code uses the `curtain` function from th...  \n",
+       "126  \\nThis code imports the necessary libraries an...  \n",
+       "127  \\nThe code uses the skimage library's measure ...  \n",
+       "128  \\nThis code imports necessary libraries and fu...  \n",
+       "129  \\nThis code uses the pyclesperanto_prototype l...  \n",
+       "\n",
+       "[130 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset2.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24ff7579-c67d-4242-8b72-4f41a71c9b4b",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

diff --git a/docs/71_fine_tuning_hf/test_model.ipynb b/docs/71_fine_tuning_hf/test_model.ipynb
@@ -0,0 +1,159 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f1cd554b-2318-41d1-969f-46d7cafd1b40",
+   "metadata": {},
+   "source": [
+    "# Testing the model\n",
+    "Here we just test our fine-tuned model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "be1df990-5cbf-4f3e-9059-9fae25f8d3ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prompt_hf(request, model=\"haesleinhuepf/gemma-2b-it-bia-proof-of-concept2\"):\n",
+    "    global prompt_hf\n",
+    "    import transformers\n",
+    "    import torch\n",
+    "    \n",
+    "    if prompt_hf._pipeline is None:    \n",
+    "        prompt_hf._pipeline = transformers.pipeline(\n",
+    "            \"text-generation\", model=model, model_kwargs={\"torch_dtype\": torch.bfloat16}, device_map=\"auto\",\n",
+    "             max_new_tokens=200\n",
+    "        )\n",
+    "    \n",
+    "    return prompt_hf._pipeline(request)[0]['generated_text']\n",
+    "prompt_hf._pipeline = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "958d16ef-f3fa-412f-be38-76101d63a2e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ade3065e46914ab5b27eac86d175a10a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n",
+      "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n",
+      "`config.hidden_activation` if you want to override this behaviour.\n",
+      "See https://github.com/huggingface/transformers/pull/29402 for more details.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "adc6bf6e815348649fd315678923a671",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some parameters are on the meta device device because they were offloaded to the cpu.\n",
+      "C:\\Users\\rober\\miniconda3\\envs\\genai-gpu\\Lib\\site-packages\\transformers\\models\\gemma\\modeling_gemma.py:482: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
+      "  attn_output = torch.nn.functional.scaled_dot_product_attention(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "Write Python code for cropping an image in X and Y to coordinates 10-20 and 30-50 respectively.\n",
+       "\n",
+       "```python\n",
+       "import cv2\n",
+       "\n",
+       "# Load the image\n",
+       "image = cv2.imread(\"image.jpg\")\n",
+       "\n",
+       "# Crop the image\n",
+       "cropped_image = image[10:20, 30:50]\n",
+       "\n",
+       "# Save the cropped image\n",
+       "cv2.imwrite(\"cropped_image.jpg\", cropped_image)\n",
+       "```\n",
+       "\n",
+       "**Explanation:**\n",
+       "\n",
+       "* `cv2.imread(\"image.jpg\")` loads the image from the file \"image.jpg\".\n",
+       "* `image[10:20, 30:50]` crops the image by specifying the coordinates of the top-left and bottom-right corners of the crop.\n",
+       "* `cv2.imwrite(\"cropped_image.jpg\", cropped_image)` saves the cropped image to the file \"cropped_image.jpg\".\n",
+       "\n",
+       "**Note:**\n",
+       "\n",
+       "* The `[10:20, 30:50]` coordinates represent the height"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import Markdown, display\n",
+    "result = prompt_hf(\"Write Python code for cropping an image in X and Y to coordinates 10-20 and 30-50\")\n",
+    "display(Markdown(result))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0a04961-d213-47d2-a218-8eacd242a35d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -104,6 +104,7 @@ parts:
     chapters:
     - file: 71_fine_tuning_hf/fine-tune-gemma.ipynb
     - file: 71_fine_tuning_hf/merging_model.ipynb
+    - file: 71_fine_tuning_hf/test_model.ipynb
     - file: 71_fine_tuning_hf/hf_data_upload.ipynb
 
   - caption: Benchmarking