substratusai · samos123 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/model-loader-huggingface/Dockerfile b/model-loader-huggingface/Dockerfile
@@ -1,11 +1,12 @@
 ARG BASE_IMAGE=substratusai/base:latest
 FROM ${BASE_IMAGE}
 
+RUN mkdir -p /content/model
 
 COPY requirements.txt requirements.txt
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-COPY scripts/* scripts
+COPY scripts/* scripts/
 COPY src/* src/
 
 ENTRYPOINT ["/tini", "--", "/content/scripts/entrypoint.sh"]

diff --git a/model-loader-huggingface/Makefile b/model-loader-huggingface/Makefile
@@ -0,0 +1,17 @@
+VENV_NAME=.venv
+PYTHON=${VENV_NAME}/bin/python3
+PIP=${VENV_NAME}/bin/pip
+
+.PHONY: venv
+venv:
+	if [ ! -d "${VENV_NAME}" ]; then python3 -m venv ${VENV_NAME}; fi
+
+.PHONY: install
+install: venv
+	${PIP} install -r requirements.txt && \
+	${PIP} install pytest
+
+.PHONY: test
+test: install
+	${VENV_NAME}/bin/pytest
+
diff --git a/model-loader-huggingface/notebook.yaml b/model-loader-huggingface/notebook.yaml
@@ -0,0 +1,9 @@
+apiVersion: substratus.ai/v1
+kind: Notebook
+metadata:
+  name: model-loader-dev
+spec:
+  image: substratusai/model-loader-huggingface
+  command: ["notebook.sh"]
+  params:
+    name: tiiuae/falcon-7b-instruct
diff --git a/model-loader-huggingface/src/__init__.py b/model-loader-huggingface/src/__init__.py
diff --git a/model-loader-huggingface/src/load.ipynb b/model-loader-huggingface/src/load.ipynb
@@ -21,28 +21,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "be15516c",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "['.gitattributes',\n",
-       " 'LICENSE.md',\n",
        " 'README.md',\n",
        " 'config.json',\n",
-       " 'flax_model.msgpack',\n",
+       " 'configuration_RW.py',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/model.mlmodel',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Manifest.json',\n",
        " 'generation_config.json',\n",
-       " 'merges.txt',\n",
-       " 'pytorch_model.bin',\n",
+       " 'handler.py',\n",
+       " 'modelling_RW.py',\n",
+       " 'pytorch_model-00001-of-00002.bin',\n",
+       " 'pytorch_model-00002-of-00002.bin',\n",
+       " 'pytorch_model.bin.index.json',\n",
        " 'special_tokens_map.json',\n",
-       " 'tf_model.h5',\n",
-       " 'tokenizer_config.json',\n",
-       " 'vocab.json']"
+       " 'tokenizer.json',\n",
+       " 'tokenizer_config.json']"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -66,17 +78,18 @@
      "data": {
       "text/plain": [
        "['.gitattributes',\n",
-       " 'LICENSE.md',\n",
        " 'README.md',\n",
        " 'config.json',\n",
-       " 'flax_model.msgpack',\n",
+       " 'configuration_RW.py',\n",
        " 'generation_config.json',\n",
-       " 'merges.txt',\n",
-       " 'pytorch_model.bin',\n",
+       " 'handler.py',\n",
+       " 'modelling_RW.py',\n",
+       " 'pytorch_model-00001-of-00002.bin',\n",
+       " 'pytorch_model-00002-of-00002.bin',\n",
+       " 'pytorch_model.bin.index.json',\n",
        " 'special_tokens_map.json',\n",
-       " 'tf_model.h5',\n",
-       " 'tokenizer_config.json',\n",
-       " 'vocab.json']"
+       " 'tokenizer.json',\n",
+       " 'tokenizer_config.json']"
       ]
      },
      "execution_count": 4,
@@ -85,13 +98,14 @@
     }
    ],
    "source": [
-    "filenames = list(filter(lambda f: not f.startswith(\"coreml/\"), filenames))\n",
+    "from utils import filter_files\n",
+    "filenames = filter_files(filenames)\n",
     "filenames"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "6cc3beac",
    "metadata": {},
    "outputs": [
@@ -100,29 +114,31 @@
      "output_type": "stream",
      "text": [
       "Downloading .gitattributes to /content/model/.gitattributes\n",
-      "Downloading LICENSE.md to /content/model/LICENSE.md\n",
       "Downloading README.md to /content/model/README.md\n",
       "Downloading config.json to /content/model/config.json\n",
-      "Downloading flax_model.msgpack to /content/model/flax_model.msgpack\n",
+      "Downloading configuration_RW.py to /content/model/configuration_RW.py\n",
       "Downloading generation_config.json to /content/model/generation_config.json\n",
-      "Downloading merges.txt to /content/model/merges.txt\n",
-      "Downloading pytorch_model.bin to /content/model/pytorch_model.bin\n",
+      "Downloading handler.py to /content/model/handler.py\n",
+      "Downloading modelling_RW.py to /content/model/modelling_RW.py\n",
+      "Downloading pytorch_model-00001-of-00002.bin to /content/model/pytorch_model-00001-of-00002.bin\n",
+      "Downloading pytorch_model-00002-of-00002.bin to /content/model/pytorch_model-00002-of-00002.bin\n",
+      "Downloading pytorch_model.bin.index.json to /content/model/pytorch_model.bin.index.json\n",
       "Downloading special_tokens_map.json to /content/model/special_tokens_map.json\n",
-      "Downloading tf_model.h5 to /content/model/tf_model.h5\n",
+      "Downloading tokenizer.json to /content/model/tokenizer.json\n",
       "Downloading tokenizer_config.json to /content/model/tokenizer_config.json\n",
-      "Downloading vocab.json to /content/model/vocab.json\n",
-      "Finished downloading /content/model/generation_config.json\n",
-      "Finished downloading /content/model/README.md\n",
-      "Finished downloading /content/model/flax_model.msgpack\n",
-      "Finished downloading /content/model/vocab.json\n",
-      "Finished downloading /content/model/LICENSE.md\n",
+      "Finished downloading /content/model/pytorch_model-00002-of-00002.bin\n",
+      "Finished downloading /content/model/.gitattributes\n",
       "Finished downloading /content/model/special_tokens_map.json\n",
+      "Finished downloading /content/model/pytorch_model.bin.index.json\n",
+      "Finished downloading /content/model/handler.py\n",
+      "Finished downloading /content/model/README.md\n",
+      "Finished downloading /content/model/generation_config.json\n",
+      "Finished downloading /content/model/tokenizer.json\n",
+      "Finished downloading /content/model/configuration_RW.py\n",
       "Finished downloading /content/model/config.json\n",
-      "Finished downloading /content/model/pytorch_model.bin\n",
       "Finished downloading /content/model/tokenizer_config.json\n",
-      "Finished downloading /content/model/tf_model.h5\n",
-      "Finished downloading /content/model/merges.txt\n",
-      "Finished downloading /content/model/.gitattributes\n"
+      "Finished downloading /content/model/pytorch_model-00001-of-00002.bin\n",
+      "Finished downloading /content/model/modelling_RW.py\n"
      ]
     }
    ],
@@ -155,29 +171,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "89545c70-8e54-4265-aab7-e42e5fb606d6",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "total 724M\n",
-      "   0 drwxr-xr-x 14 root root  448 Jul 16 17:12 .\n",
-      "4.0K drwxr-xr-x  1 root root 4.0K Jul 16 17:11 ..\n",
-      "4.0K -rw-r--r--  1 root root 1.2K Jul 16 17:12 .gitattributes\n",
-      " 12K -rw-r--r--  1 root root  11K Jul 16 17:12 LICENSE.md\n",
-      "8.0K -rw-r--r--  1 root root 7.0K Jul 16 17:12 README.md\n",
-      "4.0K -rw-r--r--  1 root root  651 Jul 16 17:12 config.json\n",
-      "241M -rw-r--r--  1 root root 239M Jul 16 17:13 flax_model.msgpack\n",
-      "4.0K -rw-r--r--  1 root root  137 Jul 16 17:12 generation_config.json\n",
-      "448K -rw-r--r--  1 root root 446K Jul 16 17:12 merges.txt\n",
-      "241M -rw-r--r--  1 root root 239M Jul 16 17:13 pytorch_model.bin\n",
-      "4.0K -rw-r--r--  1 root root  441 Jul 16 17:12 special_tokens_map.json\n",
-      "241M -rw-r--r--  1 root root 240M Jul 16 17:13 tf_model.h5\n",
-      "4.0K -rw-r--r--  1 root root  685 Jul 16 17:12 tokenizer_config.json\n",
-      "880K -rw-r--r--  1 root root 878K Jul 16 17:12 vocab.json\n"
+      "total 14G\n",
+      "4.0K drwxr-xr-x 2 root root 4.0K Aug  4 04:41 .\n",
+      "8.0K drwxr-xr-x 1 root root 4.0K Aug  4 04:41 ..\n",
+      "4.0K -rw-r--r-- 1 root root 1.5K Aug  4 04:41 .gitattributes\n",
+      " 12K -rw-r--r-- 1 root root 9.6K Aug  4 04:41 README.md\n",
+      "4.0K -rw-r--r-- 1 root root  667 Aug  4 04:41 config.json\n",
+      "4.0K -rw-r--r-- 1 root root 2.6K Aug  4 04:41 configuration_RW.py\n",
+      "4.0K -rw-r--r-- 1 root root  111 Aug  4 04:41 generation_config.json\n",
+      "4.0K -rw-r--r-- 1 root root 1.2K Aug  4 04:41 handler.py\n",
+      " 48K -rw-r--r-- 1 root root  47K Aug  4 04:41 modelling_RW.py\n",
+      "9.3G -rw-r--r-- 1 root root 9.3G Aug  4 04:43 pytorch_model-00001-of-00002.bin\n",
+      "4.2G -rw-r--r-- 1 root root 4.2G Aug  4 04:42 pytorch_model-00002-of-00002.bin\n",
+      " 20K -rw-r--r-- 1 root root  17K Aug  4 04:41 pytorch_model.bin.index.json\n",
+      "4.0K -rw-r--r-- 1 root root  281 Aug  4 04:41 special_tokens_map.json\n",
+      "2.7M -rw-r--r-- 1 root root 2.7M Aug  4 04:41 tokenizer.json\n",
+      "4.0K -rw-r--r-- 1 root root  220 Aug  4 04:41 tokenizer_config.json\n"
      ]
     }
    ],
@@ -202,7 +219,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,

diff --git a/model-loader-huggingface/src/test_utils.py b/model-loader-huggingface/src/test_utils.py
@@ -0,0 +1,39 @@
+import json
+import pytest
+from .utils import filter_files
+
+
+@pytest.mark.parametrize(
+    "files, expected",
+    [
+        (
+            [
+                "pytorch_model.bin",
+                "tf_model.h5",
+            ],
+            ["pytorch_model.bin"],
+        ),
+        (
+            [
+                "config.json",
+                "model-00001-of-00002.safetensors",
+                "model-00002-of-00002.safetensors",
+                "model.safetensors.index.json",
+                "pytorch_model-00001-of-00002.bin",
+                "pytorch_model-00002-of-00002.bin",
+                "pytorch_model.bin.index.json",
+            ],
+            [
+                "config.json",
+                "model.safetensors.index.json",
+                "pytorch_model-00001-of-00002.bin",
+                "pytorch_model-00002-of-00002.bin",
+                "pytorch_model.bin.index.json",
+            ],
+        ),
+    ],
+)
+def test_filter_files(files, expected):
+    assert filter_files(files) == expected
+
+
diff --git a/model-loader-huggingface/src/utils.py b/model-loader-huggingface/src/utils.py
@@ -0,0 +1,13 @@
+from typing import List
+
+
+def filter_files(files: List[str]) -> List[str]:
+    files = list(filter(lambda f: not f.startswith("coreml/"), files))
+    has_pytorch_model = any([f.startswith("pytorch_model") for f in files])
+    if has_pytorch_model:
+        # filter out safetensors
+        files = list(filter(lambda f: not f.endswith(".safetensors"), files))
+        # filter out tensorflow model
+        files = list(filter(lambda f: not f.startswith("tf_model"), files))
+    return files
+