From 685f0f8637bd02ff2cf34ce2a10dcad7f444a5d1 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Thu, 3 Aug 2023 21:15:55 -0700
Subject: [PATCH 1/4] only download pytorch model

Fixes #20
---
 model-loader-huggingface/Makefile          | 17 +++++++++++++++++
 model-loader-huggingface/src/__init__.py   |  0
 model-loader-huggingface/src/load.ipynb    |  3 ++-
 model-loader-huggingface/src/test_utils.py | 19 +++++++++++++++++++
 model-loader-huggingface/src/utils.py      | 13 +++++++++++++
 5 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 model-loader-huggingface/Makefile
 create mode 100644 model-loader-huggingface/src/__init__.py
 create mode 100644 model-loader-huggingface/src/test_utils.py
 create mode 100644 model-loader-huggingface/src/utils.py

diff --git a/model-loader-huggingface/Makefile b/model-loader-huggingface/Makefile
new file mode 100644
index 0000000..68d821f
--- /dev/null
+++ b/model-loader-huggingface/Makefile
@@ -0,0 +1,17 @@
+VENV_NAME=.venv
+PYTHON=${VENV_NAME}/bin/python3
+PIP=${VENV_NAME}/bin/pip
+
+.PHONY: venv
+venv:
+	if [ ! -d "${VENV_NAME}" ]; then python3 -m venv ${VENV_NAME}; fi
+
+.PHONY: install
+install: venv
+	${PIP} install -r requirements.txt && \
+	${PIP} install pytest
+
+.PHONY: test
+test: install
+	${VENV_NAME}/bin/pytest
+
diff --git a/model-loader-huggingface/src/__init__.py b/model-loader-huggingface/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/model-loader-huggingface/src/load.ipynb b/model-loader-huggingface/src/load.ipynb
index c52b3bd..f28959d 100644
--- a/model-loader-huggingface/src/load.ipynb
+++ b/model-loader-huggingface/src/load.ipynb
@@ -85,7 +85,8 @@
     }
    ],
    "source": [
-    "filenames = list(filter(lambda f: not f.startswith(\"coreml/\"), filenames))\n",
+    "from .utils import filter_files\n",
+    "filenames = filter_files(filenames)\n",
     "filenames"
    ]
   },
diff --git a/model-loader-huggingface/src/test_utils.py b/model-loader-huggingface/src/test_utils.py
new file mode 100644
index 0000000..2573129
--- /dev/null
+++ b/model-loader-huggingface/src/test_utils.py
@@ -0,0 +1,19 @@
+import json
+import pytest
+from .utils import filter_files
+
+
+def test_filter_files():
+    files = ["pytorch_model.bin", "tf_model.h5"]
+    assert filter_files(files) == files[:1]
+
+    files = ["config.json",
+             "model-00001-of-00002.safetensors",
+             "model-00002-of-00002.safetensors",
+             "model.safetensors.index.json",
+             "pytorch_model-00001-of-00002.bin",
+             "pytorch_model-00002-of-00002.bin",
+             "pytorch_model.bin.index.json"]
+    assert filter_files(files) == [files[0], files[3], files[4], files[5], files[6]]
+
+
diff --git a/model-loader-huggingface/src/utils.py b/model-loader-huggingface/src/utils.py
new file mode 100644
index 0000000..065fdd7
--- /dev/null
+++ b/model-loader-huggingface/src/utils.py
@@ -0,0 +1,13 @@
+from typing import List
+
+
+def filter_files(files: List[str]) -> List[str]:
+    files = list(filter(lambda f: not f.startswith("coreml/"), files))
+    has_pytorch_model = any([f.startswith("pytorch_model") for f in files])
+    if has_pytorch_model:
+        # filter out safetensors
+        files = list(filter(lambda f: not f.endswith(".safetensors"), files))
+        # filter out tensorflow model
+        files = list(filter(lambda f: not f.startswith("tf_model"), files))
+    return files
+

From 55f7ed120e36b7b091f9a7721233a9cd81c38c2e Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Thu, 3 Aug 2023 21:59:22 -0700
Subject: [PATCH 2/4] fix notebook

---
 model-loader-huggingface/Dockerfile     |   3 +-
 model-loader-huggingface/src/load.ipynb | 116 ++++++++++++++----------
 2 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/model-loader-huggingface/Dockerfile b/model-loader-huggingface/Dockerfile
index 1914250..a16aeb2 100644
--- a/model-loader-huggingface/Dockerfile
+++ b/model-loader-huggingface/Dockerfile
@@ -1,11 +1,12 @@
 ARG BASE_IMAGE=substratusai/base:latest
 FROM ${BASE_IMAGE}
 
+RUN mkdir -p /content/model
 
 COPY requirements.txt requirements.txt
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-COPY scripts/* scripts
+COPY scripts/* scripts/
 COPY src/* src/
 
 ENTRYPOINT ["/tini", "--", "/content/scripts/entrypoint.sh"]
diff --git a/model-loader-huggingface/src/load.ipynb b/model-loader-huggingface/src/load.ipynb
index f28959d..3646b9a 100644
--- a/model-loader-huggingface/src/load.ipynb
+++ b/model-loader-huggingface/src/load.ipynb
@@ -21,28 +21,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "be15516c",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "['.gitattributes',\n",
-       " 'LICENSE.md',\n",
        " 'README.md',\n",
        " 'config.json',\n",
-       " 'flax_model.msgpack',\n",
+       " 'configuration_RW.py',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/model.mlmodel',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin',\n",
+       " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Manifest.json',\n",
        " 'generation_config.json',\n",
-       " 'merges.txt',\n",
-       " 'pytorch_model.bin',\n",
+       " 'handler.py',\n",
+       " 'modelling_RW.py',\n",
+       " 'pytorch_model-00001-of-00002.bin',\n",
+       " 'pytorch_model-00002-of-00002.bin',\n",
+       " 'pytorch_model.bin.index.json',\n",
        " 'special_tokens_map.json',\n",
-       " 'tf_model.h5',\n",
-       " 'tokenizer_config.json',\n",
-       " 'vocab.json']"
+       " 'tokenizer.json',\n",
+       " 'tokenizer_config.json']"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -66,17 +78,18 @@
      "data": {
       "text/plain": [
        "['.gitattributes',\n",
-       " 'LICENSE.md',\n",
        " 'README.md',\n",
        " 'config.json',\n",
-       " 'flax_model.msgpack',\n",
+       " 'configuration_RW.py',\n",
        " 'generation_config.json',\n",
-       " 'merges.txt',\n",
-       " 'pytorch_model.bin',\n",
+       " 'handler.py',\n",
+       " 'modelling_RW.py',\n",
+       " 'pytorch_model-00001-of-00002.bin',\n",
+       " 'pytorch_model-00002-of-00002.bin',\n",
+       " 'pytorch_model.bin.index.json',\n",
        " 'special_tokens_map.json',\n",
-       " 'tf_model.h5',\n",
-       " 'tokenizer_config.json',\n",
-       " 'vocab.json']"
+       " 'tokenizer.json',\n",
+       " 'tokenizer_config.json']"
       ]
      },
      "execution_count": 4,
@@ -85,14 +98,14 @@
     }
    ],
    "source": [
-    "from .utils import filter_files\n",
+    "from utils import filter_files\n",
     "filenames = filter_files(filenames)\n",
     "filenames"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "6cc3beac",
    "metadata": {},
    "outputs": [
@@ -101,29 +114,31 @@
      "output_type": "stream",
      "text": [
       "Downloading .gitattributes to /content/model/.gitattributes\n",
-      "Downloading LICENSE.md to /content/model/LICENSE.md\n",
       "Downloading README.md to /content/model/README.md\n",
       "Downloading config.json to /content/model/config.json\n",
-      "Downloading flax_model.msgpack to /content/model/flax_model.msgpack\n",
+      "Downloading configuration_RW.py to /content/model/configuration_RW.py\n",
       "Downloading generation_config.json to /content/model/generation_config.json\n",
-      "Downloading merges.txt to /content/model/merges.txt\n",
-      "Downloading pytorch_model.bin to /content/model/pytorch_model.bin\n",
+      "Downloading handler.py to /content/model/handler.py\n",
+      "Downloading modelling_RW.py to /content/model/modelling_RW.py\n",
+      "Downloading pytorch_model-00001-of-00002.bin to /content/model/pytorch_model-00001-of-00002.bin\n",
+      "Downloading pytorch_model-00002-of-00002.bin to /content/model/pytorch_model-00002-of-00002.bin\n",
+      "Downloading pytorch_model.bin.index.json to /content/model/pytorch_model.bin.index.json\n",
       "Downloading special_tokens_map.json to /content/model/special_tokens_map.json\n",
-      "Downloading tf_model.h5 to /content/model/tf_model.h5\n",
+      "Downloading tokenizer.json to /content/model/tokenizer.json\n",
       "Downloading tokenizer_config.json to /content/model/tokenizer_config.json\n",
-      "Downloading vocab.json to /content/model/vocab.json\n",
-      "Finished downloading /content/model/generation_config.json\n",
-      "Finished downloading /content/model/README.md\n",
-      "Finished downloading /content/model/flax_model.msgpack\n",
-      "Finished downloading /content/model/vocab.json\n",
-      "Finished downloading /content/model/LICENSE.md\n",
+      "Finished downloading /content/model/pytorch_model-00002-of-00002.bin\n",
+      "Finished downloading /content/model/.gitattributes\n",
       "Finished downloading /content/model/special_tokens_map.json\n",
+      "Finished downloading /content/model/pytorch_model.bin.index.json\n",
+      "Finished downloading /content/model/handler.py\n",
+      "Finished downloading /content/model/README.md\n",
+      "Finished downloading /content/model/generation_config.json\n",
+      "Finished downloading /content/model/tokenizer.json\n",
+      "Finished downloading /content/model/configuration_RW.py\n",
       "Finished downloading /content/model/config.json\n",
-      "Finished downloading /content/model/pytorch_model.bin\n",
       "Finished downloading /content/model/tokenizer_config.json\n",
-      "Finished downloading /content/model/tf_model.h5\n",
-      "Finished downloading /content/model/merges.txt\n",
-      "Finished downloading /content/model/.gitattributes\n"
+      "Finished downloading /content/model/pytorch_model-00001-of-00002.bin\n",
+      "Finished downloading /content/model/modelling_RW.py\n"
      ]
     }
    ],
@@ -156,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "89545c70-8e54-4265-aab7-e42e5fb606d6",
    "metadata": {},
    "outputs": [
@@ -164,21 +179,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "total 724M\n",
-      "   0 drwxr-xr-x 14 root root  448 Jul 16 17:12 .\n",
-      "4.0K drwxr-xr-x  1 root root 4.0K Jul 16 17:11 ..\n",
-      "4.0K -rw-r--r--  1 root root 1.2K Jul 16 17:12 .gitattributes\n",
-      " 12K -rw-r--r--  1 root root  11K Jul 16 17:12 LICENSE.md\n",
-      "8.0K -rw-r--r--  1 root root 7.0K Jul 16 17:12 README.md\n",
-      "4.0K -rw-r--r--  1 root root  651 Jul 16 17:12 config.json\n",
-      "241M -rw-r--r--  1 root root 239M Jul 16 17:13 flax_model.msgpack\n",
-      "4.0K -rw-r--r--  1 root root  137 Jul 16 17:12 generation_config.json\n",
-      "448K -rw-r--r--  1 root root 446K Jul 16 17:12 merges.txt\n",
-      "241M -rw-r--r--  1 root root 239M Jul 16 17:13 pytorch_model.bin\n",
-      "4.0K -rw-r--r--  1 root root  441 Jul 16 17:12 special_tokens_map.json\n",
-      "241M -rw-r--r--  1 root root 240M Jul 16 17:13 tf_model.h5\n",
-      "4.0K -rw-r--r--  1 root root  685 Jul 16 17:12 tokenizer_config.json\n",
-      "880K -rw-r--r--  1 root root 878K Jul 16 17:12 vocab.json\n"
+      "total 14G\n",
+      "4.0K drwxr-xr-x 2 root root 4.0K Aug  4 04:41 .\n",
+      "8.0K drwxr-xr-x 1 root root 4.0K Aug  4 04:41 ..\n",
+      "4.0K -rw-r--r-- 1 root root 1.5K Aug  4 04:41 .gitattributes\n",
+      " 12K -rw-r--r-- 1 root root 9.6K Aug  4 04:41 README.md\n",
+      "4.0K -rw-r--r-- 1 root root  667 Aug  4 04:41 config.json\n",
+      "4.0K -rw-r--r-- 1 root root 2.6K Aug  4 04:41 configuration_RW.py\n",
+      "4.0K -rw-r--r-- 1 root root  111 Aug  4 04:41 generation_config.json\n",
+      "4.0K -rw-r--r-- 1 root root 1.2K Aug  4 04:41 handler.py\n",
+      " 48K -rw-r--r-- 1 root root  47K Aug  4 04:41 modelling_RW.py\n",
+      "9.3G -rw-r--r-- 1 root root 9.3G Aug  4 04:43 pytorch_model-00001-of-00002.bin\n",
+      "4.2G -rw-r--r-- 1 root root 4.2G Aug  4 04:42 pytorch_model-00002-of-00002.bin\n",
+      " 20K -rw-r--r-- 1 root root  17K Aug  4 04:41 pytorch_model.bin.index.json\n",
+      "4.0K -rw-r--r-- 1 root root  281 Aug  4 04:41 special_tokens_map.json\n",
+      "2.7M -rw-r--r-- 1 root root 2.7M Aug  4 04:41 tokenizer.json\n",
+      "4.0K -rw-r--r-- 1 root root  220 Aug  4 04:41 tokenizer_config.json\n"
      ]
     }
    ],
@@ -203,7 +219,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,

From 89356476a0dfc2bb3a4cc995f8414064812fc243 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Thu, 3 Aug 2023 21:59:34 -0700
Subject: [PATCH 3/4] add notebook that can be used for development

---
 model-loader-huggingface/notebook.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 model-loader-huggingface/notebook.yaml

diff --git a/model-loader-huggingface/notebook.yaml b/model-loader-huggingface/notebook.yaml
new file mode 100644
index 0000000..460a17f
--- /dev/null
+++ b/model-loader-huggingface/notebook.yaml
@@ -0,0 +1,9 @@
+apiVersion: substratus.ai/v1
+kind: Notebook
+metadata:
+  name: model-loader-dev
+spec:
+  image: substratusai/model-loader-huggingface
+  command: ["notebook.sh"]
+  params:
+    name: tiiuae/falcon-7b-instruct

From 95bba650a857c7eda8ae4bced1c70f88f4156012 Mon Sep 17 00:00:00 2001
From: Sam Stoelinga <sammiestoel@gmail.com>
Date: Thu, 3 Aug 2023 23:08:44 -0700
Subject: [PATCH 4/4] Update model-loader-huggingface/src/test_utils.py

Co-authored-by: Brandon J. Bjelland <brandoconnor@gmail.com>
---
 model-loader-huggingface/src/test_utils.py | 44 ++++++++++++++++------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/model-loader-huggingface/src/test_utils.py b/model-loader-huggingface/src/test_utils.py
index 2573129..91f47b7 100644
--- a/model-loader-huggingface/src/test_utils.py
+++ b/model-loader-huggingface/src/test_utils.py
@@ -3,17 +3,37 @@
 from .utils import filter_files
 
 
-def test_filter_files():
-    files = ["pytorch_model.bin", "tf_model.h5"]
-    assert filter_files(files) == files[:1]
-
-    files = ["config.json",
-             "model-00001-of-00002.safetensors",
-             "model-00002-of-00002.safetensors",
-             "model.safetensors.index.json",
-             "pytorch_model-00001-of-00002.bin",
-             "pytorch_model-00002-of-00002.bin",
-             "pytorch_model.bin.index.json"]
-    assert filter_files(files) == [files[0], files[3], files[4], files[5], files[6]]
+@pytest.mark.parametrize(
+    "files, expected",
+    [
+        (
+            [
+                "pytorch_model.bin",
+                "tf_model.h5",
+            ],
+            ["pytorch_model.bin"],
+        ),
+        (
+            [
+                "config.json",
+                "model-00001-of-00002.safetensors",
+                "model-00002-of-00002.safetensors",
+                "model.safetensors.index.json",
+                "pytorch_model-00001-of-00002.bin",
+                "pytorch_model-00002-of-00002.bin",
+                "pytorch_model.bin.index.json",
+            ],
+            [
+                "config.json",
+                "model.safetensors.index.json",
+                "pytorch_model-00001-of-00002.bin",
+                "pytorch_model-00002-of-00002.bin",
+                "pytorch_model.bin.index.json",
+            ],
+        ),
+    ],
+)
+def test_filter_files(files, expected):
+    assert filter_files(files) == expected