diff --git a/model-loader-huggingface/Dockerfile b/model-loader-huggingface/Dockerfile index 1914250..a16aeb2 100644 --- a/model-loader-huggingface/Dockerfile +++ b/model-loader-huggingface/Dockerfile @@ -1,11 +1,12 @@ ARG BASE_IMAGE=substratusai/base:latest FROM ${BASE_IMAGE} +RUN mkdir -p /content/model COPY requirements.txt requirements.txt RUN pip3 install --no-cache-dir -r requirements.txt -COPY scripts/* scripts +COPY scripts/* scripts/ COPY src/* src/ ENTRYPOINT ["/tini", "--", "/content/scripts/entrypoint.sh"] diff --git a/model-loader-huggingface/Makefile b/model-loader-huggingface/Makefile new file mode 100644 index 0000000..68d821f --- /dev/null +++ b/model-loader-huggingface/Makefile @@ -0,0 +1,17 @@ +VENV_NAME=.venv +PYTHON=${VENV_NAME}/bin/python3 +PIP=${VENV_NAME}/bin/pip + +.PHONY: venv +venv: + if [ ! -d "${VENV_NAME}" ]; then python3 -m venv ${VENV_NAME}; fi + +.PHONY: install +install: venv + ${PIP} install -r requirements.txt && \ + ${PIP} install pytest + +.PHONY: test +test: install + ${VENV_NAME}/bin/pytest + diff --git a/model-loader-huggingface/notebook.yaml b/model-loader-huggingface/notebook.yaml new file mode 100644 index 0000000..460a17f --- /dev/null +++ b/model-loader-huggingface/notebook.yaml @@ -0,0 +1,9 @@ +apiVersion: substratus.ai/v1 +kind: Notebook +metadata: + name: model-loader-dev +spec: + image: substratusai/model-loader-huggingface + command: ["notebook.sh"] + params: + name: tiiuae/falcon-7b-instruct diff --git a/model-loader-huggingface/src/__init__.py b/model-loader-huggingface/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model-loader-huggingface/src/load.ipynb b/model-loader-huggingface/src/load.ipynb index c52b3bd..3646b9a 100644 --- a/model-loader-huggingface/src/load.ipynb +++ b/model-loader-huggingface/src/load.ipynb @@ -21,28 +21,40 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "be15516c", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, { "data": { "text/plain": [ "['.gitattributes',\n", - " 'LICENSE.md',\n", " 'README.md',\n", " 'config.json',\n", - " 'flax_model.msgpack',\n", + " 'configuration_RW.py',\n", + " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/model.mlmodel',\n", + " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin',\n", + " 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Manifest.json',\n", " 'generation_config.json',\n", - " 'merges.txt',\n", - " 'pytorch_model.bin',\n", + " 'handler.py',\n", + " 'modelling_RW.py',\n", + " 'pytorch_model-00001-of-00002.bin',\n", + " 'pytorch_model-00002-of-00002.bin',\n", + " 'pytorch_model.bin.index.json',\n", " 'special_tokens_map.json',\n", - " 'tf_model.h5',\n", - " 'tokenizer_config.json',\n", - " 'vocab.json']" + " 'tokenizer.json',\n", + " 'tokenizer_config.json']" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -66,17 +78,18 @@ "data": { "text/plain": [ "['.gitattributes',\n", - " 'LICENSE.md',\n", " 'README.md',\n", " 'config.json',\n", - " 'flax_model.msgpack',\n", + " 'configuration_RW.py',\n", " 'generation_config.json',\n", - " 'merges.txt',\n", - " 'pytorch_model.bin',\n", + " 'handler.py',\n", + " 'modelling_RW.py',\n", + " 'pytorch_model-00001-of-00002.bin',\n", + " 'pytorch_model-00002-of-00002.bin',\n", + " 'pytorch_model.bin.index.json',\n", " 'special_tokens_map.json',\n", - " 'tf_model.h5',\n", - " 'tokenizer_config.json',\n", - " 'vocab.json']" + " 'tokenizer.json',\n", + " 'tokenizer_config.json']" ] }, "execution_count": 4, @@ -85,13 +98,14 @@ } ], "source": [ - "filenames = list(filter(lambda f: not f.startswith(\"coreml/\"), filenames))\n", + "from utils import filter_files\n", + "filenames = filter_files(filenames)\n", "filenames" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "6cc3beac", "metadata": {}, "outputs": [ @@ -100,29 +114,31 @@ "output_type": "stream", "text": [ "Downloading .gitattributes to /content/model/.gitattributes\n", - "Downloading LICENSE.md to /content/model/LICENSE.md\n", "Downloading README.md to /content/model/README.md\n", "Downloading config.json to /content/model/config.json\n", - "Downloading flax_model.msgpack to /content/model/flax_model.msgpack\n", + "Downloading configuration_RW.py to /content/model/configuration_RW.py\n", "Downloading generation_config.json to /content/model/generation_config.json\n", - "Downloading merges.txt to /content/model/merges.txt\n", - "Downloading pytorch_model.bin to /content/model/pytorch_model.bin\n", + "Downloading handler.py to /content/model/handler.py\n", + "Downloading modelling_RW.py to /content/model/modelling_RW.py\n", + "Downloading pytorch_model-00001-of-00002.bin to /content/model/pytorch_model-00001-of-00002.bin\n", + "Downloading pytorch_model-00002-of-00002.bin to /content/model/pytorch_model-00002-of-00002.bin\n", + "Downloading pytorch_model.bin.index.json to /content/model/pytorch_model.bin.index.json\n", "Downloading special_tokens_map.json to /content/model/special_tokens_map.json\n", - "Downloading tf_model.h5 to /content/model/tf_model.h5\n", + "Downloading tokenizer.json to /content/model/tokenizer.json\n", "Downloading tokenizer_config.json to /content/model/tokenizer_config.json\n", - "Downloading vocab.json to /content/model/vocab.json\n", - "Finished downloading /content/model/generation_config.json\n", - "Finished downloading /content/model/README.md\n", - "Finished downloading /content/model/flax_model.msgpack\n", - "Finished downloading /content/model/vocab.json\n", - "Finished downloading /content/model/LICENSE.md\n", + "Finished downloading /content/model/pytorch_model-00002-of-00002.bin\n", + "Finished downloading /content/model/.gitattributes\n", "Finished downloading /content/model/special_tokens_map.json\n", + "Finished downloading /content/model/pytorch_model.bin.index.json\n", + "Finished downloading /content/model/handler.py\n", + "Finished downloading /content/model/README.md\n", + "Finished downloading /content/model/generation_config.json\n", + "Finished downloading /content/model/tokenizer.json\n", + "Finished downloading /content/model/configuration_RW.py\n", "Finished downloading /content/model/config.json\n", - "Finished downloading /content/model/pytorch_model.bin\n", "Finished downloading /content/model/tokenizer_config.json\n", - "Finished downloading /content/model/tf_model.h5\n", - "Finished downloading /content/model/merges.txt\n", - "Finished downloading /content/model/.gitattributes\n" + "Finished downloading /content/model/pytorch_model-00001-of-00002.bin\n", + "Finished downloading /content/model/modelling_RW.py\n" ] } ], @@ -155,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "89545c70-8e54-4265-aab7-e42e5fb606d6", "metadata": {}, "outputs": [ @@ -163,21 +179,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 724M\n", - " 0 drwxr-xr-x 14 root root 448 Jul 16 17:12 .\n", - "4.0K drwxr-xr-x 1 root root 4.0K Jul 16 17:11 ..\n", - "4.0K -rw-r--r-- 1 root root 1.2K Jul 16 17:12 .gitattributes\n", - " 12K -rw-r--r-- 1 root root 11K Jul 16 17:12 LICENSE.md\n", - "8.0K -rw-r--r-- 1 root root 7.0K Jul 16 17:12 README.md\n", - "4.0K -rw-r--r-- 1 root root 651 Jul 16 17:12 config.json\n", - "241M -rw-r--r-- 1 root root 239M Jul 16 17:13 flax_model.msgpack\n", - "4.0K -rw-r--r-- 1 root root 137 Jul 16 17:12 generation_config.json\n", - "448K -rw-r--r-- 1 root root 446K Jul 16 17:12 merges.txt\n", - "241M -rw-r--r-- 1 root root 239M Jul 16 17:13 pytorch_model.bin\n", - "4.0K -rw-r--r-- 1 root root 441 Jul 16 17:12 special_tokens_map.json\n", - "241M -rw-r--r-- 1 root root 240M Jul 16 17:13 tf_model.h5\n", - "4.0K -rw-r--r-- 1 root root 685 Jul 16 17:12 tokenizer_config.json\n", - "880K -rw-r--r-- 1 root root 878K Jul 16 17:12 vocab.json\n" + "total 14G\n", + "4.0K drwxr-xr-x 2 root root 4.0K Aug 4 04:41 .\n", + "8.0K drwxr-xr-x 1 root root 4.0K Aug 4 04:41 ..\n", + "4.0K -rw-r--r-- 1 root root 1.5K Aug 4 04:41 .gitattributes\n", + " 12K -rw-r--r-- 1 root root 9.6K Aug 4 04:41 README.md\n", + "4.0K -rw-r--r-- 1 root root 667 Aug 4 04:41 config.json\n", + "4.0K -rw-r--r-- 1 root root 2.6K Aug 4 04:41 configuration_RW.py\n", + "4.0K -rw-r--r-- 1 root root 111 Aug 4 04:41 generation_config.json\n", + "4.0K -rw-r--r-- 1 root root 1.2K Aug 4 04:41 handler.py\n", + " 48K -rw-r--r-- 1 root root 47K Aug 4 04:41 modelling_RW.py\n", + "9.3G -rw-r--r-- 1 root root 9.3G Aug 4 04:43 pytorch_model-00001-of-00002.bin\n", + "4.2G -rw-r--r-- 1 root root 4.2G Aug 4 04:42 pytorch_model-00002-of-00002.bin\n", + " 20K -rw-r--r-- 1 root root 17K Aug 4 04:41 pytorch_model.bin.index.json\n", + "4.0K -rw-r--r-- 1 root root 281 Aug 4 04:41 special_tokens_map.json\n", + "2.7M -rw-r--r-- 1 root root 2.7M Aug 4 04:41 tokenizer.json\n", + "4.0K -rw-r--r-- 1 root root 220 Aug 4 04:41 tokenizer_config.json\n" ] } ], @@ -202,7 +219,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/model-loader-huggingface/src/test_utils.py b/model-loader-huggingface/src/test_utils.py new file mode 100644 index 0000000..91f47b7 --- /dev/null +++ b/model-loader-huggingface/src/test_utils.py @@ -0,0 +1,39 @@ +import json +import pytest +from .utils import filter_files + + +@pytest.mark.parametrize( + "files, expected", + [ + ( + [ + "pytorch_model.bin", + "tf_model.h5", + ], + ["pytorch_model.bin"], + ), + ( + [ + "config.json", + "model-00001-of-00002.safetensors", + "model-00002-of-00002.safetensors", + "model.safetensors.index.json", + "pytorch_model-00001-of-00002.bin", + "pytorch_model-00002-of-00002.bin", + "pytorch_model.bin.index.json", + ], + [ + "config.json", + "model.safetensors.index.json", + "pytorch_model-00001-of-00002.bin", + "pytorch_model-00002-of-00002.bin", + "pytorch_model.bin.index.json", + ], + ), + ], +) +def test_filter_files(files, expected): + assert filter_files(files) == expected + + diff --git a/model-loader-huggingface/src/utils.py b/model-loader-huggingface/src/utils.py new file mode 100644 index 0000000..065fdd7 --- /dev/null +++ b/model-loader-huggingface/src/utils.py @@ -0,0 +1,13 @@ +from typing import List + + +def filter_files(files: List[str]) -> List[str]: + files = list(filter(lambda f: not f.startswith("coreml/"), files)) + has_pytorch_model = any([f.startswith("pytorch_model") for f in files]) + if has_pytorch_model: + # filter out safetensors + files = list(filter(lambda f: not f.endswith(".safetensors"), files)) + # filter out tensorflow model + files = list(filter(lambda f: not f.startswith("tf_model"), files)) + return files +