Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only download pytorch model #23

Merged
merged 4 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion model-loader-huggingface/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
ARG BASE_IMAGE=substratusai/base:latest
FROM ${BASE_IMAGE}

RUN mkdir -p /content/model

COPY requirements.txt requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

COPY scripts/* scripts
COPY scripts/* scripts/
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
COPY src/* src/

ENTRYPOINT ["/tini", "--", "/content/scripts/entrypoint.sh"]
Expand Down
17 changes: 17 additions & 0 deletions model-loader-huggingface/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
VENV_NAME=.venv
PYTHON=${VENV_NAME}/bin/python3
PIP=${VENV_NAME}/bin/pip

.PHONY: venv
venv:
if [ ! -d "${VENV_NAME}" ]; then python3 -m venv ${VENV_NAME}; fi

.PHONY: install
install: venv
${PIP} install -r requirements.txt && \
${PIP} install pytest

.PHONY: test
test: install
${VENV_NAME}/bin/pytest

9 changes: 9 additions & 0 deletions model-loader-huggingface/notebook.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: substratus.ai/v1
kind: Notebook
metadata:
name: model-loader-dev
spec:
image: substratusai/model-loader-huggingface
command: ["notebook.sh"]
params:
name: tiiuae/falcon-7b-instruct
Empty file.
117 changes: 67 additions & 50 deletions model-loader-huggingface/src/load.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,40 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "be15516c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"data": {
"text/plain": [
"['.gitattributes',\n",
" 'LICENSE.md',\n",
" 'README.md',\n",
" 'config.json',\n",
" 'flax_model.msgpack',\n",
" 'configuration_RW.py',\n",
" 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/model.mlmodel',\n",
" 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Data/com.apple.CoreML/weights/weight.bin',\n",
" 'coreml/text-generation/falcon-7b-64-float32.mlpackage/Manifest.json',\n",
" 'generation_config.json',\n",
" 'merges.txt',\n",
" 'pytorch_model.bin',\n",
" 'handler.py',\n",
" 'modelling_RW.py',\n",
" 'pytorch_model-00001-of-00002.bin',\n",
" 'pytorch_model-00002-of-00002.bin',\n",
" 'pytorch_model.bin.index.json',\n",
" 'special_tokens_map.json',\n",
" 'tf_model.h5',\n",
" 'tokenizer_config.json',\n",
" 'vocab.json']"
" 'tokenizer.json',\n",
" 'tokenizer_config.json']"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -66,17 +78,18 @@
"data": {
"text/plain": [
"['.gitattributes',\n",
" 'LICENSE.md',\n",
" 'README.md',\n",
" 'config.json',\n",
" 'flax_model.msgpack',\n",
" 'configuration_RW.py',\n",
" 'generation_config.json',\n",
" 'merges.txt',\n",
" 'pytorch_model.bin',\n",
" 'handler.py',\n",
" 'modelling_RW.py',\n",
" 'pytorch_model-00001-of-00002.bin',\n",
" 'pytorch_model-00002-of-00002.bin',\n",
" 'pytorch_model.bin.index.json',\n",
" 'special_tokens_map.json',\n",
" 'tf_model.h5',\n",
" 'tokenizer_config.json',\n",
" 'vocab.json']"
" 'tokenizer.json',\n",
" 'tokenizer_config.json']"
]
},
"execution_count": 4,
Expand All @@ -85,13 +98,14 @@
}
],
"source": [
"filenames = list(filter(lambda f: not f.startswith(\"coreml/\"), filenames))\n",
"from utils import filter_files\n",
"filenames = filter_files(filenames)\n",
"filenames"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "6cc3beac",
"metadata": {},
"outputs": [
Expand All @@ -100,29 +114,31 @@
"output_type": "stream",
"text": [
"Downloading .gitattributes to /content/model/.gitattributes\n",
"Downloading LICENSE.md to /content/model/LICENSE.md\n",
"Downloading README.md to /content/model/README.md\n",
"Downloading config.json to /content/model/config.json\n",
"Downloading flax_model.msgpack to /content/model/flax_model.msgpack\n",
"Downloading configuration_RW.py to /content/model/configuration_RW.py\n",
"Downloading generation_config.json to /content/model/generation_config.json\n",
"Downloading merges.txt to /content/model/merges.txt\n",
"Downloading pytorch_model.bin to /content/model/pytorch_model.bin\n",
"Downloading handler.py to /content/model/handler.py\n",
"Downloading modelling_RW.py to /content/model/modelling_RW.py\n",
"Downloading pytorch_model-00001-of-00002.bin to /content/model/pytorch_model-00001-of-00002.bin\n",
"Downloading pytorch_model-00002-of-00002.bin to /content/model/pytorch_model-00002-of-00002.bin\n",
"Downloading pytorch_model.bin.index.json to /content/model/pytorch_model.bin.index.json\n",
"Downloading special_tokens_map.json to /content/model/special_tokens_map.json\n",
"Downloading tf_model.h5 to /content/model/tf_model.h5\n",
"Downloading tokenizer.json to /content/model/tokenizer.json\n",
"Downloading tokenizer_config.json to /content/model/tokenizer_config.json\n",
"Downloading vocab.json to /content/model/vocab.json\n",
"Finished downloading /content/model/generation_config.json\n",
"Finished downloading /content/model/README.md\n",
"Finished downloading /content/model/flax_model.msgpack\n",
"Finished downloading /content/model/vocab.json\n",
"Finished downloading /content/model/LICENSE.md\n",
"Finished downloading /content/model/pytorch_model-00002-of-00002.bin\n",
"Finished downloading /content/model/.gitattributes\n",
"Finished downloading /content/model/special_tokens_map.json\n",
"Finished downloading /content/model/pytorch_model.bin.index.json\n",
"Finished downloading /content/model/handler.py\n",
"Finished downloading /content/model/README.md\n",
"Finished downloading /content/model/generation_config.json\n",
"Finished downloading /content/model/tokenizer.json\n",
"Finished downloading /content/model/configuration_RW.py\n",
"Finished downloading /content/model/config.json\n",
"Finished downloading /content/model/pytorch_model.bin\n",
"Finished downloading /content/model/tokenizer_config.json\n",
"Finished downloading /content/model/tf_model.h5\n",
"Finished downloading /content/model/merges.txt\n",
"Finished downloading /content/model/.gitattributes\n"
"Finished downloading /content/model/pytorch_model-00001-of-00002.bin\n",
"Finished downloading /content/model/modelling_RW.py\n"
]
}
],
Expand Down Expand Up @@ -155,29 +171,30 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "89545c70-8e54-4265-aab7-e42e5fb606d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 724M\n",
" 0 drwxr-xr-x 14 root root 448 Jul 16 17:12 .\n",
"4.0K drwxr-xr-x 1 root root 4.0K Jul 16 17:11 ..\n",
"4.0K -rw-r--r-- 1 root root 1.2K Jul 16 17:12 .gitattributes\n",
" 12K -rw-r--r-- 1 root root 11K Jul 16 17:12 LICENSE.md\n",
"8.0K -rw-r--r-- 1 root root 7.0K Jul 16 17:12 README.md\n",
"4.0K -rw-r--r-- 1 root root 651 Jul 16 17:12 config.json\n",
"241M -rw-r--r-- 1 root root 239M Jul 16 17:13 flax_model.msgpack\n",
"4.0K -rw-r--r-- 1 root root 137 Jul 16 17:12 generation_config.json\n",
"448K -rw-r--r-- 1 root root 446K Jul 16 17:12 merges.txt\n",
"241M -rw-r--r-- 1 root root 239M Jul 16 17:13 pytorch_model.bin\n",
"4.0K -rw-r--r-- 1 root root 441 Jul 16 17:12 special_tokens_map.json\n",
"241M -rw-r--r-- 1 root root 240M Jul 16 17:13 tf_model.h5\n",
"4.0K -rw-r--r-- 1 root root 685 Jul 16 17:12 tokenizer_config.json\n",
"880K -rw-r--r-- 1 root root 878K Jul 16 17:12 vocab.json\n"
"total 14G\n",
"4.0K drwxr-xr-x 2 root root 4.0K Aug 4 04:41 .\n",
"8.0K drwxr-xr-x 1 root root 4.0K Aug 4 04:41 ..\n",
"4.0K -rw-r--r-- 1 root root 1.5K Aug 4 04:41 .gitattributes\n",
" 12K -rw-r--r-- 1 root root 9.6K Aug 4 04:41 README.md\n",
"4.0K -rw-r--r-- 1 root root 667 Aug 4 04:41 config.json\n",
"4.0K -rw-r--r-- 1 root root 2.6K Aug 4 04:41 configuration_RW.py\n",
"4.0K -rw-r--r-- 1 root root 111 Aug 4 04:41 generation_config.json\n",
"4.0K -rw-r--r-- 1 root root 1.2K Aug 4 04:41 handler.py\n",
" 48K -rw-r--r-- 1 root root 47K Aug 4 04:41 modelling_RW.py\n",
"9.3G -rw-r--r-- 1 root root 9.3G Aug 4 04:43 pytorch_model-00001-of-00002.bin\n",
"4.2G -rw-r--r-- 1 root root 4.2G Aug 4 04:42 pytorch_model-00002-of-00002.bin\n",
" 20K -rw-r--r-- 1 root root 17K Aug 4 04:41 pytorch_model.bin.index.json\n",
"4.0K -rw-r--r-- 1 root root 281 Aug 4 04:41 special_tokens_map.json\n",
"2.7M -rw-r--r-- 1 root root 2.7M Aug 4 04:41 tokenizer.json\n",
"4.0K -rw-r--r-- 1 root root 220 Aug 4 04:41 tokenizer_config.json\n"
]
}
],
Expand All @@ -202,7 +219,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.6"
}
},
"nbformat": 4,
Expand Down
39 changes: 39 additions & 0 deletions model-loader-huggingface/src/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import json
import pytest
from .utils import filter_files


@pytest.mark.parametrize(
"files, expected",
[
(
[
"pytorch_model.bin",
"tf_model.h5",
],
["pytorch_model.bin"],
),
(
[
"config.json",
"model-00001-of-00002.safetensors",
"model-00002-of-00002.safetensors",
"model.safetensors.index.json",
"pytorch_model-00001-of-00002.bin",
"pytorch_model-00002-of-00002.bin",
"pytorch_model.bin.index.json",
],
[
"config.json",
"model.safetensors.index.json",
"pytorch_model-00001-of-00002.bin",
"pytorch_model-00002-of-00002.bin",
"pytorch_model.bin.index.json",
],
),
],
)
def test_filter_files(files, expected):
assert filter_files(files) == expected


13 changes: 13 additions & 0 deletions model-loader-huggingface/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import List


def filter_files(files: List[str]) -> List[str]:
files = list(filter(lambda f: not f.startswith("coreml/"), files))
has_pytorch_model = any([f.startswith("pytorch_model") for f in files])
if has_pytorch_model:
# filter out safetensors
files = list(filter(lambda f: not f.endswith(".safetensors"), files))
# filter out tensorflow model
files = list(filter(lambda f: not f.startswith("tf_model"), files))
return files