Skip to content

Commit

Permalink
refactor for standalone cli (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Oct 1, 2023
1 parent 6a3090e commit 6279eb8
Show file tree
Hide file tree
Showing 26 changed files with 228 additions and 79 deletions.
2 changes: 2 additions & 0 deletions base/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
artifacts
.venv
9 changes: 5 additions & 4 deletions base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ RUN apt-get update && \
# Using /content matches colab.
# Precreate some paths for convenience (when mounts are not present).
WORKDIR /content
ENV PATH="$PATH:/content/scripts"
RUN mkdir -p /content/artifacts

RUN pip3 install --no-cache-dir jupyterlab
COPY requirements.txt /tmp/requirements.txt
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt

COPY scripts/* scripts/
COPY scripts/* /usr/local/bin/

# Tini for signal handling, etc.
# https://github.com/krallin/tini
Expand All @@ -30,4 +31,4 @@ ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]

CMD ["notebook.sh"]
CMD ["run-notebook.sh"]
17 changes: 17 additions & 0 deletions base/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,20 @@ Includes:
- CUDA
- Jupyter Lab

By default the base image will run any notebooks ending with `.ipynb` extension
that are stored inside the `/content` or `/content/src` directory. The outputs
of the notebooks will be stored to `/content/artifacts`

## Build the base image

```
docker build -t substratusai/base .
```

Test it out by placing a notebook in `/content` directory:
```
docker run -ti -v $(pwd)/test.ipynb:/content/test.ipynb \
-v $(pwd)/artifacts:/content/artifacts \
--security-opt seccomp=unconfined \
substratusai/base bash
```
2 changes: 2 additions & 0 deletions base/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
jupyterlab
papermill
75 changes: 75 additions & 0 deletions base/scripts/run-notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python3

import sys
from pathlib import Path
import logging
import papermill
from papermill.log import logger
from papermill.engines import NBClientEngine, papermill_engines, NotebookExecutionManager, PapermillNotebookClient
from papermill.utils import merge_kwargs, remove_args
from nbconvert import HTMLExporter

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger.setLevel(logging.INFO)

class HTMLExecutionManager(NotebookExecutionManager):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.html_exporter = HTMLExporter()
self.html_output_path = Path(self.output_path).with_suffix(".html")

def save(self, **kwargs):
super().save(**kwargs)
html_output, _ = self.html_exporter.from_notebook_node(self.nb)
with self.html_output_path.open("w", encoding ="utf-8") as f:
f.write(html_output)

class HTMLEngine(NBClientEngine):
@classmethod
def execute_notebook(
cls,
nb,
kernel_name,
output_path=None,
progress_bar=True,
log_output=False,
autosave_cell_every=5,
**kwargs,
):
"""
A wrapper to handle notebook execution tasks.
Wraps the notebook object in a `NotebookExecutionManager` in order to track
execution state in a uniform manner. This is meant to help simplify
engine implementations. This allows a developer to just focus on
iterating and executing the cell contents.
"""
nb_man = HTMLExecutionManager(
nb,
output_path=output_path,
progress_bar=progress_bar,
log_output=log_output,
autosave_cell_every=autosave_cell_every,
)

nb_man.notebook_start()
try:
cls.execute_managed_notebook(nb_man, kernel_name, log_output=log_output, **kwargs)
finally:
nb_man.cleanup_pbar()
nb_man.notebook_complete()

return nb_man.nb


papermill_engines.register('htmlengine', HTMLEngine)
papermill_engines.register_entry_points()


# Execute the notebook with papermill as usual
papermill.execute_notebook(
sys.argv[1],
sys.argv[2],
log_output=True,
engine_name="htmlengine",
)
25 changes: 25 additions & 0 deletions base/scripts/run-notebook.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash

set -e
if [[ -z "$NOTEBOOK" ]]; then
for nb in /content/*.ipynb; do
[ -e "$nb" ] || continue
echo "Executing notebook $nb"
run-notebook.py "$nb" "/content/artifacts/$(basename ${nb})"
# jupyter nbconvert --to html /content/artifacts/$(basename "$nb")
done
for nb in /content/src/*.ipynb; do
[ -e "$nb" ] || continue
mkdir -p /content/artifacts/src
echo "Executing notebook $nb"
run-notebook.py "$nb" "/content/artifacts/src/$(basename ${nb})"
# jupyter nbconvert --to html /content/artifacts//src/$(basename "$nb")
done
else
for nb in $NOTEBOOK; do
[ -e "$nb" ] || continue
echo "Executing notebook $nb"
run-notebook.py "$nb" "/content/artifacts/$(basename ${nb})"
# jupyter nbconvert --to html /content/artifacts/$(basename "$nb")
done
fi
File renamed without changes.
70 changes: 70 additions & 0 deletions base/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"test\")"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"testing a notebook 0\n",
"testing a notebook 1\n",
"testing a notebook 2\n",
"testing a notebook 3\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1000\u001b[39m):\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mtesting a notebook \u001b[39m\u001b[39m{\u001b[39;00mi\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m time\u001b[39m.\u001b[39;49msleep(\u001b[39m1.0\u001b[39;49m)\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import time\n",
"\n",
"for i in range(10):\n",
" print(f\"testing a notebook {i}\")\n",
" time.sleep(1.0)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
12 changes: 1 addition & 11 deletions dataset-loader-http/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,7 @@
ARG BASE_IMAGE=substratusai/base:latest
FROM ${BASE_IMAGE}


RUN mkdir -p /content/src /content/data
WORKDIR /content

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

ENV PATH="$PATH:/content/bin"

# Copy in build dependencies only since the build will take a while.
COPY ./scripts/ ./scripts
COPY ./src/ ./src

CMD load.sh
COPY load.ipynb .
4 changes: 2 additions & 2 deletions dataset-loader-http/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ cat > params.json << EOF
{"urls": "https://huggingface.co/datasets/substratusai/k8s-instructions/raw/main/k8s-instructions.jsonl"}
EOF

docker run -it -v $(pwd)/src:/content/src -p 8888:8888 \
docker run -it -v $(pwd)/load.ipynb:/content/load.ipynb -p 8888:8888 \
-v $(pwd)/params.json:/content/params.json \
dataset-loader-http notebook.sh
dataset-loader-http
```
Now open your browser at http://localhost:8888
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
"\n",
"def download_file(url: str) -> str:\n",
" filename = get_filename(url)\n",
" destination = f\"/content/data/{filename}\"\n",
" destination = f\"/content/artifacts/{filename}\"\n",
" print(f\"Downloading {url} to {destination}\")\n",
" urllib.request.urlretrieve(url, destination)\n",
" return destination\n",
Expand Down Expand Up @@ -117,7 +117,7 @@
}
],
"source": [
"! ls -lash /content/data/"
"! ls -lash /content/artifacts/"
]
},
{
Expand All @@ -144,7 +144,7 @@
}
],
"source": [
"! head -n 10 /content/data/*"
"! head -n 10 /content/artifacts/*"
]
}
],
Expand Down
5 changes: 0 additions & 5 deletions dataset-loader-http/scripts/load.sh

This file was deleted.

3 changes: 1 addition & 2 deletions dataset-squad/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ FROM ${BASE_IMAGE}
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

COPY ./scripts/ ./scripts
COPY ./src/ ./src
COPY ./load.ipynb ./
6 changes: 4 additions & 2 deletions dataset-squad/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ Explore and develop with a Jupyter Lab.

```sh
# Run a Jupyter Notebook.
docker run -it -v $(pwd)/data:/data -v $(pwd)/src:/dataset/src -p 8888:8888 dataset-squad notebook.sh
docker run -it -v $(pwd)/artifacts:/content/artifacts \
-v $(pwd)/load.ipynb:/content/load.ipynb -p 8888:8888 \
dataset-squad notebook.sh

# In another terminal: Open browser.
open http://localhost:8888
Expand All @@ -26,7 +28,7 @@ open http://localhost:8888
Fetch data.

```sh
docker run -e -v $(pwd)/data:/data -v $(pwd)/logs:/dataset/logs squad-dataset load.sh
docker run -e -v $(pwd)/artifacts:/content/artifacts squad-dataset

head data/*
```
2 changes: 1 addition & 1 deletion dataset-squad/src/load.ipynb → dataset-squad/load.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"# \"https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json\"\n",
"data_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\"\n",
"raw_file_path = \"/tmp/squad.json\"\n",
"transformed_file_path = \"/content/data/all.jsonl\""
"transformed_file_path = \"/content/artifacts/all.jsonl\""
]
},
{
Expand Down
5 changes: 0 additions & 5 deletions dataset-squad/scripts/load.sh

This file was deleted.

6 changes: 2 additions & 4 deletions model-loader-huggingface/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ EOF
docker run -it \
-v $(pwd)/src:/content/src \
-v $(pwd)/params.json:/content/params.json \
-v $(pwd)/logs:/content/logs \
-v $(pwd)/model:/content/model \
-v $(pwd)/artifacts:/content/artifacts \
-p 8888:8888 \
substratusai/model-loader-huggingface notebook.sh
```
Expand All @@ -35,8 +34,7 @@ cat > params.json <<EOF
EOF
docker run -it \
-v $(pwd)/params.json:/content/params.json \
-v $(pwd)/logs:/content/logs \
-v $(pwd)/model:/content/model \
-v $(pwd)/artifacts:/content/artifacts \
substratusai/model-loader-huggingface
```

4 changes: 2 additions & 2 deletions model-loader-huggingface/src/load.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"if not model_id:\n",
" raise ValueError(\"Missing required parameter name. Set `params: {name: hf_org/model_id} in the model spec` \")\n",
"\n",
"output_dir = params.get(\"output_dir\", \"/content/model\")\n",
"output_dir = params.get(\"output_dir\", \"/content/artifacts\")\n",
"\n",
"# snapshot_download(repo_id=model_id, local_dir=output_dir, local_dir_use_symlinks=False, revision=\"main\")"
]
Expand Down Expand Up @@ -180,7 +180,7 @@
}
],
"source": [
"! ls -lash /content/model"
"! ls -lash {output_dir}"
]
}
],
Expand Down
4 changes: 2 additions & 2 deletions model-server-basaran/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG BASE_IMAGE=substratusai/base:latest
FROM ${BASE_IMAGE}

ENV MODEL="/content/saved-model"
ENV MODEL="/content/model"
ENV MODEL_LOCAL_FILES_ONLY="true"
ENV MODEL_TRUST_REMOTE_CODE="true"
ENV PORT=8080
Expand All @@ -17,7 +17,7 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

COPY scripts/ scripts/
COPY serve.sh /usr/local/bin/

CMD serve.sh
EXPOSE $PORT
Loading

0 comments on commit 6279eb8

Please sign in to comment.