Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Programmatic execution of notebooks #2031

Merged
merged 22 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,630 changes: 815 additions & 815 deletions examples/00_quick_start/als_movielens.ipynb

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions examples/00_quick_start/dkn_MIND.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,14 @@
"import os\n",
"import sys\n",
"from tempfile import TemporaryDirectory\n",
"import scrapbook as sb\n",
"import tensorflow as tf\n",
"tf.get_logger().setLevel(\"ERROR\") # only show error messages\n",
"tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n",
"\n",
"from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources, prepare_hparams\n",
"from recommenders.models.deeprec.models.dkn import DKN\n",
"from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator\n",
"from recommenders.utils.notebook_utils import store_metadata\n",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a weird error in the DKN notebook. It's a timeout. I have never seen this error.

It might be related to a bad configuration of CUDA (see below)? Let me rerun that test.

@pytest.mark.notebooks
    @pytest.mark.gpu
    def test_dkn_quickstart(notebooks, output_notebook, kernel_name):
        notebook_path = notebooks["dkn_quickstart"]
>       execute_notebook(
            notebook_path,
            output_notebook,
            kernel_name=kernel_name,
            parameters=dict(EPOCHS=1, BATCH_SIZE=500),
        )

tests/unit/examples/test_notebooks_gpu.py:118: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
recommenders/utils/notebook_utils.py:107: in execute_notebook
    executed_notebook, _ = execute_preprocessor.preprocess(
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/nbconvert/preprocessors/execute.py:102: in preprocess
    self.preprocess_cell(cell, resources, index)
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/nbconvert/preprocessors/execute.py:123: in preprocess_cell
    cell = self.execute_cell(cell, index, store_history=True)
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/jupyter_core/utils/__init__.py:173: in wrapped
    return loop.run_until_complete(inner)
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/asyncio/base_events.py:616: in run_until_complete
    return future.result()
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/nbclient/client.py:1005: in async_execute_cell
    exec_reply = await self.task_poll_for_reply
/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/nbclient/client.py:806: in _async_poll_for_reply
    error_on_timeout_execute_reply = await self._async_handle_timeout(timeout, cell)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <nbconvert.preprocessors.execute.ExecutePreprocessor object at 0x152370dcb400>
timeout = 600
cell = ***'cell_type': 'code', 'execution_count': 7, 'metadata': ***'pycharm': ***'is_executing': False***, 'scrolled': True, 'execut...\x1b[49m\x1b[43m)\x1b[49m\n', '\x1b[0;31mKeyboardInterrupt\x1b[0m: ']***], 'source': 'model.fit(train_file, valid_file)'***

    async def _async_handle_timeout(
        self, timeout: int, cell: NotebookNode | None = None
    ) -> None | dict[str, t.Any]:
        self.log.error("Timeout waiting for execute reply (%is)." % timeout)
        if self.interrupt_on_timeout:
            self.log.error("Interrupting kernel")
            assert self.km is not None
            await ensure_async(self.km.interrupt_kernel())
            if self.error_on_timeout:
                execute_reply = ***"content": *****self.error_on_timeout, "status": "error"***
                return execute_reply
            return None
        else:
            assert cell is not None
>           raise CellTimeoutError.error_from_timeout_and_cell(
                "Cell execution timed out", timeout, cell
            )
E           nbclient.exceptions.CellTimeoutError: A cell timed out while it was being executed, after 600 seconds.
E           The message was: Cell execution timed out.
E           Here is a preview of the cell contents:
E           -------------------
E           model.fit(train_file, valid_file)
E           -------------------

/azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib/python3.8/site-packages/nbclient/client.py:856: CellTimeoutError
----------------------------- Captured stdout call -----------------------------
ERROR:traitlets:Timeout waiting for execute reply (600s).
----------------------------- Captured stderr call -----------------------------
2023-11-18 07:19:58.273399: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:19:58.273444: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-11-18 07:20:01.260672: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.260819: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.260909: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.260994: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.261077: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.261163: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.261246: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.261330: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /azureml-envs/azureml_34c56b1c46d7f5ae137d78e9a4192235/lib:
2023-11-18 07:20:01.261341: W tensorflow/core/common_runtime/gpu/gpu_device.cc:[1850](https://github.com/recommenders-team/recommenders/actions/runs/6912445137/job/18808189826?pr=2031#step:3:1857)] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-11-18 07:20:02.067266: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-18 07:20:02.068875: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
------------------------------ Captured log call -------------------------------
ERROR    traitlets:client.py:845 Timeout waiting for execute reply (600s).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SimonYansenZhao this is the current error. I believe it is not related to the multiline problem

"\n",
"print(f\"System version: {sys.version}\")\n",
"print(f\"Tensorflow version: {tf.__version__}\")"
Expand Down Expand Up @@ -345,7 +345,12 @@
"metadata": {},
"outputs": [],
"source": [
"sb.glue(\"res\", res)"
"# Record results for tests - ignore this cell\n",
"store_metadata(\"auc\", res[\"auc\"])\n",
"store_metadata(\"group_auc\", res[\"group_auc\"])\n",
"store_metadata(\"ndcg@5\", res[\"ndcg@5\"])\n",
"store_metadata(\"ndcg@10\", res[\"ndcg@10\"])\n",
"store_metadata(\"mean_mrr\", res[\"mean_mrr\"])\n"
]
},
{
Expand Down Expand Up @@ -395,4 +400,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
37 changes: 19 additions & 18 deletions examples/00_quick_start/fastai_movielens.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@
}
],
"source": [
"# set the environment path to find Recommenders\n",
"from tempfile import TemporaryDirectory\n",
"import sys\n",
"import os\n",
"import pandas as pd\n",
"import sys\n",
"import numpy as np\n",
"import scrapbook as sb\n",
"import torch, fastai\n",
"import pandas as pd\n",
"import torch\n",
"import fastai\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"from fastai.collab import collab_learner, CollabDataBunch, load_learner\n",
"\n",
"from recommenders.utils.constants import (\n",
Expand All @@ -61,6 +61,7 @@
"from recommenders.models.fastai.fastai_utils import cartesian_product, score\n",
"from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var\n",
"from recommenders.utils.notebook_utils import store_metadata\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Pandas version: {}\".format(pd.__version__))\n",
Expand Down Expand Up @@ -914,17 +915,17 @@
}
],
"source": [
"# Record results with papermill for tests\n",
"sb.glue(\"map\", eval_map)\n",
"sb.glue(\"ndcg\", eval_ndcg)\n",
"sb.glue(\"precision\", eval_precision)\n",
"sb.glue(\"recall\", eval_recall)\n",
"sb.glue(\"rmse\", eval_rmse)\n",
"sb.glue(\"mae\", eval_mae)\n",
"sb.glue(\"exp_var\", eval_exp_var)\n",
"sb.glue(\"rsquared\", eval_r2)\n",
"sb.glue(\"train_time\", train_time.interval)\n",
"sb.glue(\"test_time\", test_time.interval)"
"# Record results for tests - ignore this cell\n",
"store_metadata(\"map\", eval_map)\n",
"store_metadata(\"ndcg\", eval_ndcg)\n",
"store_metadata(\"precision\", eval_precision)\n",
"store_metadata(\"recall\", eval_recall)\n",
"store_metadata(\"rmse\", eval_rmse)\n",
"store_metadata(\"mae\", eval_mae)\n",
"store_metadata(\"exp_var\", eval_exp_var)\n",
"store_metadata(\"rsquared\", eval_r2)\n",
"store_metadata(\"train_time\", train_time.interval)\n",
"store_metadata(\"test_time\", test_time.interval)"
]
},
{
Expand Down Expand Up @@ -964,4 +965,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading