Added quick start example to the README

florencejt · Jan 9, 2024 · 1411337 · 1411337
1 parent 2c325ab
commit 1411337
Show file tree

Hide file tree

Showing 6 changed files with 495 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -45,11 +45,52 @@ To savour the flavours of `fusilli`, you can install it using pip:
 pip install fusilli
 ```
 
-## How to Cite
+## Quick Start
+
+Here is a quick example of how to use `fusilli` to train a regression model and plot the real values vs. predicted
+values.
+
+```
+    from fusilli.data import prepare_fusion_data
+    from fusilli.train import train_and_save_models
+    from fusilli.eval import RealsVsPreds
+    import matplotlib.pyplot as plt
+
+    # Import the example fusion model
+    from fusilli.fusionmodels.tabularfusion.example_model import ExampleModel
+
+    data_paths = {
+        "tabular1": "path/to/tabular_1.csv",  
+        "tabular2": "path/to/tabular_2.csv",  
+        "image": "path/to/image_file.pt",  
+    }
+
+    output_paths = {
+        "checkpoints": "path/to/checkpoints/dir",  
+        "losses": "path/to/losses/dir",  
+        "figures": "path/to/figures/dir",  
+    }
+
+    # Get the data ready
+    data_module = prepare_fusion_data(prediction_task="regression",
+                                      fusion_model=ExampleModel,
+                                      data_paths=data_paths,
+                                      output_paths=output_paths)
+
+    # Train the model
+    trained_model = train_and_save_models(data_module=data_module,
+                                          fusion_model=ExampleModel)
+
+    # Evaluate the model by plotting the real values vs. predicted values
+    RealsVsPreds_figure = RealsVsPreds.from_final_val_data(trained_model)
+    plt.show()
 
+```
 
-Florence Townend, Patrick J. Roddy, & Philipp Goebl. (2024). florencejt/fusilli: Fusilli v1.1.0 (v1.1.0). Zenodo. https://doi.org/10.5281/zenodo.10463697
+## How to Cite
 
+Florence Townend, Patrick J. Roddy, & Philipp Goebl. (2024). florencejt/fusilli: Fusilli v1.1.0 (v1.1.0).
+Zenodo. https://doi.org/10.5281/zenodo.10463697
 
 ## Contribute!
 

diff --git a/...s/training_and_testing/images/thumb/sphx_glr_plot_using_external_data_thumb.png b/...s/training_and_testing/images/thumb/sphx_glr_plot_using_external_data_thumb.png
diff --git a/docs/auto_examples/training_and_testing/plot_using_external_data.ipynb b/docs/auto_examples/training_and_testing/plot_using_external_data.ipynb
@@ -0,0 +1,97 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Using External Test Data\n\nLet's learn how to use external test data with Fusilli!\nSome guidance can also be found in the `Data Loading <data-loading>` section of the documentation.\n\nThe extra step that we need to take is to provide the paths to the test data files to the functions that create evaluation figures: :class:`~fusilli.eval.RealsVsPreds.from_new_data`, :class:`~fusilli.eval.ConfusionMatrix.from_new_data`, :class:`~fusilli.eval.ModelComparison.from_new_data`.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>It is not possible to use external test data with graph-based fusion models.</p></div>\n\n\nWe'll rush through the first few steps of the training and testing process, as they are covered in more detail in the other example notebooks.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\nfrom tqdm.auto import tqdm\nimport os\n\nfrom docs.examples import generate_sklearn_simulated_data\nfrom fusilli.data import prepare_fusion_data\nfrom fusilli.eval import RealsVsPreds, ModelComparison\nfrom fusilli.train import train_and_save_models\nfrom fusilli.utils.model_chooser import import_chosen_fusion_models\n\n# sphinx_gallery_thumbnail_number = -1\n\n\nmodel_conditions = {\n    \"class_name\": [\"ConcatTabularData\"],\n}\n\nfusion_models = import_chosen_fusion_models(model_conditions)\n\n# Regression task\nprediction_task = \"regression\"\n\n# Set the batch size\nbatch_size = 48\n\n# Setting output directories\noutput_paths = {\n    \"losses\": \"loss_logs/external_data\",\n    \"checkpoints\": \"checkpoints/external_data\",\n    \"figures\": \"figures/external_data\",\n}\n\nfor dir in output_paths.values():\n    os.makedirs(dir, exist_ok=True)\n\n# Clearing the loss logs directory (only for the example notebooks)\nfor dir in os.listdir(output_paths[\"losses\"]):\n    # remove files\n    for file in os.listdir(os.path.join(output_paths[\"losses\"], dir)):\n        os.remove(os.path.join(output_paths[\"losses\"], dir, file))\n    # remove dir\n    os.rmdir(os.path.join(output_paths[\"losses\"], dir))\n\ntabular1_path, tabular2_path = generate_sklearn_simulated_data(prediction_task,\n                                                               num_samples=500,\n                                                               num_tab1_features=10,\n                                                               num_tab2_features=20)\n\nexternal_tabular1_path, external_tabular2_path = generate_sklearn_simulated_data(prediction_task,\n                                                                                 num_samples=100,\n                                                                                 num_tab1_features=10,\n                                                                                 num_tab2_features=20,\n                                                                                 external=True)\ndata_paths = {\n    \"tabular1\": tabular1_path,\n    \"tabular2\": tabular2_path,\n    \"image\": \"\",\n}\n\nexternal_data_paths = {\n    \"tabular1\": external_tabular1_path,\n    \"tabular2\": external_tabular2_path,\n    \"image\": \"\",\n}\n\nfusion_model = fusion_models[0]\n\nprint(\"Method name:\", fusion_model.method_name)\nprint(\"Modality type:\", fusion_model.modality_type)\nprint(\"Fusion type:\", fusion_model.fusion_type)\n\n# Create the data module\ndm = prepare_fusion_data(prediction_task=prediction_task,\n                         fusion_model=fusion_model,\n                         data_paths=data_paths,\n                         output_paths=output_paths,\n                         batch_size=batch_size, )\n\n# train and test\ntrained_model = train_and_save_models(\n    data_module=dm,\n    fusion_model=fusion_model,\n    enable_checkpointing=True,\n    show_loss_plot=True,\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluating with validation data\nWe'll start by evaluating the model with the validation data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "reals_preds_validation = RealsVsPreds.from_final_val_data(trained_model)\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluating with external data\nNow we'll evaluate the model with the external data.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "reals_preds_external = RealsVsPreds.from_new_data(trained_model,\n                                                  output_paths=output_paths,\n                                                  test_data_paths=external_data_paths)\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Removing checkpoint files\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "for dir in os.listdir(output_paths[\"checkpoints\"]):\n    # remove files\n    os.remove(os.path.join(output_paths[\"checkpoints\"], dir))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.16"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/docs/auto_examples/training_and_testing/plot_using_external_data.py b/docs/auto_examples/training_and_testing/plot_using_external_data.py
@@ -0,0 +1,128 @@
+"""
+Using External Test Data
+========================================================================
+
+Let's learn how to use external test data with Fusilli!
+Some guidance can also be found in the :ref:`Data Loading <data-loading>` section of the documentation.
+
+The extra step that we need to take is to provide the paths to the test data files to the functions that create evaluation figures: :class:`~fusilli.eval.RealsVsPreds.from_new_data`, :class:`~fusilli.eval.ConfusionMatrix.from_new_data`, :class:`~fusilli.eval.ModelComparison.from_new_data`.
+
+.. note::
+
+    It is not possible to use external test data with graph-based fusion models.
+
+
+We'll rush through the first few steps of the training and testing process, as they are covered in more detail in the other example notebooks.
+
+"""
+
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+import os
+
+from docs.examples import generate_sklearn_simulated_data
+from fusilli.data import prepare_fusion_data
+from fusilli.eval import RealsVsPreds, ModelComparison
+from fusilli.train import train_and_save_models
+from fusilli.utils.model_chooser import import_chosen_fusion_models
+
+# sphinx_gallery_thumbnail_number = -1
+
+
+model_conditions = {
+    "class_name": ["ConcatTabularData"],
+}
+
+fusion_models = import_chosen_fusion_models(model_conditions)
+
+# Regression task
+prediction_task = "regression"
+
+# Set the batch size
+batch_size = 48
+
+# Setting output directories
+output_paths = {
+    "losses": "loss_logs/external_data",
+    "checkpoints": "checkpoints/external_data",
+    "figures": "figures/external_data",
+}
+
+for dir in output_paths.values():
+    os.makedirs(dir, exist_ok=True)
+
+# Clearing the loss logs directory (only for the example notebooks)
+for dir in os.listdir(output_paths["losses"]):
+    # remove files
+    for file in os.listdir(os.path.join(output_paths["losses"], dir)):
+        os.remove(os.path.join(output_paths["losses"], dir, file))
+    # remove dir
+    os.rmdir(os.path.join(output_paths["losses"], dir))
+
+tabular1_path, tabular2_path = generate_sklearn_simulated_data(prediction_task,
+                                                               num_samples=500,
+                                                               num_tab1_features=10,
+                                                               num_tab2_features=20)
+
+external_tabular1_path, external_tabular2_path = generate_sklearn_simulated_data(prediction_task,
+                                                                                 num_samples=100,
+                                                                                 num_tab1_features=10,
+                                                                                 num_tab2_features=20,
+                                                                                 external=True)
+data_paths = {
+    "tabular1": tabular1_path,
+    "tabular2": tabular2_path,
+    "image": "",
+}
+
+external_data_paths = {
+    "tabular1": external_tabular1_path,
+    "tabular2": external_tabular2_path,
+    "image": "",
+}
+
+fusion_model = fusion_models[0]
+
+print("Method name:", fusion_model.method_name)
+print("Modality type:", fusion_model.modality_type)
+print("Fusion type:", fusion_model.fusion_type)
+
+# Create the data module
+dm = prepare_fusion_data(prediction_task=prediction_task,
+                         fusion_model=fusion_model,
+                         data_paths=data_paths,
+                         output_paths=output_paths,
+                         batch_size=batch_size, )
+
+# train and test
+trained_model = train_and_save_models(
+    data_module=dm,
+    fusion_model=fusion_model,
+    enable_checkpointing=True,
+    show_loss_plot=True,
+)
+
+# %%
+# Evaluating with validation data
+# -----------------------------------------------
+# We'll start by evaluating the model with the validation data.
+
+reals_preds_validation = RealsVsPreds.from_final_val_data(trained_model)
+plt.show()
+
+# %%
+# Evaluating with external data
+# ----------------------------------------------
+# Now we'll evaluate the model with the external data.
+
+reals_preds_external = RealsVsPreds.from_new_data(trained_model,
+                                                  output_paths=output_paths,
+                                                  test_data_paths=external_data_paths)
+plt.show()
+
+# %%
+# Removing checkpoint files
+
+for dir in os.listdir(output_paths["checkpoints"]):
+    # remove files
+    os.remove(os.path.join(output_paths["checkpoints"], dir))