Merge pull request #8 from ramanathanlab/develop

Develop
ramanathanlab · Mar 17, 2023 · a1a2f6a · a1a2f6a
2 parents 3359256 + c64b106
commit a1a2f6a
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -16,39 +16,59 @@ The computational motif implemented by DeepDriveMD to support ML/AI-coupled simu
 ## Installation
 
 Create a conda environment
-```bash
+```console
 conda create -n deepdrivemd python=3.9 -y
+conda activate deepdrivemd
 ```
 
 To install OpenMM for simulations:
-```bash
+```console
 conda install -c conda-forge gcc=12.1.0 -y
 conda install -c conda-forge openmm -y
 ```
 
 To install `deepdrivemd`:
-```bash
+```console
 git clone https://github.com/ramanathanlab/deepdrivemd.git
 cd deepdrivemd
 make install
 ```
 
 ## Usage
 
-The workflow can be tested locally using mock API's for the tasks by running:
-```bash
-python -m deepdrivemd.workflows.openmm_cvae --test -c tests/basic-local/test.yaml
+The workflow can be tested on a workstation (a system with a few GPUs) via:
+```console
+python -m deepdrivemd.workflows.openmm_cvae -c tests/apps-enabled-workstation/test.yaml
 ```
 This will generate an output directory for the run with logs, results, and task specific output folders.
 
-Each test will write a timestamped run directory to the `runs/` directory specified in `tests/basic-local/test.yaml`.
+Each test will write a timestamped experiment output directory to the `runs/` directory.
 
-To clean up the runs (by default these are ignored by git):
-```bash
-rm -r runs/
+Inside the output directory, you will find:
+```console
+$ ls runs/experiment-170323-091525/
+inference  params.yaml  result  run-info  runtime.log  simulation  train
 ```
+- `params.yaml`: the full configuration file (default parameters included)
+- `runtime.log`: the workflow log
+- `result`: a directory containing JSON files `simulation.json`, `train.json`, `inference.json` which log task results including success or failure, potential error messages, runtime statistics. This can be helpful for debugging application-level failures.
+- `simulation`, `train`, `inference`: output directories each containing subdirectories `run-<uuid>` for each submitted task. This is where the output files of your simulations, preprocessed data, model weights, etc will be written by your applications (it corresponds to the application workdir).
+- `run-info`: Parsl logs
+
+An example, the simulation run directories may look like:
+```console
+ls runs/experiment-170323-091525/simulation/run-08843adb-65e1-47f0-b0f8-34821aa45923:
+1FME-unfolded.pdb  contact_map.npy  input.yaml  output.yaml  rmsd.npy  sim.dcd  sim.log
+```
+- `1FME-unfolded.pdb` the PDB file used to start the simulation
+- `contact_map.npy`, `rmsd.npy`: the preprocessed data files which will be input into the train and inference tasks
+- `input.yaml`, `output.yaml`: These simply log the task function input and return values, they are helpful for debugging but are not strtictly necessary
+- `sim.dcd`: the simulation trajectory file containing all the coordinate frames
+- `sim.log`: a simulation log detailing the energy, steps taken, ns/day, etc
+
+By default the `runs/` directory is ignored by git.
 
-**Note**: Mock testing is specified in each of the application scripts `deepdrivemd/applications/*/app.py`.
+Production runs can be configured and run analogously. See `examples/bba-folding-workstation/` for a detailed example of folding the [1FME](https://www.rcsb.org/structure/1FME) protein. **The YAML files document the configuration settings and explain the use case**.
 
 
 ## Contributing

diff --git a/deepdrivemd/apps/cvae_inference/__init__.py b/deepdrivemd/apps/cvae_inference/__init__.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Optional
+from typing import List
 
 from deepdrivemd.api import (
     ApplicationSettings,
@@ -11,8 +11,12 @@
 
 class CVAEInferenceInput(BatchSettings):
     contact_map_paths: List[Path]
+    """A list of contact map .npy files to process."""
     rmsd_paths: List[Path]
+    """A list of rmsd .npy files to process. The ith rmsd_path and contact_map_path
+    should correspond to the same simualtion."""
     model_weight_path: Path
+    """The trained model weights .pt file to use for inference."""
 
 
 class CVAEInferenceOutput(BaseSettings):
@@ -23,12 +27,14 @@ class CVAEInferenceOutput(BaseSettings):
 
 
 class CVAEInferenceSettings(ApplicationSettings):
-    # Optionally resume training from a checkpoint file
-    checkpoint_path: Optional[Path] = None
     cvae_settings_yaml: Path
+    """Path to the CVAE hyperparameters."""
     inference_batch_size: int = 128
+    """The batch size to use during inference (larger batch size will be faster)."""
     sklearn_num_jobs: int = 8
+    """The number of cores to use for sklearn LOF method."""
     num_outliers: int = 120
+    """The number of latent space outliers to consider when picking the minimal RMSD structures."""
 
     # validators
     _checkpoint_path = path_validator("checkpoint_path")

diff --git a/examples/bba-folding-workstation/cvae-prod-settings.yaml b/examples/bba-folding-workstation/cvae-prod-settings.yaml
@@ -1,22 +1,24 @@
-input_shape: [1, 28, 28]
-filters: [16, 16, 16, 16]
-kernels: [3, 3, 3, 3]
-strides: [1, 1, 1, 2]
-affine_widths: [128]
-affine_dropouts: [0.5]
-latent_dim: 3
-lambda_rec: 1.0
-num_data_workers: 4
-prefetch_factor: 2
-batch_size: 64
-device: cuda
-optimizer_name: RMSprop
-optimizer_hparams:
-    lr: 0.001
-    weight_decay: 0.00001
-epochs: 20
-checkpoint_log_every: 20
-plot_log_every: 20
-plot_n_samples: 5000
-plot_method: raw
-
+# Parameters for a convolutional autoencoder as implemented in the mdlearn package.
+# For additional documentation on the input parameters, see here:
+# https://mdlearn.readthedocs.io/en/latest/pages/_autosummary/mdlearn.nn.models.vae.symmetric_conv2d_vae.html#mdlearn.nn.models.vae.symmetric_conv2d_vae.SymmetricConv2dVAETrainer
+input_shape: [1, 28, 28] # Contact matrix shape, in this case the number of residues in BBA.
+filters: [16, 16, 16, 16] # The convolution filters to use (should be same number as kernels and strides)
+kernels: [3, 3, 3, 3] # The convolution kernels to use
+strides: [1, 1, 1, 2] # The convolution strides to use
+affine_widths: [128] # The number of neurons in the linear layers (should be same number as affine_dropouts) 
+affine_dropouts: [0.5] # The dropout to use in the linear layers
+latent_dim: 3 # The latent dimension of the autoencoder
+lambda_rec: 1.0 # How much to weight the reconstruction loss vs the KL divergence
+num_data_workers: 4 # The number of parallel data workers for loading data (performance tuning)
+prefetch_factor: 2 # How many batches each data worker should prefetch (performance tuning)
+batch_size: 64 # The batch size to use during training
+device: cuda # The device to train/infer with (cuda or cpu)
+optimizer_name: RMSprop # The optimizer used to train the model
+optimizer_hparams: # See the torch documentation for the above optimizer for details: https://pytorch.org/docs/stable/optim.html
+    lr: 0.001 # Learning rate for the optimizer
+    weight_decay: 0.00001 # Weight decay for the optimizer
+epochs: 20 # The number of epochs to train for, smaller systems generally need fewer epochs
+checkpoint_log_every: 20 # How often to log a model weight checkpoint file (we only use the last one logged, so set to number of epochs)
+plot_log_every: 20 # How often to log a plot of the autoenoder latent space (helpful for debugging the model -- clustering should be visually apparent)
+plot_n_samples: 5000 # The number of samples to plot
+plot_method: raw # Plot the "raw" latent coordinates in 3D, "PCA" of the embeddings, "TSNE" of the embeddings, etc. See https://mdlearn.readthedocs.io/en/latest/pages/_autosummary/mdlearn.visualize.html
diff --git a/examples/bba-folding-workstation/prod.yaml b/examples/bba-folding-workstation/prod.yaml
@@ -1,22 +1,104 @@
+# This is an example YAML configuration file for running DeepDriveMD on
+# a small workstation to fold the 1FME fast folding protein. All the data
+# for this workflow is self contained within the repository including
+# folded and unfolded structures. This is the best example to debug with
+# as you can scale the number of GPUs, simulation length, and other settings
+# using this small biomolecular system (28 residues) instead of a larger
+# compute-intensive system. This workflow configuration takes approximately 8
+# hours to run to convergence on 4 V100 GPUs.
+
+# NOTE: There are more parameters that can be configured than are listed
+# here. Please refer to deepdrivemd/api.py:DeepDriveMDSettings for details.
+
+# NOTE: simulation_settings, train_settings, and inference_settings encapsulate
+# specific application parameters suited towards your biomolecular system and
+# machine learning training and inference algorithms. This is meant to be
+# an illustrative example for best practices for configuring your experiments
+# and exposing a convenient YAML interface to the input parameters you would like
+# to tune. You may find that this (or a different) specific deep learning model or simulation
+# script is suited to multiple problems, but DeepDriveMD is flexible and allows you
+# to add your own custom solutions. This workflow is geared towards simulating a system
+# from a starting state to some target, given as a PDB file via simulation_settings:rmsd_reference_pdb.
+# In this case, we are using it to fold the 1FME protein by minimizing the RMSD to
+# the native state. To start your modelling we recomend using the convolutational
+# variational autoencoder as configured below as a first step. You may need to adjust
+# the inference application if your task can not be cast as an RMSD minimization problem.
+
+
+# The simulation input directory. Should contain subfolders with PDB
+# files (and optional topology files)
 simulation_input_dir: data/1fme
+# The number of workers to use for all tasks (3 will be used for simulation,
+# 1 will be shared between train/infer tasks)
 num_workers: 4
+# The number of simulations to run between training jobs (all the data produced 
+# throughout the duration of the workflow is used for training)
 simulations_per_train: 6
+# The number of simulations to run between inference jobs (inference is fast,
+# we want to select outliers as quickly as possible)
 simulations_per_inference: 1
+# The total number of simulations to run before the workflow stops (1000 is
+# essentially infinity and requires manually stopping the workflow once
+# convergence is manually confirmed)
 num_total_simulations: 1000
 
+# Compute settings can be configured by refering to deepdrivemd/parsl.py
+# The `name` field specifies what type of system to run on and the subsequent
+# arguments are conditional on the name field (e.g., a cluster may have different
+# configuration than a workstation).
 compute_settings:
+  # Specify we want the workstation parsl configuration
   name: workstation
+  # Identify which GPUs to assign tasks to. It's generally recommended to first check
+  # nvidia-smi to see which GPUs are available. The numbers below are analogous to
+  # setting CUDA_VISIBLE_DEVICES=0,1,2,3
   available_accelerators: ["0", "1", "2", "3"]
 
+# The simulation settings as exposed in deepdrivemd/apps/openmm_simulation
+# This application uses OpenMM as a simulation backend and can be changed
+# to suit your modelling needs. To see the full list of tunable parameters,
+# see deepdrivemd/apps/openmm_simulation/__init__.py:MDSimulationSettings
 simulation_settings:
+  # The number of nanoseconds to run each simulation for
   simulation_length_ns: 10
+  # How often to write a coordinate frame to a DCD file
   report_interval_ps: 10
+  # The temperature to simulate at
   temperature_kelvin: 300
+  # The reference PDB with which to compute RMSD of each reported frame to
   rmsd_reference_pdb: data/1fme/1FME-folded.pdb
 
+# The training settings for the convolutional variational autoencoder (CVAE).
+# Full documentation and the paper citation which describes the application of
+# the CVAE to contact maps can be found here: https://mdlearn.readthedocs.io/en/latest/pages/_autosummary/mdlearn.nn.models.vae.symmetric_conv2d_vae.html#module-mdlearn.nn.models.vae.symmetric_conv2d_vae
 train_settings:
+  # Here we pass a YAML file containing all the CVAE parameters (documentation included)
+  # This is just to avoid needing to copy and paste paramaters in both the train_settings and inference_settings
   cvae_settings_yaml: examples/bba-folding-workstation/cvae-prod-settings.yaml
 
+# The inference settings. For this workflow, the CVAE is periodically retrained
+# on all the reported frames of the simulations. The most recent CVAE model weights
+# are always used during inference. The inference application is responsible for analyzing
+# the reported simulation frames and selecting a small subset of frames that are
+# deemed biologically "interesting" which are then used to restart the subsequent simulations.
+# The algorithm employed in this application is as follows:
+#   1. Encode all the contact maps into the latent space learned by the CVAE.
+#   2. Run the Local Outlier Factor (LOF) on the latent embeddings: https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html
+#   3. Take the top `num_outliers` outliers which correspond to the most negative LOF score
+#   4. From the top outliers, re-sort them according to their RMSD to simulation_settings:rmsd_reference_pdb
+#   5. Repeat this each call to the inference function analyzing more and more data from the simulations
+#
+# Following this procedure, each time a simulation finishes, the workflow submits a new simulation
+# job using the frame corresponding to the next best outlier with minimal RMSD to the target state.
+# As the workflow progresses, the simulations begin to sample conformers that are closer to the target reference state.
+# To read the inference application logic, please see: deepdrivemd/apps/cvae_inference
 inference_settings:
+  # The same CVAE paratameter file as in train_settings
   cvae_settings_yaml: examples/bba-folding-workstation/cvae-prod-settings.yaml
+  # The number of latent space outliers to consider when picking the minimal RMSD structures
   num_outliers: 100
+
+# After reading this example and trying out a few configuration changes, you should
+# be able to consider whether your system of interest can be cast as an RMSD
+# minimization problem or whether you need to make a small adjust to the inference
+# script to change which frames should be preffered during simulation restarts.