From 13fc5439f499a5b0276ea6af9f3f5edb3d999561 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 28 Mar 2024 14:19:06 -0400 Subject: [PATCH 1/6] end2end diffusion model training --- .../data/diffusion/data_loader.py | 14 +- .../data/diffusion/data_preprocess.py | 19 +- crystal_diffusion/models/model_loader.py | 47 +++- crystal_diffusion/train_diffusion.py | 248 ++++++++++++++++++ examples/local/config_diffusion.yaml | 34 +++ examples/local/run_diffusion.sh | 16 ++ 6 files changed, 346 insertions(+), 32 deletions(-) create mode 100644 crystal_diffusion/train_diffusion.py create mode 100644 examples/local/config_diffusion.yaml create mode 100755 examples/local/run_diffusion.sh diff --git a/crystal_diffusion/data/diffusion/data_loader.py b/crystal_diffusion/data/diffusion/data_loader.py index c58cf7b2..a2a0ea58 100644 --- a/crystal_diffusion/data/diffusion/data_loader.py +++ b/crystal_diffusion/data/diffusion/data_loader.py @@ -9,11 +9,9 @@ import pytorch_lightning as pl import torch import torch.nn.functional as F -from torch.utils.data import DataLoader - from crystal_diffusion.data.diffusion.data_preprocess import \ LammpsProcessorForDiffusion -from crystal_diffusion.utils.hp_utils import check_and_log_hp +from torch.utils.data import DataLoader logger = logging.getLogger(__name__) @@ -49,7 +47,7 @@ def __init__( lot of disk space. Defaults to None. """ super().__init__() - check_and_log_hp(["batch_size", "num_workers"], hyper_params) # validate the hyperparameters + # check_and_log_hp(["batch_size", "num_workers"], hyper_params) # validate the hyperparameters # TODO add the padding parameters for number of atoms self.lammps_run_dir = lammps_run_dir self.processed_dataset_dir = processed_dataset_dir @@ -68,7 +66,7 @@ def dataset_transform(x: Dict[typing.AnyStr, typing.Any], spatial_dim: int = 3) Args: x: raw columns from the processed data files. Should contain natom, box, type, position and - reduced_position. + relative_positions. spatial_dim (optional): number of spatial dimensions. Defaults to 3. Returns: @@ -78,7 +76,7 @@ def dataset_transform(x: Dict[typing.AnyStr, typing.Any], spatial_dim: int = 3) transformed_x['natom'] = torch.as_tensor(x['natom']).long() # resulting tensor size: (batchsize, ) bsize = transformed_x['natom'].size(0) transformed_x['box'] = torch.as_tensor(x['box']) # size: (batchsize, spatial dimension) - for pos in ['position', 'reduced_position']: + for pos in ['position', 'relative_positions']: transformed_x[pos] = torch.as_tensor(x[pos]).view(bsize, -1, spatial_dim) transformed_x['type'] = torch.as_tensor(x['type']).long() # size: (batchsize, max atom) @@ -89,7 +87,7 @@ def pad_samples(x: Dict[typing.AnyStr, typing.Any], max_atom: int, spatial_dim: """Pad a sample for batching. Args: - x: initial sample from the dataset. Should contain natom, position, reduced_position and type. + x: initial sample from the dataset. Should contain natom, position, relative_positions and type. max_atom: maximum number of atoms to pad to spatial_dim (optional): number of spatial dimensions. Defaults to 3. @@ -100,7 +98,7 @@ def pad_samples(x: Dict[typing.AnyStr, typing.Any], max_atom: int, spatial_dim: if natom > max_atom: raise ValueError(f"Hyper-parameter max_atom is smaller than an example in the dataset with {natom} atoms.") x['type'] = F.pad(torch.as_tensor(x['type']).long(), (0, max_atom - natom), 'constant', -1) - for pos in ['position', 'reduced_position']: + for pos in ['position', 'relative_positions']: x[pos] = F.pad(torch.as_tensor(x[pos]).float(), (0, spatial_dim * (max_atom - natom)), 'constant', torch.nan) return x diff --git a/crystal_diffusion/data/diffusion/data_preprocess.py b/crystal_diffusion/data/diffusion/data_preprocess.py index 34aa4661..f0105976 100644 --- a/crystal_diffusion/data/diffusion/data_preprocess.py +++ b/crystal_diffusion/data/diffusion/data_preprocess.py @@ -5,7 +5,6 @@ from typing import List, Optional import pandas as pd - from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output logger = logging.getLogger(__name__) @@ -55,30 +54,30 @@ def prepare_data(self, raw_data_dir: str, mode: str = 'train') -> List[str]: return list_files @staticmethod - def _convert_coords_to_reduced(row: pd.Series) -> List[float]: - """Convert a dataframe row to reduced coordinates. + def _convert_coords_to_relative(row: pd.Series) -> List[float]: + """Convert a dataframe row to relative coordinates. Args: row: entry in the dataframe. Should contain box, x, y and z Returns: - x, y and z in reduced coordinates + x, y and z in relative (reduced) coordinates """ x_lim, y_lim, z_lim = row['box'] coord_red = [coord for triple in zip(row['x'], row['y'], row['z']) for coord in (triple[0] / x_lim, triple[1] / y_lim, triple[2] / z_lim)] return coord_red - def get_x_reduced(self, df: pd.DataFrame) -> pd.DataFrame: - """Add a column with reduced x,y, z coordinates. + def get_x_relative(self, df: pd.DataFrame) -> pd.DataFrame: + """Add a column with relative x,y, z coordinates. Args: df: dataframe with atomic positions. Should contain box, x, y and z. Returns: - dataframe with added column of reduced positions [x1, y1, z1, x2, y2, ...] + dataframe with added column of relative positions [x1, y1, z1, x2, y2, ...] """ - df['reduced_position'] = df.apply(lambda x: self._convert_coords_to_reduced(x), axis=1) + df['relative_positions'] = df.apply(lambda x: self._convert_coords_to_relative(x), axis=1) return df def parse_lammps_run(self, run_dir: str) -> Optional[pd.DataFrame]: @@ -114,11 +113,11 @@ def parse_lammps_run(self, run_dir: str) -> Optional[pd.DataFrame]: # TODO consider filtering out samples with large forces and MD steps that are too similar # TODO large force and similar are to be defined df = df[['type', 'x', 'y', 'z', 'box']] - df = self.get_x_reduced(df) # add reduced coordinates + df = self.get_x_relative(df) # add relative coordinates df['natom'] = df['type'].apply(lambda x: len(x)) # count number of atoms in a structure # naive implementation: a list of list which is converted into a 2d array by torch later # but a list of list is not ok with the writing on files with parquet df['position'] = df.apply(lambda x: [j for i in ['x', 'y', 'z'] for j in x[i]], axis=1) # position as 3d array # position is natom * 3 array # TODO unit test to check the order after reshape - return df[['natom', 'box', 'type', 'position', 'reduced_position']] + return df[['natom', 'box', 'type', 'position', 'relative_positions']] diff --git a/crystal_diffusion/models/model_loader.py b/crystal_diffusion/models/model_loader.py index 197189be..4a13bcb0 100644 --- a/crystal_diffusion/models/model_loader.py +++ b/crystal_diffusion/models/model_loader.py @@ -1,28 +1,47 @@ +"""Functions to instantiate a model based on the provided hyperparameters.""" import logging +from typing import Any, AnyStr, Dict -from crystal_diffusion.models.score_network import MLPScoreNetwork +from crystal_diffusion.models.optimizer import (OptimizerParameters, + ValidOptimizerNames) +from crystal_diffusion.models.position_diffusion_lightning_model import ( + PositionDiffusionLightningModel, PositionDiffusionParameters) +from crystal_diffusion.models.score_network import MLPScoreNetworkParameters +from crystal_diffusion.samplers.variance_sampler import NoiseParameters logger = logging.getLogger(__name__) -def load_model(hyper_params): # pragma: no cover - """Instantiate a model. +def load_diffusion_model(hyper_params: Dict[AnyStr, Any]) -> PositionDiffusionLightningModel: + """Load a position diffusion model from the hyperparameters. Args: - hyper_params (dict): hyper parameters from the config file + hyper_params: dictionary of hyperparameters loaded from a config file Returns: - model (obj): A neural network model object. + Diffusion model randomly initialized """ - architecture = hyper_params['architecture'] - # __TODO__ fix architecture list - if architecture == 'simple_mlp': - model_class = MLPScoreNetwork - else: - raise ValueError('architecture {} not supported'.format(architecture)) - logger.info('selected architecture: {}'.format(architecture)) - - model = model_class(hyper_params) + score_network_parameters = MLPScoreNetworkParameters( + number_of_atoms=hyper_params['data']['max_atom'], + **hyper_params['model']['score_network'] + ) + score_network_parameters.spatial_dimension = hyper_params.get('spatial_dimension', 3) + + hyper_params['optimizer']['name'] = ValidOptimizerNames(hyper_params['optimizer']['name']) + + optimizer_parameters = OptimizerParameters( + **hyper_params['optimizer'] + ) + + noise_parameters = NoiseParameters(**hyper_params['model']['noise']) + + diffusion_params = PositionDiffusionParameters( + score_network_parameters=score_network_parameters, + optimizer_parameters=optimizer_parameters, + noise_parameters=noise_parameters, + ) + + model = PositionDiffusionLightningModel(diffusion_params) logger.info('model info:\n' + str(model) + '\n') return model diff --git a/crystal_diffusion/train_diffusion.py b/crystal_diffusion/train_diffusion.py new file mode 100644 index 00000000..ff8fc7a6 --- /dev/null +++ b/crystal_diffusion/train_diffusion.py @@ -0,0 +1,248 @@ +"""Entry point to train a diffusion model.""" +import argparse +import glob +import logging +import os +import shutil +import sys + +import orion +import pytorch_lightning as pl +import yaml +from crystal_diffusion.data.diffusion.data_loader import ( + LammpsForDiffusionDataModule, LammpsLoaderParameters) +from crystal_diffusion.models.model_loader import load_diffusion_model +from crystal_diffusion.utils.file_utils import rsync_folder +from crystal_diffusion.utils.hp_utils import check_and_log_hp +from crystal_diffusion.utils.logging_utils import LoggerWriter, log_exp_details +from crystal_diffusion.utils.reproducibility_utils import set_seed +from orion.client import report_results +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from yaml import load + +logger = logging.getLogger(__name__) + +BEST_MODEL_NAME = 'best_model' +LAST_MODEL_NAME = 'last_model' + + +def main(): + """Create and train a diffusion model: main entry point of the program. + + Note: + This main.py file is meant to be called using the cli, + see the `examples/local/run_diffusion.sh` file to see how to use it. + + """ + parser = argparse.ArgumentParser() + # __TODO__ check you need all the following CLI parameters + parser.add_argument('--log', help='log to this file (in addition to stdout/err)') + parser.add_argument('--config', + help='config file with generic hyper-parameters, such as optimizer, ' + 'batch_size, ... - in yaml format') + parser.add_argument('--data', help='path to a LAMMPS data set', required=True) + parser.add_argument('--processed_datadir', help='path to the processed data directory', required=True) + parser.add_argument('--dataset_working_dir', help='path to the Datasets working directory. Defaults to None', + default=None) + parser.add_argument('--tmp-folder', + help='will use this folder as working folder - it will copy the input data ' + 'here, generate results here, and then copy them back to the output ' + 'folder') # TODO possibly remove this + parser.add_argument('--output', help='path to outputs - will store files here', required=True) + parser.add_argument('--disable-progressbar', action='store_true', + help='will disable the progressbar while going over the mini-batch') + parser.add_argument('--start-from-scratch', action='store_true', + help='will not load any existing saved model - even if present') + parser.add_argument('--accelerator', help='PL trainer accelerator. Defaults to auto.', default='auto') + parser.add_argument('--devices', default=1, help='pytorch-lightning devices kwarg. Defaults to 1.') + parser.add_argument('--debug', action='store_true') # TODO not used yet + args = parser.parse_args() + + logging.basicConfig(stream=sys.stdout, level=logging.INFO) + + if os.path.exists(args.output) and args.start_from_scratch: + logger.info('Starting from scratch, removing any previous experiments.') + shutil.rmtree(args.output) + + if os.path.exists(args.output): + logger.info("Previous experiment found, resuming from checkpoint") + else: + os.makedirs(args.output) + + if args.tmp_folder is not None: + # TODO data rsync to tmp_folder + output_dir = os.path.join(args.tmp_folder, 'output') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + else: + output_dir = args.output + + # will log to a file if provided (useful for orion on cluster) + if args.log is not None: + handler = logging.handlers.WatchedFileHandler(args.log) + formatter = logging.Formatter(logging.BASIC_FORMAT) + handler.setFormatter(formatter) + root = logging.getLogger() + root.setLevel(logging.INFO) + root.addHandler(handler) + + # to intercept any print statement: + sys.stdout = LoggerWriter(logger.info) + sys.stderr = LoggerWriter(logger.warning) + + if args.config is not None: + with open(args.config, 'r') as stream: + hyper_params = load(stream, Loader=yaml.FullLoader) + else: + hyper_params = {} + + run(args, output_dir, hyper_params) + + if args.tmp_folder is not None: + rsync_folder(output_dir + os.path.sep, args.output) + + +def run(args, output_dir, hyper_params): + """Create and run the dataloaders, training loops, etc. + + Args: + args (object): arguments passed from the cli + output_dir (str): path to output folder + hyper_params (dict): hyper parameters from the config file + """ + # __TODO__ change the hparam that are used from the training algorithm + # (and NOT the model - these will be specified in the model itself) + logger.info('List of hyper-parameters:') + check_and_log_hp( + ['model', 'data', 'exp_name', 'max_epoch', 'optimizer', 'seed', + 'early_stopping'], + hyper_params) + + if hyper_params["seed"] is not None: + set_seed(hyper_params["seed"]) + + log_exp_details(os.path.realpath(__file__), args) + + data_params = LammpsLoaderParameters(**hyper_params['data']) + + datamodule = LammpsForDiffusionDataModule( + lammps_run_dir=args.data, + processed_dataset_dir=args.processed_datadir, + hyper_params=data_params, + working_cache_dir=args.dataset_working_dir, + ) + + model = load_diffusion_model(hyper_params) + + train(model=model, datamodule=datamodule, output=output_dir, hyper_params=hyper_params, + use_progress_bar=not args.disable_progressbar, accelerator=args.accelerator, devices=args.devices) + + # clean up the data cache to save disk space + datamodule.clean_up() + + +def train(**kwargs): # pragma: no cover + """Training loop wrapper. Used to catch exception if Orion is being used.""" + try: + best_dev_metric = train_impl(**kwargs) + except RuntimeError as err: + if orion.client.cli.IS_ORION_ON and 'CUDA out of memory' in str(err): + logger.error(err) + logger.error('model was out of memory - assigning a bad score to tell Orion to avoid' + 'too big model') + best_dev_metric = -999 + else: + raise err + + report_results([dict( + name='dev_metric', + type='objective', + # note the minus - cause orion is always trying to minimize (cit. from the guide) + value=-float(best_dev_metric))]) + + +def train_impl(model, datamodule, output, hyper_params, use_progress_bar, accelerator=None, devices=None + ): # pragma: no cover + """Train a model: main training loop implementation. + + Args: + model (obj): The neural network model object. + datamodule (obj): lightning data module that will instantiate data loaders. + output (str): Output directory. + hyper_params (dict): Dict containing hyper-parameters. + use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging). + accelerator: PL trainer accelerator + devices: PL devices to use + """ + check_and_log_hp(['max_epoch'], hyper_params) + + best_model_path = os.path.join(output, BEST_MODEL_NAME) + best_checkpoint_callback = ModelCheckpoint( + dirpath=best_model_path, + filename='model', + save_top_k=1, + verbose=use_progress_bar, + monitor="val_loss", + mode="max", + every_n_epochs=1, + ) + + last_model_path = os.path.join(output, LAST_MODEL_NAME) + last_checkpoint_callback = ModelCheckpoint( + dirpath=last_model_path, + filename='model', + verbose=use_progress_bar, + every_n_epochs=1, + ) + + # TODO pl Trainer does not use the kwarg resume_from_checkpoint now - check about resume training works now + # resume_from_checkpoint = handle_previous_models(output, last_model_path, best_model_path) + + + early_stopping_params = hyper_params['early_stopping'] + check_and_log_hp(['metric', 'mode', 'patience'], hyper_params['early_stopping']) + early_stopping = EarlyStopping( + early_stopping_params['metric'], + mode=early_stopping_params['mode'], + patience=early_stopping_params['patience'], + verbose=use_progress_bar) + + logger = pl.loggers.TensorBoardLogger( + save_dir=output, + default_hp_metric=False, + version=0, # Necessary to resume tensorboard logging + ) + + trainer = pl.Trainer( + callbacks=[early_stopping, best_checkpoint_callback, last_checkpoint_callback], + max_epochs=hyper_params['max_epoch'], + # resume_from_checkpoint=resume_from_checkpoint, + accelerator=accelerator, + devices=devices, + logger=logger, + ) + + trainer.fit(model, datamodule=datamodule) + + # Log the best result and associated hyper parameters + best_dev_result = float(early_stopping.best_score.cpu().numpy()) + logger.log_hyperparams(hyper_params, metrics={'best_dev_metric': best_dev_result}) + + return best_dev_result + + +def handle_previous_models(output, last_model_path, best_model_path): + """Move the previous models in a new timestamp folder.""" + last_models = glob.glob(last_model_path + os.sep + '*') + + if len(last_models) >= 1: + resume_from_checkpoint = sorted(last_models)[-1] + logger.info(f'models found - resuming from {resume_from_checkpoint}') + else: + logger.info('no model found - starting training from scratch') + resume_from_checkpoint = None + return resume_from_checkpoint + + +if __name__ == '__main__': + main() diff --git a/examples/local/config_diffusion.yaml b/examples/local/config_diffusion.yaml new file mode 100644 index 00000000..042f5fbe --- /dev/null +++ b/examples/local/config_diffusion.yaml @@ -0,0 +1,34 @@ +# general +loss: cross_entropy +max_epoch: 5 +exp_name: exp_example +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 + +# data +data: + batch_size: 32 + num_workers: 0 + max_atom: 512 + +# architecture +spatial_dimension: 3 +model: + score_network: + hidden_dimensions: [16, 16] # dimensions of the hidden layers. Length of array determines number of la + noise: + total_time_steps: 10 + sigma_min: 0.005 # default value + sigma_max: 0.5 # default value + +# optimizer +optimizer: + name: adam + learning_rate: 0.001 + +# early stopping +early_stopping: + metric: val_loss + mode: min + patience: 3 \ No newline at end of file diff --git a/examples/local/run_diffusion.sh b/examples/local/run_diffusion.sh new file mode 100755 index 00000000..8e1d3a7e --- /dev/null +++ b/examples/local/run_diffusion.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +LOG=debug.log +CONFIG=config_diffusion.yaml +DATA_DIR=../../data/si_diffusion_v1 +PROCESSED_DATA=${DATA_DIR}/processed +DATA_WORK_DIR=./tmp_work_dir/ +OUTPUT=debug + +python ../../crystal_diffusion/train_diffusion.py \ + --log $LOG \ + --config $CONFIG \ + --data $DATA_DIR \ + --processed_datadir $PROCESSED_DATA \ + --dataset_working_dir $DATA_WORK_DIR \ + --output $OUTPUT From 497b9eee789bd4ced0dfd64cff2073f558a6a370 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 28 Mar 2024 14:27:42 -0400 Subject: [PATCH 2/6] fixing unit tests --- tests/data/diffusion/test_data_loader.py | 6 +++--- tests/data/diffusion/test_data_preprocess.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/data/diffusion/test_data_loader.py b/tests/data/diffusion/test_data_loader.py index a3ef4cd4..49d88482 100644 --- a/tests/data/diffusion/test_data_loader.py +++ b/tests/data/diffusion/test_data_loader.py @@ -12,14 +12,14 @@ def input_data_to_transform(self): 'natom': [2], # batch size of 1 'box': [[1.0, 1.0, 1.0]], 'position': [[1., 2., 3, 4., 5, 6]], # for one batch, two atoms, 3D positions - 'reduced_position': [[1., 2., 3, 4., 5, 6]], + 'relative_positions': [[1., 2., 3, 4., 5, 6]], 'type': [[1, 2]] } def test_dataset_transform(self, input_data_to_transform): result = LammpsForDiffusionDataModule.dataset_transform(input_data_to_transform) # Check keys in result - assert set(result.keys()) == {'natom', 'position', 'reduced_position', 'box', 'type'} + assert set(result.keys()) == {'natom', 'position', 'relative_positions', 'box', 'type'} # Check tensor types and shapes assert torch.equal(result['natom'], torch.tensor([2]).long()) @@ -39,7 +39,7 @@ def input_data_to_pad(self): 'natom': 2, # batch size of 1 'box': [1.0, 1.0, 1.0], 'position': [1., 2., 3, 4., 5, 6], # for one batch, two atoms, 3D positions - 'reduced_position': [1., 2., 3, 4., 5, 6], + 'relative_positions': [1., 2., 3, 4., 5, 6], 'type': [1, 2] } diff --git a/tests/data/diffusion/test_data_preprocess.py b/tests/data/diffusion/test_data_preprocess.py index 3354e1a6..bc587bd6 100644 --- a/tests/data/diffusion/test_data_preprocess.py +++ b/tests/data/diffusion/test_data_preprocess.py @@ -86,7 +86,7 @@ def test_parse_lammps_run(mock_processor, mock_parse_lammps_output, tmp_path): assert 'box' in df.columns assert 'type' in df.columns assert 'position' in df.columns - assert 'reduced_position' in df.columns + assert 'relative_positions' in df.columns @pytest.fixture @@ -105,14 +105,14 @@ def sample_coordinates(box_coordinates): }) -def test_convert_coords_to_reduced(sample_coordinates, box_coordinates): +def test_convert_coords_to_relative(sample_coordinates, box_coordinates): # Expected output: Each coordinate divided by 1, 2, 3 (the box limits) for index, row in sample_coordinates.iterrows(): - reduced_coords = LammpsProcessorForDiffusion._convert_coords_to_reduced(row) + relative_coords = LammpsProcessorForDiffusion._convert_coords_to_relative(row) expected_coords = [] for x, y, z in zip(row['x'], row['y'], row['z']): expected_coords.extend([x / box_coordinates[0], y / box_coordinates[1], z / box_coordinates[2]]) - assert reduced_coords == expected_coords + assert relative_coords == expected_coords @pytest.fixture @@ -123,9 +123,9 @@ def mock_prepare_data(): yield mock_prepare -def test_get_x_reduced(mock_prepare_data, sample_coordinates, tmpdir): - # Call get_x_reduced on the test data +def test_get_x_relative(mock_prepare_data, sample_coordinates, tmpdir): + # Call get_x_relative on the test data lp = LammpsProcessorForDiffusion(tmpdir, tmpdir) - result_df = lp.get_x_reduced(sample_coordinates) - # Check if 'reduced_position' column is added - assert 'reduced_position' in result_df.columns + result_df = lp.get_x_relative(sample_coordinates) + # Check if 'relative_positions' column is added + assert 'relative_positions' in result_df.columns From 79b426649edf3b687f9c6ef85bc39dcdd2d3141c Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 28 Mar 2024 14:29:05 -0400 Subject: [PATCH 3/6] extra white line --- crystal_diffusion/train_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/crystal_diffusion/train_diffusion.py b/crystal_diffusion/train_diffusion.py index ff8fc7a6..6a688722 100644 --- a/crystal_diffusion/train_diffusion.py +++ b/crystal_diffusion/train_diffusion.py @@ -198,7 +198,6 @@ def train_impl(model, datamodule, output, hyper_params, use_progress_bar, accele # TODO pl Trainer does not use the kwarg resume_from_checkpoint now - check about resume training works now # resume_from_checkpoint = handle_previous_models(output, last_model_path, best_model_path) - early_stopping_params = hyper_params['early_stopping'] check_and_log_hp(['metric', 'mode', 'patience'], hyper_params['early_stopping']) early_stopping = EarlyStopping( From 5dfffac3cc26432600d4aeb5bc0ab7343633e342 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 28 Mar 2024 14:50:54 -0400 Subject: [PATCH 4/6] isort error? --- crystal_diffusion/data/diffusion/data_loader.py | 3 ++- crystal_diffusion/data/diffusion/data_preprocess.py | 1 + crystal_diffusion/train_diffusion.py | 7 ++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/crystal_diffusion/data/diffusion/data_loader.py b/crystal_diffusion/data/diffusion/data_loader.py index a2a0ea58..12ac202e 100644 --- a/crystal_diffusion/data/diffusion/data_loader.py +++ b/crystal_diffusion/data/diffusion/data_loader.py @@ -9,9 +9,10 @@ import pytorch_lightning as pl import torch import torch.nn.functional as F +from torch.utils.data import DataLoader + from crystal_diffusion.data.diffusion.data_preprocess import \ LammpsProcessorForDiffusion -from torch.utils.data import DataLoader logger = logging.getLogger(__name__) diff --git a/crystal_diffusion/data/diffusion/data_preprocess.py b/crystal_diffusion/data/diffusion/data_preprocess.py index f0105976..a00208f5 100644 --- a/crystal_diffusion/data/diffusion/data_preprocess.py +++ b/crystal_diffusion/data/diffusion/data_preprocess.py @@ -5,6 +5,7 @@ from typing import List, Optional import pandas as pd + from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output logger = logging.getLogger(__name__) diff --git a/crystal_diffusion/train_diffusion.py b/crystal_diffusion/train_diffusion.py index 6a688722..1a87e274 100644 --- a/crystal_diffusion/train_diffusion.py +++ b/crystal_diffusion/train_diffusion.py @@ -9,6 +9,10 @@ import orion import pytorch_lightning as pl import yaml +from orion.client import report_results +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from yaml import load + from crystal_diffusion.data.diffusion.data_loader import ( LammpsForDiffusionDataModule, LammpsLoaderParameters) from crystal_diffusion.models.model_loader import load_diffusion_model @@ -16,9 +20,6 @@ from crystal_diffusion.utils.hp_utils import check_and_log_hp from crystal_diffusion.utils.logging_utils import LoggerWriter, log_exp_details from crystal_diffusion.utils.reproducibility_utils import set_seed -from orion.client import report_results -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from yaml import load logger = logging.getLogger(__name__) From e8b91eaba957b82af4330d9a822b791ce19f9bd3 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 28 Mar 2024 15:10:19 -0400 Subject: [PATCH 5/6] missing fnct in model_loader that should be removed in the future --- crystal_diffusion/models/model_loader.py | 26 +++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/crystal_diffusion/models/model_loader.py b/crystal_diffusion/models/model_loader.py index 4a13bcb0..e4d1c1c3 100644 --- a/crystal_diffusion/models/model_loader.py +++ b/crystal_diffusion/models/model_loader.py @@ -6,7 +6,8 @@ ValidOptimizerNames) from crystal_diffusion.models.position_diffusion_lightning_model import ( PositionDiffusionLightningModel, PositionDiffusionParameters) -from crystal_diffusion.models.score_network import MLPScoreNetworkParameters +from crystal_diffusion.models.score_network import (MLPScoreNetwork, + MLPScoreNetworkParameters) from crystal_diffusion.samplers.variance_sampler import NoiseParameters logger = logging.getLogger(__name__) @@ -45,3 +46,26 @@ def load_diffusion_model(hyper_params: Dict[AnyStr, Any]) -> PositionDiffusionLi logger.info('model info:\n' + str(model) + '\n') return model + + +def load_model(hyper_params): # pragma: no cover + """Instantiate a model. + + Args: + hyper_params (dict): hyper parameters from the config file + + Returns: + model (obj): A neural network model object. + """ + architecture = hyper_params['architecture'] + # __TODO__ fix architecture list + if architecture == 'simple_mlp': + model_class = MLPScoreNetwork + else: + raise ValueError('architecture {} not supported'.format(architecture)) + logger.info('selected architecture: {}'.format(architecture)) + + model = model_class(hyper_params) + logger.info('model info:\n' + str(model) + '\n') + + return model From c9080693f5eb35122c5056b77a9df0abd4454d7e Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 4 Apr 2024 09:53:11 -0400 Subject: [PATCH 6/6] fix truncated comment --- examples/local/config_diffusion.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/local/config_diffusion.yaml b/examples/local/config_diffusion.yaml index 042f5fbe..8ed5fa27 100644 --- a/examples/local/config_diffusion.yaml +++ b/examples/local/config_diffusion.yaml @@ -16,7 +16,7 @@ data: spatial_dimension: 3 model: score_network: - hidden_dimensions: [16, 16] # dimensions of the hidden layers. Length of array determines number of la + hidden_dimensions: [16, 16] # dimensions of the hidden layers. Length of array determines number of layers noise: total_time_steps: 10 sigma_min: 0.005 # default value