From 6e298f74fa634c77489b8f285b58ed4d2d345435 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Wed, 17 Jan 2024 15:26:48 +0800 Subject: [PATCH 01/12] add MoFlow std case --- training/benchmarks/moflow/pytorch/README.md | 55 ++++ .../benchmarks/moflow/pytorch/__init__.py | 0 .../moflow/pytorch/config/__init__.py | 2 + .../benchmarks/moflow/pytorch/config/_base.py | 105 +++++++ .../moflow/pytorch/config/mutable_params.py | 5 + .../moflow/pytorch/data/__init__.py | 0 .../moflow/pytorch/data/data_frame_parser.py | 109 +++++++ .../moflow/pytorch/data/data_loader.py | 110 +++++++ .../moflow/pytorch/data/encoding.py | 139 +++++++++ .../moflow/pytorch/data/transform.py | 85 ++++++ .../moflow/pytorch/dataloaders/__init__.py | 0 .../moflow/pytorch/dataloaders/dataloader.py | 68 +++++ .../benchmarks/moflow/pytorch/misc/config.py | 142 +++++++++ .../benchmarks/moflow/pytorch/misc/utils.py | 211 ++++++++++++++ .../moflow/pytorch/model/__init__.py | 6 + .../benchmarks/moflow/pytorch/model/basic.py | 194 +++++++++++++ .../moflow/pytorch/model/coupling.py | 196 +++++++++++++ .../benchmarks/moflow/pytorch/model/glow.py | 270 ++++++++++++++++++ .../benchmarks/moflow/pytorch/model/model.py | 251 ++++++++++++++++ .../benchmarks/moflow/pytorch/model/utils.py | 42 +++ .../moflow/pytorch/optimizers/__init__.py | 0 .../moflow/pytorch/run_pretraining.py | 192 +++++++++++++ .../moflow/pytorch/runtime/__init__.py | 0 .../moflow/pytorch/runtime/arguments.py | 69 +++++ .../moflow/pytorch/runtime/common.py | 93 ++++++ .../pytorch/runtime/distributed_utils.py | 71 +++++ .../moflow/pytorch/runtime/generate.py | 97 +++++++ .../moflow/pytorch/runtime/logger.py | 124 ++++++++ .../moflow/pytorch/train/__init__.py | 0 .../moflow/pytorch/train/evaluator.py | 110 +++++++ .../moflow/pytorch/train/trainer.py | 237 +++++++++++++++ .../moflow/pytorch/train/trainer_adapter.py | 44 +++ .../moflow/pytorch/train/training_state.py | 37 +++ training/nvidia/moflow-pytorch/README.md | 51 ++++ .../moflow-pytorch/config/config_A100x1x1.py | 4 + .../moflow-pytorch/config/config_A100x1x8.py | 4 + .../moflow-pytorch/config/config_A100x2x8.py | 3 + .../config/environment_variables.sh | 8 + .../moflow-pytorch/config/requirements.txt | 2 + .../moflow-pytorch/extern/trainer_adapter.py | 17 ++ 40 files changed, 3153 insertions(+) create mode 100644 training/benchmarks/moflow/pytorch/README.md create mode 100644 training/benchmarks/moflow/pytorch/__init__.py create mode 100755 training/benchmarks/moflow/pytorch/config/__init__.py create mode 100755 training/benchmarks/moflow/pytorch/config/_base.py create mode 100755 training/benchmarks/moflow/pytorch/config/mutable_params.py create mode 100644 training/benchmarks/moflow/pytorch/data/__init__.py create mode 100644 training/benchmarks/moflow/pytorch/data/data_frame_parser.py create mode 100644 training/benchmarks/moflow/pytorch/data/data_loader.py create mode 100644 training/benchmarks/moflow/pytorch/data/encoding.py create mode 100644 training/benchmarks/moflow/pytorch/data/transform.py create mode 100644 training/benchmarks/moflow/pytorch/dataloaders/__init__.py create mode 100644 training/benchmarks/moflow/pytorch/dataloaders/dataloader.py create mode 100644 training/benchmarks/moflow/pytorch/misc/config.py create mode 100644 training/benchmarks/moflow/pytorch/misc/utils.py create mode 100644 training/benchmarks/moflow/pytorch/model/__init__.py create mode 100644 training/benchmarks/moflow/pytorch/model/basic.py create mode 100644 training/benchmarks/moflow/pytorch/model/coupling.py create mode 100644 training/benchmarks/moflow/pytorch/model/glow.py create mode 100644 training/benchmarks/moflow/pytorch/model/model.py create mode 100644 training/benchmarks/moflow/pytorch/model/utils.py create mode 100644 training/benchmarks/moflow/pytorch/optimizers/__init__.py create mode 100755 training/benchmarks/moflow/pytorch/run_pretraining.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/__init__.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/arguments.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/common.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/distributed_utils.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/generate.py create mode 100644 training/benchmarks/moflow/pytorch/runtime/logger.py create mode 100755 training/benchmarks/moflow/pytorch/train/__init__.py create mode 100755 training/benchmarks/moflow/pytorch/train/evaluator.py create mode 100755 training/benchmarks/moflow/pytorch/train/trainer.py create mode 100755 training/benchmarks/moflow/pytorch/train/trainer_adapter.py create mode 100755 training/benchmarks/moflow/pytorch/train/training_state.py create mode 100644 training/nvidia/moflow-pytorch/README.md create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x1x1.py create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x1x8.py create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x2x8.py create mode 100755 training/nvidia/moflow-pytorch/config/environment_variables.sh create mode 100644 training/nvidia/moflow-pytorch/config/requirements.txt create mode 100644 training/nvidia/moflow-pytorch/extern/trainer_adapter.py diff --git a/training/benchmarks/moflow/pytorch/README.md b/training/benchmarks/moflow/pytorch/README.md new file mode 100644 index 000000000..ad7ee15bc --- /dev/null +++ b/training/benchmarks/moflow/pytorch/README.md @@ -0,0 +1,55 @@ + +## Model Introduction +MoFlow is a model for molecule generation that leverages Normalizing Flows. Normalizing Flows is a class of generative neural networks that directly models the probability density of the data. They consist of a sequence of invertible transformations that convert the input data that follow some hard-to-model distribution into a latent code that follows a normal distribution which can then be easily used for sampling. + +MoFlow was first introduced by Chengxi Zang et al. in their paper titled "MoFlow: An Invertible Flow Model for Generating Molecular Graphs" [paper](https://arxiv.org/pdf/2006.10137.pdf). + + + +## Model source code +This repository includes software from [MoFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow) +licensed under the Apache License, Version 2.0 + +Some of the files in this directory were modified by BAAI in 2024 to support FlagPerf. + +## Dataset +### getting the data +This [original source code repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#getting-the-data) contains the prepare_datasets.sh script that will automatically download and process the dataset. By default, data will be downloaded to the /data/ directory in the container. +```bash +bash prepare_datasets.sh +``` +### preprocess the dataset +Start the container with [Dockerfile](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/DrugDiscovery/MoFlow/Dockerfile). Enter the container. +excute the folowing script to preprocess the dataset. +```bash +python3 scripts/data_preprocess.py +``` + +### dataset strucutres +preview directory structures of \ +```bash +tree . +``` + +``` +. +├── valid_idx_zinc250k.json +├── zinc250k.csv +└── zinc250k_relgcn_kekulized_ggnp.npz +``` + +| FileName | Size(Bytes) | MD5 | +| ---------------------------------- | ----------- | -------------------------------- | +| valid_idx_zinc250k.json | 187832 | f8045b49a413c31136a0645d30c0b846 | +| zinc250k.csv | 23736231 | cd330eafb7a2cc413b3c9cafaf3efece | +| zinc250k_relgcn_kekulized_ggnp.npz | 375680462 | c91985e309a9f76457169859dbe1e662 | + + +## Checkpoint +- None + +## AI Frameworks && Accelerators supports + +| | Pytorch | Paddle | TensorFlow2 | +| ---------- | ------------------------------------------ | ------ | ----------- | +| Nvidia GPU | [✅](../../nvidia/moflow-pytorch/README.md) | N/A | N/A | diff --git a/training/benchmarks/moflow/pytorch/__init__.py b/training/benchmarks/moflow/pytorch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/config/__init__.py b/training/benchmarks/moflow/pytorch/config/__init__.py new file mode 100755 index 000000000..96e0aae70 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/config/__init__.py @@ -0,0 +1,2 @@ +from ._base import * +from .mutable_params import mutable_params diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py new file mode 100755 index 000000000..b1ff28e92 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/config/_base.py @@ -0,0 +1,105 @@ +# DO NOT MODIFY THESE REQUIRED PARAMETERS + +# Required parameters +vendor: str = None +data_dir: str = None +name: str = "moflow" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True + +# Optional parameters + +# ========================================================= +# data +# ========================================================= +# The config to choose. This parameter allows one to switch between different datasets. +# and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used. +config_name: str = "zinc250k" +# Number of workers in the data loader. +num_workers: int = 4 + +# ========================================================= +# loss scale +# ========================================================= +# Base learning rate. +lr: float = 0.0005 +# beta1 parameter for the optimizer. +beta1: float = 0.9 +# beta2 parameter for the optimizer. +beta2: float = 0.99 +# Gradient clipping norm. +clip: float = 1.0 +# ========================================================= +# train && evaluate +# ========================================================= +# Batch size per GPU for training +train_batch_size: int = 512 +eval_batch_size: int = 100 + +target_nuv: float = 85.0 + +# Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved. +save_epochs: int = 5 +# Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed. +eval_epochs: int = 5 + +# Number of warmup steps. This value is used for benchmarking and for CUDA graph capture. +warmup_steps: int = 20 +# Number of steps used for training/inference. This parameter allows finishing. +# training earlier than the specified number of epochs. +# If used with inference, it allows generating more molecules (by default only a single batch of molecules is generated). +steps: int = -1 +# Temperature used for sampling. +temperature: float = 0.3 +first_epoch: int = 0 +epochs: int = 300 + +allow_untrained = False + +do_train = True +fp16 = False +amp: bool = True +distributed: bool = True + +# Directory where checkpoints are stored +results_dir: str = "moflow_results" +# Path to store generated molecules. If an empty string is provided, predictions will not be saved (useful for benchmarking and debugging). +# predictions_path: str = "moflow_results/predictions.smi" +# ========================================================= +# experiment +# ========================================================= +# Compile the model with `torch.jit.script`. Can be used to speed up training or inference. +jit: bool = False +# Capture GPU kernels with CUDA graphs. This option allows to speed up training +cuda_graph: bool = True +# Verbosity level. Specify the following values: 0, 1, 2, 3, where 0 means minimal verbosity (errors only) and 3 - maximal (debugging). +verbosity: int = 1 +# Path for DLLogger log. This file will contain information about the speed and accuracy of the model during training and inference. +# Note that if the file already exists, new logs will be added at the end. +log_path: str = "moflow_results/moflow.json" +# Frequency for writing logs, expressed in steps. +log_interval: int = 20 + + +# Apply validity correction after the generation of the molecules. +correct_validity: bool = False +# ========================================================= +# utils +# ========================================================= +# Random seed used to initialize the distributed loaders +seed: int = 1 +dist_backend: str = 'nccl' + +device: str = None + +# ========================================================= +# for driver +# ========================================================= +# rank of the GPU, used to launch distributed training. +local_rank: int = -1 +use_env: bool = True +log_freq: int = 500 +print_freq: int = 500 +n_device: int = 1 +sync_bn: bool = False +gradient_accumulation_steps: int = 1 diff --git a/training/benchmarks/moflow/pytorch/config/mutable_params.py b/training/benchmarks/moflow/pytorch/config/mutable_params.py new file mode 100755 index 000000000..7dcf8652a --- /dev/null +++ b/training/benchmarks/moflow/pytorch/config/mutable_params.py @@ -0,0 +1,5 @@ +mutable_params = [ + 'vendor', 'data_dir', 'lr', 'train_batch_size', 'eval_batch_size', + 'do_train', 'amp', 'fp16', 'distributed', 'dist_backend', 'num_workers', + 'device', 'cudnn_benchmark', 'cudnn_deterministic' +] diff --git a/training/benchmarks/moflow/pytorch/data/__init__.py b/training/benchmarks/moflow/pytorch/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/data/data_frame_parser.py b/training/benchmarks/moflow/pytorch/data/data_frame_parser.py new file mode 100644 index 000000000..ba76fc439 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/data/data_frame_parser.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +from logging import getLogger +import traceback +from typing import List + +import numpy as np +import pandas as pd +from rdkit import Chem +from tqdm import tqdm + +from moflow.data.encoding import MolEncoder, EncodingError +from moflow.data.data_loader import NumpyTupleDataset + + +class DataFrameParser: + """ + This DataFrameParser parses pandas dataframe containing SMILES and, optionally, some additional features. + + Args: + encoder (MolEncoder): encoder instance + labels (list): labels column that should be loaded + smiles_col (str): smiles column + """ + + def __init__(self, encoder: MolEncoder, + labels: List[str], + smiles_col: str = 'smiles'): + super(DataFrameParser, self).__init__() + self.labels = labels + self.smiles_col = smiles_col + self.logger = getLogger(__name__) + self.encoder = encoder + + def parse(self, df: pd.DataFrame) -> NumpyTupleDataset: + """Parse DataFrame using `encoder` and prepare a dataset instance + + Labels are extracted from `labels` columns and input features are + extracted from smiles information in `smiles` column. + """ + all_nodes = [] + all_edges = [] + + total_count = df.shape[0] + fail_count = 0 + success_count = 0 + for smiles in tqdm(df[self.smiles_col], total=df.shape[0]): + try: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + fail_count += 1 + continue + # Note that smiles expression is not unique. + # we obtain canonical smiles + nodes, edges = self.encoder.encode_mol(mol) + + except EncodingError as e: + fail_count += 1 + continue + except Exception as e: + self.logger.warning('parse(), type: {}, {}' + .format(type(e).__name__, e.args)) + self.logger.info(traceback.format_exc()) + fail_count += 1 + continue + all_nodes.append(nodes) + all_edges.append(edges) + success_count += 1 + + result = [np.array(all_nodes), np.array(all_edges), *(df[label_col].values for label_col in self.labels)] + self.logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}' + .format(fail_count, success_count, total_count)) + + dataset = NumpyTupleDataset(result) + return dataset diff --git a/training/benchmarks/moflow/pytorch/data/data_loader.py b/training/benchmarks/moflow/pytorch/data/data_loader.py new file mode 100644 index 000000000..28f9378ca --- /dev/null +++ b/training/benchmarks/moflow/pytorch/data/data_loader.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +import os +import logging +from typing import Any, Callable, Iterable, Optional, Tuple + +import numpy as np +from torch.utils.data import Dataset + + +class NumpyTupleDataset(Dataset): + """Dataset of a tuple of datasets. + + It combines multiple datasets into one dataset. Each example is represented + by a tuple whose ``i``-th item corresponds to the i-th dataset. + And each ``i``-th dataset is expected to be an instance of numpy.ndarray. + + Args: + datasets: Underlying datasets. The ``i``-th one is used for the + ``i``-th item of each example. All datasets must have the same + length. + transform: An optional function applied to an item bofre returning + """ + + def __init__(self, datasets: Iterable[np.ndarray], transform: Optional[Callable] = None) -> None: + if not datasets: + raise ValueError('no datasets are given') + length = len(datasets[0]) + for i, dataset in enumerate(datasets): + if len(dataset) != length: + raise ValueError( + 'dataset of the index {} has a wrong length'.format(i)) + self._datasets = datasets + self._length = length + self.transform = transform + + def __len__(self) -> int: + return self._length + + def __getitem__(self, index: int) -> Tuple[Any]: + item = [dataset[index] for dataset in self._datasets] + + if self.transform: + item = self.transform(item) + return item + + def get_datasets(self) -> Tuple[np.ndarray]: + return self._datasets + + + def save(self, filepath: str) -> None: + """save the dataset to filepath in npz format + + Args: + filepath (str): filepath to save dataset. It is recommended to end + with '.npz' extension. + """ + np.savez(filepath, *self._datasets) + logging.info('Save {} done.'.format(filepath)) + + @classmethod + def load(cls, filepath: str, transform: Optional[Callable] = None): + logging.info('Loading file {}'.format(filepath)) + if not os.path.exists(filepath): + raise ValueError('Invalid filepath {} for dataset'.format(filepath)) + load_data = np.load(filepath) + result = [] + i = 0 + while True: + key = 'arr_{}'.format(i) + if key in load_data.keys(): + result.append(load_data[key]) + i += 1 + else: + break + return cls(result, transform) diff --git a/training/benchmarks/moflow/pytorch/data/encoding.py b/training/benchmarks/moflow/pytorch/data/encoding.py new file mode 100644 index 000000000..d3d71fde9 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/data/encoding.py @@ -0,0 +1,139 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +from typing import Tuple +import numpy as np +from rdkit import Chem + +from moflow.config import BOND_TO_CODE, DUMMY_CODE + + +class MolEncoder: + """Encodes atoms and adjecency matrix. + + Args: + out_size (int): It specifies the size of array returned by + `get_input_features`. + If the number of atoms in the molecule is less than this value, + the returned arrays is padded to have fixed size. + """ + + def __init__(self, out_size: int): + super(MolEncoder, self).__init__() + self.out_size = out_size + + def encode_mol(self, mol: Chem.Mol) -> Tuple[np.ndarray, np.ndarray]: + """get input features + + Args: + mol (Mol): + + Returns: + + """ + mol = self._standardize_mol(mol) + self._check_num_atoms(mol) + atom_array = self.construct_atomic_number_array(mol) + adj_array = self.construct_discrete_edge_matrix(mol) + return atom_array, adj_array + + def _standardize_mol(self, mol: Chem.Mol) -> Chem.Mol: + canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, + canonical=True) + mol = Chem.MolFromSmiles(canonical_smiles) + Chem.Kekulize(mol) + return mol + + def _check_num_atoms(self, mol: Chem.Mol) -> None: + """Check number of atoms in `mol` does not exceed `out_size`""" + num_atoms = mol.GetNumAtoms() + if num_atoms > self.out_size: + raise EncodingError(f'Number of atoms in mol {num_atoms} exceeds num_max_atoms {self.out_size}') + + + def construct_atomic_number_array(self, mol: Chem.Mol) -> np.ndarray: + """Returns atomic numbers of atoms consisting a molecule. + + Args: + mol (rdkit.Chem.Mol): Input molecule. + + Returns: + numpy.ndarray: an array consisting of atomic numbers + of atoms in the molecule. + """ + + atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()] + n_atom = len(atom_list) + if self.out_size < n_atom: + raise EncodingError(f'out_size {self.out_size} is smaller than number of atoms in mol {n_atom}') + atom_array = np.full(self.out_size, DUMMY_CODE, dtype=np.uint8) + atom_array[:n_atom] = atom_list + return atom_array + + + def construct_discrete_edge_matrix(self, mol: Chem.Mol) -> np.ndarray: + """Returns the edge-type dependent adjacency matrix of the given molecule. + + Args: + mol (rdkit.Chem.Mol): Input molecule. + + Returns: + adj_array (numpy.ndarray): The adjacent matrix of the input molecule. + It is symmetrical 2-dimensional array with shape (out_size, out_size), + filled with integers representing bond types. It two atoms are not + conncted, DUMMY_CODE is used instead. + """ + if mol is None: + raise EncodingError('mol is None') + n_atom = mol.GetNumAtoms() + + if self.out_size < n_atom: + raise EncodingError(f'out_size {self.out_size} is smaller than number of atoms in mol {n_atom}') + + adjs = np.full((self.out_size, self.out_size), DUMMY_CODE, dtype=np.uint8) + + for bond in mol.GetBonds(): + bond_type = bond.GetBondType() + # we need to use code here - bond types are rdkit objects + code = BOND_TO_CODE[bond_type] + i = bond.GetBeginAtomIdx() + j = bond.GetEndAtomIdx() + adjs[[i, j], [j, i]] = code + return adjs + + +class EncodingError(Exception): + pass diff --git a/training/benchmarks/moflow/pytorch/data/transform.py b/training/benchmarks/moflow/pytorch/data/transform.py new file mode 100644 index 000000000..da068e9b0 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/data/transform.py @@ -0,0 +1,85 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +import json +import logging +import numpy as np +import os +from typing import Dict, Tuple + +from misc.config import CODE_TO_BOND, DUMMY_CODE, Config + + +def _onehot(data: np.ndarray, codes_dict: Dict[int, int], dtype=np.float32) -> np.ndarray: + shape = [len(codes_dict), *data.shape] + encoded = np.zeros(shape, dtype=dtype) + for obj_key, code in codes_dict.items(): + encoded[code, data == obj_key] = 1 + return encoded + + +def encode_nodes(atomic_nums: np.ndarray, config: Config) -> np.ndarray: + padded_data = np.full(config.max_num_nodes, DUMMY_CODE, dtype=np.uint8) + padded_data[:len(atomic_nums)] = atomic_nums + encoded = _onehot(padded_data, config.dataset_config.atomic_to_code).T + return encoded + + +def encode_edges(adj: np.ndarray, config: Config) -> np.ndarray: + padded_data = np.full((config.max_num_nodes, config.max_num_nodes), DUMMY_CODE, dtype=np.uint8) + n, m = adj.shape + assert n == m, 'adjecency matrix should be square' + padded_data[:n, :n] = adj + # we already store codes in the file - bond types are rdkit objects + encoded = _onehot(padded_data, {k:k for k in CODE_TO_BOND}) + return encoded + + +def transform_fn(data: Tuple[np.ndarray], config: Config) -> Tuple[np.ndarray]: + node, adj, *labels = data + node = encode_nodes(node, config) + adj = encode_edges(adj, config) + return (node, adj, *labels) + + +def get_val_ids(config: Config, data_dir: str): + file_path = os.path.join(data_dir, config.dataset_config.valid_idx_file) + logging.info('loading train/valid split information from: {}'.format(file_path)) + with open(file_path) as json_data: + data = json.load(json_data) + + val_ids = [int(idx)-1 for idx in data] + return val_ids diff --git a/training/benchmarks/moflow/pytorch/dataloaders/__init__.py b/training/benchmarks/moflow/pytorch/dataloaders/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py new file mode 100644 index 000000000..a9591b20f --- /dev/null +++ b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py @@ -0,0 +1,68 @@ +import os +import functools + +import torch +from torch.utils.data.distributed import DistributedSampler + +from data.data_loader import NumpyTupleDataset +from data import transform +from misc.config import CONFIGS + + +def build_datasets(args): + + # Model configuration + assert args.config_name in CONFIGS + config = CONFIGS[args.config_name] + data_file = config.dataset_config.dataset_file + transform_fn = functools.partial(transform.transform_fn, config=config) + valid_idx = transform.get_val_ids(config, args.data_dir) + + # Datasets: + data_file_path = os.path.join(args.data_dir, data_file) + print(f"data_file_path: {data_file_path}") + + dataset = NumpyTupleDataset.load( + os.path.join(args.data_dir, data_file), + transform=transform_fn, + ) + if len(valid_idx) == 0: + raise ValueError('Empty validation set!') + else: + print(f"valid_idx size: {len(valid_idx)}") + + train_idx = [t for t in range(len(dataset)) if t not in valid_idx] + train_dataset = torch.utils.data.Subset(dataset, train_idx) + test_dataset = torch.utils.data.Subset(dataset, valid_idx) + return train_dataset, test_dataset + + + +def _get_sampler(args, train_dataset): + if args.distributed: + sampler = DistributedSampler(train_dataset, + seed=args.seed, + drop_last=False) + else: + sampler = None + return sampler + + +def build_train_dataloader(args, train_dataset): + sampler = _get_sampler(args, train_dataset) + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=sampler is None, + sampler=sampler, + num_workers=args.num_workers, + drop_last=True, + ) + + if args.distributed: + train_dataloader.sampler.set_epoch(-1) + return train_dataloader + + +def build_eval_dataloader(args, eval_dataset): + return None diff --git a/training/benchmarks/moflow/pytorch/misc/config.py b/training/benchmarks/moflow/pytorch/misc/config.py new file mode 100644 index 000000000..8bf4d07c4 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/misc/config.py @@ -0,0 +1,142 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import asdict, dataclass, field +import json +from typing import Dict, List, Optional + +from rdkit import Chem + + +_VALID_IDX_FILE = 'valid_idx_{}.json' +_CSV_FILE = '{}.csv' +_DATASET_FILE = '{}_relgcn_kekulized_ggnp.npz' + +DUMMY_CODE = 0 +CODE_TO_BOND = dict(enumerate([ + 'DUMMY', + Chem.rdchem.BondType.SINGLE, + Chem.rdchem.BondType.DOUBLE, + Chem.rdchem.BondType.TRIPLE, +])) +BOND_TO_CODE = {v: k for k, v in CODE_TO_BOND.items()} +ATOM_VALENCY = {6: 4, 7: 3, 8: 2, 9: 1, 15: 3, 16: 2, 17: 1, 35: 1, 53: 1} + + +@dataclass +class DatasetConfig: + dataset_name: str + atomic_num_list: List[int] + max_num_atoms: int + labels: List[str] + smiles_col: str + code_to_atomic: Dict[int, int] = field(init=False) + atomic_to_code: Dict[int, int] = field(init=False) + valid_idx_file: str = field(init=False) + csv_file: str = field(init=False) + dataset_file: str = field(init=False) + + def __post_init__(self): + self.valid_idx_file = _VALID_IDX_FILE.format(self.dataset_name) + self.csv_file = _CSV_FILE.format(self.dataset_name) + self.dataset_file = _DATASET_FILE.format(self.dataset_name) + + self.code_to_atomic = dict(enumerate(sorted([DUMMY_CODE] + self.atomic_num_list))) + self.atomic_to_code = {v: k for k, v in self.code_to_atomic.items()} + + +@dataclass +class AtomFlowConfig: + n_flow: int + hidden_gnn: List[int] + hidden_lin: List[int] + n_block: int = 1 + mask_row_size_list: List[int] = field(default_factory=lambda: [1]) + mask_row_stride_list: List[int] = field(default_factory=lambda: [1]) + +@dataclass +class BondFlowConfig: + hidden_ch: List[int] + conv_lu: int + n_squeeze: int + n_block: int = 1 + n_flow: int = 10 + + +@dataclass +class ModelConfig: + atom_config: AtomFlowConfig + bond_config: BondFlowConfig + noise_scale: float = 0.6 + learn_dist: bool = True + +@dataclass +class Config: + dataset_config: DatasetConfig + model_config: ModelConfig + max_num_nodes: Optional[int] = None + num_node_features: Optional[int] = None + num_edge_features: int = len(CODE_TO_BOND) + z_dim: int = field(init=False) + + def __post_init__(self): + if self.max_num_nodes is None: + self.max_num_nodes = self.dataset_config.max_num_atoms + if self.num_node_features is None: + self.num_node_features = len(self.dataset_config.code_to_atomic) + bonds_dim = self.max_num_nodes * self.max_num_nodes * self.num_edge_features + atoms_dim = self.max_num_nodes * self.num_node_features + self.z_dim = bonds_dim + atoms_dim + + + def save(self, path): + self.path = path + with open(path, 'w') as f: + json.dump(asdict(self), f, indent=4, sort_keys=True) + + @classmethod + def load(cls, path): + with open(path, 'r') as f: + data = json.load(f) + return cls(**data) + + def __repr__(self) -> str: + return json.dumps(asdict(self), indent=4, separators=(',', ': ')) + + +ZINC250K_CONFIG = Config( + max_num_nodes=40, + dataset_config=DatasetConfig( + dataset_name='zinc250k', + atomic_num_list=[6, 7, 8, 9, 15, 16, 17, 35, 53], + max_num_atoms=38, + labels=['logP', 'qed', 'SAS'], + smiles_col='smiles', + ), + model_config=ModelConfig( + AtomFlowConfig( + n_flow=38, + hidden_gnn=[256], + hidden_lin=[512, 64], + ), + BondFlowConfig( + n_squeeze=20, + hidden_ch=[512, 512], + conv_lu=2 + ), + ) +) + +CONFIGS = {'zinc250k': ZINC250K_CONFIG} diff --git a/training/benchmarks/moflow/pytorch/misc/utils.py b/training/benchmarks/moflow/pytorch/misc/utils.py new file mode 100644 index 000000000..465b2f6ab --- /dev/null +++ b/training/benchmarks/moflow/pytorch/misc/utils.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +import re +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +from rdkit import Chem +import torch + +from misc.config import Config, ATOM_VALENCY, CODE_TO_BOND, DUMMY_CODE + + +def postprocess_predictions(x: Union[torch.Tensor, np.ndarray], adj: Union[torch.Tensor, np.ndarray], config: Config) -> Tuple[np.ndarray, np.ndarray]: + assert x.ndim == 3 and adj.ndim == 4, 'expected batched predictions' + n = config.dataset_config.max_num_atoms + adj = adj[:, :, :n, :n] + x = x[:, :n] + + atoms = torch.argmax(x, dim=2) + atoms = _to_numpy_array(atoms) + + adj = torch.argmax(adj, dim=1) + adj = _to_numpy_array(adj) + + decoded = np.zeros_like(atoms) + for code, atomic_num in config.dataset_config.code_to_atomic.items(): + decoded[atoms == code] = atomic_num + + return decoded, adj + + +def convert_predictions_to_mols(adj: np.ndarray, x: np.ndarray, correct_validity: bool = False) -> List[Chem.Mol]: + molecules = [construct_mol(x_elem, adj_elem) for x_elem, adj_elem in zip(x, adj)] + + if correct_validity: + molecules = [correct_mol(mol) for mol in molecules] + return molecules + + +def construct_mol(atoms: np.ndarray, adj: np.ndarray) -> Chem.Mol: + from rdkit import RDLogger + RDLogger.DisableLog('rdApp.*') + atoms_exist = (atoms != 0) + atoms = atoms[atoms_exist] + adj = adj[atoms_exist][:, atoms_exist] + + mol = Chem.RWMol() + + for atom in atoms: + mol.AddAtom(Chem.Atom(int(atom))) + + for start, end in zip(*np.where(adj != DUMMY_CODE)): + if start > end: + mol.AddBond(int(start), int(end), CODE_TO_BOND[int(adj[start, end])]) + # add formal charge to atom: e.g. [O+], [N+] [S+] + # not support [O-], [N-] [S-] [NH+] etc. + flag, atomid_valence = check_valency(mol) + if flag: + continue + else: + assert len(atomid_valence) == 2 + idx = atomid_valence[0] + v = atomid_valence[1] + an = mol.GetAtomWithIdx(idx).GetAtomicNum() + if an in (7, 8, 16) and (v - ATOM_VALENCY[an]) == 1: + mol.GetAtomWithIdx(idx).SetFormalCharge(1) + return mol + + +def valid_mol(x: Optional[Chem.Mol]) -> Optional[Chem.Mol]: + if x is None: + # RDKit wasn't able to create the mol + return None + smi = Chem.MolToSmiles(x, isomericSmiles=True) + if len(smi) == 0 or '.' in smi: + # Mol is empty or fragmented + return None + reloaded = Chem.MolFromSmiles(smi) + # if smiles is invalid - it will be None, otherwise mol is valid + return reloaded + + +def check_valency(mol: Chem.Mol) -> Tuple[bool, List[int]]: + """Checks that no atoms in the mol have exceeded their possible + valency. Returns True if no valency issues, False otherwise + plus information about problematic atom. + """ + try: + Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES) + return True, None + except ValueError as e: + e = str(e) + p = e.find('#') + e_sub = e[p:] + atomid_valence = list(map(int, re.findall(r'\d+', e_sub))) + return False, atomid_valence + + +def correct_mol(mol: Chem.Mol) -> Chem.Mol: + flag, atomid_valence = check_valency(mol) + while not flag: + assert len(atomid_valence) == 2 + idx = atomid_valence[0] + v = atomid_valence[1] + queue = [] + for b in mol.GetAtomWithIdx(idx).GetBonds(): + queue.append( + (b.GetIdx(), int(b.GetBondType()), b.GetBeginAtomIdx(), b.GetEndAtomIdx()) + ) + queue.sort(key=lambda tup: tup[1], reverse=True) + if len(queue) > 0: + start = queue[0][2] + end = queue[0][3] + t = queue[0][1] - 1 + mol.RemoveBond(start, end) + if t >= 1: + mol.AddBond(start, end, CODE_TO_BOND[t]) + flag, atomid_valence = check_valency(mol) + + # if mol is fragmented, select the largest fragment + mols = Chem.GetMolFrags(mol, asMols=True) + mol = max(mols, key=lambda m: m.GetNumAtoms()) + + return mol + + +def predictions_to_smiles(adj: torch.Tensor, x: torch.Tensor, config: Config) -> List[str]: + x, adj = postprocess_predictions(x, adj, config=config) + valid = [Chem.MolToSmiles(construct_mol(x_elem, adj_elem), isomericSmiles=True) + for x_elem, adj_elem in zip(x, adj)] + return valid + + +def check_validity(molecules: List[Chem.Mol]) -> dict: + valid = [valid_mol(mol) for mol in molecules] + valid = [mol for mol in valid if mol is not None] + + n_mols = len(molecules) + valid_ratio = len(valid) / n_mols + valid_smiles = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in valid] + unique_smiles = list(set(valid_smiles)) + unique_ratio = 0. + if len(valid) > 0: + unique_ratio = len(unique_smiles) / len(valid) + valid_mols = [Chem.MolFromSmiles(s) for s in valid_smiles] + abs_unique_ratio = len(unique_smiles) / n_mols + + results = dict() + results['valid_mols'] = valid_mols + results['valid_smiles'] = valid_smiles + results['valid_ratio'] = valid_ratio * 100 + results['unique_ratio'] = unique_ratio * 100 + results['abs_unique_ratio'] = abs_unique_ratio * 100 + + return results + + +def check_novelty(gen_smiles: List[str], train_smiles: List[str], n_generated_mols: int): + if len(gen_smiles) == 0: + novel_ratio = 0. + abs_novel_ratio = 0. + else: + duplicates = [1 for mol in gen_smiles if mol in train_smiles] + novel = len(gen_smiles) - sum(duplicates) + novel_ratio = novel * 100. / len(gen_smiles) + abs_novel_ratio = novel * 100. / n_generated_mols + return novel_ratio, abs_novel_ratio + + +def _to_numpy_array(a): + if isinstance(a, torch.Tensor): + a = a.cpu().detach().numpy() + elif isinstance(a, np.ndarray): + pass + else: + raise TypeError("a ({}) is not a torch.Tensor".format(type(a))) + return a diff --git a/training/benchmarks/moflow/pytorch/model/__init__.py b/training/benchmarks/moflow/pytorch/model/__init__.py new file mode 100644 index 000000000..5d61ffbda --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/__init__.py @@ -0,0 +1,6 @@ +from model.model import MoFlow + + +def create_model(config): + model = MoFlow(config) + return model diff --git a/training/benchmarks/moflow/pytorch/model/basic.py b/training/benchmarks/moflow/pytorch/model/basic.py new file mode 100644 index 000000000..d52c3680b --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/basic.py @@ -0,0 +1,194 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +import math +from typing import Tuple +import numpy as np +from scipy import linalg as la +import torch +from torch import nn +from torch.nn import functional as F + +from runtime.distributed_utils import get_world_size, reduce_tensor + + +class ActNorm(nn.Module): + def __init__(self, num_channels, num_dims, channels_dim=1): + super().__init__() + self.num_channels = num_channels + self.num_dims = num_dims + self.channels_dim = channels_dim + self.shape = [1] * num_dims + self.shape[channels_dim] = num_channels + self.loc = nn.Parameter(torch.zeros(*self.shape)) + self.scale = nn.Parameter(torch.ones(*self.shape)) + + self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8)) + self.register_buffer('num_elements', torch.tensor(0, dtype=torch.uint8)) + + @torch.jit.ignore + def initialize(self, input): + if self.initialized.item() == 1: + return + + dims = list(input.shape[1:]) + del dims[self.channels_dim -1] + + num_elems = math.prod(dims) + permutation = [self.channels_dim] + [i for i in range(self.num_dims) if i != self.channels_dim] + with torch.no_grad(): + + flatten = input.permute(*permutation).contiguous().view(self.num_channels, -1) + mean = flatten.mean(1).view(self.shape) + std = flatten.std(1).view(self.shape) + + num_gpus = get_world_size() + mean = reduce_tensor(mean, num_gpus) + std = reduce_tensor(std, num_gpus) + self.loc.data.copy_(-mean) + self.scale.data.copy_(1 / (std + 1e-6)) + self.initialized.fill_(1) + self.num_elements.fill_(num_elems) + + def forward(self, input): + log_abs = torch.log(torch.abs(self.scale)) + logdet = self.num_elements * torch.sum(log_abs) + return self.scale * (input + self.loc), logdet + + @torch.jit.export + def reverse(self, output): + return output / self.scale - self.loc + + +class InvConv2d(nn.Module): + def __init__(self, in_channel): + super().__init__() + + weight = torch.randn(in_channel, in_channel) + q, _ = torch.qr(weight) + weight = q.unsqueeze(2).unsqueeze(3) + self.weight = nn.Parameter(weight) + + def forward(self, input): + _, _, height, width = input.shape + + out = F.conv2d(input, self.weight) + logdet = ( + height * width * torch.slogdet(self.weight.squeeze().double())[1].float() + ) + + return out, logdet + + def reverse(self, output): + return F.conv2d( + output, self.weight.squeeze().inverse().unsqueeze(2).unsqueeze(3) + ) + + +class InvConv2dLU(nn.Module): + def __init__(self, in_channel): + super().__init__() + + weight = np.random.randn(in_channel, in_channel) + q, _ = la.qr(weight) + w_p, w_l, w_u = la.lu(q.astype(np.float32)) + w_s = np.diag(w_u) + w_u = np.triu(w_u, 1) + u_mask = np.triu(np.ones_like(w_u), 1) + l_mask = u_mask.T + + w_p = torch.from_numpy(w_p) + w_l = torch.from_numpy(w_l).contiguous() + w_s = torch.from_numpy(w_s) + w_u = torch.from_numpy(w_u) + + self.register_buffer('w_p', w_p) + self.register_buffer('u_mask', torch.from_numpy(u_mask)) + self.register_buffer('l_mask', torch.from_numpy(l_mask)) + self.register_buffer('s_sign', torch.sign(w_s)) + self.register_buffer('l_eye', torch.eye(l_mask.shape[0])) + self.w_l = nn.Parameter(w_l) + self.w_s = nn.Parameter(torch.log(torch.abs(w_s))) + self.w_u = nn.Parameter(w_u) + + def forward(self, input): + _, _, height, width = input.shape + + weight = self.calc_weight() + + out = F.conv2d(input, weight) + logdet = height * width * torch.sum(self.w_s) + + return out, logdet + + def calc_weight(self): + weight = ( + self.w_p + @ (self.w_l * self.l_mask + self.l_eye) + @ ((self.w_u * self.u_mask) + torch.diag(self.s_sign * torch.exp(self.w_s))) + ) + + return weight.unsqueeze(2).unsqueeze(3) + + def reverse(self, output): + weight = self.calc_weight() + dtype = weight.dtype + weight = weight.float() + weight_inv = weight.squeeze().inverse().unsqueeze(2).unsqueeze(3) + weight_inv = weight_inv.to(dtype=dtype) + + return F.conv2d(output, weight_inv) + + +class GraphConv(nn.Module): + def __init__(self, in_channels, out_channels, num_atoms, num_edge_type=4): + super(GraphConv, self).__init__() + + self.graph_linear_self = nn.Linear(in_channels, out_channels) + self.graph_linear_edge = nn.Linear(in_channels, out_channels * num_edge_type) + self.num_edge_type = num_edge_type + self.in_ch = in_channels + self.out_ch = out_channels + self.num_atoms = num_atoms + + def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + adj, nodes = graph + hs = self.graph_linear_self(nodes) + m = self.graph_linear_edge(nodes) + m = m.view(-1, self.num_atoms, self.out_ch, self.num_edge_type) + hr = torch.einsum('bemn,bnce->bmc', adj, m) + hr = hr.unsqueeze(2) + return hs + hr diff --git a/training/benchmarks/moflow/pytorch/model/coupling.py b/training/benchmarks/moflow/pytorch/model/coupling.py new file mode 100644 index 000000000..a97c9b55a --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/coupling.py @@ -0,0 +1,196 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +from typing import Tuple +import torch +import torch.nn as nn +from torch.nn.functional import logsigmoid + +from model.basic import GraphConv + + +def sigmoid_inverse(x): + """Calculates 1/sigmoid(x) in a more numerically stable way""" + return 1 + torch.exp(-x) + + +class AffineCoupling(nn.Module): # delete + def __init__(self, in_channel, hidden_channels, mask_swap=False): # filter_size=512, --> hidden_channels =(512, 512) + super(AffineCoupling, self).__init__() + + self.mask_swap=mask_swap + # self.norms_in = nn.ModuleList() + last_h = in_channel // 2 + vh = tuple(hidden_channels) + layers = [] + for h in vh: + layers.append(nn.Conv2d(last_h, h, kernel_size=3, padding=1)) + layers.append(nn.BatchNorm2d(h)) + layers.append(nn.ReLU(inplace=True)) + last_h = h + layers.append(nn.Conv2d(last_h, in_channel, kernel_size=3, padding=1)) + self.layers = nn.Sequential(*layers) + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + in_a, in_b = input.chunk(2, 1) # (2,12,32,32) --> (2,6,32,32), (2,6,32,32) + + if self.mask_swap: + in_a, in_b = in_b, in_a + + s_logits, t = self._s_t_function(in_a) + s = torch.sigmoid(s_logits) + out_b = (in_b + t) * s + logdet = torch.sum(logsigmoid(s_logits).reshape(input.shape[0], -1), 1) + + if self.mask_swap: + result = torch.cat([out_b, in_a], 1) + else: + result = torch.cat([in_a, out_b], 1) + + return result, logdet + + @torch.jit.export + def reverse(self, output: torch.Tensor) -> torch.Tensor: + out_a, out_b = output.chunk(2, 1) + if self.mask_swap: + out_a, out_b = out_b, out_a + + s_logits, t = self._s_t_function(out_a) + s_inverse = sigmoid_inverse(s_logits) + in_b = out_b * s_inverse - t + + if self.mask_swap: + result = torch.cat([in_b, out_a], 1) + else: + result = torch.cat([out_a, in_b], 1) + + return result + + def _s_t_function(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + h = self.layers(x) + s_logits, t = h.chunk(2, 1) + return s_logits, t + + +class ConvCouplingBlock(nn.Module): + def __init__(self, in_dim: int, out_dim: int, n_node: int) -> None: + super().__init__() + self.graph_conv = GraphConv(in_dim, out_dim, n_node) + self.bn = nn.BatchNorm2d(n_node) + self.relu = nn.ReLU(inplace=True) + + def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + adj, nodes = graph + h = self.graph_conv(graph) + h = h.to(memory_format=torch.channels_last) + h = self.bn(h) + h = self.relu(h) + return adj, h + + +class LinCouplingBlock(nn.Module): + def __init__(self, in_dim: int, out_dim: int, n_node: int) -> None: + super().__init__() + self.lin = nn.Linear(in_dim, out_dim) + self.bn = nn.BatchNorm2d(n_node) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + h = self.lin(x) + h = h.to(memory_format=torch.channels_last) + h = self.bn(h) + h = self.relu(h) + return h + + +class GraphAffineCoupling(nn.Module): + def __init__(self, n_node, in_dim, hidden_dim_dict, masked_row): + super(GraphAffineCoupling, self).__init__() + self.n_node = n_node + self.in_dim = in_dim + self.hidden_dim_dict = hidden_dim_dict + self.masked_row = masked_row + + self.hidden_dim_gnn = hidden_dim_dict['gnn'] + self.hidden_dim_linear = hidden_dim_dict['linear'] + + conv_layers = [] + last_dim = in_dim + for out_dim in self.hidden_dim_gnn: + conv_layers.append(ConvCouplingBlock(last_dim, out_dim, n_node)) + last_dim = out_dim + self.net_conv = nn.ModuleList(conv_layers) + + lin_layers = [] + for out_dim in self.hidden_dim_linear: + lin_layers.append(LinCouplingBlock(last_dim, out_dim, n_node)) + last_dim = out_dim + lin_layers.append(nn.Linear(last_dim, in_dim*2)) + self.net_lin = nn.Sequential(*lin_layers) + + mask = torch.ones(n_node, in_dim) + mask[masked_row, :] = 0 # masked_row are kept same, and used for _s_t for updating the left rows + self.register_buffer('mask', mask) + + def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + adj, input = graph + masked_x = self.mask * input + masked_x_sq = masked_x.unsqueeze(2) + s_logits, t = self._s_t_function((adj, masked_x_sq)) + s = torch.sigmoid(s_logits) + out = masked_x + (1-self.mask) * (input + t) * s + logdet = torch.sum(logsigmoid(s_logits).reshape(input.shape[0], -1), 1) + return out, logdet + + @torch.jit.export + def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + adj, output = graph + masked_y = self.mask * output + masked_y_sq = masked_y.unsqueeze(2) + s_logits, t = self._s_t_function((adj, masked_y_sq)) + s_inverse = sigmoid_inverse(s_logits) + input = masked_y + (1 - self.mask) * (output * s_inverse - t) + return input + + def _s_t_function(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + for l in self.net_conv: + graph = l(graph) + adj, h = graph + h = self.net_lin(h) + h = h.squeeze(2) + s_logits, t = h.chunk(2, dim=-1) + + return s_logits, t diff --git a/training/benchmarks/moflow/pytorch/model/glow.py b/training/benchmarks/moflow/pytorch/model/glow.py new file mode 100644 index 000000000..d5e944d6a --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/glow.py @@ -0,0 +1,270 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +from typing import Tuple +import torch +import torch.nn as nn + +from model.basic import ActNorm, InvConv2dLU, InvConv2d +from model.coupling import AffineCoupling, GraphAffineCoupling + + +class Flow(nn.Module): + def __init__(self, in_channel, hidden_channels, conv_lu=2, mask_swap=False): + super(Flow, self).__init__() + + # More stable to support more flows + self.actnorm = ActNorm(num_channels=in_channel, num_dims=4) + + if conv_lu == 0: + self.invconv = InvConv2d(in_channel) + elif conv_lu == 1: + self.invconv = InvConv2dLU(in_channel) + elif conv_lu == 2: + self.invconv = None + else: + raise ValueError("conv_lu in {0,1,2}, 0:InvConv2d, 1:InvConv2dLU, 2:none-just swap to update in coupling") + + self.coupling = AffineCoupling(in_channel, hidden_channels, mask_swap=mask_swap) + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + out, logdet = self.actnorm(input) + if self.invconv is not None: + out, det1 = self.invconv(out) + else: + det1 = 0 + out, det2 = self.coupling(out) + + logdet = logdet + det1 + if det2 is not None: + logdet = logdet + det2 + + return out, logdet + + @torch.jit.export + def reverse(self, output: torch.Tensor) -> torch.Tensor: + input = self.coupling.reverse(output) + if self.invconv is not None: + input = self.invconv.reverse(input) + input = self.actnorm.reverse(input) + + return input + + +class FlowOnGraph(nn.Module): + def __init__(self, n_node, in_dim, hidden_dim_dict, masked_row): + super(FlowOnGraph, self).__init__() + self.n_node = n_node + self.in_dim = in_dim + self.hidden_dim_dict = hidden_dim_dict + self.masked_row = masked_row + self.actnorm = ActNorm(num_channels=n_node, num_dims=3) + self.coupling = GraphAffineCoupling(n_node, in_dim, hidden_dim_dict, masked_row) + + def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + adj, input = graph + out, logdet = self.actnorm(input) + det1 = 0 + out, det2 = self.coupling((adj, out)) + + logdet = logdet + det1 + if det2 is not None: + logdet = logdet + det2 + return out, logdet + + @torch.jit.export + def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + adj, output = graph + input = self.coupling.reverse((adj, output)) + input = self.actnorm.reverse(input) + return input + + +class Block(nn.Module): + def __init__(self, in_channel, n_flow, squeeze_fold, hidden_channels, conv_lu=2): + super(Block, self).__init__() + self.squeeze_fold = squeeze_fold + squeeze_dim = in_channel * self.squeeze_fold * self.squeeze_fold + + self.flows = nn.ModuleList() + for i in range(n_flow): + if conv_lu in (0, 1): + self.flows.append(Flow(squeeze_dim, hidden_channels, + conv_lu=conv_lu, mask_swap=False)) + else: + self.flows.append(Flow(squeeze_dim, hidden_channels, + conv_lu=2, mask_swap=bool(i % 2))) + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + out = self._squeeze(input) + logdet = 0 + + for flow in self.flows: + out, det = flow(out) + logdet = logdet + det + + out = self._unsqueeze(out) + return out, logdet + + @torch.jit.export + def reverse(self, output: torch.Tensor) -> torch.Tensor: + input = self._squeeze(output) + + for flow in self.flows[::-1]: + input = flow.reverse(input) + + unsqueezed = self._unsqueeze(input) + return unsqueezed + + def _squeeze(self, x: torch.Tensor) -> torch.Tensor: + """Trade spatial extent for channels. In forward direction, convert each + 1x4x4 volume of input into a 4x1x1 volume of output. + + Args: + x (torch.Tensor): Input to squeeze or unsqueeze. + reverse (bool): Reverse the operation, i.e., unsqueeze. + + Returns: + x (torch.Tensor): Squeezed or unsqueezed tensor. + """ + assert len(x.shape) == 4 + b_size, n_channel, height, width = x.shape + fold = self.squeeze_fold + + squeezed = x.view(b_size, n_channel, height // fold, fold, width // fold, fold) + squeezed = squeezed.permute(0, 1, 3, 5, 2, 4).contiguous() + out = squeezed.view(b_size, n_channel * fold * fold, height // fold, width // fold) + return out + + def _unsqueeze(self, x: torch.Tensor) -> torch.Tensor: + assert len(x.shape) == 4 + b_size, n_channel, height, width = x.shape + fold = self.squeeze_fold + unsqueezed = x.view(b_size, n_channel // (fold * fold), fold, fold, height, width) + unsqueezed = unsqueezed.permute(0, 1, 4, 2, 5, 3).contiguous() + out = unsqueezed.view(b_size, n_channel // (fold * fold), height * fold, width * fold) + return out + + +class BlockOnGraph(nn.Module): + def __init__(self, n_node, in_dim, hidden_dim_dict, n_flow, mask_row_size=1, mask_row_stride=1): + super(BlockOnGraph, self).__init__() + assert 0 < mask_row_size < n_node + self.flows = nn.ModuleList() + for i in range(n_flow): + start = i * mask_row_stride + masked_row =[r % n_node for r in range(start, start+mask_row_size)] + self.flows.append(FlowOnGraph(n_node, in_dim, hidden_dim_dict, masked_row=masked_row)) + + def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + adj, input = graph + out = input + logdet = 0 + for flow in self.flows: + out, det = flow((adj, out)) + logdet = logdet + det + return out, logdet + + @torch.jit.export + def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + adj, output = graph + input = output + for flow in self.flows[::-1]: + input = flow.reverse((adj, input)) + return input + + +class Glow(nn.Module): + def __init__(self, in_channel, n_flow, n_block, squeeze_fold, hidden_channel, conv_lu=2): + super(Glow, self).__init__() + + self.blocks = nn.ModuleList() + n_channel = in_channel + for i in range(n_block): + self.blocks.append(Block(n_channel, n_flow, squeeze_fold, hidden_channel, conv_lu=conv_lu)) + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + logdet = 0 + out = input + + for block in self.blocks: + out, det = block(out) + logdet = logdet + det + + return out, logdet + + @torch.jit.export + def reverse(self, z: torch.Tensor) -> torch.Tensor: + h = z + for i, block in enumerate(self.blocks[::-1]): + h = block.reverse(h) + + return h + + +class GlowOnGraph(nn.Module): + def __init__(self, n_node, in_dim, hidden_dim_dict, n_flow, n_block, + mask_row_size_list=(2,), mask_row_stride_list=(1,)): + super(GlowOnGraph, self).__init__() + + assert len(mask_row_size_list) == n_block or len(mask_row_size_list) == 1 + assert len(mask_row_stride_list) == n_block or len(mask_row_stride_list) == 1 + if len(mask_row_size_list) == 1: + mask_row_size_list = mask_row_size_list * n_block + if len(mask_row_stride_list) == 1: + mask_row_stride_list = mask_row_stride_list * n_block + self.blocks = nn.ModuleList() + for i in range(n_block): + mask_row_size = mask_row_size_list[i] + mask_row_stride = mask_row_stride_list[i] + self.blocks.append(BlockOnGraph(n_node, in_dim, hidden_dim_dict, n_flow, mask_row_size, mask_row_stride)) + + def forward(self, adj: torch.Tensor, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + logdet = 0 + out = x + for block in self.blocks: + out, det = block((adj, out)) + logdet = logdet + det + return out, logdet + + @torch.jit.export + def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + adj, z = graph + input = z + for i, block in enumerate(self.blocks[::-1]): + input = block.reverse((adj, input)) + + return input diff --git a/training/benchmarks/moflow/pytorch/model/model.py b/training/benchmarks/moflow/pytorch/model/model.py new file mode 100644 index 000000000..8cf6e0078 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/model.py @@ -0,0 +1,251 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2020 Chengxi Zang +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +import math +import torch +import torch.nn as nn + +from misc.config import Config +from model.glow import Glow, GlowOnGraph + +def gaussian_nll(x, mean, ln_var): + """Computes the negative log-likelihood of a Gaussian distribution. + + Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` + representing :math:`\\log(\\sigma^2)`, this function computes in + elementwise manner the negative log-likelihood of :math:`x` on a + Gaussian distribution :math:`N(\\mu, S)`, + + .. math:: + + -\\log N(x; \\mu, \\sigma^2) = + \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) + + \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu), + + where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal + matrix where :math:`S_{ii} = \\sigma_i^2`. + + Args: + x: Input variable. + mean: Mean of a Gaussian distribution, :math:`\\mu`. + ln_var: Logarithm of variance of a Gaussian distribution, + :math:`\\log(\\sigma^2)`. + + Returns: + torch.Tensor: + Negative log-likelihood. + """ + + x_prec = torch.exp(-ln_var) + x_diff = x - mean + x_power = (x_diff * x_diff) * x_prec * -0.5 + loss = (ln_var + math.log(2 * (math.pi))) / 2 - x_power + return loss + + +class MoFlowLoss(nn.Module): + def __init__(self, config: Config) -> None: + super().__init__() + self.b_n_type = config.num_edge_features + self.a_n_node = config.max_num_nodes + self.a_n_type = config.num_node_features + self.b_size = self.a_n_node * self.a_n_node * self.b_n_type + self.a_size = self.a_n_node * self.a_n_type + + if config.model_config.learn_dist: + self.ln_var = nn.Parameter(torch.zeros(1)) + else: + self.register_buffer('ln_var', torch.zeros(1)) + + def forward(self, h, adj_h, sum_log_det_jacs_x, sum_log_det_jacs_adj): + z = [h, adj_h] + logdet = [sum_log_det_jacs_x, sum_log_det_jacs_adj] + + device = z[0].device + dtype = z[0].dtype + z[0] = z[0].reshape(z[0].shape[0],-1) + z[1] = z[1].reshape(z[1].shape[0], -1) + + logdet[0] = logdet[0] - self.a_size * math.log(2.) + logdet[1] = logdet[1] - self.b_size * math.log(2.) + ln_var_adj = self.ln_var * torch.ones([self.b_size], device=device, dtype=dtype) + ln_var_x = self.ln_var * torch.ones([self.a_size], device=device, dtype=dtype) + nll_adj = torch.mean( + torch.sum(gaussian_nll(z[1], torch.zeros(self.b_size, device=device, dtype=dtype), ln_var_adj), dim=1) + - logdet[1]) + nll_adj = nll_adj / (self.b_size * math.log(2.)) # the negative log likelihood per dim with log base 2 + + nll_x = torch.mean(torch.sum( + gaussian_nll(z[0], torch.zeros(self.a_size, device=device, dtype=dtype), ln_var_x), + dim=1) - logdet[0]) + nll_x = nll_x / (self.a_size * math.log(2.)) # the negative log likelihood per dim with log base 2 + + return nll_x, nll_adj + + +class MoFlow(nn.Module): + def __init__(self, config: Config): + super(MoFlow, self).__init__() + self.config = config + self.b_n_type = config.num_edge_features + self.a_n_node = config.max_num_nodes + self.a_n_type = config.num_node_features + self.b_size = self.a_n_node * self.a_n_node * self.b_n_type + self.a_size = self.a_n_node * self.a_n_type + self.noise_scale = config.model_config.noise_scale + + self.bond_model = Glow( + in_channel=self.b_n_type, + n_flow=config.model_config.bond_config.n_flow, + n_block=config.model_config.bond_config.n_block, + squeeze_fold=config.model_config.bond_config.n_squeeze, + hidden_channel=config.model_config.bond_config.hidden_ch, + conv_lu=config.model_config.bond_config.conv_lu + ) + + self.atom_model = GlowOnGraph( + n_node=self.a_n_node, + in_dim=self.a_n_type, + hidden_dim_dict={ + 'gnn': config.model_config.atom_config.hidden_gnn, + 'linear': config.model_config.atom_config.hidden_lin + }, + n_flow=config.model_config.atom_config.n_flow, + n_block=config.model_config.atom_config.n_block, + mask_row_size_list=config.model_config.atom_config.mask_row_size_list, + mask_row_stride_list=config.model_config.atom_config.mask_row_stride_list, + ) + + self._cuda_graphs = dict() + self.atom_stream = None + self.bond_stream = None + + @torch.jit.ignore + def forward(self, adj: torch.Tensor, x: torch.Tensor, with_cuda_graph: bool = False): + """ + :param adj: (256,4,9,9) + :param x: (256,9,5) + :return: + """ + if with_cuda_graph and self.atom_stream is None: + self.atom_stream = torch.cuda.Stream() + self.bond_stream = torch.cuda.Stream() + h = x + # add uniform noise to node feature matrices + if self.training: + if self.noise_scale == 0: + h = h/2.0 - 0.5 + torch.rand_like(x) * 0.4 + else: + h = h + torch.rand_like(x) * self.noise_scale + if with_cuda_graph: + if self.atom_model not in self._cuda_graphs: + h, sum_log_det_jacs_x = self._forward_graph(self.atom_model, adj, h) + else: + self.atom_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(self.atom_stream): + h, sum_log_det_jacs_x = self._forward_graph(self.atom_model, adj, h) + else: + h, sum_log_det_jacs_x = self.atom_model(adj, h) + + # add uniform noise to adjacency tensors + if self.training: + if self.noise_scale == 0: + adj_bond = adj/2.0 - 0.5 + torch.rand_like(adj) * 0.4 + else: + adj_bond = adj + torch.rand_like(adj) * self.noise_scale + else: + adj_bond = adj + if with_cuda_graph: + if self.bond_model not in self._cuda_graphs: + adj_h, sum_log_det_jacs_adj = self._forward_graph(self.bond_model, adj_bond) + else: + self.bond_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(self.bond_stream): + adj_h, sum_log_det_jacs_adj = self._forward_graph(self.bond_model, adj_bond) + else: + adj_h, sum_log_det_jacs_adj = self.bond_model(adj_bond) + if with_cuda_graph: + torch.cuda.current_stream().wait_stream(self.atom_stream) + torch.cuda.current_stream().wait_stream(self.bond_stream) + return h, adj_h, sum_log_det_jacs_x, sum_log_det_jacs_adj + + @torch.jit.export + def reverse(self, z): + """ + Returns a molecule, given its latent vector. + :param z: latent vector. Shape: [B, N*N*M + N*T] + B = Batch size, N = number of atoms, M = number of bond types, + T = number of atom types (Carbon, Oxygen etc.) + :return: adjacency matrix and feature matrix of a molecule + """ + batch_size = z.shape[0] + z_x = z[:, :self.a_size] + z_adj = z[:, self.a_size:] + + h_adj = z_adj.reshape(batch_size, self.b_n_type, self.a_n_node, self.a_n_node) + h_adj = h_adj.to(memory_format=torch.channels_last) + h_adj = self.bond_model.reverse(h_adj) + + if self.noise_scale == 0: + h_adj = (h_adj + 0.5) * 2 + adj = h_adj + adj = adj + adj.permute(0, 1, 3, 2) + adj = adj / 2 + adj = adj.softmax(dim=1) + max_bond = adj.max(dim=1).values.reshape(batch_size, -1, self.a_n_node, self.a_n_node) + adj = torch.floor(adj / max_bond) + + adj = adj.to(memory_format=torch.channels_last) + h_x = z_x.reshape(batch_size, self.a_n_node, self.a_n_type) + h_x = self.atom_model.reverse((adj, h_x)) + if self.noise_scale == 0: + h_x = (h_x + 0.5) * 2 + return adj, h_x + + @torch.jit.ignore + def _forward_graph(self, model, *args): + if model not in self._cuda_graphs: + if torch.distributed.is_initialized(): + torch.distributed.barrier() + torch.cuda.synchronize() + self._cuda_graphs[model] = torch.cuda.make_graphed_callables( + model, + args, + ) + torch.cuda.synchronize() + if torch.distributed.is_initialized(): + torch.distributed.barrier() + return self._cuda_graphs[model](*args) diff --git a/training/benchmarks/moflow/pytorch/model/utils.py b/training/benchmarks/moflow/pytorch/model/utils.py new file mode 100644 index 000000000..6f9233040 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/model/utils.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from typing import Iterable +import torch + +def initialize_module(module: torch.nn.Module, inputs: Iterable[torch.Tensor]) -> None: + """Use given sample input to initialize the module. + Module must implement method called `initialize` which takes list of input tensors + """ + assert hasattr(module, 'initialize') + assert len(inputs) == 1, f'{len(inputs)} inputs' + assert module.initialized.item() == 0, 'initialized' + module.initialize(*inputs) + assert module.initialized.item() == 1, 'not initialized' + + +def initialize(model: torch.nn.Module, single_batch: Iterable[torch.Tensor]) -> None: + """Initialize all sub-modules in the model given the sample input batch.""" + hooks = [] + for name, module in model.named_modules(): + if hasattr(module, 'initialize'): + logging.info(f'marking {name} for initialization') + hook = module.register_forward_pre_hook(initialize_module) + hooks.append(hook) + _ = model(*single_batch) + logging.info('all modules initialized, removing hooks') + for hook in hooks: + hook.remove() diff --git a/training/benchmarks/moflow/pytorch/optimizers/__init__.py b/training/benchmarks/moflow/pytorch/optimizers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/run_pretraining.py b/training/benchmarks/moflow/pytorch/run_pretraining.py new file mode 100755 index 000000000..07a216567 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/run_pretraining.py @@ -0,0 +1,192 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +# 标准库 +import os +import sys +import time +from typing import Any, Tuple + +# 三方库 +import torch + +# benchmarks目录 append到sys.path +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, + "../../"))) # benchmarks目录 +# 本地库 +import config +from driver import Event, dist_pytorch +from driver.helper import InitHelper + +# 导入相关的模块、方法、变量。这里保持名称一致,实现可以不同。 +from runtime.logger import MetricsLogger, PerformanceLogger, setup_logging +from runtime.common import get_newest_checkpoint, load_state +from train import trainer_adapter +from train.evaluator import Evaluator +from train.trainer import Trainer +from train.training_state import TrainingState +from dataloaders.dataloader import build_datasets, build_train_dataloader + +logger = None + +torch._C._jit_set_autocast_mode(True) + + +def main() -> Tuple[Any, Any]: + global logger + global config + + # init + init_helper = InitHelper(config) + model_driver = init_helper.init_driver(globals(), locals()) + args = model_driver.config + dist_pytorch.init_dist_training_env(args) + dist_pytorch.barrier(args.vendor) + model_driver.event(Event.INIT_START) + + torch.cuda.set_stream(torch.cuda.Stream()) + os.makedirs(args.results_dir, exist_ok=True) + + args.distributed = dist_pytorch.get_world_size() > 1 + # logger + logger = model_driver.logger + dllogger = setup_logging(args) + world_size = args.n_device + perf_logger, acc_logger = None, None + if args.local_rank == 0: + perf_logger = PerformanceLogger(dllogger, + args.train_batch_size * world_size, + args.warmup_steps) + acc_logger = MetricsLogger(dllogger) + + train_dataset, _ = build_datasets(args) + train_dataloader = build_train_dataloader(args, train_dataset) + + if args.save_epochs == -1: + args.save_epochs = args.epochs + if args.eval_epochs == -1: + args.eval_epochs = args.epochs + if args.steps == -1: + args.steps = args.epochs * len(train_dataloader) + + seed = args.seed + init_helper.set_seed(seed, args.vendor) + + # 创建TrainingState对象 + training_state = TrainingState() + + # 构建 trainer:依赖 evaluator、TrainingState对象 + evaluator = Evaluator() + trainer = Trainer( + driver=model_driver, + adapter=trainer_adapter, + evaluator=evaluator, + training_state=training_state, + device=args.device, + args=args, + perf_logger=perf_logger, + acc_logger=acc_logger, + train_dataloader=train_dataloader, + ) + training_state._trainer = trainer + + # 设置分布式环境, trainer init() + dist_pytorch.barrier(args.vendor) + trainer.init() + dist_pytorch.barrier(args.vendor) + + if not args.do_train: + return args, training_state + + model_driver.event(Event.INIT_END) + + # TRAIN_START + dist_pytorch.barrier(args.vendor) + model_driver.event(Event.TRAIN_START) + train_start_time = time.time() + + snapshot_path = get_newest_checkpoint(args.results_dir) + dist_pytorch.main_proc_print(f"snapshot_path: {snapshot_path}") + + first_epoch, step = 0, 0 + if snapshot_path is not None: + snapshot_epoch, ln_var = load_state(snapshot_path, + trainer.model_callable, + optimizer=trainer.optimizer, + device=args.device) + trainer.loss_callable.ln_var = torch.nn.Parameter(torch.tensor(ln_var)) + first_epoch = snapshot_epoch + 1 + step = first_epoch * len(train_dataloader) + else: + first_epoch = 0 + step = 0 + + + if first_epoch > args.epochs: + dist_pytorch.main_proc_print( + f'Model was already trained for {first_epoch} epochs, skip pretraining' + ) + + # 训练过程 + epoch = first_epoch + while epoch < args.epochs: + training_state.epoch = epoch + step = trainer.train_one_epoch(train_dataloader, step) + epoch += 1 + if step >= args.steps: + break + # TRAIN_END事件 + training_state.train_time = time.time() - train_start_time + model_driver.event(Event.TRAIN_END) + + if args.local_rank == 0: + # The same report for each epoch + acc_logger.summarize(step=tuple()) + perf_logger.summarize(step=tuple()) + + res = evaluator.evaluate(args, trainer.config, acc_logger) + dist_pytorch.main_proc_print(f"evaluate results: {res}") + training_state.nuv = res['nuv'] + if training_state.nuv >= args.target_nuv: + dist_pytorch.main_proc_print( + f"converged_success. eval_nuv: {training_state.nuv}, target_nuv: {args.target_nuv}" + ) + training_state.converged_success() + + return config, training_state + + +if __name__ == "__main__": + start = time.time() + config_update, state = main() + if not dist_pytorch.is_main_process(): + sys.exit(0) + + # 训练信息写日志 + e2e_time = time.time() - start + if config_update.do_train: + + finished_info = { + "e2e_time": + e2e_time, + "train_time": + state.train_time, + "train_no_eval_time": + state.no_eval_time, + "pure_training_computing_time": + state.pure_compute_time, + "throughput(ips)_raw": + round(state.num_trained_samples / state.train_time, 2), + "throughput(ips)_no_eval": + round(state.num_trained_samples / state.no_eval_time, 2), + "throughput(ips)_pure_compute": + round(state.num_trained_samples / state.pure_compute_time, 2), + "converged": + state.converged, + "final_nuv": + state.nuv, + } + else: + finished_info = {"e2e_time": e2e_time} + logger.log(Event.FINISHED, message=finished_info, stacklevel=0) diff --git a/training/benchmarks/moflow/pytorch/runtime/__init__.py b/training/benchmarks/moflow/pytorch/runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/runtime/arguments.py b/training/benchmarks/moflow/pytorch/runtime/arguments.py new file mode 100644 index 000000000..d5b8c58c5 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/runtime/arguments.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import os + +from misc.config import CONFIGS +from runtime.logger import LOGGING_LEVELS + + +PARSER = argparse.ArgumentParser() +PARSER.add_argument('--data_dir', type=str, default='/data', help='Location for the dataset.') +PARSER.add_argument('--config_name', type=str, default='zinc250k', choices=list(CONFIGS), + help='The config to choose. This parameter allows one to switch between different datasets ' + 'and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.') +PARSER.add_argument('--results_dir', type=str, default='/results', help='Directory where checkpoints are stored.') +PARSER.add_argument('--predictions_path', type=str, default='/results/predictions.smi', + help='Path to store generated molecules. If an empty string is provided, predictions will not be ' + 'saved (useful for benchmarking and debugging).') +PARSER.add_argument('--log_path', type=str, default=None, + help='Path for DLLogger log. This file will contain information about the speed and ' + 'accuracy of the model during training and inference. Note that if the file ' + 'already exists, new logs will be added at the end.') +PARSER.add_argument('--log_interval', type=int, default=20, help='Frequency for writing logs, expressed in steps.') +PARSER.add_argument('--warmup_steps', type=int, default=20, + help='Number of warmup steps. This value is used for benchmarking and for CUDA graph capture.') +PARSER.add_argument('--steps', type=int, default=-1, + help='Number of steps used for training/inference. This parameter allows finishing ' + 'training earlier than the specified number of epochs. If used with inference, ' + 'it allows generating more molecules (by default only a single batch of molecules is generated).') +PARSER.add_argument('--save_epochs', type=int, default=5, + help='Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved.') +PARSER.add_argument('--eval_epochs', type=int, default=5, + help='Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed.') +PARSER.add_argument('--learning_rate', type=float, default=0.0005, help='Base learning rate.') +PARSER.add_argument('--beta1', type=float, default=0.9, help='beta1 parameter for the optimizer.') +PARSER.add_argument('--beta2', type=float, default=0.99, help='beta2 parameter for the optimizer.') +PARSER.add_argument('--clip', type=float, default=1, help='Gradient clipping norm.') +PARSER.add_argument('--epochs', type=int, default=300, + help='Number of training epochs. Note that you can finish training mid-epoch by using "--steps" flag.') +PARSER.add_argument('--batch_size', type=int, default=512, help='Batch size per GPU.') +PARSER.add_argument('--num_workers', type=int, default=4, help='Number of workers in the data loader.') +PARSER.add_argument('--seed', type=int, default=1, help='Random seed used to initialize the distributed loaders.') +PARSER.add_argument('--local_rank', default=os.environ.get('LOCAL_RANK', 0), type=int, + help='rank of the GPU, used to launch distributed training. This argument is specified ' + 'automatically by `torchrun` and does not have to be provided by the user.') +PARSER.add_argument('--temperature', type=float, default=0.3, help='Temperature used for sampling.') +PARSER.add_argument('--val_batch_size', type=int, default=100, help='Number of molecules to generate during validation step.') +PARSER.add_argument('--allow_untrained', action='store_true', + help='Allow sampling molecules from an untrained network. Useful for performance benchmarking or debugging purposes.') +PARSER.add_argument('--correct_validity', action='store_true', help='Apply validity correction after the generation of the molecules.') +PARSER.add_argument('--amp', action='store_true', help='Use Automatic Mixed Precision.') +PARSER.add_argument('--cuda_graph', action='store_true', help='Capture GPU kernels with CUDA graphs. This option allows to speed up training.') +PARSER.add_argument('--jit', action='store_true', help='Compile the model with `torch.jit.script`. Can be used to speed up training or inference.') +PARSER.add_argument('--verbosity', type=int, default=1, choices=list(LOGGING_LEVELS), + help='Verbosity level. Specify the following values: 0, 1, 2, 3, where 0 means minimal ' + 'verbosity (errors only) and 3 - maximal (debugging).') diff --git a/training/benchmarks/moflow/pytorch/runtime/common.py b/training/benchmarks/moflow/pytorch/runtime/common.py new file mode 100644 index 000000000..3d630089d --- /dev/null +++ b/training/benchmarks/moflow/pytorch/runtime/common.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from glob import glob +import logging +import os +from typing import List, Optional, Tuple +import torch + +from model.model import MoFlow + + +CHECKPOINT_PATTERN = 'model_snapshot_epoch_%s' + + +def _sort_checkpoints(paths: List[str]) -> List[str]: + return sorted(paths, key=lambda x: int(x.split('_')[-1])) + + +def save_state(dir: str, model: MoFlow, optimizer: torch.optim.Optimizer, ln_var: float, epoch: int, keep: int = 1) -> None: + """Save training state in a given dir. This checkpoint can be used to resume training or run inference + with the trained model. This function will keep up to newest checkpoints and remove the oldest ones. + """ + save_path = os.path.join(dir, CHECKPOINT_PATTERN % (epoch + 1)) + state = { + 'model': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'ln_var': ln_var, + 'epoch': epoch, + } + torch.save(state, save_path) + + if keep > 0: + filenames = glob(os.path.join(dir, CHECKPOINT_PATTERN % '*')) + if len(filenames) <= keep: + return + + to_del = _sort_checkpoints(filenames)[:-keep] + for path in to_del: + os.remove(path) + + +def load_state(path: str, model: MoFlow, device: torch.device, optimizer: Optional[torch.optim.Optimizer] = None) -> Tuple[int, float]: + """Load model's and optimizer's state from a given file. + This function returns the number of epochs the model was trained for and natural logarithm of variance + the for the distribution of the latent space. + """ + state = torch.load(path, map_location=device) + model.load_state_dict(state['model']) + if optimizer is not None: + optimizer.load_state_dict(state['optimizer']) + return state['epoch'], state['ln_var'] + + +def get_newest_checkpoint(model_dir: str, validate: bool = True) -> str: + """Find newest checkpoint in a given directory. + If validate is set to True, this function will also verify that the file can be loaded and + select older checkpoint if neccessary. + """ + filenames = glob(os.path.join(model_dir, CHECKPOINT_PATTERN % '*')) + if len(filenames) == 0: + logging.info(f'No checkpoints available') + return None + + paths = _sort_checkpoints(filenames) + if validate: + for latest_path in paths[::-1]: + try: + torch.load(latest_path, map_location='cpu') + break + except: + logging.info(f'Checkpoint {latest_path} is corrupted') + else: + logging.info(f'All available checkpoints were corrupted') + return None + + else: + latest_path = paths[-1] + + logging.info(f'Found checkpoint {latest_path}') + return latest_path diff --git a/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py b/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py new file mode 100644 index 000000000..67ca67e16 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import os + +import torch +import torch.distributed as dist + + +def get_device(local_rank: int) -> torch.device: + if torch.cuda.is_available(): + torch.cuda.set_device(local_rank % torch.cuda.device_count()) + device = torch.device("cuda") + else: + device = torch.device("cpu") + logging.warning("not using a(ny) GPU(s)!") + return device + + +def get_world_size() -> int: + return int(os.environ.get("WORLD_SIZE", 1)) + + +def reduce_tensor(tensor: torch.Tensor, num_gpus: int) -> torch.Tensor: + if num_gpus > 1: + rt = tensor.clone() + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + if rt.is_floating_point(): + rt = rt / num_gpus + else: + rt = rt // num_gpus + return rt + return tensor + + +def init_distributed() -> bool: + world_size = int(os.environ.get("WORLD_SIZE", 1)) + distributed = world_size > 1 + if distributed: + backend = "nccl" if torch.cuda.is_available() else "gloo" + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" # Needed for CUDA graphs + dist.init_process_group(backend=backend, init_method="env://") + assert dist.is_initialized() + + if get_rank() == 0: + logging.info(f"Distributed initialized. World size: {world_size}") + return distributed + + +def get_rank() -> int: + """ + Gets distributed rank or returns zero if distributed is not initialized. + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + else: + rank = 0 + return rank diff --git a/training/benchmarks/moflow/pytorch/runtime/generate.py b/training/benchmarks/moflow/pytorch/runtime/generate.py new file mode 100644 index 000000000..de9369494 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/runtime/generate.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Tuple +import numpy as np + +from torch.cuda.amp import autocast +import torch + +from misc.config import CONFIGS, Config + +from model.model import MoFlow +from misc.utils import convert_predictions_to_mols, postprocess_predictions +from runtime.arguments import PARSER +from runtime.common import get_newest_checkpoint, load_state +from runtime.distributed_utils import get_device +from runtime.logger import PerformanceLogger, setup_logging + + +def infer(model: MoFlow, config: Config, device: torch.device, *, + ln_var: float = 0, temp: float = 0.6, mu: Optional[torch.Tensor] = None, + batch_size: int = 20) -> Tuple[np.ndarray, np.ndarray]: + + if mu is None: + mu = torch.zeros(config.z_dim, dtype=torch.float32, device=device) + + sigma = temp * np.sqrt(np.exp(ln_var)) + with torch.no_grad(): + z = torch.normal(mu.reshape(-1, config.z_dim).repeat((batch_size, 1)), sigma) + adj, x = model.reverse(z) + x, adj = postprocess_predictions(x, adj, config=config) + + return adj, x + + +if __name__ == '__main__': + from rdkit import RDLogger + RDLogger.DisableLog('rdApp.*') + + args = PARSER.parse_args() + logger = setup_logging(args) + perf_logger = PerformanceLogger(logger, args.train_batch_size, args.warmup_steps, mode='generate') + if args.predictions_path: + from rdkit.Chem import SmilesWriter + smiles_writer = SmilesWriter(args.predictions_path) + + snapshot_path = get_newest_checkpoint(args.results_dir) + config = CONFIGS[args.config_name] + model = MoFlow(config) + + device = get_device(args.local_rank) + if snapshot_path is not None: + epoch, ln_var = load_state(snapshot_path, model, device=device) + elif args.allow_untrained: + epoch, ln_var = 0, 0 + else: + raise RuntimeError('Generating molecules from an untrained network! ' + 'If this was intentional, pass --allow_untrained flag.') + model.to(device=device, memory_format=torch.channels_last) + model.eval() + if args.jit: + model.atom_model = torch.jit.script(model.atom_model) + model.bond_model = torch.jit.script(model.bond_model) + + + if args.steps == -1: + args.steps = 1 + + with autocast(enabled=args.amp): + for i in range(args.steps): + perf_logger.update() + results = infer( + model, config, ln_var=ln_var, temp=args.temperature, batch_size=args.batch_size, + device=device) + + if (i + 1) % args.log_interval == 0: + perf_logger.summarize(step=(0, i, i)) + if args.predictions_path: + mols_batch = convert_predictions_to_mols(*results, correct_validity=args.correct_validity) + for mol in mols_batch: + smiles_writer.write(mol) + + perf_logger.summarize(step=tuple()) + if args.predictions_path: + smiles_writer.close() diff --git a/training/benchmarks/moflow/pytorch/runtime/logger.py b/training/benchmarks/moflow/pytorch/runtime/logger.py new file mode 100644 index 000000000..b597d26de --- /dev/null +++ b/training/benchmarks/moflow/pytorch/runtime/logger.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABC, abstractmethod +import logging +import time + +import dllogger +from dllogger import JSONStreamBackend, StdOutBackend, Verbosity +import numpy as np + + +LOGGING_LEVELS = dict(enumerate([logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG])) + + +def get_dllogger(args): + backends = [] + if args.local_rank == 0: + backends.append(StdOutBackend(Verbosity.VERBOSE)) + if args.log_path is not None: + backends.append(JSONStreamBackend(Verbosity.VERBOSE, args.log_path, append=True)) + dllogger.init(backends=backends) + return dllogger + + +def setup_logging(args): + logging.basicConfig( + format='%(asctime)s %(levelname)s:\t%(message)s', datefmt='%H:%M:%S', level=LOGGING_LEVELS[args.verbosity], force=True + ) + return get_dllogger(args) + + +class BaseLogger(ABC): + @abstractmethod + def update(self, **kwargs) -> None: + pass + + @abstractmethod + def process_stats(self) -> dict: + return {} + + @abstractmethod + def reset(self) -> None: + pass + + def summarize(self, step: tuple): + stats = self.process_stats() + if len(stats) == 0: + logging.warn('Empty stats for logging, skipping') + return + self.logger.log(step=step, data=stats) + self.logger.flush() + return stats + + +class PerformanceLogger(BaseLogger): + def __init__(self, logger, batch_size: int, warmup_steps: int = 100, mode: str = 'train'): + self.logger = logger + self.batch_size = batch_size + self.warmup_steps = warmup_steps + self._step = 0 + self._timestamps = [] + self.mode = mode + + def update(self, **kwargs) -> None: + self._step += 1 + if self._step >= self.warmup_steps: + self._timestamps.append(time.time()) + + def reset(self) -> None: + self._step = 0 + self._timestamps = [] + + def process_stats(self) -> dict: + if len(self._timestamps) < 2: + logging.warn('Cannot process performance stats - less than 2 measurements collected') + return {} + + timestamps = np.asarray(self._timestamps) + deltas = np.diff(timestamps) + throughput = (self.batch_size / deltas).mean() + stats = { + f'throughput_{self.mode}': throughput, + f'latency_{self.mode}_mean': deltas.mean(), + f'total_time_{self.mode}': timestamps[-1] - timestamps[0], + } + for level in [90, 95, 99]: + stats.update({f'latency_{self.mode}_{level}': np.percentile(deltas, level)}) + + return stats + + +class MetricsLogger(BaseLogger): + def __init__(self, logger, mode: str = 'train'): + self.logger = logger + self.mode = mode + self._metrics_dict = {} + + def update(self, metrics: dict, **kwargs) -> None: + for metrics_name, metric_val in metrics.items(): + if metrics_name not in self._metrics_dict: + self._metrics_dict[metrics_name] = [] + self._metrics_dict[metrics_name].append(float(metric_val)) + + def reset(self) -> None: + self._metrics_dict = {} + + def process_stats(self) -> dict: + stats = {} + for metric_name, metric_val in self._metrics_dict.items(): + stats[metric_name] = np.mean(metric_val) + return stats diff --git a/training/benchmarks/moflow/pytorch/train/__init__.py b/training/benchmarks/moflow/pytorch/train/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/training/benchmarks/moflow/pytorch/train/evaluator.py b/training/benchmarks/moflow/pytorch/train/evaluator.py new file mode 100755 index 000000000..6af54fb72 --- /dev/null +++ b/training/benchmarks/moflow/pytorch/train/evaluator.py @@ -0,0 +1,110 @@ +import os +import sys +import torch +import numpy as np +from functools import partial + +from torch.cuda.amp import autocast + +from misc.utils import check_validity, convert_predictions_to_mols, predictions_to_smiles, check_novelty +from runtime.common import get_newest_checkpoint, load_state +from data.data_loader import NumpyTupleDataset +from data import transform +from runtime.generate import infer +from model.model import MoFlow + +# add benchmarks directory to sys.path +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import dist_pytorch + + +class Evaluator: + + def __init__(self): + pass + + def evaluate(self, args, config, acc_logger): + device = args.device + snapshot_path = get_newest_checkpoint(args.results_dir) + args.jit = True + model = MoFlow(config) + + dist_pytorch.main_proc_print(f"snapshot_path: {snapshot_path}") + + if snapshot_path is not None: + epoch, ln_var = load_state(snapshot_path, model, device=device) + elif args.allow_untrained: + epoch, ln_var = 0, 0 + else: + raise RuntimeError( + 'Generating molecules from an untrained network! ' + 'If this was intentional, pass --allow_untrained flag.') + dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}") + + model.to(device) + model.eval() + + if args.steps == -1: + args.steps = 1 + else: + args.steps = 100 + + dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}. args.steps:{args.steps}") + + valid_idx = transform.get_val_ids(config, args.data_dir) + dataset = NumpyTupleDataset.load( + os.path.join(args.data_dir, config.dataset_config.dataset_file), + transform=partial(transform.transform_fn, config=config), + ) + train_idx = [t for t in range(len(dataset)) if t not in valid_idx] + n_train = len(train_idx) + train_dataset = torch.utils.data.Subset(dataset, train_idx) + train_x = torch.Tensor(np.array([a[0] for a in train_dataset])) + train_adj = torch.Tensor(np.array([a[1] for a in train_dataset])) + + train_smiles = set(predictions_to_smiles(train_adj, train_x, config)) + + metrics = dict() + with autocast(enabled=args.amp): + for i in range(args.steps): + results = infer(model, + config, + ln_var=ln_var, + temp=args.temperature, + batch_size=args.train_batch_size, + device=device) + + mols_batch = convert_predictions_to_mols( + *results, + correct_validity=args.correct_validity, + ) + validity_info = check_validity(mols_batch) + + novel_r, abs_novel_r = check_novelty( + validity_info['valid_smiles'], + train_smiles, + len(mols_batch), + ) + _, nuv = check_novelty( + list(set(validity_info['valid_smiles'])), + train_smiles, + len(mols_batch), + ) + metrics = { + 'validity': validity_info['valid_ratio'], + 'novelty': novel_r, + 'uniqueness': validity_info['unique_ratio'], + 'abs_novelty': abs_novel_r, + 'abs_uniqueness': validity_info['abs_unique_ratio'], + 'nuv': nuv, + } + dist_pytorch.main_proc_print(f"step:{i} metrics:{metrics}") + if args.local_rank == 0: + acc_logger.update(metrics) + + if args.local_rank == 0: + stats = acc_logger.summarize(step=tuple()) + return stats + + return None diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py new file mode 100755 index 000000000..ab9a95d5b --- /dev/null +++ b/training/benchmarks/moflow/pytorch/train/trainer.py @@ -0,0 +1,237 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import time +import os +import sys +import argparse +from typing import Dict + +import torch +import torch.utils.data +from torch.types import Device +from torch.cuda.amp import autocast + +from model import create_model +from model.model import MoFlow, MoFlowLoss +from model.utils import initialize +from train.evaluator import Evaluator +from train.training_state import TrainingState +from runtime.generate import infer +from runtime.distributed_utils import reduce_tensor +from runtime.common import save_state +from misc.utils import check_validity, convert_predictions_to_mols +from misc.config import CONFIGS, Config +# add benchmarks directory to sys.path +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import Driver, dist_pytorch + + +class Trainer: + + def __init__(self, driver: Driver, adapter, evaluator: Evaluator, + training_state: TrainingState, device: Device, args, + perf_logger, acc_logger, train_dataloader): + super(Trainer, self).__init__() + self.driver = driver + self.adapter = adapter + self.training_state = training_state + self.device = device + self.evaluator = evaluator + self.args = args + self.perf_logger = perf_logger + self.acc_logger = acc_logger + self.train_dataloader = train_dataloader + + self.scaler = None + self.config = None + self.clip_grad = None + self.loss_module = None + self.model_callable = None + self.loss_callable = None + self.model = None + + def init(self): + args = self.args + dist_pytorch.main_proc_print("Init progress:") + self.config = CONFIGS[self.args.config_name] + self.model = create_model(self.config) + self.model.to(self.device) + device = args.device + model = self.model + x, adj, *_ = next(iter(self.train_dataloader)) + x = x.to(device) + adj = adj.to(device) + with autocast(enabled=args.amp): + initialize(self.model, (adj, x)) + + model.to(memory_format=torch.channels_last) + adj.to(memory_format=torch.channels_last) + + if args.jit: + model.bond_model = torch.jit.script(model.bond_model) + model.atom_model = torch.jit.script(model.atom_model) + + # make one pass in both directions to make sure that model works + with torch.no_grad(): + _ = model(adj, x) + _ = model.reverse( + torch.randn(args.train_batch_size, + self.config.z_dim, + device=device)) + + self.model = self.adapter.convert_model(self.model) + self.model = self.adapter.model_to_fp16(self.model, self.args) + self.model = self.adapter.model_to_ddp(self.model, self.args) + self.loss_module = MoFlowLoss(self.config) + self.loss_module.to(self.device) + self.loss_module = self.adapter.model_to_ddp(self.loss_module, + self.args) + self.model_callable, self.loss_callable = self._get_callables() + + self.optimizer = self.adapter.create_optimizer(self.model, self.args, + self.loss_module) + self.scaler = self.adapter.create_grad_scaler(self.args) + self.clip_grad = self.adapter.create_clip_grad() + + def _get_callables(self): + args = self.args + is_distributed = args.distributed + model_callable, loss_callable = None, None + if is_distributed: + model_callable = self.model.module + loss_callable = self.loss_module.module + else: + model_callable = self.model + loss_callable = self.loss_module + return model_callable, loss_callable + + def train_one_epoch(self, train_dataloader, step) -> int: + model = self.model + args = self.args + device = self.device + epoch = self.training_state.epoch + clip_grad_norm_ = self.clip_grad + world_size = args.n_device + local_rank = self.args.local_rank + is_distributed = args.distributed + + + if local_rank == 0: + self.acc_logger.reset() + + print(f"Epoch: {epoch}") + if args.distributed: + train_dataloader.sampler.set_epoch(epoch) + + model.train() + noeval_start_time = time.time() + + for i, batch in enumerate(train_dataloader): + if local_rank == 0: + self.perf_logger.update() + step += 1 + self.optimizer.zero_grad() + x = batch[0].to(device) + adj = batch[1].to(device=device, memory_format=torch.channels_last) + + pure_compute_start_time = time.time() + + # Forward, backward and optimize + with_cuda_graph = (args.cuda_graph and step >= args.warmup_steps + and x.size(0) == args.train_batch_size) + + with autocast(enabled=args.amp, cache_enabled=not with_cuda_graph): + output = model(adj, x, with_cuda_graph=with_cuda_graph) + nll_x, nll_adj = self.loss_module(*output) + loss = nll_x + nll_adj + + if args.amp: + self.scaler.scale(loss).backward() + self.scaler.unscale_(self.optimizer) + clip_grad_norm_(model.parameters(), args.clip) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + loss.backward() + clip_grad_norm_(model.parameters(), args.clip) + self.optimizer.step() + + self.training_state.pure_compute_time += time.time() - pure_compute_start_time + + # Print log info + if (i + 1) % args.log_interval == 0: + nll_x_value = reduce_tensor(nll_x, world_size).item() + nll_adj_value = reduce_tensor(nll_adj, world_size).item() + loss_value = nll_x_value + nll_adj_value + + if local_rank == 0: + self.acc_logger.update({ + 'loglik': loss_value, + 'nll_x': nll_x_value, + 'nll_adj': nll_adj_value + }) + + self.acc_logger.summarize(step=(epoch, i, i)) + self.perf_logger.summarize(step=(epoch, i, i)) + + + if step >= args.steps: + break + + self.training_state.num_trained_samples += len(train_dataloader.dataset) + self.training_state.no_eval_time += time.time() - noeval_start_time + + if (epoch + 1) % args.eval_epochs == 0: + with autocast(enabled=args.amp): + metrics = run_validation(self.model, self.config, + self.loss_callable.ln_var.item(), args, + is_distributed, world_size, device) + dist_pytorch.main_proc_print(f"epoch:{epoch+1}, metrics:{metrics}") + + if local_rank == 0: + self.acc_logger.update(metrics) + + # The same report for each epoch + if local_rank == 0: + self.acc_logger.summarize(step=(epoch, )) + self.perf_logger.summarize(step=(epoch, )) + + # Save the model checkpoints + if (epoch + 1) % args.save_epochs == 0: + if local_rank == 0 or not is_distributed: + save_state(args.results_dir, + self.model_callable, + self.optimizer, + self.loss_callable.ln_var.item(), + epoch, + keep=5) + return step + +def run_validation(model: MoFlow, config: Config, ln_var: float, + args: argparse.Namespace, is_distributed: bool, + world_size: int, device: torch.device) -> Dict[str, float]: + model.eval() + model_callable = model.module if is_distributed else model + + result = infer(model_callable, + config, + device=device, + ln_var=ln_var, + batch_size=args.eval_batch_size, + temp=args.temperature) + mols = convert_predictions_to_mols(*result, + correct_validity=args.correct_validity) + validity_info = check_validity(mols) + valid_ratio = torch.tensor(validity_info['valid_ratio'], + dtype=torch.float32, + device=device) + unique_ratio = torch.tensor(validity_info['unique_ratio'], + dtype=torch.float32, + device=device) + valid_value = reduce_tensor(valid_ratio, world_size).detach().cpu().numpy() + unique_value = reduce_tensor(unique_ratio, + world_size).detach().cpu().numpy() + model.train() + return {'valid': valid_value, 'unique': unique_value} diff --git a/training/benchmarks/moflow/pytorch/train/trainer_adapter.py b/training/benchmarks/moflow/pytorch/train/trainer_adapter.py new file mode 100755 index 000000000..2d40170be --- /dev/null +++ b/training/benchmarks/moflow/pytorch/train/trainer_adapter.py @@ -0,0 +1,44 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import os +import sys + +import torch +import torch.distributed as dist +from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) +from driver.dist_pytorch import main_proc_print + + +def create_optimizer(model, args, loss_module): + pass + + +def create_clip_grad(): + pass + + +def create_grad_scaler(args): + pass + + +def convert_model(model: nn.Module) -> nn.Module: + return model + + +def model_to_fp16(model: nn.Module, args) -> nn.Module: + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if args.fp16: + main_proc_print(" > use fp16...") + model.to(torch.bfloat16) + return model + + +def model_to_ddp(model: nn.Module, args) -> nn.Module: + if dist.is_available() and dist.is_initialized(): + model = DDP(model, device_ids=[args.local_rank]) + return model diff --git a/training/benchmarks/moflow/pytorch/train/training_state.py b/training/benchmarks/moflow/pytorch/train/training_state.py new file mode 100755 index 000000000..35e6dd53d --- /dev/null +++ b/training/benchmarks/moflow/pytorch/train/training_state.py @@ -0,0 +1,37 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +from dataclasses import dataclass + + +@dataclass +class TrainingState: + _trainer = None + _status = 'aborted' # later set to 'success' if termination criteria met + + global_steps = 0 + + loss: float = 0.0 + nuv: float = 0.0 + + epoch: int = 1 + + end_training: bool = False + converged: bool = False + + train_time = 0.001 + no_eval_time = 0.001 + pure_compute_time = 0.001 + + num_trained_samples = 0 + + def status(self): + """get status""" + if self.converged: + self._status = "success" + return self._status + + def converged_success(self): + """converged success""" + self.end_training = True + self.converged = True diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md new file mode 100644 index 000000000..a7fccc451 --- /dev/null +++ b/training/nvidia/moflow-pytorch/README.md @@ -0,0 +1,51 @@ +### 1. 数据集准备 + +[ZINC 250k](../../benchmarks/moflow/pytorch/README.md#dataset) + +### 2. Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: NVIDIA DGX A100(40G) + - 加速卡型号: NVIDIA_A100-SXM4-40GB + - CPU型号: AMD EPYC7742-64core@1.5G + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+936e930 + - 依赖软件版本: + - cuda: 11.4 + + +### 3.运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ----------------------------------------------------------- | +| 任务类别 | DrugDiscovery | | +| 模型 | MoFlow | | +| 数据集 | ZINC 250k | | +| 数据精度 | precision,见“性能指标” | 可选tf32/amp/fp16/ | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p_core>p_train>p_whole) | +| 训练结果 | nuv,见“性能指标” | 所有生成的分子中,Novel, Unique and Valid的分子所占的百分比 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_nuv | mem | +| ----------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | --------- | --------- | +| A100单机8卡(1x8) | amp | / | 3114 | 24008 | 27165 | 30343 | 78.4 | 11.8/40.0 | +| A100单机8卡(1x8) | amp | bs=3072,lr=0.003 | / | 31279 | 38043 | 46823 | 76 | 34.6/40.0 | +| A100单机单卡(1x1) | amp | bs=256,lr=0.8 | / | 4447 | 4542 | 4819 | / | 10.9/40.0 | +| A100两机8卡(2x8) | amp | bs=256,lr=0.8 | / | 10576 | 11085 | 11874 | / | 27.9/40.0 | + diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py new file mode 100644 index 000000000..47f1f7d43 --- /dev/null +++ b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py @@ -0,0 +1,4 @@ +lr = 0.0001 +train_batch_size = 512 +eval_batch_size = 100 + diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py new file mode 100644 index 000000000..9ead7fcdb --- /dev/null +++ b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py @@ -0,0 +1,4 @@ +lr = 0.0005 +train_batch_size = 512 +eval_batch_size = 100 + diff --git a/training/nvidia/moflow-pytorch/config/config_A100x2x8.py b/training/nvidia/moflow-pytorch/config/config_A100x2x8.py new file mode 100644 index 000000000..9049954f7 --- /dev/null +++ b/training/nvidia/moflow-pytorch/config/config_A100x2x8.py @@ -0,0 +1,3 @@ +lr = 0.003 +train_batch_size = 3072 +eval_batch_size = 600 \ No newline at end of file diff --git a/training/nvidia/moflow-pytorch/config/environment_variables.sh b/training/nvidia/moflow-pytorch/config/environment_variables.sh new file mode 100755 index 000000000..c035c8297 --- /dev/null +++ b/training/nvidia/moflow-pytorch/config/environment_variables.sh @@ -0,0 +1,8 @@ +# ================================================= +# Export variables +# ================================================= +# Add your own proxy here if you have problems connecting with github.com +export http_proxy="10.1.0.34:7890" +export https_proxy="10.1.0.34:7890" + +export OMP_NUM_THREADS=8 diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt new file mode 100644 index 000000000..d58f2e8a9 --- /dev/null +++ b/training/nvidia/moflow-pytorch/config/requirements.txt @@ -0,0 +1,2 @@ +rdkit +git+https://github.com/NVIDIA/dllogger#egg=dllogger \ No newline at end of file diff --git a/training/nvidia/moflow-pytorch/extern/trainer_adapter.py b/training/nvidia/moflow-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..aff774f81 --- /dev/null +++ b/training/nvidia/moflow-pytorch/extern/trainer_adapter.py @@ -0,0 +1,17 @@ +import torch +from apex.optimizers import FusedAdam as Adam +from apex.contrib.clip_grad import clip_grad_norm_ + +def create_optimizer(model, args, loss_module): + optimizer = Adam((*model.parameters(), *loss_module.parameters()), lr=args.lr, betas=(args.beta1, args.beta2)) + return optimizer + + +def create_clip_grad(): + return clip_grad_norm_ + + +def create_grad_scaler(args): + """create_grad_scaler for mixed precision training""" + scaler = torch.cuda.amp.GradScaler() if args.amp else None + return scaler \ No newline at end of file From 308bc2e74c564b6ed694d15b34a115e2b3ae982a Mon Sep 17 00:00:00 2001 From: zhouyu Date: Wed, 17 Jan 2024 17:29:04 +0800 Subject: [PATCH 02/12] update readme --- .../benchmarks/moflow/pytorch/config/_base.py | 2 +- training/nvidia/moflow-pytorch/README.md | 18 ++++++++++++------ .../moflow-pytorch/config/config_A100x1x1.py | 5 ++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py index b1ff28e92..00dc8cc4d 100755 --- a/training/benchmarks/moflow/pytorch/config/_base.py +++ b/training/benchmarks/moflow/pytorch/config/_base.py @@ -36,7 +36,7 @@ train_batch_size: int = 512 eval_batch_size: int = 100 -target_nuv: float = 85.0 +target_nuv: float = 80 # Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved. save_epochs: int = 5 diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md index a7fccc451..841b99c67 100644 --- a/training/nvidia/moflow-pytorch/README.md +++ b/training/nvidia/moflow-pytorch/README.md @@ -42,10 +42,16 @@ * 性能指标 -| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_nuv | mem | -| ----------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | --------- | --------- | -| A100单机8卡(1x8) | amp | / | 3114 | 24008 | 27165 | 30343 | 78.4 | 11.8/40.0 | -| A100单机8卡(1x8) | amp | bs=3072,lr=0.003 | / | 31279 | 38043 | 46823 | 76 | 34.6/40.0 | -| A100单机单卡(1x1) | amp | bs=256,lr=0.8 | / | 4447 | 4542 | 4819 | / | 10.9/40.0 | -| A100两机8卡(2x8) | amp | bs=256,lr=0.8 | / | 10576 | 11085 | 11874 | / | 27.9/40.0 | +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_nuv | mem | +| ----------------- | --------- | ----------------- | -------- | ------- | ------- | ------ | --------- | --------- | +| A100单机8卡(1x8) | amp | / | 3098 | 24128 | 27514 | 30696 | 78.4 | 11.8/40.0 | +| A100单机8卡(1x8) | amp | bs=3072,lr=0.003 | / | 31279 | 38043 | 46823 | 76.1 | 34.6/40.0 | +| A100单机单卡(1x1) | amp | bs=3584,lr=0.0001 | / | 5810 | 5992 | 6387 | / | 37.9/40.0 | +| A100两机8卡(2x8) | amp | bs=3072,lr=0.0005 | / | 47655 | 63957 | 90228 | / | 34.5/40.0 | + + + +> 注 +> 原始仓库中的[NUV](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#results)取了**20**个不同的随机种子,进行了**20**次实验的平均值。 +> 由于实验机器资源所限,我们无法进行大量的实验验证。本实验结果与上述的值相比,可能会有一定差异。 diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py index 47f1f7d43..15500667d 100644 --- a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py +++ b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py @@ -1,4 +1,3 @@ lr = 0.0001 -train_batch_size = 512 -eval_batch_size = 100 - +train_batch_size = 3584 +eval_batch_size = 100 \ No newline at end of file From 11e50f349c96ae84573d9f9534758335f368fae1 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Wed, 17 Jan 2024 17:32:23 +0800 Subject: [PATCH 03/12] add case example for test_conf --- .../nvidia/moflow-pytorch/config/environment_variables.sh | 4 ++-- training/run_benchmarks/config/test_conf.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/training/nvidia/moflow-pytorch/config/environment_variables.sh b/training/nvidia/moflow-pytorch/config/environment_variables.sh index c035c8297..0a9382166 100755 --- a/training/nvidia/moflow-pytorch/config/environment_variables.sh +++ b/training/nvidia/moflow-pytorch/config/environment_variables.sh @@ -2,7 +2,7 @@ # Export variables # ================================================= # Add your own proxy here if you have problems connecting with github.com -export http_proxy="10.1.0.34:7890" -export https_proxy="10.1.0.34:7890" +# export http_proxy="127.0.0.1:7890" +# export https_proxy="127.0.0.1:7890" export OMP_NUM_THREADS=8 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 71e1df913..254585727 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -79,7 +79,8 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", - "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", + # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", + "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", From 2cb2eceae19ae621ed86772dbcf3bd168ee8668b Mon Sep 17 00:00:00 2001 From: zhouyu Date: Wed, 17 Jan 2024 17:34:59 +0800 Subject: [PATCH 04/12] change to comment --- training/run_benchmarks/config/test_conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 254585727..4aa5e1051 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -80,7 +80,7 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", - "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/", + # "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", From 2dc11409d488de6b7621f1922b59ea5b380209a9 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Fri, 19 Jan 2024 10:29:58 +0800 Subject: [PATCH 05/12] rdkit add version --- training/nvidia/moflow-pytorch/config/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt index d58f2e8a9..768ed454f 100644 --- a/training/nvidia/moflow-pytorch/config/requirements.txt +++ b/training/nvidia/moflow-pytorch/config/requirements.txt @@ -1,2 +1,2 @@ -rdkit +rdkit==2023.9.3 git+https://github.com/NVIDIA/dllogger#egg=dllogger \ No newline at end of file From abee0464fa6a96442a29aa3da6538aa47c300d1c Mon Sep 17 00:00:00 2001 From: zhouyu Date: Fri, 23 Feb 2024 13:09:41 +0800 Subject: [PATCH 06/12] add jit & cuda_graph to mutable_params, overwritten by vendors are allowed --- training/benchmarks/moflow/pytorch/config/mutable_params.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/benchmarks/moflow/pytorch/config/mutable_params.py b/training/benchmarks/moflow/pytorch/config/mutable_params.py index 7dcf8652a..ec54460d8 100755 --- a/training/benchmarks/moflow/pytorch/config/mutable_params.py +++ b/training/benchmarks/moflow/pytorch/config/mutable_params.py @@ -1,5 +1,6 @@ mutable_params = [ 'vendor', 'data_dir', 'lr', 'train_batch_size', 'eval_batch_size', 'do_train', 'amp', 'fp16', 'distributed', 'dist_backend', 'num_workers', - 'device', 'cudnn_benchmark', 'cudnn_deterministic' + 'device', 'cudnn_benchmark', 'cudnn_deterministic', + 'jit', 'cuda_graph' ] From 9d2324396b5e5d1e815b304819a5a6ccaad99a41 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Mon, 26 Feb 2024 10:14:27 +0800 Subject: [PATCH 07/12] rename config_name to dataset_name --- training/benchmarks/moflow/pytorch/config/_base.py | 2 +- training/benchmarks/moflow/pytorch/dataloaders/dataloader.py | 4 ++-- training/benchmarks/moflow/pytorch/runtime/arguments.py | 2 +- training/benchmarks/moflow/pytorch/runtime/generate.py | 2 +- training/benchmarks/moflow/pytorch/train/trainer.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py index 00dc8cc4d..9475338b6 100755 --- a/training/benchmarks/moflow/pytorch/config/_base.py +++ b/training/benchmarks/moflow/pytorch/config/_base.py @@ -14,7 +14,7 @@ # ========================================================= # The config to choose. This parameter allows one to switch between different datasets. # and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used. -config_name: str = "zinc250k" +dataset_name: str = "zinc250k" # Number of workers in the data loader. num_workers: int = 4 diff --git a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py index a9591b20f..d1e8d27c2 100644 --- a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py @@ -12,8 +12,8 @@ def build_datasets(args): # Model configuration - assert args.config_name in CONFIGS - config = CONFIGS[args.config_name] + assert args.dataset_name in CONFIGS + config = CONFIGS[args.dataset_name] data_file = config.dataset_config.dataset_file transform_fn = functools.partial(transform.transform_fn, config=config) valid_idx = transform.get_val_ids(config, args.data_dir) diff --git a/training/benchmarks/moflow/pytorch/runtime/arguments.py b/training/benchmarks/moflow/pytorch/runtime/arguments.py index d5b8c58c5..5a48fe4d2 100644 --- a/training/benchmarks/moflow/pytorch/runtime/arguments.py +++ b/training/benchmarks/moflow/pytorch/runtime/arguments.py @@ -22,7 +22,7 @@ PARSER = argparse.ArgumentParser() PARSER.add_argument('--data_dir', type=str, default='/data', help='Location for the dataset.') -PARSER.add_argument('--config_name', type=str, default='zinc250k', choices=list(CONFIGS), +PARSER.add_argument('--dataset_name', type=str, default='zinc250k', choices=list(CONFIGS), help='The config to choose. This parameter allows one to switch between different datasets ' 'and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.') PARSER.add_argument('--results_dir', type=str, default='/results', help='Directory where checkpoints are stored.') diff --git a/training/benchmarks/moflow/pytorch/runtime/generate.py b/training/benchmarks/moflow/pytorch/runtime/generate.py index de9369494..5e03f1db4 100644 --- a/training/benchmarks/moflow/pytorch/runtime/generate.py +++ b/training/benchmarks/moflow/pytorch/runtime/generate.py @@ -57,7 +57,7 @@ def infer(model: MoFlow, config: Config, device: torch.device, *, smiles_writer = SmilesWriter(args.predictions_path) snapshot_path = get_newest_checkpoint(args.results_dir) - config = CONFIGS[args.config_name] + config = CONFIGS[args.dataset_name] model = MoFlow(config) device = get_device(args.local_rank) diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py index ab9a95d5b..98de66380 100755 --- a/training/benchmarks/moflow/pytorch/train/trainer.py +++ b/training/benchmarks/moflow/pytorch/train/trainer.py @@ -55,7 +55,7 @@ def __init__(self, driver: Driver, adapter, evaluator: Evaluator, def init(self): args = self.args dist_pytorch.main_proc_print("Init progress:") - self.config = CONFIGS[self.args.config_name] + self.config = CONFIGS[self.args.dataset_name] self.model = create_model(self.config) self.model.to(self.device) device = args.device From 01097740327bac8aed89cecba4f8beb2b035fc0b Mon Sep 17 00:00:00 2001 From: zhouyu Date: Mon, 26 Feb 2024 13:09:53 +0800 Subject: [PATCH 08/12] set time statistic variables to 0 --- training/benchmarks/moflow/pytorch/train/training_state.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/training/benchmarks/moflow/pytorch/train/training_state.py b/training/benchmarks/moflow/pytorch/train/training_state.py index 35e6dd53d..4a45ab93d 100755 --- a/training/benchmarks/moflow/pytorch/train/training_state.py +++ b/training/benchmarks/moflow/pytorch/train/training_state.py @@ -19,10 +19,9 @@ class TrainingState: end_training: bool = False converged: bool = False - train_time = 0.001 - no_eval_time = 0.001 - pure_compute_time = 0.001 - + train_time = 0 + no_eval_time = 0 + pure_compute_time = 0 num_trained_samples = 0 def status(self): From eb1fd00fd754e76abe208b72c1fe3b03cd141ecd Mon Sep 17 00:00:00 2001 From: zhouyu Date: Sat, 2 Mar 2024 19:11:20 +0800 Subject: [PATCH 09/12] update seed and target_nuv --- .../benchmarks/moflow/pytorch/config/_base.py | 4 +- .../moflow/pytorch/run_pretraining.py | 7 +-- .../moflow/pytorch/train/evaluator.py | 17 +++---- .../moflow/pytorch/train/trainer.py | 44 +++++++++++-------- .../moflow-pytorch/config/config_A100x1x8.py | 1 + .../moflow-pytorch/config/requirements.txt | 2 +- 6 files changed, 38 insertions(+), 37 deletions(-) diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py index 9475338b6..26f9ab84c 100755 --- a/training/benchmarks/moflow/pytorch/config/_base.py +++ b/training/benchmarks/moflow/pytorch/config/_base.py @@ -36,10 +36,10 @@ train_batch_size: int = 512 eval_batch_size: int = 100 -target_nuv: float = 80 +target_nuv: float = 87.9 # Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved. -save_epochs: int = 5 +save_epochs: int = 50 # Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed. eval_epochs: int = 5 diff --git a/training/benchmarks/moflow/pytorch/run_pretraining.py b/training/benchmarks/moflow/pytorch/run_pretraining.py index 07a216567..88a68e90a 100755 --- a/training/benchmarks/moflow/pytorch/run_pretraining.py +++ b/training/benchmarks/moflow/pytorch/run_pretraining.py @@ -97,6 +97,8 @@ def main() -> Tuple[Any, Any]: dist_pytorch.barrier(args.vendor) if not args.do_train: + res = evaluator.evaluate(args, trainer.config, acc_logger) + dist_pytorch.main_proc_print(f"evaluate results: {res}") return args, training_state model_driver.event(Event.INIT_END) @@ -122,7 +124,6 @@ def main() -> Tuple[Any, Any]: first_epoch = 0 step = 0 - if first_epoch > args.epochs: dist_pytorch.main_proc_print( f'Model was already trained for {first_epoch} epochs, skip pretraining' @@ -132,9 +133,9 @@ def main() -> Tuple[Any, Any]: epoch = first_epoch while epoch < args.epochs: training_state.epoch = epoch - step = trainer.train_one_epoch(train_dataloader, step) + trainer.train_one_epoch(train_dataloader, step) epoch += 1 - if step >= args.steps: + if training_state.global_steps >= args.steps: break # TRAIN_END事件 training_state.train_time = time.time() - train_start_time diff --git a/training/benchmarks/moflow/pytorch/train/evaluator.py b/training/benchmarks/moflow/pytorch/train/evaluator.py index 6af54fb72..7bb0b39c0 100755 --- a/training/benchmarks/moflow/pytorch/train/evaluator.py +++ b/training/benchmarks/moflow/pytorch/train/evaluator.py @@ -40,7 +40,6 @@ def evaluate(self, args, config, acc_logger): raise RuntimeError( 'Generating molecules from an untrained network! ' 'If this was intentional, pass --allow_untrained flag.') - dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}") model.to(device) model.eval() @@ -48,7 +47,7 @@ def evaluate(self, args, config, acc_logger): if args.steps == -1: args.steps = 1 else: - args.steps = 100 + args.steps = 1000 dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}. args.steps:{args.steps}") @@ -58,7 +57,6 @@ def evaluate(self, args, config, acc_logger): transform=partial(transform.transform_fn, config=config), ) train_idx = [t for t in range(len(dataset)) if t not in valid_idx] - n_train = len(train_idx) train_dataset = torch.utils.data.Subset(dataset, train_idx) train_x = torch.Tensor(np.array([a[0] for a in train_dataset])) train_adj = torch.Tensor(np.array([a[1] for a in train_dataset])) @@ -80,7 +78,6 @@ def evaluate(self, args, config, acc_logger): correct_validity=args.correct_validity, ) validity_info = check_validity(mols_batch) - novel_r, abs_novel_r = check_novelty( validity_info['valid_smiles'], train_smiles, @@ -99,12 +96,8 @@ def evaluate(self, args, config, acc_logger): 'abs_uniqueness': validity_info['abs_unique_ratio'], 'nuv': nuv, } - dist_pytorch.main_proc_print(f"step:{i} metrics:{metrics}") - if args.local_rank == 0: - acc_logger.update(metrics) - - if args.local_rank == 0: - stats = acc_logger.summarize(step=tuple()) - return stats + dist_pytorch.main_proc_print("metrics", i, metrics) + acc_logger.update(metrics) - return None + stats = acc_logger.summarize(step=tuple()) + return stats diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py index 98de66380..e2d9db2f8 100755 --- a/training/benchmarks/moflow/pytorch/train/trainer.py +++ b/training/benchmarks/moflow/pytorch/train/trainer.py @@ -59,24 +59,23 @@ def init(self): self.model = create_model(self.config) self.model.to(self.device) device = args.device - model = self.model x, adj, *_ = next(iter(self.train_dataloader)) x = x.to(device) adj = adj.to(device) with autocast(enabled=args.amp): initialize(self.model, (adj, x)) - model.to(memory_format=torch.channels_last) + self.model.to(memory_format=torch.channels_last) adj.to(memory_format=torch.channels_last) if args.jit: - model.bond_model = torch.jit.script(model.bond_model) - model.atom_model = torch.jit.script(model.atom_model) + self.model.bond_model = torch.jit.script(self.model.bond_model) + self.model.atom_model = torch.jit.script(self.model.atom_model) # make one pass in both directions to make sure that model works with torch.no_grad(): - _ = model(adj, x) - _ = model.reverse( + _ = self.model(adj, x) + _ = self.model.reverse( torch.randn(args.train_batch_size, self.config.z_dim, device=device)) @@ -107,7 +106,7 @@ def _get_callables(self): loss_callable = self.loss_module return model_callable, loss_callable - def train_one_epoch(self, train_dataloader, step) -> int: + def train_one_epoch(self, train_dataloader, step): model = self.model args = self.args device = self.device @@ -117,6 +116,8 @@ def train_one_epoch(self, train_dataloader, step) -> int: local_rank = self.args.local_rank is_distributed = args.distributed + if step > 0 and step > self.training_state.global_steps: + self.training_state.global_steps = step if local_rank == 0: self.acc_logger.reset() @@ -131,7 +132,7 @@ def train_one_epoch(self, train_dataloader, step) -> int: for i, batch in enumerate(train_dataloader): if local_rank == 0: self.perf_logger.update() - step += 1 + self.training_state.global_steps += 1 self.optimizer.zero_grad() x = batch[0].to(device) adj = batch[1].to(device=device, memory_format=torch.channels_last) @@ -139,8 +140,10 @@ def train_one_epoch(self, train_dataloader, step) -> int: pure_compute_start_time = time.time() # Forward, backward and optimize - with_cuda_graph = (args.cuda_graph and step >= args.warmup_steps - and x.size(0) == args.train_batch_size) + with_cuda_graph = ( + args.cuda_graph + and self.training_state.global_steps >= args.warmup_steps + and x.size(0) == args.train_batch_size) with autocast(enabled=args.amp, cache_enabled=not with_cuda_graph): output = model(adj, x, with_cuda_graph=with_cuda_graph) @@ -158,7 +161,8 @@ def train_one_epoch(self, train_dataloader, step) -> int: clip_grad_norm_(model.parameters(), args.clip) self.optimizer.step() - self.training_state.pure_compute_time += time.time() - pure_compute_start_time + self.training_state.pure_compute_time += time.time( + ) - pure_compute_start_time # Print log info if (i + 1) % args.log_interval == 0: @@ -176,19 +180,21 @@ def train_one_epoch(self, train_dataloader, step) -> int: self.acc_logger.summarize(step=(epoch, i, i)) self.perf_logger.summarize(step=(epoch, i, i)) - - if step >= args.steps: + if self.training_state.global_steps >= args.steps: break - self.training_state.num_trained_samples += len(train_dataloader.dataset) + self.training_state.num_trained_samples += len( + train_dataloader.dataset) self.training_state.no_eval_time += time.time() - noeval_start_time if (epoch + 1) % args.eval_epochs == 0: with autocast(enabled=args.amp): metrics = run_validation(self.model, self.config, - self.loss_callable.ln_var.item(), args, - is_distributed, world_size, device) - dist_pytorch.main_proc_print(f"epoch:{epoch+1}, metrics:{metrics}") + self.loss_callable.ln_var.item(), + args, is_distributed, world_size, + device) + dist_pytorch.main_proc_print( + f"epoch:{epoch+1}, metrics:{metrics}") if local_rank == 0: self.acc_logger.update(metrics) @@ -206,8 +212,8 @@ def train_one_epoch(self, train_dataloader, step) -> int: self.optimizer, self.loss_callable.ln_var.item(), epoch, - keep=5) - return step + keep=3) + def run_validation(model: MoFlow, config: Config, ln_var: float, args: argparse.Namespace, is_distributed: bool, diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py index 9ead7fcdb..d7c74636d 100644 --- a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py +++ b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py @@ -2,3 +2,4 @@ train_batch_size = 512 eval_batch_size = 100 +seed = 42 \ No newline at end of file diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt index 768ed454f..441b965e1 100644 --- a/training/nvidia/moflow-pytorch/config/requirements.txt +++ b/training/nvidia/moflow-pytorch/config/requirements.txt @@ -1,2 +1,2 @@ -rdkit==2023.9.3 +rdkit-pypi git+https://github.com/NVIDIA/dllogger#egg=dllogger \ No newline at end of file From bfc3d1dc478c8c6d2dde7622adaac81555bb94a2 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Sat, 2 Mar 2024 19:18:17 +0800 Subject: [PATCH 10/12] update 1x8 result for official bs --- training/nvidia/moflow-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md index 841b99c67..bf006dc8e 100644 --- a/training/nvidia/moflow-pytorch/README.md +++ b/training/nvidia/moflow-pytorch/README.md @@ -44,8 +44,8 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_nuv | mem | | ----------------- | --------- | ----------------- | -------- | ------- | ------- | ------ | --------- | --------- | -| A100单机8卡(1x8) | amp | / | 3098 | 24128 | 27514 | 30696 | 78.4 | 11.8/40.0 | -| A100单机8卡(1x8) | amp | bs=3072,lr=0.003 | / | 31279 | 38043 | 46823 | 76.1 | 34.6/40.0 | +| A100单机8卡(1x8) | amp | / | 3220 | 27023 | 27519 | 30789 | 88.45 | 11.8/40.0 | +| A100单机8卡(1x8) | amp | bs=3072,lr=0.003 | / | 31279 | 38043 | 46823 | / | 34.6/40.0 | | A100单机单卡(1x1) | amp | bs=3584,lr=0.0001 | / | 5810 | 5992 | 6387 | / | 37.9/40.0 | | A100两机8卡(2x8) | amp | bs=3072,lr=0.0005 | / | 47655 | 63957 | 90228 | / | 34.5/40.0 | From 9d09a230fa1848fcd211314da1c5c96697303981 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Sat, 2 Mar 2024 19:30:13 +0800 Subject: [PATCH 11/12] update notice for readme --- training/nvidia/moflow-pytorch/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md index bf006dc8e..9fa9609be 100644 --- a/training/nvidia/moflow-pytorch/README.md +++ b/training/nvidia/moflow-pytorch/README.md @@ -53,5 +53,6 @@ > 注 > 原始仓库中的[NUV](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#results)取了**20**个不同的随机种子,进行了**20**次实验的平均值。 -> 由于实验机器资源所限,我们无法进行大量的实验验证。本实验结果与上述的值相比,可能会有一定差异。 +> 此模型本身的实验结果对随机性比较敏感。seed, temperature等都会影响nuv的值,参考[training-stability-test]一节的说明。(https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#training-stability-test)。 +> 如厂商一次无法收敛,需尝试运行**若干次**。 From 67bd369debff0dfb3b452cdee942c9bb0264d87d Mon Sep 17 00:00:00 2001 From: Zhou Yu Date: Sat, 2 Mar 2024 19:31:38 +0800 Subject: [PATCH 12/12] Update test_conf.py --- training/run_benchmarks/config/test_conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 4aa5e1051..ee609e952 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -80,7 +80,7 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", - # "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/", + # "moflow:pytorch_1.13:A100:1:8:1": "/raid/dataset/MoFlow/data/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", @@ -152,4 +152,4 @@ # "faster_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", # "retinanet:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", # "resnet50:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", - # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", \ No newline at end of file + # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",