From 6e298f74fa634c77489b8f285b58ed4d2d345435 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Wed, 17 Jan 2024 15:26:48 +0800
Subject: [PATCH 01/12] add MoFlow std case

---
 training/benchmarks/moflow/pytorch/README.md  |  55 ++++
 .../benchmarks/moflow/pytorch/__init__.py     |   0
 .../moflow/pytorch/config/__init__.py         |   2 +
 .../benchmarks/moflow/pytorch/config/_base.py | 105 +++++++
 .../moflow/pytorch/config/mutable_params.py   |   5 +
 .../moflow/pytorch/data/__init__.py           |   0
 .../moflow/pytorch/data/data_frame_parser.py  | 109 +++++++
 .../moflow/pytorch/data/data_loader.py        | 110 +++++++
 .../moflow/pytorch/data/encoding.py           | 139 +++++++++
 .../moflow/pytorch/data/transform.py          |  85 ++++++
 .../moflow/pytorch/dataloaders/__init__.py    |   0
 .../moflow/pytorch/dataloaders/dataloader.py  |  68 +++++
 .../benchmarks/moflow/pytorch/misc/config.py  | 142 +++++++++
 .../benchmarks/moflow/pytorch/misc/utils.py   | 211 ++++++++++++++
 .../moflow/pytorch/model/__init__.py          |   6 +
 .../benchmarks/moflow/pytorch/model/basic.py  | 194 +++++++++++++
 .../moflow/pytorch/model/coupling.py          | 196 +++++++++++++
 .../benchmarks/moflow/pytorch/model/glow.py   | 270 ++++++++++++++++++
 .../benchmarks/moflow/pytorch/model/model.py  | 251 ++++++++++++++++
 .../benchmarks/moflow/pytorch/model/utils.py  |  42 +++
 .../moflow/pytorch/optimizers/__init__.py     |   0
 .../moflow/pytorch/run_pretraining.py         | 192 +++++++++++++
 .../moflow/pytorch/runtime/__init__.py        |   0
 .../moflow/pytorch/runtime/arguments.py       |  69 +++++
 .../moflow/pytorch/runtime/common.py          |  93 ++++++
 .../pytorch/runtime/distributed_utils.py      |  71 +++++
 .../moflow/pytorch/runtime/generate.py        |  97 +++++++
 .../moflow/pytorch/runtime/logger.py          | 124 ++++++++
 .../moflow/pytorch/train/__init__.py          |   0
 .../moflow/pytorch/train/evaluator.py         | 110 +++++++
 .../moflow/pytorch/train/trainer.py           | 237 +++++++++++++++
 .../moflow/pytorch/train/trainer_adapter.py   |  44 +++
 .../moflow/pytorch/train/training_state.py    |  37 +++
 training/nvidia/moflow-pytorch/README.md      |  51 ++++
 .../moflow-pytorch/config/config_A100x1x1.py  |   4 +
 .../moflow-pytorch/config/config_A100x1x8.py  |   4 +
 .../moflow-pytorch/config/config_A100x2x8.py  |   3 +
 .../config/environment_variables.sh           |   8 +
 .../moflow-pytorch/config/requirements.txt    |   2 +
 .../moflow-pytorch/extern/trainer_adapter.py  |  17 ++
 40 files changed, 3153 insertions(+)
 create mode 100644 training/benchmarks/moflow/pytorch/README.md
 create mode 100644 training/benchmarks/moflow/pytorch/__init__.py
 create mode 100755 training/benchmarks/moflow/pytorch/config/__init__.py
 create mode 100755 training/benchmarks/moflow/pytorch/config/_base.py
 create mode 100755 training/benchmarks/moflow/pytorch/config/mutable_params.py
 create mode 100644 training/benchmarks/moflow/pytorch/data/__init__.py
 create mode 100644 training/benchmarks/moflow/pytorch/data/data_frame_parser.py
 create mode 100644 training/benchmarks/moflow/pytorch/data/data_loader.py
 create mode 100644 training/benchmarks/moflow/pytorch/data/encoding.py
 create mode 100644 training/benchmarks/moflow/pytorch/data/transform.py
 create mode 100644 training/benchmarks/moflow/pytorch/dataloaders/__init__.py
 create mode 100644 training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
 create mode 100644 training/benchmarks/moflow/pytorch/misc/config.py
 create mode 100644 training/benchmarks/moflow/pytorch/misc/utils.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/__init__.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/basic.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/coupling.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/glow.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/model.py
 create mode 100644 training/benchmarks/moflow/pytorch/model/utils.py
 create mode 100644 training/benchmarks/moflow/pytorch/optimizers/__init__.py
 create mode 100755 training/benchmarks/moflow/pytorch/run_pretraining.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/__init__.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/arguments.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/common.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/distributed_utils.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/generate.py
 create mode 100644 training/benchmarks/moflow/pytorch/runtime/logger.py
 create mode 100755 training/benchmarks/moflow/pytorch/train/__init__.py
 create mode 100755 training/benchmarks/moflow/pytorch/train/evaluator.py
 create mode 100755 training/benchmarks/moflow/pytorch/train/trainer.py
 create mode 100755 training/benchmarks/moflow/pytorch/train/trainer_adapter.py
 create mode 100755 training/benchmarks/moflow/pytorch/train/training_state.py
 create mode 100644 training/nvidia/moflow-pytorch/README.md
 create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x1x1.py
 create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x1x8.py
 create mode 100644 training/nvidia/moflow-pytorch/config/config_A100x2x8.py
 create mode 100755 training/nvidia/moflow-pytorch/config/environment_variables.sh
 create mode 100644 training/nvidia/moflow-pytorch/config/requirements.txt
 create mode 100644 training/nvidia/moflow-pytorch/extern/trainer_adapter.py

diff --git a/training/benchmarks/moflow/pytorch/README.md b/training/benchmarks/moflow/pytorch/README.md
new file mode 100644
index 000000000..ad7ee15bc
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/README.md
@@ -0,0 +1,55 @@
+
+## Model Introduction
+MoFlow is a model for molecule generation that leverages Normalizing Flows. Normalizing Flows is a class of generative neural networks that directly models the probability density of the data. They consist of a sequence of invertible transformations that convert the input data that follow some hard-to-model distribution into a latent code that follows a normal distribution which can then be easily used for sampling.
+
+MoFlow was first introduced by Chengxi Zang et al. in their paper titled "MoFlow: An Invertible Flow Model for Generating Molecular Graphs" [paper](https://arxiv.org/pdf/2006.10137.pdf).
+
+
+
+## Model source code
+This repository includes software from [MoFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow)
+licensed under the Apache License, Version 2.0 
+
+Some of the files in this directory were modified by BAAI in 2024 to support FlagPerf.
+
+## Dataset
+### getting the data
+This [original source code repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#getting-the-data) contains the prepare_datasets.sh script that will automatically download and process the dataset. By default, data will be downloaded to the /data/ directory in the container.
+```bash
+bash prepare_datasets.sh
+```
+### preprocess the dataset
+Start the container with [Dockerfile](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/DrugDiscovery/MoFlow/Dockerfile). Enter the container.
+excute the folowing script to preprocess the dataset.
+```bash
+python3 scripts/data_preprocess.py
+```
+
+### dataset strucutres
+preview directory structures of \<data_dir\>
+```bash
+tree .
+```
+
+```
+.
+├── valid_idx_zinc250k.json
+├── zinc250k.csv
+└── zinc250k_relgcn_kekulized_ggnp.npz
+```
+
+| FileName                           | Size(Bytes) | MD5                              |
+| ---------------------------------- | ----------- | -------------------------------- |
+| valid_idx_zinc250k.json            | 187832      | f8045b49a413c31136a0645d30c0b846 |
+| zinc250k.csv                       | 23736231    | cd330eafb7a2cc413b3c9cafaf3efece |
+| zinc250k_relgcn_kekulized_ggnp.npz | 375680462   | c91985e309a9f76457169859dbe1e662 |
+
+
+## Checkpoint
+- None
+
+## AI Frameworks && Accelerators supports
+
+|            | Pytorch                                    | Paddle | TensorFlow2 |
+| ---------- | ------------------------------------------ | ------ | ----------- |
+| Nvidia GPU | [✅](../../nvidia/moflow-pytorch/README.md) | N/A    | N/A         |
diff --git a/training/benchmarks/moflow/pytorch/__init__.py b/training/benchmarks/moflow/pytorch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/config/__init__.py b/training/benchmarks/moflow/pytorch/config/__init__.py
new file mode 100755
index 000000000..96e0aae70
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/config/__init__.py
@@ -0,0 +1,2 @@
+from ._base import *
+from .mutable_params import mutable_params
diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py
new file mode 100755
index 000000000..b1ff28e92
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/config/_base.py
@@ -0,0 +1,105 @@
+# DO NOT MODIFY THESE REQUIRED PARAMETERS
+
+# Required parameters
+vendor: str = None
+data_dir: str = None
+name: str = "moflow"
+cudnn_benchmark: bool = False
+cudnn_deterministic: bool = True
+
+# Optional parameters
+
+# =========================================================
+# data
+# =========================================================
+# The config to choose. This parameter allows one to switch between different datasets.
+# and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.
+config_name: str = "zinc250k"
+# Number of workers in the data loader.
+num_workers: int = 4
+
+# =========================================================
+# loss scale
+# =========================================================
+# Base learning rate.
+lr: float = 0.0005
+# beta1 parameter for the optimizer.
+beta1: float = 0.9
+# beta2 parameter for the optimizer.
+beta2: float = 0.99
+# Gradient clipping norm.
+clip: float = 1.0
+# =========================================================
+# train && evaluate
+# =========================================================
+# Batch size per GPU for training
+train_batch_size: int = 512
+eval_batch_size: int = 100
+
+target_nuv: float = 85.0
+
+# Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved.
+save_epochs: int = 5
+# Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed.
+eval_epochs: int = 5
+
+# Number of warmup steps. This value is used for benchmarking and for CUDA graph capture.
+warmup_steps: int = 20
+# Number of steps used for training/inference. This parameter allows finishing.
+# training earlier than the specified number of epochs.
+# If used with inference, it allows generating  more molecules (by default only a single batch of molecules is generated).
+steps: int = -1
+# Temperature used for sampling.
+temperature: float = 0.3
+first_epoch: int = 0
+epochs: int = 300
+
+allow_untrained = False
+
+do_train = True
+fp16 = False
+amp: bool = True
+distributed: bool = True
+
+# Directory where checkpoints are stored
+results_dir: str = "moflow_results"
+# Path to store generated molecules. If an empty string is provided, predictions will not be saved (useful for benchmarking and debugging).
+# predictions_path: str = "moflow_results/predictions.smi"
+# =========================================================
+# experiment
+# =========================================================
+# Compile the model with `torch.jit.script`. Can be used to speed up training or inference.
+jit: bool = False
+# Capture GPU kernels with CUDA graphs. This option allows to speed up training
+cuda_graph: bool = True
+# Verbosity level. Specify the following values: 0, 1, 2, 3, where 0 means minimal verbosity (errors only) and 3 - maximal (debugging).
+verbosity: int = 1
+# Path for DLLogger log. This file will contain information about the speed and accuracy of the model during training and inference.
+# Note that if the file already exists, new logs will be added at the end.
+log_path: str = "moflow_results/moflow.json"
+# Frequency for writing logs, expressed in steps.
+log_interval: int = 20
+
+
+# Apply validity correction after the generation of the molecules.
+correct_validity: bool = False
+# =========================================================
+# utils
+# =========================================================
+# Random seed used to initialize the distributed loaders
+seed: int = 1
+dist_backend: str = 'nccl'
+
+device: str = None
+
+# =========================================================
+# for driver
+# =========================================================
+# rank of the GPU, used to launch distributed training.
+local_rank: int = -1
+use_env: bool = True
+log_freq: int = 500
+print_freq: int = 500
+n_device: int = 1
+sync_bn: bool = False
+gradient_accumulation_steps: int = 1
diff --git a/training/benchmarks/moflow/pytorch/config/mutable_params.py b/training/benchmarks/moflow/pytorch/config/mutable_params.py
new file mode 100755
index 000000000..7dcf8652a
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/config/mutable_params.py
@@ -0,0 +1,5 @@
+mutable_params = [
+    'vendor', 'data_dir', 'lr', 'train_batch_size', 'eval_batch_size',
+    'do_train', 'amp', 'fp16', 'distributed', 'dist_backend', 'num_workers',
+    'device', 'cudnn_benchmark', 'cudnn_deterministic'
+]
diff --git a/training/benchmarks/moflow/pytorch/data/__init__.py b/training/benchmarks/moflow/pytorch/data/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/data/data_frame_parser.py b/training/benchmarks/moflow/pytorch/data/data_frame_parser.py
new file mode 100644
index 000000000..ba76fc439
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/data/data_frame_parser.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+from logging import getLogger
+import traceback
+from typing import List
+
+import numpy as np
+import pandas as pd
+from rdkit import Chem
+from tqdm import tqdm
+
+from moflow.data.encoding import MolEncoder, EncodingError
+from moflow.data.data_loader import NumpyTupleDataset
+
+
+class DataFrameParser:
+    """
+    This DataFrameParser parses pandas dataframe containing SMILES and, optionally, some additional features.
+
+    Args:
+        encoder (MolEncoder): encoder instance
+        labels (list): labels column that should be loaded
+        smiles_col (str): smiles column
+    """
+
+    def __init__(self, encoder: MolEncoder,
+                 labels: List[str],
+                 smiles_col: str = 'smiles'):
+        super(DataFrameParser, self).__init__()
+        self.labels = labels
+        self.smiles_col = smiles_col
+        self.logger = getLogger(__name__)
+        self.encoder = encoder
+
+    def parse(self, df: pd.DataFrame) -> NumpyTupleDataset:
+        """Parse DataFrame using `encoder` and prepare a dataset instance
+
+        Labels are extracted from `labels` columns and input features are
+        extracted from smiles information in `smiles` column.
+        """        
+        all_nodes = []
+        all_edges = []
+
+        total_count = df.shape[0]
+        fail_count = 0
+        success_count = 0
+        for smiles in tqdm(df[self.smiles_col], total=df.shape[0]):
+            try:
+                mol = Chem.MolFromSmiles(smiles)
+                if mol is None:
+                    fail_count += 1
+                    continue
+                # Note that smiles expression is not unique.
+                # we obtain canonical smiles
+                nodes, edges = self.encoder.encode_mol(mol)
+
+            except EncodingError as e:
+                fail_count += 1
+                continue
+            except Exception as e:
+                self.logger.warning('parse(), type: {}, {}'
+                                .format(type(e).__name__, e.args))
+                self.logger.info(traceback.format_exc())
+                fail_count += 1
+                continue
+            all_nodes.append(nodes)
+            all_edges.append(edges)
+            success_count += 1
+        
+        result = [np.array(all_nodes), np.array(all_edges), *(df[label_col].values for label_col in self.labels)]
+        self.logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'
+                    .format(fail_count, success_count, total_count))
+  
+        dataset = NumpyTupleDataset(result)
+        return dataset
diff --git a/training/benchmarks/moflow/pytorch/data/data_loader.py b/training/benchmarks/moflow/pytorch/data/data_loader.py
new file mode 100644
index 000000000..28f9378ca
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/data/data_loader.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+import os
+import logging
+from typing import Any, Callable, Iterable, Optional, Tuple
+
+import numpy as np
+from torch.utils.data import Dataset
+
+
+class NumpyTupleDataset(Dataset):
+    """Dataset of a tuple of datasets.
+
+        It combines multiple datasets into one dataset. Each example is represented
+        by a tuple whose ``i``-th item corresponds to the i-th dataset.
+        And each ``i``-th dataset is expected to be an instance of numpy.ndarray.
+
+        Args:
+            datasets: Underlying datasets. The ``i``-th one is used for the
+                ``i``-th item of each example. All datasets must have the same
+                length.
+            transform: An optional function applied to an item bofre returning
+        """
+
+    def __init__(self, datasets: Iterable[np.ndarray], transform: Optional[Callable] = None) -> None:
+        if not datasets:
+            raise ValueError('no datasets are given')
+        length = len(datasets[0])
+        for i, dataset in enumerate(datasets):
+            if len(dataset) != length:
+                raise ValueError(
+                    'dataset of the index {} has a wrong length'.format(i))
+        self._datasets = datasets
+        self._length = length
+        self.transform = transform
+
+    def __len__(self) -> int:
+        return self._length
+
+    def __getitem__(self, index: int) -> Tuple[Any]:
+        item = [dataset[index] for dataset in self._datasets]
+
+        if self.transform:
+            item = self.transform(item)
+        return item
+
+    def get_datasets(self) -> Tuple[np.ndarray]:
+        return self._datasets
+
+
+    def save(self, filepath: str) -> None:
+        """save the dataset to filepath in npz format
+
+        Args:
+            filepath (str): filepath to save dataset. It is recommended to end
+                with '.npz' extension.
+        """
+        np.savez(filepath, *self._datasets)
+        logging.info('Save {} done.'.format(filepath))
+
+    @classmethod
+    def load(cls, filepath: str, transform: Optional[Callable] = None):
+        logging.info('Loading file {}'.format(filepath))
+        if not os.path.exists(filepath):
+            raise ValueError('Invalid filepath {} for dataset'.format(filepath))
+        load_data = np.load(filepath)
+        result = []
+        i = 0
+        while True:
+            key = 'arr_{}'.format(i)
+            if key in load_data.keys():
+                result.append(load_data[key])
+                i += 1
+            else:
+                break
+        return cls(result, transform)
diff --git a/training/benchmarks/moflow/pytorch/data/encoding.py b/training/benchmarks/moflow/pytorch/data/encoding.py
new file mode 100644
index 000000000..d3d71fde9
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/data/encoding.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+from typing import Tuple
+import numpy as np
+from rdkit import Chem
+
+from moflow.config import BOND_TO_CODE, DUMMY_CODE
+
+
+class MolEncoder:
+    """Encodes atoms and adjecency matrix.
+
+    Args:
+        out_size (int): It specifies the size of array returned by
+            `get_input_features`.
+            If the number of atoms in the molecule is less than this value,
+            the returned arrays is padded to have fixed size.
+    """
+
+    def __init__(self, out_size: int):
+        super(MolEncoder, self).__init__()
+        self.out_size = out_size
+
+    def encode_mol(self, mol: Chem.Mol) -> Tuple[np.ndarray, np.ndarray]:
+        """get input features
+
+        Args:
+            mol (Mol):
+
+        Returns:
+
+        """
+        mol = self._standardize_mol(mol)
+        self._check_num_atoms(mol)
+        atom_array = self.construct_atomic_number_array(mol)
+        adj_array = self.construct_discrete_edge_matrix(mol)
+        return atom_array, adj_array
+
+    def _standardize_mol(self, mol: Chem.Mol) -> Chem.Mol:
+        canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False,
+                                            canonical=True)
+        mol = Chem.MolFromSmiles(canonical_smiles)
+        Chem.Kekulize(mol)
+        return mol
+
+    def _check_num_atoms(self, mol: Chem.Mol) -> None:
+        """Check number of atoms in `mol` does not exceed `out_size`"""
+        num_atoms = mol.GetNumAtoms()
+        if num_atoms > self.out_size:
+            raise EncodingError(f'Number of atoms in mol {num_atoms} exceeds num_max_atoms {self.out_size}')
+
+
+    def construct_atomic_number_array(self, mol: Chem.Mol) -> np.ndarray:
+        """Returns atomic numbers of atoms consisting a molecule.
+
+        Args:
+            mol (rdkit.Chem.Mol): Input molecule.
+
+        Returns:
+            numpy.ndarray: an array consisting of atomic numbers
+                of atoms in the molecule.
+        """
+
+        atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()]
+        n_atom = len(atom_list)
+        if self.out_size < n_atom:
+            raise EncodingError(f'out_size {self.out_size} is smaller than number of atoms in mol {n_atom}')
+        atom_array = np.full(self.out_size, DUMMY_CODE, dtype=np.uint8)
+        atom_array[:n_atom] = atom_list
+        return atom_array
+
+        
+    def construct_discrete_edge_matrix(self, mol: Chem.Mol) -> np.ndarray:
+        """Returns the edge-type dependent adjacency matrix of the given molecule.
+
+        Args:
+            mol (rdkit.Chem.Mol): Input molecule.
+
+        Returns:
+            adj_array (numpy.ndarray): The adjacent matrix of the input molecule.
+                It is symmetrical 2-dimensional array with shape (out_size, out_size),
+                filled with integers representing bond types. It two atoms are not
+                conncted, DUMMY_CODE is used instead.
+        """
+        if mol is None:
+            raise EncodingError('mol is None')
+        n_atom = mol.GetNumAtoms()
+
+        if self.out_size < n_atom:
+            raise EncodingError(f'out_size {self.out_size} is smaller than number of atoms in mol {n_atom}')
+
+        adjs = np.full((self.out_size, self.out_size), DUMMY_CODE, dtype=np.uint8)
+
+        for bond in mol.GetBonds():
+            bond_type = bond.GetBondType()
+            # we need to use code here - bond types are rdkit objects
+            code = BOND_TO_CODE[bond_type]
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            adjs[[i, j], [j, i]] = code
+        return adjs
+
+
+class EncodingError(Exception):
+    pass
diff --git a/training/benchmarks/moflow/pytorch/data/transform.py b/training/benchmarks/moflow/pytorch/data/transform.py
new file mode 100644
index 000000000..da068e9b0
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/data/transform.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+import json
+import logging
+import numpy as np
+import os
+from typing import Dict, Tuple
+
+from misc.config import CODE_TO_BOND, DUMMY_CODE, Config
+
+
+def _onehot(data: np.ndarray, codes_dict: Dict[int, int], dtype=np.float32) -> np.ndarray:
+    shape = [len(codes_dict), *data.shape]
+    encoded = np.zeros(shape, dtype=dtype)
+    for obj_key, code in codes_dict.items():
+        encoded[code, data == obj_key] = 1
+    return encoded
+
+
+def encode_nodes(atomic_nums: np.ndarray, config: Config) -> np.ndarray:
+    padded_data = np.full(config.max_num_nodes, DUMMY_CODE, dtype=np.uint8)
+    padded_data[:len(atomic_nums)] = atomic_nums
+    encoded = _onehot(padded_data, config.dataset_config.atomic_to_code).T
+    return encoded
+
+
+def encode_edges(adj: np.ndarray, config: Config) -> np.ndarray:
+    padded_data = np.full((config.max_num_nodes, config.max_num_nodes), DUMMY_CODE, dtype=np.uint8)
+    n, m = adj.shape
+    assert n == m, 'adjecency matrix should be square'
+    padded_data[:n, :n] = adj
+    # we already store codes in the file - bond types are rdkit objects
+    encoded = _onehot(padded_data, {k:k for k in CODE_TO_BOND})
+    return encoded
+
+
+def transform_fn(data: Tuple[np.ndarray], config: Config) -> Tuple[np.ndarray]:
+    node, adj, *labels = data
+    node = encode_nodes(node, config)
+    adj = encode_edges(adj, config)
+    return (node, adj, *labels)
+
+
+def get_val_ids(config: Config, data_dir: str):
+    file_path = os.path.join(data_dir, config.dataset_config.valid_idx_file)
+    logging.info('loading train/valid split information from: {}'.format(file_path))
+    with open(file_path) as json_data:
+        data = json.load(json_data)
+
+    val_ids = [int(idx)-1 for idx in data]
+    return val_ids
diff --git a/training/benchmarks/moflow/pytorch/dataloaders/__init__.py b/training/benchmarks/moflow/pytorch/dataloaders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
new file mode 100644
index 000000000..a9591b20f
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
@@ -0,0 +1,68 @@
+import os
+import functools
+
+import torch
+from torch.utils.data.distributed import DistributedSampler
+
+from data.data_loader import NumpyTupleDataset
+from data import transform
+from misc.config import CONFIGS
+
+
+def build_datasets(args):
+
+    # Model configuration
+    assert args.config_name in CONFIGS
+    config = CONFIGS[args.config_name]
+    data_file = config.dataset_config.dataset_file
+    transform_fn = functools.partial(transform.transform_fn, config=config)
+    valid_idx = transform.get_val_ids(config, args.data_dir)
+
+    # Datasets:
+    data_file_path = os.path.join(args.data_dir, data_file)
+    print(f"data_file_path: {data_file_path}")
+
+    dataset = NumpyTupleDataset.load(
+        os.path.join(args.data_dir, data_file),
+        transform=transform_fn,
+    )
+    if len(valid_idx) == 0:
+        raise ValueError('Empty validation set!')
+    else:
+        print(f"valid_idx size: {len(valid_idx)}")
+
+    train_idx = [t for t in range(len(dataset)) if t not in valid_idx]
+    train_dataset = torch.utils.data.Subset(dataset, train_idx)
+    test_dataset = torch.utils.data.Subset(dataset, valid_idx)
+    return train_dataset, test_dataset
+
+
+
+def _get_sampler(args, train_dataset):
+    if args.distributed:
+        sampler = DistributedSampler(train_dataset,
+                                     seed=args.seed,
+                                     drop_last=False)
+    else:
+        sampler = None
+    return sampler
+
+
+def build_train_dataloader(args, train_dataset):
+    sampler = _get_sampler(args, train_dataset)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=sampler is None,
+        sampler=sampler,
+        num_workers=args.num_workers,
+        drop_last=True,
+    )
+
+    if args.distributed:
+        train_dataloader.sampler.set_epoch(-1)
+    return train_dataloader
+
+
+def build_eval_dataloader(args, eval_dataset):
+    return None
diff --git a/training/benchmarks/moflow/pytorch/misc/config.py b/training/benchmarks/moflow/pytorch/misc/config.py
new file mode 100644
index 000000000..8bf4d07c4
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/misc/config.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import asdict, dataclass, field
+import json
+from typing import Dict, List, Optional
+
+from rdkit import Chem
+
+
+_VALID_IDX_FILE = 'valid_idx_{}.json'
+_CSV_FILE = '{}.csv'
+_DATASET_FILE = '{}_relgcn_kekulized_ggnp.npz'
+
+DUMMY_CODE = 0
+CODE_TO_BOND = dict(enumerate([
+    'DUMMY',
+    Chem.rdchem.BondType.SINGLE,
+    Chem.rdchem.BondType.DOUBLE,
+    Chem.rdchem.BondType.TRIPLE,
+]))
+BOND_TO_CODE = {v: k for k, v in CODE_TO_BOND.items()}
+ATOM_VALENCY = {6: 4, 7: 3, 8: 2, 9: 1, 15: 3, 16: 2, 17: 1, 35: 1, 53: 1}
+
+
+@dataclass
+class DatasetConfig:
+    dataset_name: str
+    atomic_num_list: List[int]
+    max_num_atoms: int
+    labels: List[str]
+    smiles_col: str
+    code_to_atomic: Dict[int, int] = field(init=False)
+    atomic_to_code: Dict[int, int] = field(init=False)
+    valid_idx_file: str = field(init=False)
+    csv_file: str = field(init=False)
+    dataset_file: str = field(init=False)
+
+    def __post_init__(self):
+        self.valid_idx_file = _VALID_IDX_FILE.format(self.dataset_name)
+        self.csv_file = _CSV_FILE.format(self.dataset_name)
+        self.dataset_file = _DATASET_FILE.format(self.dataset_name)
+
+        self.code_to_atomic = dict(enumerate(sorted([DUMMY_CODE] + self.atomic_num_list)))
+        self.atomic_to_code = {v: k for k, v in self.code_to_atomic.items()}
+
+
+@dataclass
+class AtomFlowConfig:
+    n_flow: int
+    hidden_gnn: List[int]
+    hidden_lin: List[int]
+    n_block: int = 1
+    mask_row_size_list: List[int] = field(default_factory=lambda: [1])
+    mask_row_stride_list: List[int] = field(default_factory=lambda: [1])
+
+@dataclass
+class BondFlowConfig:
+    hidden_ch: List[int]
+    conv_lu: int
+    n_squeeze: int
+    n_block: int = 1
+    n_flow: int = 10
+
+
+@dataclass
+class ModelConfig:
+    atom_config: AtomFlowConfig
+    bond_config: BondFlowConfig
+    noise_scale: float = 0.6
+    learn_dist: bool = True
+
+@dataclass
+class Config:
+    dataset_config: DatasetConfig
+    model_config: ModelConfig
+    max_num_nodes: Optional[int] = None
+    num_node_features: Optional[int] = None
+    num_edge_features: int = len(CODE_TO_BOND)
+    z_dim: int = field(init=False)
+
+    def __post_init__(self):
+        if self.max_num_nodes is None:
+            self.max_num_nodes = self.dataset_config.max_num_atoms
+        if self.num_node_features is None:
+            self.num_node_features = len(self.dataset_config.code_to_atomic)
+        bonds_dim = self.max_num_nodes * self.max_num_nodes * self.num_edge_features
+        atoms_dim = self.max_num_nodes * self.num_node_features
+        self.z_dim = bonds_dim + atoms_dim
+
+
+    def save(self, path):
+        self.path = path
+        with open(path, 'w') as f:
+            json.dump(asdict(self), f, indent=4, sort_keys=True)
+
+    @classmethod
+    def load(cls, path):
+        with open(path, 'r') as f:
+            data = json.load(f)
+        return cls(**data)
+
+    def __repr__(self) -> str:
+        return json.dumps(asdict(self), indent=4, separators=(',', ': '))
+
+
+ZINC250K_CONFIG = Config(
+    max_num_nodes=40,
+    dataset_config=DatasetConfig(
+        dataset_name='zinc250k',
+        atomic_num_list=[6, 7, 8, 9, 15, 16, 17, 35, 53],
+        max_num_atoms=38,
+        labels=['logP', 'qed', 'SAS'],
+        smiles_col='smiles',
+    ),
+    model_config=ModelConfig(
+        AtomFlowConfig(
+            n_flow=38,
+            hidden_gnn=[256],
+            hidden_lin=[512, 64],
+        ),
+        BondFlowConfig(
+            n_squeeze=20,
+            hidden_ch=[512, 512],
+            conv_lu=2
+        ),
+    )
+)
+
+CONFIGS = {'zinc250k': ZINC250K_CONFIG}
diff --git a/training/benchmarks/moflow/pytorch/misc/utils.py b/training/benchmarks/moflow/pytorch/misc/utils.py
new file mode 100644
index 000000000..465b2f6ab
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/misc/utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+import re
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from rdkit import Chem
+import torch
+
+from misc.config import Config, ATOM_VALENCY, CODE_TO_BOND, DUMMY_CODE
+
+
+def postprocess_predictions(x: Union[torch.Tensor, np.ndarray], adj: Union[torch.Tensor, np.ndarray], config: Config) -> Tuple[np.ndarray, np.ndarray]:
+    assert x.ndim == 3 and adj.ndim == 4, 'expected batched predictions'
+    n = config.dataset_config.max_num_atoms
+    adj = adj[:, :, :n, :n]
+    x = x[:, :n]
+
+    atoms = torch.argmax(x, dim=2)
+    atoms = _to_numpy_array(atoms)
+
+    adj = torch.argmax(adj, dim=1)
+    adj = _to_numpy_array(adj)
+
+    decoded = np.zeros_like(atoms)
+    for code, atomic_num in config.dataset_config.code_to_atomic.items():
+        decoded[atoms == code] = atomic_num
+
+    return decoded, adj
+
+
+def convert_predictions_to_mols(adj: np.ndarray, x: np.ndarray, correct_validity: bool = False) -> List[Chem.Mol]:
+    molecules = [construct_mol(x_elem, adj_elem) for x_elem, adj_elem in zip(x, adj)]
+
+    if correct_validity:
+        molecules = [correct_mol(mol) for mol in molecules]
+    return molecules
+
+
+def construct_mol(atoms: np.ndarray, adj: np.ndarray) -> Chem.Mol:
+    from rdkit import RDLogger
+    RDLogger.DisableLog('rdApp.*')
+    atoms_exist = (atoms != 0)
+    atoms = atoms[atoms_exist]
+    adj = adj[atoms_exist][:, atoms_exist]
+
+    mol = Chem.RWMol()
+
+    for atom in atoms:
+        mol.AddAtom(Chem.Atom(int(atom)))
+
+    for start, end in zip(*np.where(adj != DUMMY_CODE)):
+        if start > end:
+            mol.AddBond(int(start), int(end), CODE_TO_BOND[int(adj[start, end])])
+            # add formal charge to atom: e.g. [O+], [N+] [S+]
+            # not support [O-], [N-] [S-]  [NH+] etc.
+            flag, atomid_valence = check_valency(mol)
+            if flag:
+                continue
+            else:
+                assert len(atomid_valence) == 2
+                idx = atomid_valence[0]
+                v = atomid_valence[1]
+                an = mol.GetAtomWithIdx(idx).GetAtomicNum()
+                if an in (7, 8, 16) and (v - ATOM_VALENCY[an]) == 1:
+                    mol.GetAtomWithIdx(idx).SetFormalCharge(1)
+    return mol
+
+
+def valid_mol(x: Optional[Chem.Mol]) -> Optional[Chem.Mol]:
+    if x is None:
+        # RDKit wasn't able to create the mol
+        return None
+    smi = Chem.MolToSmiles(x, isomericSmiles=True)
+    if len(smi) == 0 or '.' in smi:
+        # Mol is empty or fragmented
+        return None
+    reloaded = Chem.MolFromSmiles(smi)
+    # if smiles is invalid - it will be None, otherwise mol is valid
+    return reloaded
+
+
+def check_valency(mol: Chem.Mol) -> Tuple[bool, List[int]]:
+    """Checks that no atoms in the mol have exceeded their possible
+    valency. Returns True if no valency issues, False otherwise
+    plus information about problematic atom.
+    """
+    try:
+        Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
+        return True, None
+    except ValueError as e:
+        e = str(e)
+        p = e.find('#')
+        e_sub = e[p:]
+        atomid_valence = list(map(int, re.findall(r'\d+', e_sub)))
+        return False, atomid_valence
+
+
+def correct_mol(mol: Chem.Mol) -> Chem.Mol:
+    flag, atomid_valence = check_valency(mol)
+    while not flag:
+        assert len(atomid_valence) == 2
+        idx = atomid_valence[0]
+        v = atomid_valence[1]
+        queue = []
+        for b in mol.GetAtomWithIdx(idx).GetBonds():
+            queue.append(
+                (b.GetIdx(), int(b.GetBondType()), b.GetBeginAtomIdx(), b.GetEndAtomIdx())
+            )
+        queue.sort(key=lambda tup: tup[1], reverse=True)
+        if len(queue) > 0:
+            start = queue[0][2]
+            end = queue[0][3]
+            t = queue[0][1] - 1
+            mol.RemoveBond(start, end)
+            if t >= 1:
+                mol.AddBond(start, end, CODE_TO_BOND[t])
+        flag, atomid_valence = check_valency(mol)
+
+    # if mol is fragmented, select the largest fragment
+    mols = Chem.GetMolFrags(mol, asMols=True)
+    mol = max(mols, key=lambda m: m.GetNumAtoms())
+
+    return mol
+
+
+def predictions_to_smiles(adj: torch.Tensor, x: torch.Tensor, config: Config) -> List[str]:
+    x, adj = postprocess_predictions(x, adj, config=config)
+    valid = [Chem.MolToSmiles(construct_mol(x_elem, adj_elem), isomericSmiles=True)
+             for x_elem, adj_elem in zip(x, adj)]
+    return valid
+
+
+def check_validity(molecules: List[Chem.Mol]) -> dict:
+    valid = [valid_mol(mol) for mol in molecules]
+    valid = [mol for mol in valid if mol is not None]
+
+    n_mols = len(molecules)
+    valid_ratio = len(valid) / n_mols
+    valid_smiles = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in valid]
+    unique_smiles = list(set(valid_smiles))
+    unique_ratio = 0.
+    if len(valid) > 0:
+        unique_ratio = len(unique_smiles) / len(valid)
+    valid_mols = [Chem.MolFromSmiles(s) for s in valid_smiles]
+    abs_unique_ratio = len(unique_smiles) / n_mols
+
+    results = dict()
+    results['valid_mols'] = valid_mols
+    results['valid_smiles'] = valid_smiles
+    results['valid_ratio'] = valid_ratio * 100
+    results['unique_ratio'] = unique_ratio * 100
+    results['abs_unique_ratio'] = abs_unique_ratio * 100
+
+    return results
+
+
+def check_novelty(gen_smiles: List[str], train_smiles: List[str], n_generated_mols: int):
+    if len(gen_smiles) == 0:
+        novel_ratio = 0.
+        abs_novel_ratio = 0.
+    else:
+        duplicates = [1 for mol in gen_smiles if mol in train_smiles]
+        novel = len(gen_smiles) - sum(duplicates)
+        novel_ratio = novel * 100. / len(gen_smiles)
+        abs_novel_ratio = novel * 100. / n_generated_mols
+    return novel_ratio, abs_novel_ratio
+
+
+def _to_numpy_array(a):
+    if isinstance(a, torch.Tensor):
+        a = a.cpu().detach().numpy()
+    elif isinstance(a, np.ndarray):
+        pass
+    else:
+        raise TypeError("a ({}) is not a torch.Tensor".format(type(a)))
+    return a
diff --git a/training/benchmarks/moflow/pytorch/model/__init__.py b/training/benchmarks/moflow/pytorch/model/__init__.py
new file mode 100644
index 000000000..5d61ffbda
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/__init__.py
@@ -0,0 +1,6 @@
+from model.model import MoFlow
+
+
+def create_model(config):
+    model = MoFlow(config)
+    return model
diff --git a/training/benchmarks/moflow/pytorch/model/basic.py b/training/benchmarks/moflow/pytorch/model/basic.py
new file mode 100644
index 000000000..d52c3680b
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/basic.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+import math
+from typing import Tuple
+import numpy as np
+from scipy import linalg as la
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from runtime.distributed_utils import get_world_size, reduce_tensor
+
+
+class ActNorm(nn.Module):
+    def __init__(self, num_channels, num_dims, channels_dim=1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.num_dims = num_dims
+        self.channels_dim = channels_dim
+        self.shape = [1] * num_dims
+        self.shape[channels_dim] = num_channels
+        self.loc = nn.Parameter(torch.zeros(*self.shape))
+        self.scale = nn.Parameter(torch.ones(*self.shape))
+
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+        self.register_buffer('num_elements', torch.tensor(0, dtype=torch.uint8))
+
+    @torch.jit.ignore
+    def initialize(self, input):
+        if self.initialized.item() == 1:
+            return
+
+        dims = list(input.shape[1:])
+        del dims[self.channels_dim -1]
+
+        num_elems = math.prod(dims)
+        permutation = [self.channels_dim] + [i for i in range(self.num_dims) if i != self.channels_dim]
+        with torch.no_grad():
+
+            flatten = input.permute(*permutation).contiguous().view(self.num_channels, -1)
+            mean = flatten.mean(1).view(self.shape)
+            std = flatten.std(1).view(self.shape)
+
+            num_gpus = get_world_size()
+            mean = reduce_tensor(mean, num_gpus)
+            std = reduce_tensor(std, num_gpus)
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+            self.initialized.fill_(1)
+            self.num_elements.fill_(num_elems)
+
+    def forward(self, input):
+        log_abs = torch.log(torch.abs(self.scale))
+        logdet = self.num_elements * torch.sum(log_abs)
+        return self.scale * (input + self.loc), logdet
+
+    @torch.jit.export
+    def reverse(self, output):
+        return output / self.scale - self.loc
+
+
+class InvConv2d(nn.Module):
+    def __init__(self, in_channel):
+        super().__init__()
+
+        weight = torch.randn(in_channel, in_channel)
+        q, _ = torch.qr(weight)
+        weight = q.unsqueeze(2).unsqueeze(3)
+        self.weight = nn.Parameter(weight)
+
+    def forward(self, input):
+        _, _, height, width = input.shape
+
+        out = F.conv2d(input, self.weight)
+        logdet = (
+            height * width * torch.slogdet(self.weight.squeeze().double())[1].float()
+        )
+
+        return out, logdet
+
+    def reverse(self, output):
+        return F.conv2d(
+            output, self.weight.squeeze().inverse().unsqueeze(2).unsqueeze(3)
+        )
+
+
+class InvConv2dLU(nn.Module):
+    def __init__(self, in_channel):
+        super().__init__()
+
+        weight = np.random.randn(in_channel, in_channel)
+        q, _ = la.qr(weight)
+        w_p, w_l, w_u = la.lu(q.astype(np.float32))
+        w_s = np.diag(w_u)
+        w_u = np.triu(w_u, 1)
+        u_mask = np.triu(np.ones_like(w_u), 1)
+        l_mask = u_mask.T
+
+        w_p = torch.from_numpy(w_p)
+        w_l = torch.from_numpy(w_l).contiguous()
+        w_s = torch.from_numpy(w_s)
+        w_u = torch.from_numpy(w_u)
+
+        self.register_buffer('w_p', w_p)
+        self.register_buffer('u_mask', torch.from_numpy(u_mask))
+        self.register_buffer('l_mask', torch.from_numpy(l_mask))
+        self.register_buffer('s_sign', torch.sign(w_s))
+        self.register_buffer('l_eye', torch.eye(l_mask.shape[0]))
+        self.w_l = nn.Parameter(w_l)
+        self.w_s = nn.Parameter(torch.log(torch.abs(w_s)))
+        self.w_u = nn.Parameter(w_u)
+
+    def forward(self, input):
+        _, _, height, width = input.shape
+
+        weight = self.calc_weight()
+
+        out = F.conv2d(input, weight)
+        logdet = height * width * torch.sum(self.w_s)
+
+        return out, logdet
+
+    def calc_weight(self):
+        weight = (
+            self.w_p
+            @ (self.w_l * self.l_mask + self.l_eye)
+            @ ((self.w_u * self.u_mask) + torch.diag(self.s_sign * torch.exp(self.w_s)))
+        )
+
+        return weight.unsqueeze(2).unsqueeze(3)
+
+    def reverse(self, output):
+        weight = self.calc_weight()
+        dtype = weight.dtype
+        weight = weight.float()
+        weight_inv = weight.squeeze().inverse().unsqueeze(2).unsqueeze(3)
+        weight_inv = weight_inv.to(dtype=dtype)
+
+        return F.conv2d(output, weight_inv)
+
+
+class GraphConv(nn.Module):
+    def __init__(self, in_channels, out_channels, num_atoms, num_edge_type=4):
+        super(GraphConv, self).__init__()
+
+        self.graph_linear_self = nn.Linear(in_channels, out_channels)
+        self.graph_linear_edge = nn.Linear(in_channels, out_channels * num_edge_type)
+        self.num_edge_type = num_edge_type
+        self.in_ch = in_channels
+        self.out_ch = out_channels
+        self.num_atoms = num_atoms
+
+    def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        adj, nodes = graph
+        hs = self.graph_linear_self(nodes)
+        m = self.graph_linear_edge(nodes)
+        m = m.view(-1, self.num_atoms, self.out_ch, self.num_edge_type)
+        hr = torch.einsum('bemn,bnce->bmc', adj, m)
+        hr = hr.unsqueeze(2)
+        return hs + hr
diff --git a/training/benchmarks/moflow/pytorch/model/coupling.py b/training/benchmarks/moflow/pytorch/model/coupling.py
new file mode 100644
index 000000000..a97c9b55a
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/coupling.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+from typing import Tuple
+import torch
+import torch.nn as nn
+from torch.nn.functional import logsigmoid
+
+from model.basic import GraphConv
+
+
+def sigmoid_inverse(x):
+    """Calculates 1/sigmoid(x) in a more numerically stable way"""
+    return 1 + torch.exp(-x)
+
+
+class AffineCoupling(nn.Module):  # delete
+    def __init__(self, in_channel, hidden_channels, mask_swap=False):  # filter_size=512,  --> hidden_channels =(512, 512)
+        super(AffineCoupling, self).__init__()
+
+        self.mask_swap=mask_swap
+        # self.norms_in = nn.ModuleList()
+        last_h = in_channel // 2
+        vh = tuple(hidden_channels)
+        layers = []
+        for h in vh:
+            layers.append(nn.Conv2d(last_h, h, kernel_size=3, padding=1))
+            layers.append(nn.BatchNorm2d(h))
+            layers.append(nn.ReLU(inplace=True))
+            last_h = h
+        layers.append(nn.Conv2d(last_h, in_channel, kernel_size=3, padding=1))
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        in_a, in_b = input.chunk(2, 1)  # (2,12,32,32) --> (2,6,32,32), (2,6,32,32)
+
+        if self.mask_swap:
+            in_a, in_b = in_b, in_a
+
+        s_logits, t = self._s_t_function(in_a)
+        s = torch.sigmoid(s_logits)
+        out_b = (in_b + t) * s
+        logdet = torch.sum(logsigmoid(s_logits).reshape(input.shape[0], -1), 1)
+        
+        if self.mask_swap:
+            result = torch.cat([out_b, in_a], 1)
+        else:
+            result = torch.cat([in_a, out_b], 1)
+
+        return result, logdet
+
+    @torch.jit.export
+    def reverse(self, output: torch.Tensor) -> torch.Tensor:
+        out_a, out_b = output.chunk(2, 1)
+        if self.mask_swap:
+            out_a, out_b = out_b, out_a
+
+        s_logits, t = self._s_t_function(out_a)
+        s_inverse = sigmoid_inverse(s_logits)
+        in_b = out_b * s_inverse - t
+
+        if self.mask_swap:
+            result = torch.cat([in_b, out_a], 1)
+        else:
+            result = torch.cat([out_a, in_b], 1)
+
+        return result
+
+    def _s_t_function(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        h = self.layers(x)
+        s_logits, t = h.chunk(2, 1)
+        return s_logits, t
+
+
+class ConvCouplingBlock(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, n_node: int) -> None:
+        super().__init__()
+        self.graph_conv = GraphConv(in_dim, out_dim, n_node)
+        self.bn = nn.BatchNorm2d(n_node)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        adj, nodes = graph
+        h = self.graph_conv(graph)
+        h = h.to(memory_format=torch.channels_last)
+        h = self.bn(h)
+        h = self.relu(h)
+        return adj, h
+
+
+class LinCouplingBlock(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, n_node: int) -> None:
+        super().__init__()
+        self.lin = nn.Linear(in_dim, out_dim)
+        self.bn = nn.BatchNorm2d(n_node)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.lin(x)
+        h = h.to(memory_format=torch.channels_last)
+        h = self.bn(h)
+        h = self.relu(h)
+        return h
+
+
+class GraphAffineCoupling(nn.Module):
+    def __init__(self, n_node, in_dim, hidden_dim_dict, masked_row):
+        super(GraphAffineCoupling, self).__init__()
+        self.n_node = n_node
+        self.in_dim = in_dim
+        self.hidden_dim_dict = hidden_dim_dict
+        self.masked_row = masked_row
+
+        self.hidden_dim_gnn = hidden_dim_dict['gnn']
+        self.hidden_dim_linear = hidden_dim_dict['linear']
+
+        conv_layers = []
+        last_dim = in_dim
+        for out_dim in self.hidden_dim_gnn:
+            conv_layers.append(ConvCouplingBlock(last_dim, out_dim, n_node))
+            last_dim = out_dim
+        self.net_conv = nn.ModuleList(conv_layers)
+
+        lin_layers = []
+        for out_dim in self.hidden_dim_linear:
+            lin_layers.append(LinCouplingBlock(last_dim, out_dim, n_node))
+            last_dim = out_dim
+        lin_layers.append(nn.Linear(last_dim, in_dim*2))
+        self.net_lin = nn.Sequential(*lin_layers)
+
+        mask = torch.ones(n_node, in_dim)
+        mask[masked_row, :] = 0  # masked_row are kept same, and used for _s_t for updating the left rows
+        self.register_buffer('mask', mask)
+
+    def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        adj, input = graph
+        masked_x = self.mask * input
+        masked_x_sq = masked_x.unsqueeze(2)
+        s_logits, t = self._s_t_function((adj, masked_x_sq))
+        s = torch.sigmoid(s_logits)
+        out = masked_x + (1-self.mask) * (input + t) * s
+        logdet = torch.sum(logsigmoid(s_logits).reshape(input.shape[0], -1), 1)
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        adj, output = graph
+        masked_y = self.mask * output
+        masked_y_sq = masked_y.unsqueeze(2)
+        s_logits, t = self._s_t_function((adj, masked_y_sq))
+        s_inverse = sigmoid_inverse(s_logits)
+        input = masked_y + (1 - self.mask) * (output * s_inverse - t)
+        return input
+
+    def _s_t_function(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        for l in self.net_conv:
+            graph = l(graph)
+        adj, h = graph
+        h = self.net_lin(h)
+        h = h.squeeze(2)
+        s_logits, t = h.chunk(2, dim=-1)
+
+        return s_logits, t
diff --git a/training/benchmarks/moflow/pytorch/model/glow.py b/training/benchmarks/moflow/pytorch/model/glow.py
new file mode 100644
index 000000000..d5e944d6a
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/glow.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+from typing import Tuple
+import torch
+import torch.nn as nn
+
+from model.basic import ActNorm, InvConv2dLU, InvConv2d
+from model.coupling import AffineCoupling, GraphAffineCoupling
+
+
+class Flow(nn.Module):
+    def __init__(self, in_channel, hidden_channels, conv_lu=2, mask_swap=False):
+        super(Flow, self).__init__()
+
+        # More stable to support more flows
+        self.actnorm = ActNorm(num_channels=in_channel, num_dims=4)
+
+        if conv_lu == 0:
+            self.invconv = InvConv2d(in_channel)
+        elif conv_lu == 1:
+            self.invconv = InvConv2dLU(in_channel)
+        elif conv_lu == 2:
+            self.invconv = None
+        else:
+            raise ValueError("conv_lu in {0,1,2}, 0:InvConv2d, 1:InvConv2dLU, 2:none-just swap to update in coupling")
+
+        self.coupling = AffineCoupling(in_channel, hidden_channels, mask_swap=mask_swap)
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        out, logdet = self.actnorm(input)
+        if self.invconv is not None:
+            out, det1 = self.invconv(out)
+        else:
+            det1 = 0
+        out, det2 = self.coupling(out)
+
+        logdet = logdet + det1
+        if det2 is not None:
+            logdet = logdet + det2
+
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, output: torch.Tensor) -> torch.Tensor:
+        input = self.coupling.reverse(output)
+        if self.invconv is not None:
+            input = self.invconv.reverse(input)
+        input = self.actnorm.reverse(input)
+
+        return input
+
+
+class FlowOnGraph(nn.Module):
+    def __init__(self, n_node, in_dim, hidden_dim_dict, masked_row):
+        super(FlowOnGraph, self).__init__()
+        self.n_node = n_node
+        self.in_dim = in_dim
+        self.hidden_dim_dict = hidden_dim_dict
+        self.masked_row = masked_row
+        self.actnorm = ActNorm(num_channels=n_node, num_dims=3)
+        self.coupling = GraphAffineCoupling(n_node, in_dim, hidden_dim_dict, masked_row)
+
+    def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        adj, input = graph
+        out, logdet = self.actnorm(input)
+        det1 = 0
+        out, det2 = self.coupling((adj, out))
+
+        logdet = logdet + det1
+        if det2 is not None:
+            logdet = logdet + det2
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        adj, output = graph
+        input = self.coupling.reverse((adj, output))
+        input = self.actnorm.reverse(input)
+        return input
+
+
+class Block(nn.Module):
+    def __init__(self, in_channel, n_flow, squeeze_fold, hidden_channels, conv_lu=2):
+        super(Block, self).__init__()
+        self.squeeze_fold = squeeze_fold
+        squeeze_dim = in_channel * self.squeeze_fold * self.squeeze_fold
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flow):
+            if conv_lu in (0, 1):
+                self.flows.append(Flow(squeeze_dim, hidden_channels,
+                                       conv_lu=conv_lu, mask_swap=False))
+            else:
+                self.flows.append(Flow(squeeze_dim, hidden_channels,
+                                       conv_lu=2, mask_swap=bool(i % 2)))
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        out = self._squeeze(input)
+        logdet = 0
+
+        for flow in self.flows:
+            out, det = flow(out)
+            logdet = logdet + det
+
+        out = self._unsqueeze(out)
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, output: torch.Tensor) -> torch.Tensor:
+        input = self._squeeze(output)
+
+        for flow in self.flows[::-1]:
+            input = flow.reverse(input)
+
+        unsqueezed = self._unsqueeze(input)
+        return unsqueezed
+
+    def _squeeze(self, x: torch.Tensor) -> torch.Tensor:
+        """Trade spatial extent for channels. In forward direction, convert each
+        1x4x4 volume of input into a 4x1x1 volume of output.
+
+        Args:
+            x (torch.Tensor): Input to squeeze or unsqueeze.
+            reverse (bool): Reverse the operation, i.e., unsqueeze.
+
+        Returns:
+            x (torch.Tensor): Squeezed or unsqueezed tensor.
+        """
+        assert len(x.shape) == 4
+        b_size, n_channel, height, width = x.shape
+        fold = self.squeeze_fold
+
+        squeezed = x.view(b_size, n_channel, height // fold,  fold,  width // fold,  fold)
+        squeezed = squeezed.permute(0, 1, 3, 5, 2, 4).contiguous()
+        out = squeezed.view(b_size, n_channel * fold * fold, height // fold, width // fold)
+        return out
+
+    def _unsqueeze(self, x: torch.Tensor) -> torch.Tensor:
+        assert len(x.shape) == 4
+        b_size, n_channel, height, width = x.shape
+        fold = self.squeeze_fold
+        unsqueezed = x.view(b_size, n_channel // (fold * fold), fold, fold, height, width)
+        unsqueezed = unsqueezed.permute(0, 1, 4, 2, 5, 3).contiguous()
+        out = unsqueezed.view(b_size, n_channel // (fold * fold), height * fold, width * fold)
+        return out
+
+
+class BlockOnGraph(nn.Module):
+    def __init__(self, n_node, in_dim, hidden_dim_dict, n_flow, mask_row_size=1, mask_row_stride=1):
+        super(BlockOnGraph, self).__init__()
+        assert 0 < mask_row_size < n_node
+        self.flows = nn.ModuleList()
+        for i in range(n_flow):
+            start = i * mask_row_stride
+            masked_row =[r % n_node for r in range(start, start+mask_row_size)]
+            self.flows.append(FlowOnGraph(n_node, in_dim, hidden_dim_dict, masked_row=masked_row))
+
+    def forward(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        adj, input = graph
+        out = input
+        logdet = 0
+        for flow in self.flows:
+            out, det = flow((adj, out))
+            logdet = logdet + det
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        adj, output = graph
+        input = output
+        for flow in self.flows[::-1]:
+            input = flow.reverse((adj, input))
+        return input
+
+
+class Glow(nn.Module):
+    def __init__(self, in_channel, n_flow, n_block, squeeze_fold, hidden_channel, conv_lu=2):
+        super(Glow, self).__init__()
+
+        self.blocks = nn.ModuleList()
+        n_channel = in_channel
+        for i in range(n_block):
+            self.blocks.append(Block(n_channel, n_flow, squeeze_fold, hidden_channel, conv_lu=conv_lu))
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logdet = 0
+        out = input
+
+        for block in self.blocks:
+            out, det = block(out)
+            logdet = logdet + det
+
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, z: torch.Tensor) -> torch.Tensor:
+        h = z
+        for i, block in enumerate(self.blocks[::-1]):
+            h = block.reverse(h)
+
+        return h
+
+
+class GlowOnGraph(nn.Module):
+    def __init__(self, n_node, in_dim, hidden_dim_dict, n_flow, n_block,
+                 mask_row_size_list=(2,), mask_row_stride_list=(1,)):
+        super(GlowOnGraph, self).__init__()
+
+        assert len(mask_row_size_list) == n_block or len(mask_row_size_list) == 1
+        assert len(mask_row_stride_list) == n_block or len(mask_row_stride_list) == 1
+        if len(mask_row_size_list) == 1:
+            mask_row_size_list = mask_row_size_list * n_block
+        if len(mask_row_stride_list) == 1:
+            mask_row_stride_list = mask_row_stride_list * n_block
+        self.blocks = nn.ModuleList()
+        for i in range(n_block):
+            mask_row_size = mask_row_size_list[i]
+            mask_row_stride = mask_row_stride_list[i]
+            self.blocks.append(BlockOnGraph(n_node, in_dim, hidden_dim_dict, n_flow, mask_row_size, mask_row_stride))
+
+    def forward(self, adj: torch.Tensor, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logdet = 0
+        out = x
+        for block in self.blocks:
+            out, det = block((adj, out))
+            logdet = logdet + det
+        return out, logdet
+
+    @torch.jit.export
+    def reverse(self, graph: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        adj, z = graph
+        input = z
+        for i, block in enumerate(self.blocks[::-1]):
+            input = block.reverse((adj, input))
+
+        return input
diff --git a/training/benchmarks/moflow/pytorch/model/model.py b/training/benchmarks/moflow/pytorch/model/model.py
new file mode 100644
index 000000000..8cf6e0078
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/model.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright 2020 Chengxi Zang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+
+import math
+import torch
+import torch.nn as nn
+
+from misc.config import Config
+from model.glow import Glow, GlowOnGraph
+
+def gaussian_nll(x, mean, ln_var):
+    """Computes the negative log-likelihood of a Gaussian distribution.
+
+    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
+    representing :math:`\\log(\\sigma^2)`, this function computes in
+    elementwise manner the negative log-likelihood of :math:`x` on a
+    Gaussian distribution :math:`N(\\mu, S)`,
+
+    .. math::
+
+        -\\log N(x; \\mu, \\sigma^2) =
+        \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) +
+        \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu),
+
+    where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal
+    matrix where :math:`S_{ii} = \\sigma_i^2`.
+
+    Args:
+        x: Input variable.
+        mean: Mean of a Gaussian distribution, :math:`\\mu`.
+        ln_var: Logarithm of variance of a Gaussian distribution,
+            :math:`\\log(\\sigma^2)`.
+
+    Returns:
+        torch.Tensor:
+            Negative log-likelihood.
+    """
+
+    x_prec = torch.exp(-ln_var)
+    x_diff = x - mean
+    x_power = (x_diff * x_diff) * x_prec * -0.5
+    loss = (ln_var + math.log(2 * (math.pi))) / 2 - x_power
+    return loss
+
+
+class MoFlowLoss(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.b_n_type = config.num_edge_features
+        self.a_n_node = config.max_num_nodes
+        self.a_n_type = config.num_node_features
+        self.b_size = self.a_n_node * self.a_n_node * self.b_n_type
+        self.a_size = self.a_n_node * self.a_n_type
+
+        if config.model_config.learn_dist:
+            self.ln_var = nn.Parameter(torch.zeros(1))
+        else:
+            self.register_buffer('ln_var', torch.zeros(1))
+
+    def forward(self, h, adj_h, sum_log_det_jacs_x, sum_log_det_jacs_adj):
+        z = [h, adj_h]
+        logdet = [sum_log_det_jacs_x, sum_log_det_jacs_adj]
+
+        device = z[0].device
+        dtype = z[0].dtype
+        z[0] = z[0].reshape(z[0].shape[0],-1)
+        z[1] = z[1].reshape(z[1].shape[0], -1)
+
+        logdet[0] = logdet[0] - self.a_size * math.log(2.)
+        logdet[1] = logdet[1] - self.b_size * math.log(2.)
+        ln_var_adj = self.ln_var * torch.ones([self.b_size], device=device, dtype=dtype)
+        ln_var_x = self.ln_var * torch.ones([self.a_size], device=device, dtype=dtype)
+        nll_adj = torch.mean(
+            torch.sum(gaussian_nll(z[1], torch.zeros(self.b_size, device=device, dtype=dtype), ln_var_adj), dim=1)
+            - logdet[1])
+        nll_adj = nll_adj / (self.b_size * math.log(2.))  # the negative log likelihood per dim with log base 2
+
+        nll_x = torch.mean(torch.sum(
+            gaussian_nll(z[0], torch.zeros(self.a_size, device=device, dtype=dtype), ln_var_x),
+            dim=1) - logdet[0])
+        nll_x = nll_x / (self.a_size * math.log(2.))  # the negative log likelihood per dim with log base 2
+
+        return nll_x, nll_adj
+
+
+class MoFlow(nn.Module):
+    def __init__(self, config: Config):
+        super(MoFlow, self).__init__()
+        self.config = config
+        self.b_n_type = config.num_edge_features
+        self.a_n_node = config.max_num_nodes
+        self.a_n_type = config.num_node_features
+        self.b_size = self.a_n_node * self.a_n_node * self.b_n_type
+        self.a_size = self.a_n_node * self.a_n_type
+        self.noise_scale = config.model_config.noise_scale
+
+        self.bond_model = Glow(
+            in_channel=self.b_n_type,
+            n_flow=config.model_config.bond_config.n_flow,
+            n_block=config.model_config.bond_config.n_block,
+            squeeze_fold=config.model_config.bond_config.n_squeeze,
+            hidden_channel=config.model_config.bond_config.hidden_ch,
+            conv_lu=config.model_config.bond_config.conv_lu
+        )
+
+        self.atom_model = GlowOnGraph(
+            n_node=self.a_n_node,
+            in_dim=self.a_n_type,
+            hidden_dim_dict={
+                'gnn': config.model_config.atom_config.hidden_gnn,
+                'linear': config.model_config.atom_config.hidden_lin
+            },
+            n_flow=config.model_config.atom_config.n_flow,
+            n_block=config.model_config.atom_config.n_block,
+            mask_row_size_list=config.model_config.atom_config.mask_row_size_list,
+            mask_row_stride_list=config.model_config.atom_config.mask_row_stride_list,
+        )
+
+        self._cuda_graphs = dict()
+        self.atom_stream = None
+        self.bond_stream = None
+
+    @torch.jit.ignore
+    def forward(self, adj: torch.Tensor, x: torch.Tensor, with_cuda_graph: bool = False):
+        """
+        :param adj:  (256,4,9,9)
+        :param x: (256,9,5)
+        :return:
+        """
+        if with_cuda_graph and self.atom_stream is None:
+            self.atom_stream = torch.cuda.Stream()
+            self.bond_stream = torch.cuda.Stream()
+        h = x
+        # add uniform noise to node feature matrices
+        if self.training:
+            if self.noise_scale == 0:
+                h = h/2.0 - 0.5 + torch.rand_like(x) * 0.4
+            else:
+                h = h + torch.rand_like(x) * self.noise_scale
+        if with_cuda_graph:
+            if self.atom_model not in self._cuda_graphs:
+                h, sum_log_det_jacs_x = self._forward_graph(self.atom_model, adj, h)
+            else:
+                self.atom_stream.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(self.atom_stream):
+                    h, sum_log_det_jacs_x = self._forward_graph(self.atom_model, adj, h)
+        else:
+            h, sum_log_det_jacs_x = self.atom_model(adj, h)
+
+        # add uniform noise to adjacency tensors
+        if self.training:
+            if self.noise_scale == 0:
+                adj_bond = adj/2.0 - 0.5 + torch.rand_like(adj) * 0.4
+            else:
+                adj_bond = adj + torch.rand_like(adj) * self.noise_scale
+        else:
+            adj_bond = adj
+        if with_cuda_graph:
+            if self.bond_model not in self._cuda_graphs:
+                adj_h, sum_log_det_jacs_adj = self._forward_graph(self.bond_model, adj_bond)
+            else:
+                self.bond_stream.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(self.bond_stream):
+                    adj_h, sum_log_det_jacs_adj = self._forward_graph(self.bond_model, adj_bond)
+        else:
+            adj_h, sum_log_det_jacs_adj = self.bond_model(adj_bond)
+        if with_cuda_graph:
+            torch.cuda.current_stream().wait_stream(self.atom_stream)
+            torch.cuda.current_stream().wait_stream(self.bond_stream)
+        return h, adj_h, sum_log_det_jacs_x, sum_log_det_jacs_adj
+
+    @torch.jit.export
+    def reverse(self, z):
+        """
+        Returns a molecule, given its latent vector.
+        :param z: latent vector. Shape: [B, N*N*M + N*T]
+            B = Batch size, N = number of atoms, M = number of bond types,
+            T = number of atom types (Carbon, Oxygen etc.)
+        :return: adjacency matrix and feature matrix of a molecule
+        """
+        batch_size = z.shape[0]
+        z_x = z[:, :self.a_size]
+        z_adj = z[:, self.a_size:]
+
+        h_adj = z_adj.reshape(batch_size, self.b_n_type, self.a_n_node, self.a_n_node)
+        h_adj = h_adj.to(memory_format=torch.channels_last)
+        h_adj = self.bond_model.reverse(h_adj)
+
+        if self.noise_scale == 0:
+            h_adj = (h_adj + 0.5) * 2
+        adj = h_adj
+        adj = adj + adj.permute(0, 1, 3, 2)
+        adj = adj / 2
+        adj = adj.softmax(dim=1)
+        max_bond = adj.max(dim=1).values.reshape(batch_size, -1, self.a_n_node, self.a_n_node)
+        adj = torch.floor(adj / max_bond)
+
+        adj = adj.to(memory_format=torch.channels_last)
+        h_x = z_x.reshape(batch_size, self.a_n_node, self.a_n_type)
+        h_x = self.atom_model.reverse((adj, h_x))
+        if self.noise_scale == 0:
+            h_x = (h_x + 0.5) * 2
+        return adj, h_x
+
+    @torch.jit.ignore
+    def _forward_graph(self, model, *args):
+        if model not in self._cuda_graphs:
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+            torch.cuda.synchronize()
+            self._cuda_graphs[model] = torch.cuda.make_graphed_callables(
+                model,
+                args,
+            )
+            torch.cuda.synchronize()
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+        return self._cuda_graphs[model](*args)
diff --git a/training/benchmarks/moflow/pytorch/model/utils.py b/training/benchmarks/moflow/pytorch/model/utils.py
new file mode 100644
index 000000000..6f9233040
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/model/utils.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import Iterable
+import torch
+
+def initialize_module(module: torch.nn.Module, inputs: Iterable[torch.Tensor]) -> None:
+    """Use given sample input to initialize the module. 
+    Module must implement method called `initialize` which takes list of input tensors
+    """
+    assert hasattr(module, 'initialize')
+    assert len(inputs) == 1, f'{len(inputs)} inputs'
+    assert module.initialized.item() == 0, 'initialized'
+    module.initialize(*inputs)
+    assert module.initialized.item() == 1, 'not initialized'
+
+
+def initialize(model: torch.nn.Module, single_batch: Iterable[torch.Tensor]) -> None:
+    """Initialize all sub-modules in the model given the sample input batch."""
+    hooks = []
+    for name, module in model.named_modules():
+        if hasattr(module, 'initialize'):
+            logging.info(f'marking {name} for initialization')
+            hook = module.register_forward_pre_hook(initialize_module)
+            hooks.append(hook)
+    _ = model(*single_batch)
+    logging.info('all modules initialized, removing hooks')
+    for hook in hooks:
+        hook.remove()
diff --git a/training/benchmarks/moflow/pytorch/optimizers/__init__.py b/training/benchmarks/moflow/pytorch/optimizers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/run_pretraining.py b/training/benchmarks/moflow/pytorch/run_pretraining.py
new file mode 100755
index 000000000..07a216567
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/run_pretraining.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# 标准库
+import os
+import sys
+import time
+from typing import Any, Tuple
+
+# 三方库
+import torch
+
+# benchmarks目录 append到sys.path
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH,
+                                             "../../")))  # benchmarks目录
+# 本地库
+import config
+from driver import Event, dist_pytorch
+from driver.helper import InitHelper
+
+# 导入相关的模块、方法、变量。这里保持名称一致，实现可以不同。
+from runtime.logger import MetricsLogger, PerformanceLogger, setup_logging
+from runtime.common import get_newest_checkpoint, load_state
+from train import trainer_adapter
+from train.evaluator import Evaluator
+from train.trainer import Trainer
+from train.training_state import TrainingState
+from dataloaders.dataloader import build_datasets, build_train_dataloader
+
+logger = None
+
+torch._C._jit_set_autocast_mode(True)
+
+
+def main() -> Tuple[Any, Any]:
+    global logger
+    global config
+
+    # init
+    init_helper = InitHelper(config)
+    model_driver = init_helper.init_driver(globals(), locals())
+    args = model_driver.config
+    dist_pytorch.init_dist_training_env(args)
+    dist_pytorch.barrier(args.vendor)
+    model_driver.event(Event.INIT_START)
+
+    torch.cuda.set_stream(torch.cuda.Stream())
+    os.makedirs(args.results_dir, exist_ok=True)
+
+    args.distributed = dist_pytorch.get_world_size() > 1
+    # logger
+    logger = model_driver.logger
+    dllogger = setup_logging(args)
+    world_size = args.n_device
+    perf_logger, acc_logger = None, None
+    if args.local_rank == 0:
+        perf_logger = PerformanceLogger(dllogger,
+                                        args.train_batch_size * world_size,
+                                        args.warmup_steps)
+        acc_logger = MetricsLogger(dllogger)
+
+    train_dataset, _ = build_datasets(args)
+    train_dataloader = build_train_dataloader(args, train_dataset)
+
+    if args.save_epochs == -1:
+        args.save_epochs = args.epochs
+    if args.eval_epochs == -1:
+        args.eval_epochs = args.epochs
+    if args.steps == -1:
+        args.steps = args.epochs * len(train_dataloader)
+
+    seed = args.seed
+    init_helper.set_seed(seed, args.vendor)
+
+    # 创建TrainingState对象
+    training_state = TrainingState()
+
+    # 构建 trainer：依赖 evaluator、TrainingState对象
+    evaluator = Evaluator()
+    trainer = Trainer(
+        driver=model_driver,
+        adapter=trainer_adapter,
+        evaluator=evaluator,
+        training_state=training_state,
+        device=args.device,
+        args=args,
+        perf_logger=perf_logger,
+        acc_logger=acc_logger,
+        train_dataloader=train_dataloader,
+    )
+    training_state._trainer = trainer
+
+    # 设置分布式环境, trainer init()
+    dist_pytorch.barrier(args.vendor)
+    trainer.init()
+    dist_pytorch.barrier(args.vendor)
+
+    if not args.do_train:
+        return args, training_state
+
+    model_driver.event(Event.INIT_END)
+
+    # TRAIN_START
+    dist_pytorch.barrier(args.vendor)
+    model_driver.event(Event.TRAIN_START)
+    train_start_time = time.time()
+
+    snapshot_path = get_newest_checkpoint(args.results_dir)
+    dist_pytorch.main_proc_print(f"snapshot_path: {snapshot_path}")
+
+    first_epoch, step = 0, 0
+    if snapshot_path is not None:
+        snapshot_epoch, ln_var = load_state(snapshot_path,
+                                            trainer.model_callable,
+                                            optimizer=trainer.optimizer,
+                                            device=args.device)
+        trainer.loss_callable.ln_var = torch.nn.Parameter(torch.tensor(ln_var))
+        first_epoch = snapshot_epoch + 1
+        step = first_epoch * len(train_dataloader)
+    else:
+        first_epoch = 0
+        step = 0
+
+
+    if first_epoch > args.epochs:
+        dist_pytorch.main_proc_print(
+            f'Model was already trained for {first_epoch} epochs, skip pretraining'
+        )
+
+    # 训练过程
+    epoch = first_epoch
+    while epoch < args.epochs:
+        training_state.epoch = epoch
+        step = trainer.train_one_epoch(train_dataloader, step)
+        epoch += 1
+        if step >= args.steps:
+            break
+    # TRAIN_END事件
+    training_state.train_time = time.time() - train_start_time
+    model_driver.event(Event.TRAIN_END)
+
+    if args.local_rank == 0:
+        # The same report for each epoch
+        acc_logger.summarize(step=tuple())
+        perf_logger.summarize(step=tuple())
+
+        res = evaluator.evaluate(args, trainer.config, acc_logger)
+        dist_pytorch.main_proc_print(f"evaluate results: {res}")
+        training_state.nuv = res['nuv']
+        if training_state.nuv >= args.target_nuv:
+            dist_pytorch.main_proc_print(
+                f"converged_success. eval_nuv: {training_state.nuv}, target_nuv: {args.target_nuv}"
+            )
+            training_state.converged_success()
+
+    return config, training_state
+
+
+if __name__ == "__main__":
+    start = time.time()
+    config_update, state = main()
+    if not dist_pytorch.is_main_process():
+        sys.exit(0)
+
+    # 训练信息写日志
+    e2e_time = time.time() - start
+    if config_update.do_train:
+
+        finished_info = {
+            "e2e_time":
+            e2e_time,
+            "train_time":
+            state.train_time,
+            "train_no_eval_time":
+            state.no_eval_time,
+            "pure_training_computing_time":
+            state.pure_compute_time,
+            "throughput(ips)_raw":
+            round(state.num_trained_samples / state.train_time, 2),
+            "throughput(ips)_no_eval":
+            round(state.num_trained_samples / state.no_eval_time, 2),
+            "throughput(ips)_pure_compute":
+            round(state.num_trained_samples / state.pure_compute_time, 2),
+            "converged":
+            state.converged,
+            "final_nuv":
+            state.nuv,
+        }
+    else:
+        finished_info = {"e2e_time": e2e_time}
+    logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
diff --git a/training/benchmarks/moflow/pytorch/runtime/__init__.py b/training/benchmarks/moflow/pytorch/runtime/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/runtime/arguments.py b/training/benchmarks/moflow/pytorch/runtime/arguments.py
new file mode 100644
index 000000000..d5b8c58c5
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/runtime/arguments.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+
+from misc.config import CONFIGS
+from runtime.logger import LOGGING_LEVELS
+
+
+PARSER = argparse.ArgumentParser()
+PARSER.add_argument('--data_dir', type=str, default='/data', help='Location for the dataset.')
+PARSER.add_argument('--config_name', type=str, default='zinc250k', choices=list(CONFIGS),
+                    help='The config to choose. This parameter allows one to switch between different datasets '
+                         'and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.')
+PARSER.add_argument('--results_dir', type=str, default='/results', help='Directory where checkpoints are stored.')
+PARSER.add_argument('--predictions_path', type=str, default='/results/predictions.smi',
+                    help='Path to store generated molecules. If an empty string is provided, predictions will not be '
+                         'saved (useful for benchmarking and debugging).')
+PARSER.add_argument('--log_path', type=str, default=None,
+                    help='Path for DLLogger log. This file will contain information about the speed and '
+                         'accuracy of the model during training and inference. Note that if the file '
+                         'already exists, new logs will be added at the end.')
+PARSER.add_argument('--log_interval', type=int, default=20, help='Frequency for writing logs, expressed in steps.')
+PARSER.add_argument('--warmup_steps', type=int, default=20,
+                    help='Number of warmup steps. This value is used for benchmarking and for CUDA graph capture.')
+PARSER.add_argument('--steps', type=int, default=-1,
+                    help='Number of steps used for training/inference. This parameter allows finishing '
+                         'training earlier than the specified number of epochs. If used with inference, '
+                         'it allows generating  more molecules (by default only a single batch of molecules is generated).')
+PARSER.add_argument('--save_epochs', type=int, default=5,
+                    help='Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved.')
+PARSER.add_argument('--eval_epochs', type=int, default=5,
+                    help='Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed.')
+PARSER.add_argument('--learning_rate', type=float, default=0.0005, help='Base learning rate.')
+PARSER.add_argument('--beta1', type=float, default=0.9, help='beta1 parameter for the optimizer.')
+PARSER.add_argument('--beta2', type=float, default=0.99, help='beta2 parameter for the optimizer.')
+PARSER.add_argument('--clip', type=float, default=1, help='Gradient clipping norm.')
+PARSER.add_argument('--epochs', type=int, default=300,
+                    help='Number of training epochs. Note that you can finish training mid-epoch by using "--steps" flag.')
+PARSER.add_argument('--batch_size', type=int, default=512, help='Batch size per GPU.')
+PARSER.add_argument('--num_workers', type=int, default=4, help='Number of workers in the data loader.')
+PARSER.add_argument('--seed', type=int, default=1, help='Random seed used to initialize the distributed loaders.')
+PARSER.add_argument('--local_rank', default=os.environ.get('LOCAL_RANK', 0), type=int,
+                    help='rank of the GPU, used to launch distributed training. This argument is specified '
+                         'automatically by `torchrun` and does not have to be provided by the user.')
+PARSER.add_argument('--temperature', type=float, default=0.3, help='Temperature used for sampling.')
+PARSER.add_argument('--val_batch_size', type=int, default=100, help='Number of molecules to generate during validation step.')
+PARSER.add_argument('--allow_untrained', action='store_true',
+                    help='Allow sampling molecules from an untrained network. Useful for performance benchmarking or debugging purposes.')
+PARSER.add_argument('--correct_validity', action='store_true', help='Apply validity correction after the generation of the molecules.')
+PARSER.add_argument('--amp', action='store_true', help='Use Automatic Mixed Precision.')
+PARSER.add_argument('--cuda_graph', action='store_true', help='Capture GPU kernels with CUDA graphs. This option allows to speed up training.')
+PARSER.add_argument('--jit', action='store_true', help='Compile the model with `torch.jit.script`. Can be used to speed up training or inference.')
+PARSER.add_argument('--verbosity', type=int, default=1, choices=list(LOGGING_LEVELS),
+                    help='Verbosity level. Specify the following values: 0, 1, 2, 3, where 0 means minimal '
+                    'verbosity (errors only) and 3 - maximal (debugging).')
diff --git a/training/benchmarks/moflow/pytorch/runtime/common.py b/training/benchmarks/moflow/pytorch/runtime/common.py
new file mode 100644
index 000000000..3d630089d
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/runtime/common.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from glob import glob
+import logging
+import os
+from typing import List, Optional, Tuple
+import torch
+
+from model.model import MoFlow
+
+
+CHECKPOINT_PATTERN = 'model_snapshot_epoch_%s'
+
+
+def _sort_checkpoints(paths: List[str]) -> List[str]:
+    return sorted(paths, key=lambda x: int(x.split('_')[-1]))
+
+
+def save_state(dir: str, model: MoFlow, optimizer: torch.optim.Optimizer, ln_var: float, epoch: int, keep: int = 1) -> None:
+    """Save training state in a given dir. This checkpoint can be used to resume training or run inference
+    with the trained model. This function will keep up to <keep> newest checkpoints and remove the oldest ones.
+    """
+    save_path = os.path.join(dir, CHECKPOINT_PATTERN % (epoch + 1))
+    state = {
+        'model': model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'ln_var': ln_var,
+        'epoch': epoch,
+    }
+    torch.save(state, save_path)
+
+    if keep > 0:
+        filenames = glob(os.path.join(dir, CHECKPOINT_PATTERN % '*'))
+        if len(filenames) <= keep:
+            return
+
+        to_del = _sort_checkpoints(filenames)[:-keep]
+        for path in to_del:
+            os.remove(path)
+
+
+def load_state(path: str, model: MoFlow, device: torch.device, optimizer: Optional[torch.optim.Optimizer] = None) -> Tuple[int, float]:
+    """Load model's and optimizer's state from a given file.
+    This function returns the number of epochs the model was trained for and natural logarithm of variance
+    the for the distribution of the latent space.
+    """
+    state = torch.load(path, map_location=device)
+    model.load_state_dict(state['model'])
+    if optimizer is not None:
+        optimizer.load_state_dict(state['optimizer'])
+    return state['epoch'], state['ln_var']
+
+
+def get_newest_checkpoint(model_dir: str, validate: bool = True) -> str:
+    """Find newest checkpoint in a given directory.
+    If validate is set to True, this function will also verify that the file can be loaded and
+    select older checkpoint if neccessary.
+    """
+    filenames = glob(os.path.join(model_dir, CHECKPOINT_PATTERN % '*'))
+    if len(filenames) == 0:
+        logging.info(f'No checkpoints available')
+        return None
+
+    paths = _sort_checkpoints(filenames)
+    if validate:
+        for latest_path in paths[::-1]:
+            try:
+                torch.load(latest_path, map_location='cpu')
+                break
+            except:
+                logging.info(f'Checkpoint {latest_path} is corrupted')
+        else:
+            logging.info(f'All available checkpoints were corrupted')
+            return None
+
+    else:
+        latest_path = paths[-1]
+
+    logging.info(f'Found checkpoint {latest_path}')
+    return latest_path
diff --git a/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py b/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py
new file mode 100644
index 000000000..67ca67e16
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/runtime/distributed_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def get_device(local_rank: int) -> torch.device:
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank % torch.cuda.device_count())
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+        logging.warning("not using a(ny) GPU(s)!")
+    return device
+
+
+def get_world_size() -> int:
+    return int(os.environ.get("WORLD_SIZE", 1))
+
+
+def reduce_tensor(tensor: torch.Tensor, num_gpus: int) -> torch.Tensor:
+    if num_gpus > 1:
+        rt = tensor.clone()
+        dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+        if rt.is_floating_point():
+            rt = rt / num_gpus
+        else:
+            rt = rt // num_gpus
+        return rt
+    return tensor
+
+
+def init_distributed() -> bool:
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    distributed = world_size > 1
+    if distributed:
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" # Needed for CUDA graphs
+        dist.init_process_group(backend=backend, init_method="env://")
+        assert dist.is_initialized()
+
+    if get_rank() == 0:
+        logging.info(f"Distributed initialized. World size: {world_size}")
+    return distributed
+
+
+def get_rank() -> int:
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
diff --git a/training/benchmarks/moflow/pytorch/runtime/generate.py b/training/benchmarks/moflow/pytorch/runtime/generate.py
new file mode 100644
index 000000000..de9369494
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/runtime/generate.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple
+import numpy as np
+
+from torch.cuda.amp import autocast
+import torch
+
+from misc.config import CONFIGS, Config
+
+from model.model import MoFlow
+from misc.utils import convert_predictions_to_mols, postprocess_predictions
+from runtime.arguments import PARSER
+from runtime.common import get_newest_checkpoint, load_state
+from runtime.distributed_utils import get_device
+from runtime.logger import PerformanceLogger, setup_logging
+
+
+def infer(model: MoFlow, config: Config, device: torch.device, *,
+          ln_var: float = 0, temp: float = 0.6, mu: Optional[torch.Tensor] = None,
+          batch_size: int = 20) -> Tuple[np.ndarray, np.ndarray]:
+
+    if mu is None:
+        mu = torch.zeros(config.z_dim, dtype=torch.float32, device=device)
+
+    sigma = temp * np.sqrt(np.exp(ln_var))
+    with torch.no_grad():
+        z = torch.normal(mu.reshape(-1, config.z_dim).repeat((batch_size, 1)), sigma)
+        adj, x = model.reverse(z)
+    x, adj = postprocess_predictions(x, adj, config=config)
+
+    return adj, x
+
+
+if __name__ == '__main__':
+    from rdkit import RDLogger
+    RDLogger.DisableLog('rdApp.*')
+
+    args = PARSER.parse_args()
+    logger = setup_logging(args)
+    perf_logger = PerformanceLogger(logger, args.train_batch_size, args.warmup_steps, mode='generate')
+    if args.predictions_path:
+        from rdkit.Chem import SmilesWriter
+        smiles_writer = SmilesWriter(args.predictions_path)
+
+    snapshot_path = get_newest_checkpoint(args.results_dir)
+    config = CONFIGS[args.config_name]
+    model = MoFlow(config)
+
+    device = get_device(args.local_rank)
+    if snapshot_path is not None:
+        epoch, ln_var = load_state(snapshot_path, model, device=device)
+    elif args.allow_untrained:
+        epoch, ln_var = 0, 0
+    else:
+        raise RuntimeError('Generating molecules from an untrained network! '
+                           'If this was intentional, pass --allow_untrained flag.')
+    model.to(device=device, memory_format=torch.channels_last)
+    model.eval()
+    if args.jit:
+        model.atom_model = torch.jit.script(model.atom_model)
+        model.bond_model = torch.jit.script(model.bond_model)
+
+
+    if args.steps == -1:
+        args.steps = 1
+
+    with autocast(enabled=args.amp):
+        for i in range(args.steps):
+            perf_logger.update()
+            results = infer(
+                model, config, ln_var=ln_var, temp=args.temperature, batch_size=args.batch_size,
+                device=device)
+
+            if (i + 1) % args.log_interval == 0:
+                perf_logger.summarize(step=(0, i, i))
+            if args.predictions_path:
+                mols_batch = convert_predictions_to_mols(*results, correct_validity=args.correct_validity)
+                for mol in mols_batch:
+                    smiles_writer.write(mol)
+
+    perf_logger.summarize(step=tuple())
+    if args.predictions_path:
+        smiles_writer.close()
diff --git a/training/benchmarks/moflow/pytorch/runtime/logger.py b/training/benchmarks/moflow/pytorch/runtime/logger.py
new file mode 100644
index 000000000..b597d26de
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/runtime/logger.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+import logging
+import time
+
+import dllogger
+from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
+import numpy as np
+
+
+LOGGING_LEVELS = dict(enumerate([logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]))
+
+
+def get_dllogger(args):
+    backends = []
+    if args.local_rank == 0:
+        backends.append(StdOutBackend(Verbosity.VERBOSE))
+        if args.log_path is not None:
+            backends.append(JSONStreamBackend(Verbosity.VERBOSE, args.log_path, append=True))
+    dllogger.init(backends=backends)
+    return dllogger
+
+
+def setup_logging(args):
+    logging.basicConfig(
+        format='%(asctime)s %(levelname)s:\t%(message)s', datefmt='%H:%M:%S', level=LOGGING_LEVELS[args.verbosity], force=True
+    )
+    return get_dllogger(args)
+
+
+class BaseLogger(ABC):
+    @abstractmethod
+    def update(self, **kwargs) -> None:
+        pass
+
+    @abstractmethod
+    def process_stats(self) -> dict:
+        return {}
+
+    @abstractmethod
+    def reset(self) -> None:
+        pass
+
+    def summarize(self, step: tuple):
+        stats = self.process_stats()
+        if len(stats) == 0:
+            logging.warn('Empty stats for logging, skipping')
+            return
+        self.logger.log(step=step, data=stats)
+        self.logger.flush()
+        return stats
+
+
+class PerformanceLogger(BaseLogger):
+    def __init__(self, logger, batch_size: int, warmup_steps: int = 100, mode: str = 'train'):
+        self.logger = logger
+        self.batch_size = batch_size
+        self.warmup_steps = warmup_steps
+        self._step = 0
+        self._timestamps = []
+        self.mode = mode
+
+    def update(self, **kwargs) -> None:
+        self._step += 1
+        if self._step >= self.warmup_steps:
+            self._timestamps.append(time.time())
+    
+    def reset(self) -> None:
+        self._step = 0
+        self._timestamps = []
+
+    def process_stats(self) -> dict:
+        if len(self._timestamps) < 2:
+            logging.warn('Cannot process performance stats - less than 2 measurements collected')
+            return {}
+
+        timestamps = np.asarray(self._timestamps)
+        deltas = np.diff(timestamps)
+        throughput = (self.batch_size / deltas).mean()
+        stats = {
+            f'throughput_{self.mode}': throughput,
+            f'latency_{self.mode}_mean': deltas.mean(),
+            f'total_time_{self.mode}': timestamps[-1] - timestamps[0],
+        }
+        for level in [90, 95, 99]:
+            stats.update({f'latency_{self.mode}_{level}': np.percentile(deltas, level)})
+
+        return stats
+
+
+class MetricsLogger(BaseLogger):
+    def __init__(self, logger, mode: str = 'train'):
+        self.logger = logger
+        self.mode = mode
+        self._metrics_dict = {}
+
+    def update(self, metrics: dict, **kwargs) -> None:
+        for metrics_name, metric_val in metrics.items():
+            if metrics_name not in self._metrics_dict:
+                self._metrics_dict[metrics_name] = []
+            self._metrics_dict[metrics_name].append(float(metric_val))
+    
+    def reset(self) -> None:
+        self._metrics_dict = {}
+
+    def process_stats(self) -> dict:
+        stats = {}
+        for metric_name, metric_val in self._metrics_dict.items():
+            stats[metric_name] = np.mean(metric_val)
+        return stats
diff --git a/training/benchmarks/moflow/pytorch/train/__init__.py b/training/benchmarks/moflow/pytorch/train/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/training/benchmarks/moflow/pytorch/train/evaluator.py b/training/benchmarks/moflow/pytorch/train/evaluator.py
new file mode 100755
index 000000000..6af54fb72
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/train/evaluator.py
@@ -0,0 +1,110 @@
+import os
+import sys
+import torch
+import numpy as np
+from functools import partial
+
+from torch.cuda.amp import autocast
+
+from misc.utils import check_validity, convert_predictions_to_mols, predictions_to_smiles, check_novelty
+from runtime.common import get_newest_checkpoint, load_state
+from data.data_loader import NumpyTupleDataset
+from data import transform
+from runtime.generate import infer
+from model.model import MoFlow
+
+# add benchmarks directory to sys.path
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
+from driver import dist_pytorch
+
+
+class Evaluator:
+
+    def __init__(self):
+        pass
+
+    def evaluate(self, args, config, acc_logger):
+        device = args.device
+        snapshot_path = get_newest_checkpoint(args.results_dir)
+        args.jit = True
+        model = MoFlow(config)
+
+        dist_pytorch.main_proc_print(f"snapshot_path: {snapshot_path}")
+
+        if snapshot_path is not None:
+            epoch, ln_var = load_state(snapshot_path, model, device=device)
+        elif args.allow_untrained:
+            epoch, ln_var = 0, 0
+        else:
+            raise RuntimeError(
+                'Generating molecules from an untrained network! '
+                'If this was intentional, pass --allow_untrained flag.')
+        dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}")
+
+        model.to(device)
+        model.eval()
+
+        if args.steps == -1:
+            args.steps = 1
+        else:
+            args.steps = 100
+
+        dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}. args.steps:{args.steps}")
+
+        valid_idx = transform.get_val_ids(config, args.data_dir)
+        dataset = NumpyTupleDataset.load(
+            os.path.join(args.data_dir, config.dataset_config.dataset_file),
+            transform=partial(transform.transform_fn, config=config),
+        )
+        train_idx = [t for t in range(len(dataset)) if t not in valid_idx]
+        n_train = len(train_idx)
+        train_dataset = torch.utils.data.Subset(dataset, train_idx)
+        train_x = torch.Tensor(np.array([a[0] for a in train_dataset]))
+        train_adj = torch.Tensor(np.array([a[1] for a in train_dataset]))
+
+        train_smiles = set(predictions_to_smiles(train_adj, train_x, config))
+
+        metrics = dict()
+        with autocast(enabled=args.amp):
+            for i in range(args.steps):
+                results = infer(model,
+                                config,
+                                ln_var=ln_var,
+                                temp=args.temperature,
+                                batch_size=args.train_batch_size,
+                                device=device)
+
+                mols_batch = convert_predictions_to_mols(
+                    *results,
+                    correct_validity=args.correct_validity,
+                )
+                validity_info = check_validity(mols_batch)
+
+                novel_r, abs_novel_r = check_novelty(
+                    validity_info['valid_smiles'],
+                    train_smiles,
+                    len(mols_batch),
+                )
+                _, nuv = check_novelty(
+                    list(set(validity_info['valid_smiles'])),
+                    train_smiles,
+                    len(mols_batch),
+                )
+                metrics = {
+                    'validity': validity_info['valid_ratio'],
+                    'novelty': novel_r,
+                    'uniqueness': validity_info['unique_ratio'],
+                    'abs_novelty': abs_novel_r,
+                    'abs_uniqueness': validity_info['abs_unique_ratio'],
+                    'nuv': nuv,
+                }
+                dist_pytorch.main_proc_print(f"step:{i} metrics:{metrics}")
+                if args.local_rank == 0:
+                    acc_logger.update(metrics)
+
+        if args.local_rank == 0:
+            stats = acc_logger.summarize(step=tuple())
+            return stats
+
+        return None
diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py
new file mode 100755
index 000000000..ab9a95d5b
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/train/trainer.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import time
+import os
+import sys
+import argparse
+from typing import Dict
+
+import torch
+import torch.utils.data
+from torch.types import Device
+from torch.cuda.amp import autocast
+
+from model import create_model
+from model.model import MoFlow, MoFlowLoss
+from model.utils import initialize
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+from runtime.generate import infer
+from runtime.distributed_utils import reduce_tensor
+from runtime.common import save_state
+from misc.utils import check_validity, convert_predictions_to_mols
+from misc.config import CONFIGS, Config
+# add benchmarks directory to sys.path
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
+from driver import Driver, dist_pytorch
+
+
+class Trainer:
+
+    def __init__(self, driver: Driver, adapter, evaluator: Evaluator,
+                 training_state: TrainingState, device: Device, args,
+                 perf_logger, acc_logger, train_dataloader):
+        super(Trainer, self).__init__()
+        self.driver = driver
+        self.adapter = adapter
+        self.training_state = training_state
+        self.device = device
+        self.evaluator = evaluator
+        self.args = args
+        self.perf_logger = perf_logger
+        self.acc_logger = acc_logger
+        self.train_dataloader = train_dataloader
+
+        self.scaler = None
+        self.config = None
+        self.clip_grad = None
+        self.loss_module = None
+        self.model_callable = None
+        self.loss_callable = None
+        self.model = None
+
+    def init(self):
+        args = self.args
+        dist_pytorch.main_proc_print("Init progress:")
+        self.config = CONFIGS[self.args.config_name]
+        self.model = create_model(self.config)
+        self.model.to(self.device)
+        device = args.device
+        model = self.model
+        x, adj, *_ = next(iter(self.train_dataloader))
+        x = x.to(device)
+        adj = adj.to(device)
+        with autocast(enabled=args.amp):
+            initialize(self.model, (adj, x))
+
+        model.to(memory_format=torch.channels_last)
+        adj.to(memory_format=torch.channels_last)
+
+        if args.jit:
+            model.bond_model = torch.jit.script(model.bond_model)
+            model.atom_model = torch.jit.script(model.atom_model)
+
+        # make one pass in both directions to make sure that model works
+        with torch.no_grad():
+            _ = model(adj, x)
+            _ = model.reverse(
+                torch.randn(args.train_batch_size,
+                            self.config.z_dim,
+                            device=device))
+
+        self.model = self.adapter.convert_model(self.model)
+        self.model = self.adapter.model_to_fp16(self.model, self.args)
+        self.model = self.adapter.model_to_ddp(self.model, self.args)
+        self.loss_module = MoFlowLoss(self.config)
+        self.loss_module.to(self.device)
+        self.loss_module = self.adapter.model_to_ddp(self.loss_module,
+                                                     self.args)
+        self.model_callable, self.loss_callable = self._get_callables()
+
+        self.optimizer = self.adapter.create_optimizer(self.model, self.args,
+                                                       self.loss_module)
+        self.scaler = self.adapter.create_grad_scaler(self.args)
+        self.clip_grad = self.adapter.create_clip_grad()
+
+    def _get_callables(self):
+        args = self.args
+        is_distributed = args.distributed
+        model_callable, loss_callable = None, None
+        if is_distributed:
+            model_callable = self.model.module
+            loss_callable = self.loss_module.module
+        else:
+            model_callable = self.model
+            loss_callable = self.loss_module
+        return model_callable, loss_callable
+
+    def train_one_epoch(self, train_dataloader, step) -> int:
+        model = self.model
+        args = self.args
+        device = self.device
+        epoch = self.training_state.epoch
+        clip_grad_norm_ = self.clip_grad
+        world_size = args.n_device
+        local_rank = self.args.local_rank
+        is_distributed = args.distributed
+
+
+        if local_rank == 0:
+            self.acc_logger.reset()
+
+        print(f"Epoch: {epoch}")
+        if args.distributed:
+            train_dataloader.sampler.set_epoch(epoch)
+
+        model.train()
+        noeval_start_time = time.time()
+
+        for i, batch in enumerate(train_dataloader):
+            if local_rank == 0:
+                self.perf_logger.update()
+            step += 1
+            self.optimizer.zero_grad()
+            x = batch[0].to(device)
+            adj = batch[1].to(device=device, memory_format=torch.channels_last)
+
+            pure_compute_start_time = time.time()
+
+            # Forward, backward and optimize
+            with_cuda_graph = (args.cuda_graph and step >= args.warmup_steps
+                               and x.size(0) == args.train_batch_size)
+
+            with autocast(enabled=args.amp, cache_enabled=not with_cuda_graph):
+                output = model(adj, x, with_cuda_graph=with_cuda_graph)
+                nll_x, nll_adj = self.loss_module(*output)
+                loss = nll_x + nll_adj
+
+            if args.amp:
+                self.scaler.scale(loss).backward()
+                self.scaler.unscale_(self.optimizer)
+                clip_grad_norm_(model.parameters(), args.clip)
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            else:
+                loss.backward()
+                clip_grad_norm_(model.parameters(), args.clip)
+                self.optimizer.step()
+
+            self.training_state.pure_compute_time += time.time() - pure_compute_start_time
+
+            # Print log info
+            if (i + 1) % args.log_interval == 0:
+                nll_x_value = reduce_tensor(nll_x, world_size).item()
+                nll_adj_value = reduce_tensor(nll_adj, world_size).item()
+                loss_value = nll_x_value + nll_adj_value
+
+                if local_rank == 0:
+                    self.acc_logger.update({
+                        'loglik': loss_value,
+                        'nll_x': nll_x_value,
+                        'nll_adj': nll_adj_value
+                    })
+
+                    self.acc_logger.summarize(step=(epoch, i, i))
+                    self.perf_logger.summarize(step=(epoch, i, i))
+
+
+            if step >= args.steps:
+                break
+
+        self.training_state.num_trained_samples += len(train_dataloader.dataset)
+        self.training_state.no_eval_time += time.time() - noeval_start_time
+
+        if (epoch + 1) % args.eval_epochs == 0:
+            with autocast(enabled=args.amp):
+                metrics = run_validation(self.model, self.config,
+                                         self.loss_callable.ln_var.item(), args,
+                                         is_distributed, world_size, device)
+                dist_pytorch.main_proc_print(f"epoch:{epoch+1}, metrics:{metrics}")
+
+            if local_rank == 0:
+                self.acc_logger.update(metrics)
+
+        # The same report for each epoch
+        if local_rank == 0:
+            self.acc_logger.summarize(step=(epoch, ))
+            self.perf_logger.summarize(step=(epoch, ))
+
+        # Save the model checkpoints
+        if (epoch + 1) % args.save_epochs == 0:
+            if local_rank == 0 or not is_distributed:
+                save_state(args.results_dir,
+                           self.model_callable,
+                           self.optimizer,
+                           self.loss_callable.ln_var.item(),
+                           epoch,
+                           keep=5)
+        return step
+
+def run_validation(model: MoFlow, config: Config, ln_var: float,
+                   args: argparse.Namespace, is_distributed: bool,
+                   world_size: int, device: torch.device) -> Dict[str, float]:
+    model.eval()
+    model_callable = model.module if is_distributed else model
+
+    result = infer(model_callable,
+                   config,
+                   device=device,
+                   ln_var=ln_var,
+                   batch_size=args.eval_batch_size,
+                   temp=args.temperature)
+    mols = convert_predictions_to_mols(*result,
+                                       correct_validity=args.correct_validity)
+    validity_info = check_validity(mols)
+    valid_ratio = torch.tensor(validity_info['valid_ratio'],
+                               dtype=torch.float32,
+                               device=device)
+    unique_ratio = torch.tensor(validity_info['unique_ratio'],
+                                dtype=torch.float32,
+                                device=device)
+    valid_value = reduce_tensor(valid_ratio, world_size).detach().cpu().numpy()
+    unique_value = reduce_tensor(unique_ratio,
+                                 world_size).detach().cpu().numpy()
+    model.train()
+    return {'valid': valid_value, 'unique': unique_value}
diff --git a/training/benchmarks/moflow/pytorch/train/trainer_adapter.py b/training/benchmarks/moflow/pytorch/train/trainer_adapter.py
new file mode 100755
index 000000000..2d40170be
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/train/trainer_adapter.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import os
+import sys
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
+from driver.dist_pytorch import main_proc_print
+
+
+def create_optimizer(model, args, loss_module):
+    pass
+
+
+def create_clip_grad():
+    pass
+
+
+def create_grad_scaler(args):
+    pass
+
+
+def convert_model(model: nn.Module) -> nn.Module:
+    return model
+
+
+def model_to_fp16(model: nn.Module, args) -> nn.Module:
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if args.fp16:
+        main_proc_print(" > use fp16...")
+        model.to(torch.bfloat16)
+    return model
+
+
+def model_to_ddp(model: nn.Module, args) -> nn.Module:
+    if dist.is_available() and dist.is_initialized():
+        model = DDP(model, device_ids=[args.local_rank])
+    return model
diff --git a/training/benchmarks/moflow/pytorch/train/training_state.py b/training/benchmarks/moflow/pytorch/train/training_state.py
new file mode 100755
index 000000000..35e6dd53d
--- /dev/null
+++ b/training/benchmarks/moflow/pytorch/train/training_state.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+from dataclasses import dataclass
+
+
+@dataclass
+class TrainingState:
+    _trainer = None
+    _status = 'aborted'  # later set to 'success' if termination criteria met
+
+    global_steps = 0
+
+    loss: float = 0.0
+    nuv: float = 0.0
+
+    epoch: int = 1
+
+    end_training: bool = False
+    converged: bool = False
+
+    train_time = 0.001
+    no_eval_time = 0.001
+    pure_compute_time = 0.001
+
+    num_trained_samples = 0
+
+    def status(self):
+        """get status"""
+        if self.converged:
+            self._status = "success"
+        return self._status
+
+    def converged_success(self):
+        """converged success"""
+        self.end_training = True
+        self.converged = True
diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md
new file mode 100644
index 000000000..a7fccc451
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/README.md
@@ -0,0 +1,51 @@
+### 1. 数据集准备
+
+[ZINC 250k](../../benchmarks/moflow/pytorch/README.md#dataset) 
+
+### 2. Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器型号: NVIDIA DGX A100(40G) 
+    - 加速卡型号: NVIDIA_A100-SXM4-40GB
+    - CPU型号: AMD EPYC7742-64core@1.5G
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-1.13.0a0+936e930
+   - 依赖软件版本：
+     - cuda: 11.4
+   
+
+### 3.运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                                    |
+| -------------- | ----------------------- | ----------------------------------------------------------- |
+| 任务类别       | DrugDiscovery           |                                                             |
+| 模型           | MoFlow                  |                                                             |
+| 数据集         | ZINC 250k               |                                                             |
+| 数据精度       | precision,见“性能指标”  | 可选tf32/amp/fp16/                                          |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参                          |
+| 硬件设备简称   | nvidia A100             |                                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练图片数除以总时间(performance_whole)                 |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时                           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p_core>p_train>p_whole)              |
+| 训练结果       | nuv,见“性能指标”        | 所有生成的分子中，Novel, Unique and Valid的分子所占的百分比 |
+| 额外修改项     | 无                      |                                                             |
+
+* 性能指标
+
+| 配置              | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | final_nuv | mem       |
+| ----------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | --------- | --------- |
+| A100单机8卡(1x8)  | amp       | /                | 3114     | 24008   | 27165   | 30343  | 78.4      | 11.8/40.0 |
+| A100单机8卡(1x8)  | amp       | bs=3072,lr=0.003 | /        | 31279   | 38043   | 46823  | 76        | 34.6/40.0 |
+| A100单机单卡(1x1) | amp       | bs=256,lr=0.8    | /        | 4447    | 4542    | 4819   | /         | 10.9/40.0 |
+| A100两机8卡(2x8)  | amp       | bs=256,lr=0.8    | /        | 10576   | 11085   | 11874  | /         | 27.9/40.0 |
+
diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py
new file mode 100644
index 000000000..47f1f7d43
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,4 @@
+lr = 0.0001
+train_batch_size = 512
+eval_batch_size = 100
+
diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py
new file mode 100644
index 000000000..9ead7fcdb
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py
@@ -0,0 +1,4 @@
+lr = 0.0005
+train_batch_size = 512
+eval_batch_size = 100
+
diff --git a/training/nvidia/moflow-pytorch/config/config_A100x2x8.py b/training/nvidia/moflow-pytorch/config/config_A100x2x8.py
new file mode 100644
index 000000000..9049954f7
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/config/config_A100x2x8.py
@@ -0,0 +1,3 @@
+lr = 0.003
+train_batch_size = 3072
+eval_batch_size = 600
\ No newline at end of file
diff --git a/training/nvidia/moflow-pytorch/config/environment_variables.sh b/training/nvidia/moflow-pytorch/config/environment_variables.sh
new file mode 100755
index 000000000..c035c8297
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/config/environment_variables.sh
@@ -0,0 +1,8 @@
+# =================================================
+# Export variables
+# =================================================
+# Add your own proxy here if you have problems connecting with github.com
+export http_proxy="10.1.0.34:7890"
+export https_proxy="10.1.0.34:7890"
+
+export OMP_NUM_THREADS=8
diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt
new file mode 100644
index 000000000..d58f2e8a9
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/config/requirements.txt
@@ -0,0 +1,2 @@
+rdkit
+git+https://github.com/NVIDIA/dllogger#egg=dllogger
\ No newline at end of file
diff --git a/training/nvidia/moflow-pytorch/extern/trainer_adapter.py b/training/nvidia/moflow-pytorch/extern/trainer_adapter.py
new file mode 100644
index 000000000..aff774f81
--- /dev/null
+++ b/training/nvidia/moflow-pytorch/extern/trainer_adapter.py
@@ -0,0 +1,17 @@
+import torch
+from apex.optimizers import FusedAdam as Adam
+from apex.contrib.clip_grad import clip_grad_norm_
+
+def create_optimizer(model, args, loss_module):
+    optimizer = Adam((*model.parameters(), *loss_module.parameters()), lr=args.lr, betas=(args.beta1, args.beta2))
+    return optimizer
+
+
+def create_clip_grad():
+    return clip_grad_norm_
+
+
+def create_grad_scaler(args):
+    """create_grad_scaler for mixed precision training"""
+    scaler = torch.cuda.amp.GradScaler() if args.amp else None
+    return scaler
\ No newline at end of file

From 308bc2e74c564b6ed694d15b34a115e2b3ae982a Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Wed, 17 Jan 2024 17:29:04 +0800
Subject: [PATCH 02/12] update readme

---
 .../benchmarks/moflow/pytorch/config/_base.py  |  2 +-
 training/nvidia/moflow-pytorch/README.md       | 18 ++++++++++++------
 .../moflow-pytorch/config/config_A100x1x1.py   |  5 ++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py
index b1ff28e92..00dc8cc4d 100755
--- a/training/benchmarks/moflow/pytorch/config/_base.py
+++ b/training/benchmarks/moflow/pytorch/config/_base.py
@@ -36,7 +36,7 @@
 train_batch_size: int = 512
 eval_batch_size: int = 100
 
-target_nuv: float = 85.0
+target_nuv: float = 80
 
 # Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved.
 save_epochs: int = 5
diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md
index a7fccc451..841b99c67 100644
--- a/training/nvidia/moflow-pytorch/README.md
+++ b/training/nvidia/moflow-pytorch/README.md
@@ -42,10 +42,16 @@
 
 * 性能指标
 
-| 配置              | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | final_nuv | mem       |
-| ----------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | --------- | --------- |
-| A100单机8卡(1x8)  | amp       | /                | 3114     | 24008   | 27165   | 30343  | 78.4      | 11.8/40.0 |
-| A100单机8卡(1x8)  | amp       | bs=3072,lr=0.003 | /        | 31279   | 38043   | 46823  | 76        | 34.6/40.0 |
-| A100单机单卡(1x1) | amp       | bs=256,lr=0.8    | /        | 4447    | 4542    | 4819   | /         | 10.9/40.0 |
-| A100两机8卡(2x8)  | amp       | bs=256,lr=0.8    | /        | 10576   | 11085   | 11874  | /         | 27.9/40.0 |
+| 配置              | precision | fix_hp            | e2e_time | p_whole | p_train | p_core | final_nuv | mem       |
+| ----------------- | --------- | ----------------- | -------- | ------- | ------- | ------ | --------- | --------- |
+| A100单机8卡(1x8)  | amp       | /                 | 3098     | 24128   | 27514   | 30696  | 78.4      | 11.8/40.0 |
+| A100单机8卡(1x8)  | amp       | bs=3072,lr=0.003  | /        | 31279   | 38043   | 46823  | 76.1      | 34.6/40.0 |
+| A100单机单卡(1x1) | amp       | bs=3584,lr=0.0001 | /        | 5810    | 5992    | 6387   | /         | 37.9/40.0 |
+| A100两机8卡(2x8)  | amp       | bs=3072,lr=0.0005 | /        | 47655   | 63957   | 90228  | /         | 34.5/40.0 |
+
+
+
+> 注
+> 原始仓库中的[NUV](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#results)取了**20**个不同的随机种子，进行了**20**次实验的平均值。
+> 由于实验机器资源所限，我们无法进行大量的实验验证。本实验结果与上述的值相比，可能会有一定差异。
 
diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py
index 47f1f7d43..15500667d 100644
--- a/training/nvidia/moflow-pytorch/config/config_A100x1x1.py
+++ b/training/nvidia/moflow-pytorch/config/config_A100x1x1.py
@@ -1,4 +1,3 @@
 lr = 0.0001
-train_batch_size = 512
-eval_batch_size = 100
-
+train_batch_size = 3584
+eval_batch_size = 100
\ No newline at end of file

From 11e50f349c96ae84573d9f9534758335f368fae1 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Wed, 17 Jan 2024 17:32:23 +0800
Subject: [PATCH 03/12] add case example for test_conf

---
 .../nvidia/moflow-pytorch/config/environment_variables.sh     | 4 ++--
 training/run_benchmarks/config/test_conf.py                   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/training/nvidia/moflow-pytorch/config/environment_variables.sh b/training/nvidia/moflow-pytorch/config/environment_variables.sh
index c035c8297..0a9382166 100755
--- a/training/nvidia/moflow-pytorch/config/environment_variables.sh
+++ b/training/nvidia/moflow-pytorch/config/environment_variables.sh
@@ -2,7 +2,7 @@
 # Export variables
 # =================================================
 # Add your own proxy here if you have problems connecting with github.com
-export http_proxy="10.1.0.34:7890"
-export https_proxy="10.1.0.34:7890"
+# export http_proxy="127.0.0.1:7890"
+# export https_proxy="127.0.0.1:7890"
 
 export OMP_NUM_THREADS=8
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 71e1df913..254585727 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -79,7 +79,8 @@
     
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
-    "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
+    # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
+    "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
     

From 2cb2eceae19ae621ed86772dbcf3bd168ee8668b Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Wed, 17 Jan 2024 17:34:59 +0800
Subject: [PATCH 04/12] change to comment

---
 training/run_benchmarks/config/test_conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 254585727..4aa5e1051 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -80,7 +80,7 @@
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
     # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
-    "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/",
+    # "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
     

From 2dc11409d488de6b7621f1922b59ea5b380209a9 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Fri, 19 Jan 2024 10:29:58 +0800
Subject: [PATCH 05/12] rdkit add version

---
 training/nvidia/moflow-pytorch/config/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt
index d58f2e8a9..768ed454f 100644
--- a/training/nvidia/moflow-pytorch/config/requirements.txt
+++ b/training/nvidia/moflow-pytorch/config/requirements.txt
@@ -1,2 +1,2 @@
-rdkit
+rdkit==2023.9.3
 git+https://github.com/NVIDIA/dllogger#egg=dllogger
\ No newline at end of file

From abee0464fa6a96442a29aa3da6538aa47c300d1c Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Fri, 23 Feb 2024 13:09:41 +0800
Subject: [PATCH 06/12] add jit & cuda_graph to mutable_params, overwritten by
 vendors are allowed

---
 training/benchmarks/moflow/pytorch/config/mutable_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/benchmarks/moflow/pytorch/config/mutable_params.py b/training/benchmarks/moflow/pytorch/config/mutable_params.py
index 7dcf8652a..ec54460d8 100755
--- a/training/benchmarks/moflow/pytorch/config/mutable_params.py
+++ b/training/benchmarks/moflow/pytorch/config/mutable_params.py
@@ -1,5 +1,6 @@
 mutable_params = [
     'vendor', 'data_dir', 'lr', 'train_batch_size', 'eval_batch_size',
     'do_train', 'amp', 'fp16', 'distributed', 'dist_backend', 'num_workers',
-    'device', 'cudnn_benchmark', 'cudnn_deterministic'
+    'device', 'cudnn_benchmark', 'cudnn_deterministic', 
+    'jit', 'cuda_graph'
 ]

From 9d2324396b5e5d1e815b304819a5a6ccaad99a41 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Mon, 26 Feb 2024 10:14:27 +0800
Subject: [PATCH 07/12] rename config_name to dataset_name

---
 training/benchmarks/moflow/pytorch/config/_base.py           | 2 +-
 training/benchmarks/moflow/pytorch/dataloaders/dataloader.py | 4 ++--
 training/benchmarks/moflow/pytorch/runtime/arguments.py      | 2 +-
 training/benchmarks/moflow/pytorch/runtime/generate.py       | 2 +-
 training/benchmarks/moflow/pytorch/train/trainer.py          | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py
index 00dc8cc4d..9475338b6 100755
--- a/training/benchmarks/moflow/pytorch/config/_base.py
+++ b/training/benchmarks/moflow/pytorch/config/_base.py
@@ -14,7 +14,7 @@
 # =========================================================
 # The config to choose. This parameter allows one to switch between different datasets.
 # and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.
-config_name: str = "zinc250k"
+dataset_name: str = "zinc250k"
 # Number of workers in the data loader.
 num_workers: int = 4
 
diff --git a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
index a9591b20f..d1e8d27c2 100644
--- a/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
+++ b/training/benchmarks/moflow/pytorch/dataloaders/dataloader.py
@@ -12,8 +12,8 @@
 def build_datasets(args):
 
     # Model configuration
-    assert args.config_name in CONFIGS
-    config = CONFIGS[args.config_name]
+    assert args.dataset_name in CONFIGS
+    config = CONFIGS[args.dataset_name]
     data_file = config.dataset_config.dataset_file
     transform_fn = functools.partial(transform.transform_fn, config=config)
     valid_idx = transform.get_val_ids(config, args.data_dir)
diff --git a/training/benchmarks/moflow/pytorch/runtime/arguments.py b/training/benchmarks/moflow/pytorch/runtime/arguments.py
index d5b8c58c5..5a48fe4d2 100644
--- a/training/benchmarks/moflow/pytorch/runtime/arguments.py
+++ b/training/benchmarks/moflow/pytorch/runtime/arguments.py
@@ -22,7 +22,7 @@
 
 PARSER = argparse.ArgumentParser()
 PARSER.add_argument('--data_dir', type=str, default='/data', help='Location for the dataset.')
-PARSER.add_argument('--config_name', type=str, default='zinc250k', choices=list(CONFIGS),
+PARSER.add_argument('--dataset_name', type=str, default='zinc250k', choices=list(CONFIGS),
                     help='The config to choose. This parameter allows one to switch between different datasets '
                          'and their dedicated configurations of the neural network. By default, a pre-defined "zinc250k" config is used.')
 PARSER.add_argument('--results_dir', type=str, default='/results', help='Directory where checkpoints are stored.')
diff --git a/training/benchmarks/moflow/pytorch/runtime/generate.py b/training/benchmarks/moflow/pytorch/runtime/generate.py
index de9369494..5e03f1db4 100644
--- a/training/benchmarks/moflow/pytorch/runtime/generate.py
+++ b/training/benchmarks/moflow/pytorch/runtime/generate.py
@@ -57,7 +57,7 @@ def infer(model: MoFlow, config: Config, device: torch.device, *,
         smiles_writer = SmilesWriter(args.predictions_path)
 
     snapshot_path = get_newest_checkpoint(args.results_dir)
-    config = CONFIGS[args.config_name]
+    config = CONFIGS[args.dataset_name]
     model = MoFlow(config)
 
     device = get_device(args.local_rank)
diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py
index ab9a95d5b..98de66380 100755
--- a/training/benchmarks/moflow/pytorch/train/trainer.py
+++ b/training/benchmarks/moflow/pytorch/train/trainer.py
@@ -55,7 +55,7 @@ def __init__(self, driver: Driver, adapter, evaluator: Evaluator,
     def init(self):
         args = self.args
         dist_pytorch.main_proc_print("Init progress:")
-        self.config = CONFIGS[self.args.config_name]
+        self.config = CONFIGS[self.args.dataset_name]
         self.model = create_model(self.config)
         self.model.to(self.device)
         device = args.device

From 01097740327bac8aed89cecba4f8beb2b035fc0b Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Mon, 26 Feb 2024 13:09:53 +0800
Subject: [PATCH 08/12] set time statistic variables to 0

---
 training/benchmarks/moflow/pytorch/train/training_state.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/training/benchmarks/moflow/pytorch/train/training_state.py b/training/benchmarks/moflow/pytorch/train/training_state.py
index 35e6dd53d..4a45ab93d 100755
--- a/training/benchmarks/moflow/pytorch/train/training_state.py
+++ b/training/benchmarks/moflow/pytorch/train/training_state.py
@@ -19,10 +19,9 @@ class TrainingState:
     end_training: bool = False
     converged: bool = False
 
-    train_time = 0.001
-    no_eval_time = 0.001
-    pure_compute_time = 0.001
-
+    train_time = 0
+    no_eval_time = 0
+    pure_compute_time = 0
     num_trained_samples = 0
 
     def status(self):

From eb1fd00fd754e76abe208b72c1fe3b03cd141ecd Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Sat, 2 Mar 2024 19:11:20 +0800
Subject: [PATCH 09/12] update seed and target_nuv

---
 .../benchmarks/moflow/pytorch/config/_base.py |  4 +-
 .../moflow/pytorch/run_pretraining.py         |  7 +--
 .../moflow/pytorch/train/evaluator.py         | 17 +++----
 .../moflow/pytorch/train/trainer.py           | 44 +++++++++++--------
 .../moflow-pytorch/config/config_A100x1x8.py  |  1 +
 .../moflow-pytorch/config/requirements.txt    |  2 +-
 6 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/training/benchmarks/moflow/pytorch/config/_base.py b/training/benchmarks/moflow/pytorch/config/_base.py
index 9475338b6..26f9ab84c 100755
--- a/training/benchmarks/moflow/pytorch/config/_base.py
+++ b/training/benchmarks/moflow/pytorch/config/_base.py
@@ -36,10 +36,10 @@
 train_batch_size: int = 512
 eval_batch_size: int = 100
 
-target_nuv: float = 80
+target_nuv: float = 87.9
 
 # Frequency for saving checkpoints, expressed in epochs. If -1 is provided, checkpoints will not be saved.
-save_epochs: int = 5
+save_epochs: int = 50
 # Evaluation frequency, expressed in epochs. If -1 is provided, an evaluation will not be performed.
 eval_epochs: int = 5
 
diff --git a/training/benchmarks/moflow/pytorch/run_pretraining.py b/training/benchmarks/moflow/pytorch/run_pretraining.py
index 07a216567..88a68e90a 100755
--- a/training/benchmarks/moflow/pytorch/run_pretraining.py
+++ b/training/benchmarks/moflow/pytorch/run_pretraining.py
@@ -97,6 +97,8 @@ def main() -> Tuple[Any, Any]:
     dist_pytorch.barrier(args.vendor)
 
     if not args.do_train:
+        res = evaluator.evaluate(args, trainer.config, acc_logger)
+        dist_pytorch.main_proc_print(f"evaluate results: {res}")
         return args, training_state
 
     model_driver.event(Event.INIT_END)
@@ -122,7 +124,6 @@ def main() -> Tuple[Any, Any]:
         first_epoch = 0
         step = 0
 
-
     if first_epoch > args.epochs:
         dist_pytorch.main_proc_print(
             f'Model was already trained for {first_epoch} epochs, skip pretraining'
@@ -132,9 +133,9 @@ def main() -> Tuple[Any, Any]:
     epoch = first_epoch
     while epoch < args.epochs:
         training_state.epoch = epoch
-        step = trainer.train_one_epoch(train_dataloader, step)
+        trainer.train_one_epoch(train_dataloader, step)
         epoch += 1
-        if step >= args.steps:
+        if training_state.global_steps >= args.steps:
             break
     # TRAIN_END事件
     training_state.train_time = time.time() - train_start_time
diff --git a/training/benchmarks/moflow/pytorch/train/evaluator.py b/training/benchmarks/moflow/pytorch/train/evaluator.py
index 6af54fb72..7bb0b39c0 100755
--- a/training/benchmarks/moflow/pytorch/train/evaluator.py
+++ b/training/benchmarks/moflow/pytorch/train/evaluator.py
@@ -40,7 +40,6 @@ def evaluate(self, args, config, acc_logger):
             raise RuntimeError(
                 'Generating molecules from an untrained network! '
                 'If this was intentional, pass --allow_untrained flag.')
-        dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}")
 
         model.to(device)
         model.eval()
@@ -48,7 +47,7 @@ def evaluate(self, args, config, acc_logger):
         if args.steps == -1:
             args.steps = 1
         else:
-            args.steps = 100
+            args.steps = 1000
 
         dist_pytorch.main_proc_print(f"ln_var ===> {ln_var}. args.steps:{args.steps}")
 
@@ -58,7 +57,6 @@ def evaluate(self, args, config, acc_logger):
             transform=partial(transform.transform_fn, config=config),
         )
         train_idx = [t for t in range(len(dataset)) if t not in valid_idx]
-        n_train = len(train_idx)
         train_dataset = torch.utils.data.Subset(dataset, train_idx)
         train_x = torch.Tensor(np.array([a[0] for a in train_dataset]))
         train_adj = torch.Tensor(np.array([a[1] for a in train_dataset]))
@@ -80,7 +78,6 @@ def evaluate(self, args, config, acc_logger):
                     correct_validity=args.correct_validity,
                 )
                 validity_info = check_validity(mols_batch)
-
                 novel_r, abs_novel_r = check_novelty(
                     validity_info['valid_smiles'],
                     train_smiles,
@@ -99,12 +96,8 @@ def evaluate(self, args, config, acc_logger):
                     'abs_uniqueness': validity_info['abs_unique_ratio'],
                     'nuv': nuv,
                 }
-                dist_pytorch.main_proc_print(f"step:{i} metrics:{metrics}")
-                if args.local_rank == 0:
-                    acc_logger.update(metrics)
-
-        if args.local_rank == 0:
-            stats = acc_logger.summarize(step=tuple())
-            return stats
+                dist_pytorch.main_proc_print("metrics", i, metrics)
+                acc_logger.update(metrics)
 
-        return None
+        stats = acc_logger.summarize(step=tuple())
+        return stats
diff --git a/training/benchmarks/moflow/pytorch/train/trainer.py b/training/benchmarks/moflow/pytorch/train/trainer.py
index 98de66380..e2d9db2f8 100755
--- a/training/benchmarks/moflow/pytorch/train/trainer.py
+++ b/training/benchmarks/moflow/pytorch/train/trainer.py
@@ -59,24 +59,23 @@ def init(self):
         self.model = create_model(self.config)
         self.model.to(self.device)
         device = args.device
-        model = self.model
         x, adj, *_ = next(iter(self.train_dataloader))
         x = x.to(device)
         adj = adj.to(device)
         with autocast(enabled=args.amp):
             initialize(self.model, (adj, x))
 
-        model.to(memory_format=torch.channels_last)
+        self.model.to(memory_format=torch.channels_last)
         adj.to(memory_format=torch.channels_last)
 
         if args.jit:
-            model.bond_model = torch.jit.script(model.bond_model)
-            model.atom_model = torch.jit.script(model.atom_model)
+            self.model.bond_model = torch.jit.script(self.model.bond_model)
+            self.model.atom_model = torch.jit.script(self.model.atom_model)
 
         # make one pass in both directions to make sure that model works
         with torch.no_grad():
-            _ = model(adj, x)
-            _ = model.reverse(
+            _ = self.model(adj, x)
+            _ = self.model.reverse(
                 torch.randn(args.train_batch_size,
                             self.config.z_dim,
                             device=device))
@@ -107,7 +106,7 @@ def _get_callables(self):
             loss_callable = self.loss_module
         return model_callable, loss_callable
 
-    def train_one_epoch(self, train_dataloader, step) -> int:
+    def train_one_epoch(self, train_dataloader, step):
         model = self.model
         args = self.args
         device = self.device
@@ -117,6 +116,8 @@ def train_one_epoch(self, train_dataloader, step) -> int:
         local_rank = self.args.local_rank
         is_distributed = args.distributed
 
+        if step > 0 and step > self.training_state.global_steps:
+            self.training_state.global_steps = step
 
         if local_rank == 0:
             self.acc_logger.reset()
@@ -131,7 +132,7 @@ def train_one_epoch(self, train_dataloader, step) -> int:
         for i, batch in enumerate(train_dataloader):
             if local_rank == 0:
                 self.perf_logger.update()
-            step += 1
+            self.training_state.global_steps += 1
             self.optimizer.zero_grad()
             x = batch[0].to(device)
             adj = batch[1].to(device=device, memory_format=torch.channels_last)
@@ -139,8 +140,10 @@ def train_one_epoch(self, train_dataloader, step) -> int:
             pure_compute_start_time = time.time()
 
             # Forward, backward and optimize
-            with_cuda_graph = (args.cuda_graph and step >= args.warmup_steps
-                               and x.size(0) == args.train_batch_size)
+            with_cuda_graph = (
+                args.cuda_graph
+                and self.training_state.global_steps >= args.warmup_steps
+                and x.size(0) == args.train_batch_size)
 
             with autocast(enabled=args.amp, cache_enabled=not with_cuda_graph):
                 output = model(adj, x, with_cuda_graph=with_cuda_graph)
@@ -158,7 +161,8 @@ def train_one_epoch(self, train_dataloader, step) -> int:
                 clip_grad_norm_(model.parameters(), args.clip)
                 self.optimizer.step()
 
-            self.training_state.pure_compute_time += time.time() - pure_compute_start_time
+            self.training_state.pure_compute_time += time.time(
+            ) - pure_compute_start_time
 
             # Print log info
             if (i + 1) % args.log_interval == 0:
@@ -176,19 +180,21 @@ def train_one_epoch(self, train_dataloader, step) -> int:
                     self.acc_logger.summarize(step=(epoch, i, i))
                     self.perf_logger.summarize(step=(epoch, i, i))
 
-
-            if step >= args.steps:
+            if self.training_state.global_steps >= args.steps:
                 break
 
-        self.training_state.num_trained_samples += len(train_dataloader.dataset)
+        self.training_state.num_trained_samples += len(
+            train_dataloader.dataset)
         self.training_state.no_eval_time += time.time() - noeval_start_time
 
         if (epoch + 1) % args.eval_epochs == 0:
             with autocast(enabled=args.amp):
                 metrics = run_validation(self.model, self.config,
-                                         self.loss_callable.ln_var.item(), args,
-                                         is_distributed, world_size, device)
-                dist_pytorch.main_proc_print(f"epoch:{epoch+1}, metrics:{metrics}")
+                                         self.loss_callable.ln_var.item(),
+                                         args, is_distributed, world_size,
+                                         device)
+                dist_pytorch.main_proc_print(
+                    f"epoch:{epoch+1}, metrics:{metrics}")
 
             if local_rank == 0:
                 self.acc_logger.update(metrics)
@@ -206,8 +212,8 @@ def train_one_epoch(self, train_dataloader, step) -> int:
                            self.optimizer,
                            self.loss_callable.ln_var.item(),
                            epoch,
-                           keep=5)
-        return step
+                           keep=3)
+
 
 def run_validation(model: MoFlow, config: Config, ln_var: float,
                    args: argparse.Namespace, is_distributed: bool,
diff --git a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py
index 9ead7fcdb..d7c74636d 100644
--- a/training/nvidia/moflow-pytorch/config/config_A100x1x8.py
+++ b/training/nvidia/moflow-pytorch/config/config_A100x1x8.py
@@ -2,3 +2,4 @@
 train_batch_size = 512
 eval_batch_size = 100
 
+seed = 42
\ No newline at end of file
diff --git a/training/nvidia/moflow-pytorch/config/requirements.txt b/training/nvidia/moflow-pytorch/config/requirements.txt
index 768ed454f..441b965e1 100644
--- a/training/nvidia/moflow-pytorch/config/requirements.txt
+++ b/training/nvidia/moflow-pytorch/config/requirements.txt
@@ -1,2 +1,2 @@
-rdkit==2023.9.3
+rdkit-pypi
 git+https://github.com/NVIDIA/dllogger#egg=dllogger
\ No newline at end of file

From bfc3d1dc478c8c6d2dde7622adaac81555bb94a2 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Sat, 2 Mar 2024 19:18:17 +0800
Subject: [PATCH 10/12] update 1x8 result for official bs

---
 training/nvidia/moflow-pytorch/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md
index 841b99c67..bf006dc8e 100644
--- a/training/nvidia/moflow-pytorch/README.md
+++ b/training/nvidia/moflow-pytorch/README.md
@@ -44,8 +44,8 @@
 
 | 配置              | precision | fix_hp            | e2e_time | p_whole | p_train | p_core | final_nuv | mem       |
 | ----------------- | --------- | ----------------- | -------- | ------- | ------- | ------ | --------- | --------- |
-| A100单机8卡(1x8)  | amp       | /                 | 3098     | 24128   | 27514   | 30696  | 78.4      | 11.8/40.0 |
-| A100单机8卡(1x8)  | amp       | bs=3072,lr=0.003  | /        | 31279   | 38043   | 46823  | 76.1      | 34.6/40.0 |
+| A100单机8卡(1x8)  | amp       | /                 | 3220     | 27023   | 27519   | 30789  | 88.45     | 11.8/40.0 |
+| A100单机8卡(1x8)  | amp       | bs=3072,lr=0.003  | /        | 31279   | 38043   | 46823  | /         | 34.6/40.0 |
 | A100单机单卡(1x1) | amp       | bs=3584,lr=0.0001 | /        | 5810    | 5992    | 6387   | /         | 37.9/40.0 |
 | A100两机8卡(2x8)  | amp       | bs=3072,lr=0.0005 | /        | 47655   | 63957   | 90228  | /         | 34.5/40.0 |
 

From 9d09a230fa1848fcd211314da1c5c96697303981 Mon Sep 17 00:00:00 2001
From: zhouyu <zhouyu@baai.ac.cn>
Date: Sat, 2 Mar 2024 19:30:13 +0800
Subject: [PATCH 11/12] update notice for readme

---
 training/nvidia/moflow-pytorch/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/nvidia/moflow-pytorch/README.md b/training/nvidia/moflow-pytorch/README.md
index bf006dc8e..9fa9609be 100644
--- a/training/nvidia/moflow-pytorch/README.md
+++ b/training/nvidia/moflow-pytorch/README.md
@@ -53,5 +53,6 @@
 
 > 注
 > 原始仓库中的[NUV](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#results)取了**20**个不同的随机种子，进行了**20**次实验的平均值。
-> 由于实验机器资源所限，我们无法进行大量的实验验证。本实验结果与上述的值相比，可能会有一定差异。
+> 此模型本身的实验结果对随机性比较敏感。seed, temperature等都会影响nuv的值，参考[training-stability-test]一节的说明。(https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/DrugDiscovery/MoFlow#training-stability-test)。
+> 如厂商一次无法收敛，需尝试运行**若干次**。
 

From 67bd369debff0dfb3b452cdee942c9bb0264d87d Mon Sep 17 00:00:00 2001
From: Zhou Yu <zycosmos@gmail.com>
Date: Sat, 2 Mar 2024 19:31:38 +0800
Subject: [PATCH 12/12] Update test_conf.py

---
 training/run_benchmarks/config/test_conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 4aa5e1051..ee609e952 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -80,7 +80,7 @@
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
     # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
-    # "moflow:pytorch_1.13:A100:1:1:1": "/raid/dataset/MoFlow/data/",
+    # "moflow:pytorch_1.13:A100:1:8:1": "/raid/dataset/MoFlow/data/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
     
@@ -152,4 +152,4 @@
     # "faster_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
     # "retinanet:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
     # "resnet50:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
-    # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
\ No newline at end of file
+    # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",