Initial commit

chao1224 · Aug 30, 2023 · 8f31c0e · 8f31c0e
1 parent 93493be
commit 8f31c0e
Show file tree

Hide file tree

Showing 83 changed files with 17,386 additions and 1 deletion.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,51 @@
+FROM nvcr.io/nvidia/pytorch:22.01-py3 as base
+
+#create a new new user
+RUN useradd -ms /bin/bash shengchaol
+
+# #change to this user
+# USER shengchaol
+
+#set working directory
+WORKDIR /home/shengchaol
+
+RUN chmod -R 777 /home/shengchaol
+RUN chmod -R 777 /usr/bin
+RUN chmod -R 777 /bin
+RUN chmod -R 777 /usr/local
+RUN chmod -R 777 /opt/conda
+
+RUN conda install -y python=3.7
+
+RUN conda install -y -c rdkit rdkit=2020.09.1.0
+RUN conda install -y -c conda-forge -c pytorch pytorch=1.9.1
+
+RUN conda install -y -c pyg -c conda-forge pyg
+
+RUN pip install requests
+RUN pip install tqdm
+RUN pip install matplotlib
+RUN pip install spacy
+
+# for SciBert
+RUN conda install -y boto3
+RUN pip install transformers
+
+# for MoleculeNet
+RUN pip install ogb
+
+# install pysmilesutils
+RUN python -m pip install git+https://github.com/MolecularAI/pysmilesutils.git
+
+RUN pip install deepspeed
+
+# install Megatron
+RUN cd /tmp && git clone https://github.com/MolecularAI/MolBART.git --branch megatron-molbart-with-zinc && cd /tmp/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism && pip install .
+
+# install apex
+RUN cd /tmp && git clone https://github.com/chao1224/apex.git
+RUN cd /tmp/apex/ && pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+
+#expose port for Jupyter
+EXPOSE 8888
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,64 @@
+NVIDIA Source Code License for MoleculeSTM
+
+1. Definitions
+
+“Licensor” means any person or entity that distributes its Work.
+
+“Software” means the original work of authorship made available under this License.
+
+“Work” means the Software and any additions to or derivative works of the Software that are made available under
+this License.
+
+The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under
+U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include
+works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are “made available” under this License by including in or with the Work either
+(a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License.
+
+2. License Grant
+
+2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual,
+worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly
+display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
+
+3. Limitations
+
+3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you
+include a complete copy of this License with your distribution, and (c) you retain without modification any
+copyright, patent, trademark, or attribution notices that are present in the Work.
+
+3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and
+distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use
+limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works
+that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution
+requirements in Section 3.1) will continue to apply to the Work itself.
+
+3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use
+non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative
+works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
+
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim,
+cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then
+your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately.
+
+3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos,
+or trademarks, except as necessary to reproduce the notices described in this License.
+
+3.6 Termination. If you violate any term of this License, then your rights under this License (including the
+grant in Section 2.1) will terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU
+BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING
+NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR
+INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
+DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
diff --git a/MoleculeSTM/__init__.py b/MoleculeSTM/__init__.py
diff --git a/MoleculeSTM/backup/downstream_language_edit_step_00_check_reconstruction.py b/MoleculeSTM/backup/downstream_language_edit_step_00_check_reconstruction.py
@@ -0,0 +1,106 @@
+import argparse
+import os
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+import torch
+from torch.utils.data import DataLoader as torch_DataLoader
+
+from MoleculeSTM.utils import freeze_network
+from MoleculeSTM.datasets import ZINC15_Datasets_Only_SMILES, PubChem_Datasets_Only_SMILES
+from MoleculeSTM.models.mega_molbart.mega_mol_bart import MegaMolBART
+
+props = [
+    "qed", "MolWt", "MolLogP", "TPSA",
+    "HeavyAtomCount", "NumAromaticRings", "NumHAcceptors", "NumHDonors",  "NumRotatableBonds"
+]
+props = [
+    "MolWt", "MolLogP"
+]
+prop_pred = [(n, func) for n, func in Descriptors.descList if n.split("_")[-1] in props]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--device", type=int, default=0)
+    parser.add_argument("--verbose", type=int, default=1)
+    parser.add_argument("--dataspace_path", type=str, default="../../Datasets")
+    parser.add_argument("--dataset", type=str, default="ZINC15")
+    parser.add_argument("--molecule_type", type=str, default="MegaMolBART", choices=["MegaMolBART", "Graph"])
+
+    ########## for MoleculeSTM ##########
+    parser.add_argument("--CLIP_input_model_dir", type=str, default="../../pretrained_model")
+    parser.add_argument("--SSL_emb_dim", type=int, default=256)
+
+    ########## for generation ##########
+    parser.add_argument("--generation_model_dir", type=str, default="../../Datasets/pretrained_MegaMolBART/checkpoints")
+
+    ########## for optimization ##########
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--num_workers", type=int, default=8)
+
+    args = parser.parse_args()
+    print(args)
+
+    # This is loading from the pretarined_MegaMolBART
+    MegaMolBART_wrapper = MegaMolBART(input_dir=args.generation_model_dir, output_dir=None)
+    molecule_model_generation = MegaMolBART_wrapper.model
+    print("Loading from pretrained MegaMolBART ({}).".format(args.generation_model_dir))
+    molecule_dim_generation = 256
+
+    device = torch.device("cuda:" + str(args.device)) \
+        if torch.cuda.is_available() else torch.device("cpu")
+    molecule_model_generation = molecule_model_generation.to(device)
+
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+    device = torch.device("cuda:" + str(args.device)) \
+        if torch.cuda.is_available() else torch.device("cpu")
+
+    freeze_network(molecule_model_generation)
+    molecule_model_generation.eval()
+
+    if args.molecule_type == "MegaMolBART":
+        if args.dataset == "ZINC15":
+            dataset_root = os.path.join(args.dataspace_path, "ZINC15_data")
+            dataset = ZINC15_Datasets_Only_SMILES(dataset_root)
+        elif "PubChem" in args.dataset:
+            dataset_root = os.path.join(args.dataspace_path, "PubChem_data")
+            dataset = PubChem_Datasets_Only_SMILES(dataset_root)
+        else:
+            raise Exception
+        dataloader_class = torch_DataLoader
+    else:
+        raise Exception
+
+    dataloader = dataloader_class(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
+
+    for batch_idx, batch in enumerate(dataloader):
+        SMILES_list = batch
+        print("SMILES_list", SMILES_list)
+
+        for original_SMILES in SMILES_list:
+            mol = Chem.MolFromSmiles(original_SMILES)
+            for name, func in prop_pred:
+                value = func(mol)
+                print("{}: {}".format(name, value))
+            canon_original_SMILES = Chem.MolToSmiles(mol)
+
+            latent_code_init, pad_mask_init = MegaMolBART_wrapper.smileslist2embedding_model_given(molecule_model_generation, [original_SMILES])  # [pad, B, d], [pad, B]
+            print("latent_code:\t", latent_code_init[0, :, :5])
+
+            latent_code_init, pad_mask_init = MegaMolBART_wrapper.smileslist2embedding_model_given(molecule_model_generation, [canon_original_SMILES])  # [pad, B, d], [pad, B]
+            print("latent_code:\t", latent_code_init[0, :, :5])
+
+            generated_SMILES = MegaMolBART_wrapper.inverse_transform([latent_code_init], pad_mask_init.bool().cuda(), k=1, sanitize=True)
+            print("original SMILES:          \t", original_SMILES)
+            print("original SMILES (canon):  \t", canon_original_SMILES)
+            print("reconstructured SMILES:   \t", generated_SMILES[0])
+            print()
+
+        if batch_idx >= 9:
+            break