-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
83 changed files
with
17,386 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
FROM nvcr.io/nvidia/pytorch:22.01-py3 as base | ||
|
||
#create a new new user | ||
RUN useradd -ms /bin/bash shengchaol | ||
|
||
# #change to this user | ||
# USER shengchaol | ||
|
||
#set working directory | ||
WORKDIR /home/shengchaol | ||
|
||
RUN chmod -R 777 /home/shengchaol | ||
RUN chmod -R 777 /usr/bin | ||
RUN chmod -R 777 /bin | ||
RUN chmod -R 777 /usr/local | ||
RUN chmod -R 777 /opt/conda | ||
|
||
RUN conda install -y python=3.7 | ||
|
||
RUN conda install -y -c rdkit rdkit=2020.09.1.0 | ||
RUN conda install -y -c conda-forge -c pytorch pytorch=1.9.1 | ||
|
||
RUN conda install -y -c pyg -c conda-forge pyg | ||
|
||
RUN pip install requests | ||
RUN pip install tqdm | ||
RUN pip install matplotlib | ||
RUN pip install spacy | ||
|
||
# for SciBert | ||
RUN conda install -y boto3 | ||
RUN pip install transformers | ||
|
||
# for MoleculeNet | ||
RUN pip install ogb | ||
|
||
# install pysmilesutils | ||
RUN python -m pip install git+https://github.com/MolecularAI/pysmilesutils.git | ||
|
||
RUN pip install deepspeed | ||
|
||
# install Megatron | ||
RUN cd /tmp && git clone https://github.com/MolecularAI/MolBART.git --branch megatron-molbart-with-zinc && cd /tmp/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism && pip install . | ||
|
||
# install apex | ||
RUN cd /tmp && git clone https://github.com/chao1224/apex.git | ||
RUN cd /tmp/apex/ && pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ | ||
|
||
|
||
#expose port for Jupyter | ||
EXPOSE 8888 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
NVIDIA Source Code License for MoleculeSTM | ||
|
||
1. Definitions | ||
|
||
“Licensor” means any person or entity that distributes its Work. | ||
|
||
“Software” means the original work of authorship made available under this License. | ||
|
||
“Work” means the Software and any additions to or derivative works of the Software that are made available under | ||
this License. | ||
|
||
The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under | ||
U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include | ||
works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. | ||
|
||
Works, including the Software, are “made available” under this License by including in or with the Work either | ||
(a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License. | ||
|
||
2. License Grant | ||
|
||
2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual, | ||
worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly | ||
display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. | ||
|
||
3. Limitations | ||
|
||
3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you | ||
include a complete copy of this License with your distribution, and (c) you retain without modification any | ||
copyright, patent, trademark, or attribution notices that are present in the Work. | ||
|
||
3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and | ||
distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use | ||
limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works | ||
that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution | ||
requirements in Section 3.1) will continue to apply to the Work itself. | ||
|
||
3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use | ||
non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative | ||
works commercially. As used herein, “non-commercially” means for research or evaluation purposes only. | ||
|
||
3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, | ||
cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then | ||
your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately. | ||
|
||
3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos, | ||
or trademarks, except as necessary to reproduce the notices described in this License. | ||
|
||
3.6 Termination. If you violate any term of this License, then your rights under this License (including the | ||
grant in Section 2.1) will terminate immediately. | ||
|
||
4. Disclaimer of Warranty. | ||
|
||
THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING | ||
WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU | ||
BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. | ||
|
||
5. Limitation of Liability. | ||
|
||
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING | ||
NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, | ||
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR | ||
INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR | ||
DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN | ||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. |
Empty file.
106 changes: 106 additions & 0 deletions
106
MoleculeSTM/backup/downstream_language_edit_step_00_check_reconstruction.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import argparse | ||
import os | ||
import numpy as np | ||
from rdkit import Chem | ||
from rdkit.Chem import Descriptors | ||
|
||
import torch | ||
from torch.utils.data import DataLoader as torch_DataLoader | ||
|
||
from MoleculeSTM.utils import freeze_network | ||
from MoleculeSTM.datasets import ZINC15_Datasets_Only_SMILES, PubChem_Datasets_Only_SMILES | ||
from MoleculeSTM.models.mega_molbart.mega_mol_bart import MegaMolBART | ||
|
||
props = [ | ||
"qed", "MolWt", "MolLogP", "TPSA", | ||
"HeavyAtomCount", "NumAromaticRings", "NumHAcceptors", "NumHDonors", "NumRotatableBonds" | ||
] | ||
props = [ | ||
"MolWt", "MolLogP" | ||
] | ||
prop_pred = [(n, func) for n, func in Descriptors.descList if n.split("_")[-1] in props] | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--seed", type=int, default=42) | ||
parser.add_argument("--device", type=int, default=0) | ||
parser.add_argument("--verbose", type=int, default=1) | ||
parser.add_argument("--dataspace_path", type=str, default="../../Datasets") | ||
parser.add_argument("--dataset", type=str, default="ZINC15") | ||
parser.add_argument("--molecule_type", type=str, default="MegaMolBART", choices=["MegaMolBART", "Graph"]) | ||
|
||
########## for MoleculeSTM ########## | ||
parser.add_argument("--CLIP_input_model_dir", type=str, default="../../pretrained_model") | ||
parser.add_argument("--SSL_emb_dim", type=int, default=256) | ||
|
||
########## for generation ########## | ||
parser.add_argument("--generation_model_dir", type=str, default="../../Datasets/pretrained_MegaMolBART/checkpoints") | ||
|
||
########## for optimization ########## | ||
parser.add_argument("--batch_size", type=int, default=64) | ||
parser.add_argument("--num_workers", type=int, default=8) | ||
|
||
args = parser.parse_args() | ||
print(args) | ||
|
||
# This is loading from the pretarined_MegaMolBART | ||
MegaMolBART_wrapper = MegaMolBART(input_dir=args.generation_model_dir, output_dir=None) | ||
molecule_model_generation = MegaMolBART_wrapper.model | ||
print("Loading from pretrained MegaMolBART ({}).".format(args.generation_model_dir)) | ||
molecule_dim_generation = 256 | ||
|
||
device = torch.device("cuda:" + str(args.device)) \ | ||
if torch.cuda.is_available() else torch.device("cpu") | ||
molecule_model_generation = molecule_model_generation.to(device) | ||
|
||
np.random.seed(args.seed) | ||
torch.random.manual_seed(args.seed) | ||
if torch.cuda.is_available(): | ||
torch.cuda.manual_seed_all(args.seed) | ||
device = torch.device("cuda:" + str(args.device)) \ | ||
if torch.cuda.is_available() else torch.device("cpu") | ||
|
||
freeze_network(molecule_model_generation) | ||
molecule_model_generation.eval() | ||
|
||
if args.molecule_type == "MegaMolBART": | ||
if args.dataset == "ZINC15": | ||
dataset_root = os.path.join(args.dataspace_path, "ZINC15_data") | ||
dataset = ZINC15_Datasets_Only_SMILES(dataset_root) | ||
elif "PubChem" in args.dataset: | ||
dataset_root = os.path.join(args.dataspace_path, "PubChem_data") | ||
dataset = PubChem_Datasets_Only_SMILES(dataset_root) | ||
else: | ||
raise Exception | ||
dataloader_class = torch_DataLoader | ||
else: | ||
raise Exception | ||
|
||
dataloader = dataloader_class(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) | ||
|
||
for batch_idx, batch in enumerate(dataloader): | ||
SMILES_list = batch | ||
print("SMILES_list", SMILES_list) | ||
|
||
for original_SMILES in SMILES_list: | ||
mol = Chem.MolFromSmiles(original_SMILES) | ||
for name, func in prop_pred: | ||
value = func(mol) | ||
print("{}: {}".format(name, value)) | ||
canon_original_SMILES = Chem.MolToSmiles(mol) | ||
|
||
latent_code_init, pad_mask_init = MegaMolBART_wrapper.smileslist2embedding_model_given(molecule_model_generation, [original_SMILES]) # [pad, B, d], [pad, B] | ||
print("latent_code:\t", latent_code_init[0, :, :5]) | ||
|
||
latent_code_init, pad_mask_init = MegaMolBART_wrapper.smileslist2embedding_model_given(molecule_model_generation, [canon_original_SMILES]) # [pad, B, d], [pad, B] | ||
print("latent_code:\t", latent_code_init[0, :, :5]) | ||
|
||
generated_SMILES = MegaMolBART_wrapper.inverse_transform([latent_code_init], pad_mask_init.bool().cuda(), k=1, sanitize=True) | ||
print("original SMILES: \t", original_SMILES) | ||
print("original SMILES (canon): \t", canon_original_SMILES) | ||
print("reconstructured SMILES: \t", generated_SMILES[0]) | ||
print() | ||
|
||
if batch_idx >= 9: | ||
break |
Oops, something went wrong.