initial commit

melloddy · Jul 19, 2021 · 9fedcd3 · 9fedcd3
1 parent ee58f56
commit 9fedcd3
Show file tree

Hide file tree

Showing 8 changed files with 920 additions and 0 deletions.
diff --git a/chemfold/analysis.py b/chemfold/analysis.py
diff --git a/chemfold/analyze.py b/chemfold/analyze.py
@@ -0,0 +1,63 @@
+import argparse
+import logging
+import sys
+import analysis as cfa
+
+#logging
+shandler = logging.StreamHandler(stream=sys.stdout)
+formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
+shandler.setFormatter(formatter)
+shandler.setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(shandler)
+
+
+parser = argparse.ArgumentParser(description="Suite of different folding methods for chemical structures suitable for privacy-preserving federated learning and analysis scripts.")
+#parser.add_argument("--x", help="Descriptor file (matrix market or numpy)", type=str, required=True)
+parser.add_argument("--inp", help="Folder with input files, e.g., descriptor file (matrix market or numpy, distances, y matrix, sparse chem model folder + json summary,Matrix of ECFP features [compounds x ecfp], .mtx file)", type=str, required=True)
+parser.add_argument("--analysis", help="which analysis to run options are: [all, performance, imbalance, similarity]", choices=['all','performance','imbalance','similarity'], default='all')
+parser.add_argument("--out", help="Output directory for the clusters file (.npy)", type=str, required=True)
+
+#performance
+parser.add_argument("--psc", help="Path to sparsechem models sparsechem/models", type=str, required=False)
+parser.add_argument("--baseline_prefix", help="Baseline prefix", type=str, default='random')
+
+#similarity
+parser.add_argument("--maxsample", help="Maximal number of compound pairs to sample", type=int, default=10000000)
+parser.add_argument("--batchsize", help="Number of compound pairs to precessed in one batch", type=int, default=50000)
+parser.add_argument("--numbins", help="Number of bins of the histogram", type=int, default=10)
+parser.add_argument("--precision", help="Maximal tolerated change of intra fold ratio per similarity bin between to batches to reach convergence",type=float,default=0.02)
+parser.add_argument("--minpop", help="Minimal population of intra-fold samples per similarity bin to required to reach convergence",type=int, default=3)
+parser.add_argument("--rseed",help="random seed to use", type=int, default = 123456)
+
+args = parser.parse_args()
+logger.info(args)
+
+
+#call of analysis functions
+
+if args.analysis:
+	if args.analysis == 'imbalance' or args.analysis == 'all':
+	#balance
+		cfa.balance(args.inp,args.out)
+
+
+	if args.analysis == 'performance' or args.analysis == 'all':
+		#performance
+		#before a model has to be trained with the baseline and the alternative fold splitting using SparseChem
+		# We recognize the conditions by the prefix given as option during the training job submission. This prefix is found in the model json files. All  fold models under the same conditions are expected to have the same prefix. We now declare which condition is the baseline.
+
+		#TODO: add an example call of sparsechem train
+        #cd ./examples/chembl
+		#python train.py --prefix [specify folding method here]
+
+
+		#Analysis of the effect of the fold splits on model performance, using random folding as the baseline. It assumes you have created the models for 5 validation folds and reads in the json files created by sparsechem. Each folding scheme should use a diffrent sparsechem --prefix to distinguish them.
+		cfa.performance(args.psc, args.baseline_prefix, args.out)
+
+
+	if args.analysis == 'similarity' or args.analysis == 'all':
+		#similarity
+		#This analyzes the fraction of randomly chosen compound pairs per similariuy bins that come from the same fold. Ideally the fraction on intra-fold pairs for high similarity bins should be as high as possible.
+		cfa.similarity(args.inp, args.out,args.maxsample,args.batchsize,args.numbins,args.precision,args.minpop,args.rseed)
diff --git a/chemfold/random_split.py b/chemfold/random_split.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""random_split.py: Assigns a random fold number to each input.
+
+Assigns a random fold number to each input for the ML model to compare the generated distribution of data and labels over the folds and the resulting performance with other methods like LFC, LSH or a scaffold-based fold splitting. This script is part of the folding single partner study of WP1 from the MELLODDY project. It is highly inspired by the script 'split_by_scaffold.py by Ansgar Schuffenhauer.
+"""
+#ATTENTION! WARNING! This is a beta version. Use on your own risk!
+
+
+import argparse
+import numpy as np
+import pandas as pd
+import json
+import hashlib
+import random
+import hmac
+import os, sys, logging
+
+__author__ = "Lina Humbeck, Boehringer Ingelheim Pharma GmbH & Co.KG"
+__contributors__ = "Ansgar Schuffenhauer, Novartis"
+
+shandler = logging.StreamHandler(stream=sys.stdout)
+formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
+shandler.setFormatter(formatter)
+shandler.setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(shandler)
+
+def hashed_fold_scaffold(scaff, secret, nfolds = 5):
+    scaff = str(scaff).encode("ASCII")
+    #h = sha256([scaff], secret)
+    m = hmac.new(secret, b'', hashlib.sha256)
+    m.update(scaff)
+    random.seed(m.digest(), version=2)
+    return random.randint(0, nfolds - 1)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds')
+    parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True)
+    parser.add_argument('--out', help="output folder", type=str, required=True)
+    parser.add_argument('--params_file',help='path to parameters.json file',required=True)
+
+    args = parser.parse_args()
+    logger.info(args)
+
+    with open(args.params_file) as cnf_f:
+        params = json.load(cnf_f)
+
+    #read standardized structures
+    fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv')
+    t2_standardized = pd.read_csv(fname)
+    logger.info('read in standardized structure file: {}'.format(fname))
+
+    t2_unique = t2_standardized[['canonical_smiles']].drop_duplicates()
+    t2_scaff = t2_standardized.merge(t2_unique[['canonical_smiles']], on='canonical_smiles')
+    logger.info('extracted unique compounds:{0}'.format(len(t2_scaff)))
+
+    #now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs
+    t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv'))
+    t2_joined = t2_scaff.merge(t5, on='input_compound_id')
+    t6_scaff = t2_joined.groupby('descriptor_vector_id')['canonical_smiles'].min()
+
+    #now we need to join to final file with continuous descriptor vector Ids
+    t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv'))
+    t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id')
+
+    ## creating output path if it does not exist
+    os.makedirs(args.out, exist_ok=True)
+
+    #now we need to generate the hashes for the scaffolds
+    key = params['key']['key'].encode("ASCII")
+    nfolds = params['lsh']['nfolds'] 
+    t2_t11_joined['fold'] = t2_t11_joined['canonical_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds))
+    t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_random_folds.csv'))
+    logger.info('written out csv file with fold info')
+
+    #now create numpy arrays for folds
+    random_fold = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
+    random_fold[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['fold']
+
+    np.save(os.path.join(args.out,'random_folds.npy'),random_fold)
+    logger.info('written out numpy arrays with folds')
+
diff --git a/chemfold/scaffold_network.py b/chemfold/scaffold_network.py
@@ -0,0 +1,235 @@
+import argparse
+import numpy as np
+import pandas as pd
+import pickle
+
+import json
+
+import hashlib
+import random
+import hmac
+
+from rdkit import Chem
+from rdkit.Chem.Scaffolds import rdScaffoldNetwork, MurckoScaffold
+from rdkit.Chem import rdMolDescriptors
+from rdkit.Chem import PandasTools
+
+import os, sys, logging
+
+def has_unusual_ringsize(mol):
+    return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>6 or len(x)<5 ])>0
+
+def has_macrocycle(mol):
+    return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>9])>0
+
+def murcko_scaff_smiles(mol_smiles):
+    mol = Chem.MolFromSmiles(mol_smiles)
+    if mol is not None:
+        return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol))
+    else:
+        return None
+
+def hashed_fold_scaffold(scaff, secret, nfolds = 5):
+    scaff = str(scaff).encode("ASCII")
+    h = sha256([scaff], secret)
+    m = hmac.new(secret, b'', hashlib.sha256)
+    m.update(scaff)
+    random.seed(m.digest(), version=2)
+    return random.randint(0, nfolds - 1)
+
+def sha256(inputs, secret):
+    """
+    Encryption function using python's pre-installed packages HMAC and hashlib.
+    We are using SHA256 (it is equal in security to SHA512).
+    :param inputs: input strings
+    :param secret: given pharma partner key
+    :return:
+    """
+    m = hmac.new(secret, b'', hashlib.sha256)
+    for i in inputs:
+        m.update(i)
+    return m.digest()
+
+
+if __name__ == "__main__":
+
+    shandler = logging.StreamHandler(stream=sys.stdout)
+    formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
+    shandler.setFormatter(formatter)
+    shandler.setLevel(logging.INFO)
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    logger.addHandler(shandler)
+
+
+    parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds')
+    parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True)
+    parser.add_argument('--out', help="output folder", type=str, required=True)
+    parser.add_argument('--params_file',help='path to parameters.json file',required=True)
+    parser.add_argument('--sn_flattenIsotopes',help = 'controls flattenIsotopes parameter of scaffold network',type=bool,default=True)
+    parser.add_argument('--sn_includeGenericBondScaffolds',help = 'controls includeGenericBondScaffolds parameter of scaffold network',type=bool,default=False)
+    parser.add_argument('--sn_includeGenericScaffolds',help = 'controls includeGenericScaffolds parameter of scaffold network',type=bool,default=False)
+    parser.add_argument('--sn_includeScaffoldsWithAttachments',help = 'controls includeScaffoldsWithAttachments parameter of scaffold network',type=bool,default=False)
+    parser.add_argument('--sn_includeScaffoldsWithoutAttachments',help = 'controls includeScaffoldsWithoutAttachments parameter of scaffold network',type=bool,default=True)
+    parser.add_argument('--sn_pruneBeforeFragmenting',help = 'controls pruneBeforeFragmenting parameter of scaffold network',type=bool,default=True)
+    parser.add_argument('--nrings', help="preferred number of rings for scaffold, defines the cut-level of scaffold network", type=int, default=3)
+
+
+    args = parser.parse_args()
+    logger.info(args)
+
+    with open(args.params_file) as cnf_f:
+        params = json.load(cnf_f)
+
+    snparams = rdScaffoldNetwork.ScaffoldNetworkParams()
+    snparams.flattenIsotopes=args.sn_flattenIsotopes
+    snparams.includeGenericBondScaffolds=args.sn_includeGenericBondScaffolds
+    snparams.includeGenericScaffolds=args.sn_includeGenericScaffolds
+    snparams.includeScaffoldsWithAttachments = args.sn_includeScaffoldsWithAttachments
+    snparams.includeScaffoldsWithoutAttachments = args.sn_includeScaffoldsWithoutAttachments
+    snparams.pruneBeforeFragmenting = args.sn_pruneBeforeFragmenting
+
+
+
+    #read standardized structures
+    fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv')
+    t2_standardized = pd.read_csv(fname)
+    logger.info('read in standardized structure file: {}'.format(fname))
+
+    #calculate Murcko Scaffold
+    t2_standardized['murcko_scaff_smiles'] = t2_standardized['canonical_smiles'].apply(murcko_scaff_smiles)
+    logger.info('created murcko scaff smiles column')
+
+    #create unique Murcko Scaffolds dataframe
+    t2_murcko_unique = t2_standardized[['murcko_scaff_smiles']].drop_duplicates()
+    logger.info('extracted unique murcko scaffolds:{0}'.format(len(t2_murcko_unique)))
+
+
+
+    #create scaffold network for each unique Murcko scaffold
+    sn_dict = {}
+    failed_list = []
+    for row in t2_murcko_unique.itertuples():
+        logger.debug('processing\t|{}|'.format(row.murcko_scaff_smiles))
+        if row.murcko_scaff_smiles is not None and row.murcko_scaff_smiles != '' and pd.notnull( row.murcko_scaff_smiles):
+            mol = Chem.MolFromSmiles(row.murcko_scaff_smiles)
+            if mol is not None:
+                try:
+                    sn_dict[row.murcko_scaff_smiles] = rdScaffoldNetwork.CreateScaffoldNetwork([mol],snparams)
+                except:
+                    failed_list.append(row.murcko_scaff_smiles)
+            else:
+                failed_list.append(row.murcko_scaff_smiles)
+        else:
+            failed_list.append(row.murcko_scaff_smiles)       
+    logger.info('generated scaffold network')
+
+    ## creating output path if it does not exist
+    os.makedirs(args.out, exist_ok=True)
+
+    #save the scaffold netowrks
+    with open(os.path.join(args.out,'T2_sn_dict.pkl'),'wb') as pklf:
+        pickle.dump(sn_dict,pklf)
+    logger.info('written pickle file for scaffold networks')
+
+    #with open(os.path.join(args.out,'T2_sn_dict.pkl'),'rb') as pklf:
+    #    sn_dict = pickle.load(pklf)
+    #logger.info('read pickle file for scaffold networks')
+
+    #write out failed list
+    logger.info('encountered {} murcko scaffolds failing in scaffold network'.format(len(failed_list)))
+    fname = os.path.join(args.out,'failed_sn_murcko_scaffolds.txt')
+    pd.Series(failed_list).to_csv(fname,sep='\t')
+
+    #make a unique data frame of scaffold network nodes, and clacukate properties
+    all_nodes = set([])
+    for sn in sn_dict.values():
+        all_nodes |= set([str(n) for n in sn.nodes])
+    logger.info('extracted nodes')    
+    logger.info('number of nodes {}'.format(len(all_nodes)))
+
+    node_df = pd.DataFrame({'node_smiles':list(all_nodes)})
+    PandasTools.AddMoleculeColumnToFrame(node_df,'node_smiles','mol',includeFingerprints=False)
+    node_df['num_rings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRings)
+    node_df['num_rings_delta'] = (node_df['num_rings'] -args.nrings).abs() 
+    node_df['num_rbonds'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRotatableBonds)
+    node_df['num_hrings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumHeterocycles)
+    node_df['num_arings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumAromaticRings)
+    node_df['num_bridge'] =  node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumBridgeheadAtoms)
+    node_df['num_spiro'] =  node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumSpiroAtoms)
+    node_df['has_macrocyle'] = node_df['mol'].apply(has_macrocycle)
+    node_df['has_unusual_ring_size'] = node_df['mol'].apply(has_unusual_ringsize)
+    logger.info('calculated node properties')   
+
+    #the follwing lines define the precedence of nodes, we aim at ~3 ring scaffolds
+    priority_cols =['num_rings_delta','has_macrocyle','num_rbonds','num_bridge','num_spiro','has_unusual_ring_size','num_hrings', 'num_arings','node_smiles']
+    priority_asc = [True,False,True,False,False,False,False,True,True]
+    node_df.sort_values(priority_cols, ascending = priority_asc, inplace =True)
+    node_df['priority'] = np.arange(len(node_df))
+    node_df.set_index('node_smiles',inplace=True)
+    logger.info('sorted nodes by priority')
+
+    with open(os.path.join(args.out,'T2_sn_nodes.pkl'),'wb') as pklf:
+        pickle.dump(node_df,pklf)
+    logger.info('written pickle file for node table')
+    #with open(args.out,'T2_sn_nodes.pkl'),'rb') as pklf:
+    #    node_df = pickle.load(pklf)
+    #logger.info('read pickle file for node table')
+
+    #assign for eac Murcko scaffold the preferred sn scaffolds
+    for row in t2_murcko_unique.itertuples():
+        if row.murcko_scaff_smiles in sn_dict:
+            nodelist = [str(n) for n in sn_dict[row.murcko_scaff_smiles].nodes]
+            node_data = node_df.loc[nodelist]
+            t2_murcko_unique.loc[row.Index,'sn_scaff_smiles'] = node_data['priority'].idxmin()
+    logger.info('assigned preferred scaffolds')
+
+    with open(os.path.join(args.out,'T2_sn_murcko_unique.pkl'),'wb') as pklf:
+        pickle.dump(t2_murcko_unique,pklf)
+    logger.info('written pickle file for unique Murcko scaffolds')
+
+    #join back to original scaffolds
+    t2_scaff = t2_standardized.merge(t2_murcko_unique[['murcko_scaff_smiles','sn_scaff_smiles']],on='murcko_scaff_smiles')
+    t2_scaff.to_csv(os.path.join(args.out,'T2_sn_scaff.csv'))
+    logger.info('written scaffold assignment')
+
+    #now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs
+    t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv'))
+    t2_joined = t2_scaff.merge(t5,on='input_compound_id')
+
+    # in rare cases of collisions 8diffeent scaffolds, but ientical descriptors we brea the ties by alpabetic precedence
+    t6_scaff = t2_joined.groupby('descriptor_vector_id')[['murcko_scaff_smiles','sn_scaff_smiles']].min()
+
+    #identify collisions: different scaffold, same descriptor vector
+    nunique_murcko = t2_joined.groupby('descriptor_vector_id')['murcko_scaff_smiles'].nunique()
+    collisions_murcko = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_murcko[nunique_murcko > 1].index)].sort_values('descriptor_vector_id')
+    collisions_murcko.to_csv(os.path.join(args.out,'collisions_murcko.txt'),sep='\t')
+    logger.info('Colliding Murcko scaffolds observed on {0} records and written out'.format(len(collisions_murcko)) )
+
+    nunique_sn = t2_joined.groupby('descriptor_vector_id')['sn_scaff_smiles'].nunique()
+    collisions_sn = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_sn[nunique_sn > 1].index)].sort_values('descriptor_vector_id')
+    collisions_sn.to_csv(os.path.join(args.out,'collisions_sn.txt'),sep='\t')
+    logger.info('Colliding SN scaffolds observed on {0} records and written out'.format(len(collisions_sn)) )
+
+    #now we need to join to final file with continuous descriptor vector Ids
+    t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv'))
+    t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id')
+
+    #now we need to generate the hashes for the scaffolds
+    key = params['key']['key'].encode("ASCII")
+    nfolds = params['lsh']['nfolds'] 
+    t2_t11_joined['sn_scaff_fold'] = t2_t11_joined['sn_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds=nfolds))
+    t2_t11_joined['murcko_scaff_fold'] = t2_t11_joined['murcko_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds))
+    t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_scaff_folds.csv'))
+    logger.info('written out csv file with fold info')
+
+    #now create numpy arrays for folds
+    fold_sn = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
+    fold_sn[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['sn_scaff_fold']
+    fold_murcko = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
+    fold_murcko[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['murcko_scaff_fold']
+
+    np.save(os.path.join(args.out,'murcko_scaff_folds.npy'),fold_murcko)
+    np.save(os.path.join(args.out,'sn_scaff_folds.npy'),fold_sn)
+    logger.info('written out numpy arrays with folds')
+