Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lhumbeck authored Jul 19, 2021
1 parent ee58f56 commit 9fedcd3
Show file tree
Hide file tree
Showing 8 changed files with 920 additions and 0 deletions.
375 changes: 375 additions & 0 deletions chemfold/analysis.py

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions chemfold/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import argparse
import logging
import sys
import analysis as cfa

#logging
shandler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
shandler.setFormatter(formatter)
shandler.setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(shandler)


parser = argparse.ArgumentParser(description="Suite of different folding methods for chemical structures suitable for privacy-preserving federated learning and analysis scripts.")
#parser.add_argument("--x", help="Descriptor file (matrix market or numpy)", type=str, required=True)
parser.add_argument("--inp", help="Folder with input files, e.g., descriptor file (matrix market or numpy, distances, y matrix, sparse chem model folder + json summary,Matrix of ECFP features [compounds x ecfp], .mtx file)", type=str, required=True)
parser.add_argument("--analysis", help="which analysis to run options are: [all, performance, imbalance, similarity]", choices=['all','performance','imbalance','similarity'], default='all')
parser.add_argument("--out", help="Output directory for the clusters file (.npy)", type=str, required=True)

#performance
parser.add_argument("--psc", help="Path to sparsechem models sparsechem/models", type=str, required=False)
parser.add_argument("--baseline_prefix", help="Baseline prefix", type=str, default='random')

#similarity
parser.add_argument("--maxsample", help="Maximal number of compound pairs to sample", type=int, default=10000000)
parser.add_argument("--batchsize", help="Number of compound pairs to precessed in one batch", type=int, default=50000)
parser.add_argument("--numbins", help="Number of bins of the histogram", type=int, default=10)
parser.add_argument("--precision", help="Maximal tolerated change of intra fold ratio per similarity bin between to batches to reach convergence",type=float,default=0.02)
parser.add_argument("--minpop", help="Minimal population of intra-fold samples per similarity bin to required to reach convergence",type=int, default=3)
parser.add_argument("--rseed",help="random seed to use", type=int, default = 123456)

args = parser.parse_args()
logger.info(args)


#call of analysis functions

if args.analysis:
if args.analysis == 'imbalance' or args.analysis == 'all':
#balance
cfa.balance(args.inp,args.out)


if args.analysis == 'performance' or args.analysis == 'all':
#performance
#before a model has to be trained with the baseline and the alternative fold splitting using SparseChem
# We recognize the conditions by the prefix given as option during the training job submission. This prefix is found in the model json files. All fold models under the same conditions are expected to have the same prefix. We now declare which condition is the baseline.

#TODO: add an example call of sparsechem train
#cd ./examples/chembl
#python train.py --prefix [specify folding method here]


#Analysis of the effect of the fold splits on model performance, using random folding as the baseline. It assumes you have created the models for 5 validation folds and reads in the json files created by sparsechem. Each folding scheme should use a diffrent sparsechem --prefix to distinguish them.
cfa.performance(args.psc, args.baseline_prefix, args.out)


if args.analysis == 'similarity' or args.analysis == 'all':
#similarity
#This analyzes the fraction of randomly chosen compound pairs per similariuy bins that come from the same fold. Ideally the fraction on intra-fold pairs for high similarity bins should be as high as possible.
cfa.similarity(args.inp, args.out,args.maxsample,args.batchsize,args.numbins,args.precision,args.minpop,args.rseed)
84 changes: 84 additions & 0 deletions chemfold/random_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""random_split.py: Assigns a random fold number to each input.
Assigns a random fold number to each input for the ML model to compare the generated distribution of data and labels over the folds and the resulting performance with other methods like LFC, LSH or a scaffold-based fold splitting. This script is part of the folding single partner study of WP1 from the MELLODDY project. It is highly inspired by the script 'split_by_scaffold.py by Ansgar Schuffenhauer.
"""
#ATTENTION! WARNING! This is a beta version. Use on your own risk!


import argparse
import numpy as np
import pandas as pd
import json
import hashlib
import random
import hmac
import os, sys, logging

__author__ = "Lina Humbeck, Boehringer Ingelheim Pharma GmbH & Co.KG"
__contributors__ = "Ansgar Schuffenhauer, Novartis"

shandler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
shandler.setFormatter(formatter)
shandler.setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(shandler)

def hashed_fold_scaffold(scaff, secret, nfolds = 5):
scaff = str(scaff).encode("ASCII")
#h = sha256([scaff], secret)
m = hmac.new(secret, b'', hashlib.sha256)
m.update(scaff)
random.seed(m.digest(), version=2)
return random.randint(0, nfolds - 1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds')
parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True)
parser.add_argument('--out', help="output folder", type=str, required=True)
parser.add_argument('--params_file',help='path to parameters.json file',required=True)

args = parser.parse_args()
logger.info(args)

with open(args.params_file) as cnf_f:
params = json.load(cnf_f)

#read standardized structures
fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv')
t2_standardized = pd.read_csv(fname)
logger.info('read in standardized structure file: {}'.format(fname))

t2_unique = t2_standardized[['canonical_smiles']].drop_duplicates()
t2_scaff = t2_standardized.merge(t2_unique[['canonical_smiles']], on='canonical_smiles')
logger.info('extracted unique compounds:{0}'.format(len(t2_scaff)))

#now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs
t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv'))
t2_joined = t2_scaff.merge(t5, on='input_compound_id')
t6_scaff = t2_joined.groupby('descriptor_vector_id')['canonical_smiles'].min()

#now we need to join to final file with continuous descriptor vector Ids
t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv'))
t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id')

## creating output path if it does not exist
os.makedirs(args.out, exist_ok=True)

#now we need to generate the hashes for the scaffolds
key = params['key']['key'].encode("ASCII")
nfolds = params['lsh']['nfolds']
t2_t11_joined['fold'] = t2_t11_joined['canonical_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds))
t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_random_folds.csv'))
logger.info('written out csv file with fold info')

#now create numpy arrays for folds
random_fold = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
random_fold[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['fold']

np.save(os.path.join(args.out,'random_folds.npy'),random_fold)
logger.info('written out numpy arrays with folds')

235 changes: 235 additions & 0 deletions chemfold/scaffold_network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import argparse
import numpy as np
import pandas as pd
import pickle

import json

import hashlib
import random
import hmac

from rdkit import Chem
from rdkit.Chem.Scaffolds import rdScaffoldNetwork, MurckoScaffold
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import PandasTools

import os, sys, logging

def has_unusual_ringsize(mol):
return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>6 or len(x)<5 ])>0

def has_macrocycle(mol):
return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>9])>0

def murcko_scaff_smiles(mol_smiles):
mol = Chem.MolFromSmiles(mol_smiles)
if mol is not None:
return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol))
else:
return None

def hashed_fold_scaffold(scaff, secret, nfolds = 5):
scaff = str(scaff).encode("ASCII")
h = sha256([scaff], secret)
m = hmac.new(secret, b'', hashlib.sha256)
m.update(scaff)
random.seed(m.digest(), version=2)
return random.randint(0, nfolds - 1)

def sha256(inputs, secret):
"""
Encryption function using python's pre-installed packages HMAC and hashlib.
We are using SHA256 (it is equal in security to SHA512).
:param inputs: input strings
:param secret: given pharma partner key
:return:
"""
m = hmac.new(secret, b'', hashlib.sha256)
for i in inputs:
m.update(i)
return m.digest()


if __name__ == "__main__":

shandler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s')
shandler.setFormatter(formatter)
shandler.setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(shandler)


parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds')
parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True)
parser.add_argument('--out', help="output folder", type=str, required=True)
parser.add_argument('--params_file',help='path to parameters.json file',required=True)
parser.add_argument('--sn_flattenIsotopes',help = 'controls flattenIsotopes parameter of scaffold network',type=bool,default=True)
parser.add_argument('--sn_includeGenericBondScaffolds',help = 'controls includeGenericBondScaffolds parameter of scaffold network',type=bool,default=False)
parser.add_argument('--sn_includeGenericScaffolds',help = 'controls includeGenericScaffolds parameter of scaffold network',type=bool,default=False)
parser.add_argument('--sn_includeScaffoldsWithAttachments',help = 'controls includeScaffoldsWithAttachments parameter of scaffold network',type=bool,default=False)
parser.add_argument('--sn_includeScaffoldsWithoutAttachments',help = 'controls includeScaffoldsWithoutAttachments parameter of scaffold network',type=bool,default=True)
parser.add_argument('--sn_pruneBeforeFragmenting',help = 'controls pruneBeforeFragmenting parameter of scaffold network',type=bool,default=True)
parser.add_argument('--nrings', help="preferred number of rings for scaffold, defines the cut-level of scaffold network", type=int, default=3)


args = parser.parse_args()
logger.info(args)

with open(args.params_file) as cnf_f:
params = json.load(cnf_f)

snparams = rdScaffoldNetwork.ScaffoldNetworkParams()
snparams.flattenIsotopes=args.sn_flattenIsotopes
snparams.includeGenericBondScaffolds=args.sn_includeGenericBondScaffolds
snparams.includeGenericScaffolds=args.sn_includeGenericScaffolds
snparams.includeScaffoldsWithAttachments = args.sn_includeScaffoldsWithAttachments
snparams.includeScaffoldsWithoutAttachments = args.sn_includeScaffoldsWithoutAttachments
snparams.pruneBeforeFragmenting = args.sn_pruneBeforeFragmenting



#read standardized structures
fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv')
t2_standardized = pd.read_csv(fname)
logger.info('read in standardized structure file: {}'.format(fname))

#calculate Murcko Scaffold
t2_standardized['murcko_scaff_smiles'] = t2_standardized['canonical_smiles'].apply(murcko_scaff_smiles)
logger.info('created murcko scaff smiles column')

#create unique Murcko Scaffolds dataframe
t2_murcko_unique = t2_standardized[['murcko_scaff_smiles']].drop_duplicates()
logger.info('extracted unique murcko scaffolds:{0}'.format(len(t2_murcko_unique)))



#create scaffold network for each unique Murcko scaffold
sn_dict = {}
failed_list = []
for row in t2_murcko_unique.itertuples():
logger.debug('processing\t|{}|'.format(row.murcko_scaff_smiles))
if row.murcko_scaff_smiles is not None and row.murcko_scaff_smiles != '' and pd.notnull( row.murcko_scaff_smiles):
mol = Chem.MolFromSmiles(row.murcko_scaff_smiles)
if mol is not None:
try:
sn_dict[row.murcko_scaff_smiles] = rdScaffoldNetwork.CreateScaffoldNetwork([mol],snparams)
except:
failed_list.append(row.murcko_scaff_smiles)
else:
failed_list.append(row.murcko_scaff_smiles)
else:
failed_list.append(row.murcko_scaff_smiles)
logger.info('generated scaffold network')

## creating output path if it does not exist
os.makedirs(args.out, exist_ok=True)

#save the scaffold netowrks
with open(os.path.join(args.out,'T2_sn_dict.pkl'),'wb') as pklf:
pickle.dump(sn_dict,pklf)
logger.info('written pickle file for scaffold networks')

#with open(os.path.join(args.out,'T2_sn_dict.pkl'),'rb') as pklf:
# sn_dict = pickle.load(pklf)
#logger.info('read pickle file for scaffold networks')

#write out failed list
logger.info('encountered {} murcko scaffolds failing in scaffold network'.format(len(failed_list)))
fname = os.path.join(args.out,'failed_sn_murcko_scaffolds.txt')
pd.Series(failed_list).to_csv(fname,sep='\t')

#make a unique data frame of scaffold network nodes, and clacukate properties
all_nodes = set([])
for sn in sn_dict.values():
all_nodes |= set([str(n) for n in sn.nodes])
logger.info('extracted nodes')
logger.info('number of nodes {}'.format(len(all_nodes)))

node_df = pd.DataFrame({'node_smiles':list(all_nodes)})
PandasTools.AddMoleculeColumnToFrame(node_df,'node_smiles','mol',includeFingerprints=False)
node_df['num_rings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRings)
node_df['num_rings_delta'] = (node_df['num_rings'] -args.nrings).abs()
node_df['num_rbonds'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRotatableBonds)
node_df['num_hrings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumHeterocycles)
node_df['num_arings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumAromaticRings)
node_df['num_bridge'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumBridgeheadAtoms)
node_df['num_spiro'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumSpiroAtoms)
node_df['has_macrocyle'] = node_df['mol'].apply(has_macrocycle)
node_df['has_unusual_ring_size'] = node_df['mol'].apply(has_unusual_ringsize)
logger.info('calculated node properties')

#the follwing lines define the precedence of nodes, we aim at ~3 ring scaffolds
priority_cols =['num_rings_delta','has_macrocyle','num_rbonds','num_bridge','num_spiro','has_unusual_ring_size','num_hrings', 'num_arings','node_smiles']
priority_asc = [True,False,True,False,False,False,False,True,True]
node_df.sort_values(priority_cols, ascending = priority_asc, inplace =True)
node_df['priority'] = np.arange(len(node_df))
node_df.set_index('node_smiles',inplace=True)
logger.info('sorted nodes by priority')

with open(os.path.join(args.out,'T2_sn_nodes.pkl'),'wb') as pklf:
pickle.dump(node_df,pklf)
logger.info('written pickle file for node table')
#with open(args.out,'T2_sn_nodes.pkl'),'rb') as pklf:
# node_df = pickle.load(pklf)
#logger.info('read pickle file for node table')

#assign for eac Murcko scaffold the preferred sn scaffolds
for row in t2_murcko_unique.itertuples():
if row.murcko_scaff_smiles in sn_dict:
nodelist = [str(n) for n in sn_dict[row.murcko_scaff_smiles].nodes]
node_data = node_df.loc[nodelist]
t2_murcko_unique.loc[row.Index,'sn_scaff_smiles'] = node_data['priority'].idxmin()
logger.info('assigned preferred scaffolds')

with open(os.path.join(args.out,'T2_sn_murcko_unique.pkl'),'wb') as pklf:
pickle.dump(t2_murcko_unique,pklf)
logger.info('written pickle file for unique Murcko scaffolds')

#join back to original scaffolds
t2_scaff = t2_standardized.merge(t2_murcko_unique[['murcko_scaff_smiles','sn_scaff_smiles']],on='murcko_scaff_smiles')
t2_scaff.to_csv(os.path.join(args.out,'T2_sn_scaff.csv'))
logger.info('written scaffold assignment')

#now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs
t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv'))
t2_joined = t2_scaff.merge(t5,on='input_compound_id')

# in rare cases of collisions 8diffeent scaffolds, but ientical descriptors we brea the ties by alpabetic precedence
t6_scaff = t2_joined.groupby('descriptor_vector_id')[['murcko_scaff_smiles','sn_scaff_smiles']].min()

#identify collisions: different scaffold, same descriptor vector
nunique_murcko = t2_joined.groupby('descriptor_vector_id')['murcko_scaff_smiles'].nunique()
collisions_murcko = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_murcko[nunique_murcko > 1].index)].sort_values('descriptor_vector_id')
collisions_murcko.to_csv(os.path.join(args.out,'collisions_murcko.txt'),sep='\t')
logger.info('Colliding Murcko scaffolds observed on {0} records and written out'.format(len(collisions_murcko)) )

nunique_sn = t2_joined.groupby('descriptor_vector_id')['sn_scaff_smiles'].nunique()
collisions_sn = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_sn[nunique_sn > 1].index)].sort_values('descriptor_vector_id')
collisions_sn.to_csv(os.path.join(args.out,'collisions_sn.txt'),sep='\t')
logger.info('Colliding SN scaffolds observed on {0} records and written out'.format(len(collisions_sn)) )

#now we need to join to final file with continuous descriptor vector Ids
t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv'))
t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id')

#now we need to generate the hashes for the scaffolds
key = params['key']['key'].encode("ASCII")
nfolds = params['lsh']['nfolds']
t2_t11_joined['sn_scaff_fold'] = t2_t11_joined['sn_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds=nfolds))
t2_t11_joined['murcko_scaff_fold'] = t2_t11_joined['murcko_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds))
t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_scaff_folds.csv'))
logger.info('written out csv file with fold info')

#now create numpy arrays for folds
fold_sn = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
fold_sn[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['sn_scaff_fold']
fold_murcko = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id']))
fold_murcko[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['murcko_scaff_fold']

np.save(os.path.join(args.out,'murcko_scaff_folds.npy'),fold_murcko)
np.save(os.path.join(args.out,'sn_scaff_folds.npy'),fold_sn)
logger.info('written out numpy arrays with folds')

Loading

0 comments on commit 9fedcd3

Please sign in to comment.