diff --git a/chemfold/analysis.py b/chemfold/analysis.py new file mode 100644 index 0000000..b33a89d --- /dev/null +++ b/chemfold/analysis.py @@ -0,0 +1,375 @@ +import pandas as pd +import numpy as np +import glob +import json +import re +import os, sys, logging +import argparse +from scipy.sparse import csr_matrix, coo_matrix +from pandas import Series, DataFrame +from scipy.stats import binom, fisher_exact, ks_2samp, chi2_contingency, norm +from statsmodels import robust +import scipy.io as sio + + +#logging +shandler = logging.StreamHandler(stream=sys.stdout) +formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s') +shandler.setFormatter(formatter) +shandler.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(shandler) + + +# statistic functions + +# Determine whether the the folds are diverging significantly in terms of their count of actives +def chi2_contingency_pval_act(row): + try: + u = row.unstack().astype(int) + usum = u.sum() + chi2 = chi2_contingency(u.loc[:,usum>0].values) + return chi2[1] + except: + print(usum) + raise + +# Determine whether the distribution of datapoints per fold diverges signficantly from random +def chi2_contingency_pval_split(row): + try: + obs = row.unstack().sum().astype(int) + n = len(obs) + expected = np.repeat(np.round(obs.sum()/n).astype(int),n) + chi2 = chi2_contingency(np.array([obs,expected])) + return(chi2[1]) + except: + print(obs) + raise + +# Obtain standard deviation and median absolute deviation of the datapoint fraction assigned to the folds per task +def std_fold_split(row): + u = row.unstack().astype(int) + usum = u.sum() + return (usum/usum.sum()).std() + +def mad_fold_split(row): + u = row.unstack().astype(int) + usum = u.sum() + return robust.mad(usum/usum.sum()) + +# Obtain standard deviation and median absolute deviation of the fraction of actives in each of the folds per task +def std_fold_act(row): + u = row.unstack().astype(int) + usum = u.sum() + u.loc[:,usum>0] + uratio = u.loc[1,usum>0]/usum[usum>0] + return uratio.std() + +def mad_fold_act(row): + u = row.unstack().astype(int) + usum = u.sum() + u.loc[:,usum>0] + uratio = u.loc[1,usum>0]/usum[usum>0] + return robust.mad(uratio) + + +#sample N pairs of compounds from the Xmatrix an calculate the tanimoto similarity for each pair +def tc_pair_sample(X, folds, Nsample): + X1_sample = np.random.choice(X.shape[0], Nsample, replace=True) + X2_sample = np.random.choice(X.shape[0], Nsample, replace=True) + mask = (X1_sample != X2_sample) + X1_sample = X1_sample[mask] + X2_sample = X2_sample[mask] + X1 = np.array(X[X1_sample].todense()) + X2 = np.array(X[X2_sample].todense()) + X1_sum = X1.sum(axis=1) + X2_sum = X2.sum(axis=1) + X12_sum = (X1*X2).sum(axis=1) + tc = X12_sum/(X1_sum+X2_sum-X12_sum) + fold_match = (folds[X1_sample] == folds[X2_sample]) + return fold_match, tc + +#sample at maximimum Maxsample pairs from X in batches of size Nbatch +#bin distance into Nbins equidistant bins +#stop sampling when the change in the fraction of intra fold pairs for all distance bins drops below Precision +#and the each bin has at least Minpop intra-fold pairs +def batch_tc_pair_hist_conv(X, folds, Nbins=20, Precision=0.01, Minpop = 3, Maxsample=10000000, Nbatch=50000): + tc_hist_same_fold = np.zeros(Nbins,dtype=int) + tc_hist_different_fold = np.zeros(Nbins,dtype=int) + intra_fold_fraction = np.zeros(Nbins,dtype=float) + converged = False + samples = 0 + bin_lower_limits = np.arange(0.0,1.0,1/Nbins) + while ((not converged) and (samples <= Maxsample)): + fmatch_b, tc_b = tc_pair_sample(X, folds, Nbatch) + tc_hist_same_fold += np.histogram(tc_b[fmatch_b],bins=Nbins,range=(0.0,1.0))[0] + tc_hist_different_fold += np.histogram(tc_b[~fmatch_b],bins=Nbins,range=(0.0,1.0))[0] + new_f = tc_hist_same_fold /np.clip((tc_hist_same_fold + tc_hist_different_fold),a_min=1,a_max=None) + delta_max = np.absolute(new_f - intra_fold_fraction).max() + samples += Nbatch + intra_fold_fraction = new_f + minimal_bin_population = tc_hist_same_fold.min() + converged = (delta_max < Precision) and (minimal_bin_population >= Minpop) + logger.info('''Samples: {0}, max change = {1}, minimal bin population {2} --> Converged: {3}, ecxceeded Maxsamples ({4}): {5}'''.format(samples,delta_max,minimal_bin_population,converged,Maxsample,samples > Maxsample)) + if not converged: + logger.warning('Failed to reach convergence after sampling of {0} pairs'.format(samples)) + return tc_hist_same_fold, tc_hist_different_fold, intra_fold_fraction, bin_lower_limits + + +#folding method assessment functions +def balance(ptuner,pfold): + """ + This function analyzes the split imbalanace for tasks with a given fold split and a y-matrix. + + For input: + A combination of: + 1. the y-mtrix as in generated by MELLODDY TUNER in the files_4_ml folder and + 2. a numpy file with the folds (as for example produced by the sphere exclusion code, but also by MELLODDY TUNER using LSH in the files_4_ml folder is needed. + This script generates the counts and will take some time for this. + + It will output: + 1. a file listing imbalanace metrics per task + 2. a file aggregating this imbalance information per task data volume + Both files are written to the output location of the folding method. + + The following imbalance metrics are calculated for each task + 1. Standard deviation of the fraction of data points allocated to each fold + 2. Median absolute deviation of the fraction of data points allocated to each fold + 3. Standard deviation of the fraction of +1 labels (active) within the data points allocated to each fold + 4. Median absolute deviation of the fraction of +1 labels (active) within the data points allocated to each fold + 5. Boolean flag indiating whether the task will have 10 datapoints for each label (+1 and -1) in each fold. Tasks not meeting this requirment are exclded from performance evaluotation + 6. p-value from chi2 statistics comparing the number of datapoints allocated to each fold with result of an even distribution as expected from a random split. This will detect whether an imbalance indicated by 1. is statistically significant + 7. p-value from chi2 statistics comparing the number of actives and inactives allocated to each fold. This will detect whether an imbalance as indicated by 3. is staistically significant. + 8. a combination boolean split_inbalanced flag indicating a substantial deviation of > 0.5 in 1. and a significant p-value as in 6. below 0.05 + 9. a combination boolean act_inbalanced flag indicating a substantial deviation of > 0.5 in 3. and a significant p-value as in 7. below 0.05 + + """ + + print('Data and label balance analysis started') + + # Read fold split counts + # read from the y matrix and the fold vector + y_mat = sio.mmread(os.path.join(ptuner,'files_4_ml','T10_y.mtx')) + logger.info('read in y data') + + # make a sparse datafrme from the csr matrix + y_df_sp = pd.DataFrame.sparse.from_spmatrix(y_mat) + + foldings = glob.glob(os.path.join(pfold,'*folds.npy')) + + for method in foldings: + prefix = os.path.split(method)[1].split('_')[0] + folds = np.load(os.path.join(pfold,prefix+'_folds.npy')) + logger.info('read in fold data for {0}'.format(prefix)) + print('Running imbalance check on: ' + prefix) + + # determine the number of folds from the folds file + fold_vals = np.arange(folds.max()+1) + + # iteratite over the folds and over the two possible values +1 / -1 and do the count statistics. This hasn't been optimized and will take some time. It might be faster, if coversion to a dense dataframe were to be used + count_ser = pd.concat({f:pd.concat({1:(y_df_sp.loc[folds==f] == 1).sum(axis=0),-1:(y_df_sp.loc[folds==f] == -1).sum(axis=0)}) for f in fold_vals}) + count_df = count_ser.reset_index() + count_df.columns = ['fold_id','class_label','cont_classification_task_id','label_counts'] + + # The follwing operation merges clustering information with cluster to fold mapping only the fold_data series to be returned to client. On top of this the server communicates the overall fold ratios. + count_unstacked = count_df.set_index(['cont_classification_task_id','class_label','fold_id']).sort_index().unstack(['class_label','fold_id']) + count_unstacked['sum'] = count_unstacked['label_counts'].sum(axis=1) + count_unstacked['sum_neg'] = count_unstacked[('label_counts',-1)].sum(axis=1) + count_unstacked['sum_pos'] = count_unstacked[('label_counts',1)].sum(axis=1) + count_unstacked['pos_rate'] = count_unstacked['sum_pos'] / count_unstacked['sum'] + + + # Apply statistic functions + count_unstacked['chi2_contingency_p_split'] = count_unstacked['label_counts'].apply(chi2_contingency_pval_split,axis=1) + count_unstacked['chi2_contingency_p_act'] = count_unstacked['label_counts'].apply(chi2_contingency_pval_act,axis=1) + count_unstacked['below_10'] = (count_unstacked['label_counts'] < 10).any(axis=1) + count_unstacked['below_05'] = (count_unstacked['label_counts'] < 5).any(axis=1) + count_unstacked['std_fold_split'] = count_unstacked['label_counts'].apply(std_fold_split,axis=1) + count_unstacked['mad_fold_split'] = count_unstacked['label_counts'].apply(mad_fold_split,axis=1) + count_unstacked['std_fold_act'] = count_unstacked['label_counts'].apply(std_fold_act,axis=1) + count_unstacked['mad_fold_act'] = count_unstacked['label_counts'].apply(mad_fold_act,axis=1) + sum_labels = np.power(10,np.arange(1,np.ceil(np.log10(count_unstacked['sum'].max())))) + sum_bins= np.power(10,np.arange(1,np.ceil(np.log10(count_unstacked['sum'].max()))+1)) + count_unstacked['size_bin_lower_limit'] = pd.cut(count_unstacked['sum'],bins=sum_bins,labels=sum_labels) + + summary_data = count_unstacked.drop('label_counts',axis=1) + summary_data.columns = summary_data.columns.droplevel(['class_label','class_label']) + + # For assigning the category unbalanced_split/unbalanced_act we require statistical signficance and a meaningfull effect size, that is standard_deviations + summary_data['chi2_contingency_p_split_signif'] = (summary_data['chi2_contingency_p_split']<0.05) + summary_data['chi2_contingency_p_act_signif'] = (summary_data['chi2_contingency_p_act']<0.05) + summary_data['split_unbalanced'] = (summary_data['chi2_contingency_p_split']<0.05) & (summary_data['std_fold_split']>0.05) + summary_data['act_unbalanced'] = (summary_data['chi2_contingency_p_act']<0.05) & (summary_data['std_fold_act']>0.05) + + + # Create a aggregate fractions by size bin + sizes = summary_data.groupby(['size_bin_lower_limit']).size() + stats_by_size = pd.concat({'fraction_below_10': summary_data[summary_data['below_10']==True].groupby(['size_bin_lower_limit']).size()/sizes, + 'fraction_below_05': summary_data[summary_data['below_05']==True].groupby(['size_bin_lower_limit']).size()/sizes, + 'fraction_act_unbalanced': summary_data[summary_data['act_unbalanced']==True].groupby(['size_bin_lower_limit']).size()/sizes, + 'fraction_split_unbalanced': summary_data[summary_data['split_unbalanced']==True].groupby(['size_bin_lower_limit']).size()/sizes},axis=1) + + stats_by_size['mean_std_fold_act'] = summary_data.groupby('size_bin_lower_limit')['std_fold_act'].mean() + stats_by_size['mean_std_fold_split'] = summary_data.groupby('size_bin_lower_limit')['std_fold_split'].mean() + + stats_by_size['count'] = sizes + logger.info('aggregated by size bin') + + # Write out results + summary_data.to_csv(os.path.join(pfold,prefix+'_split_stats.txt'), sep='\t') + stats_by_size.to_csv(os.path.join(pfold,prefix+'_split_stats_overview.txt'), sep='\t') + + print('Imbalance analysis results (split_stats.txt, split_stats_overview.txt) were written to: ' + pfold + ' .') + logger.info('DONE') + + +def performance(psc, baseline_prefix, pfold): + """ + This function analyzes the perfromance difference of folding method compared to a baseline fold splitting. + + For input: + 1. path to the sparsechem folder which includes subfolders for every folding method to be analyzed including the models folder and the performance json file + + It will output: + 1. a absolute performance summary csv + 2. a delta performance summary csv + 3. mean delta perfromances for size bins + + """ + + print('Performance analysis started') + + #get raw results and do calculations + fnames = glob.glob(os.path.join(psc,'*.json')) + + + results_agg = {} + results_agg2 = {} + results = {} + conf = {} + conf2 = {} + + for ix, fname in enumerate(fnames): + f = open(fname, 'r') + data = json.load(f) + results_agg[ix] = pd.read_json(data['validation']['classification_agg'], typ='series') + results[ix] = pd.read_json(data['validation']['classification']) + + conf[ix] = pd.Series(data['conf']) + conf[ix]['fname'] = fname + + prefix = conf[ix]['prefix'] + logger.info('read in performance data for {0}'.format(prefix)) + print('Running performance analysis on: ' + prefix) + + conf_df = pd.concat(conf,axis=0).unstack() + conf_df.index.names = ['model_index'] + + results_df = pd.concat(results) + results_df.index.names = ['model_index','continuous_task_id'] + results_agg_df = pd.concat(results_agg).unstack() + results_agg_df.index.names = ['model_index'] + logger.info('read in performance data') + + mean = results_agg_df.join(conf_df[['prefix','fold_va']]).groupby('prefix').mean() + mean_delta = mean.loc[mean.index != baseline_prefix] - mean.loc[baseline_prefix] + std = results_agg_df.join(conf_df[['prefix','fold_va']]).groupby('prefix').std() + z = mean_delta/np.sqrt(std.loc[std.index != baseline_prefix]**2 + std.loc[baseline_prefix]**2) + p_val_worse = pd.DataFrame(norm.cdf(z),columns=z.columns,index=z.index) + + score_cols = ['roc_auc_score','auc_pr','avg_prec_score','f1_max','kappa'] + + result_agg_delta = pd.concat({'delta_basline_mean': mean_delta[score_cols], 'p-value':p_val_worse[score_cols]},axis=1,sort=True) + + res_detail = results_df.join(conf_df[['prefix','fold_va']]).reset_index().set_index(['prefix','continuous_task_id','fold_va']).drop('model_index',axis=1) + res_detail_mean = res_detail[score_cols].groupby(level=['prefix','continuous_task_id'])[score_cols].mean() + res_detail_std = res_detail.groupby(level=['prefix','continuous_task_id'])[score_cols].std() + mean_detail_delta = res_detail_mean.loc[res_detail_mean.index.get_level_values('prefix') != baseline_prefix] - res_detail_mean.loc[baseline_prefix] + z_detail = mean_detail_delta/np.sqrt(res_detail_std.loc[res_detail_std.index.get_level_values('prefix') != baseline_prefix]**2 + res_detail_std.loc[baseline_prefix]**2) + p_val_worse_detail = pd.DataFrame(norm.cdf(z_detail),columns=z_detail.columns,index=z_detail.index) + + count_cols = ['num_pos','num_neg'] + counts = res_detail[count_cols].groupby(level=['prefix','continuous_task_id']).sum() + counts['below_10'] = (res_detail[count_cols].groupby(level=['prefix','continuous_task_id']).min()<10).any(axis=1) + counts['total'] = counts['num_pos'] + counts['num_neg'] + + result_detail_delta = pd.concat({'delta_baseline_mean': mean_detail_delta, \ + 'p-value': p_val_worse_detail,\ + 'counts':counts.loc[counts.index.get_level_values('prefix') != baseline_prefix]},axis=1,sort=True) + + sum_labels = np.power(10,np.arange(1,np.ceil(np.log10(result_detail_delta[('counts','total')].max())))) + sum_bins= np.power(10,np.arange(1,np.ceil(np.log10(result_detail_delta[('counts','total')].max()))+1)) + result_detail_delta[('counts','size_bin_lower_limit')] = pd.cut(result_detail_delta[('counts','total')],bins=sum_bins,labels=sum_labels).astype(int) + + counts_total = result_detail_delta.reset_index().groupby(['prefix',('counts','size_bin_lower_limit')]).size() + counts_worse = result_detail_delta.loc[result_detail_delta[('p-value','auc_pr')]<0.05].reset_index().groupby(['prefix',('counts','size_bin_lower_limit')]).size() + + fraction_worse = counts_worse.reindex(counts_total.index).fillna(0)/counts_total + mean_delta_aucpr = result_detail_delta.reset_index().groupby(['prefix',('counts','size_bin_lower_limit')]).mean()[('delta_baseline_mean','auc_pr')] + + + # Write out results + result_agg_delta.to_csv(os.path.join(pfold,'performance_summary_total.txt'),sep='\t') + + result_detail_delta_4csv = result_detail_delta.copy() + result_detail_delta_4csv.columns = [' '.join(col).strip() for col in result_detail_delta_4csv.columns.values] + result_detail_delta_4csv.to_csv(os.path.join(pfold,'performance_delta_detail.txt'),sep='\t') + + + summary_by_size = pd.concat({'mean_delta_auc_pr':mean_delta_aucpr,'fraction_significantly_worse':fraction_worse},axis=1) + summary_by_size.index.names = ['split_method','size_bin_lower_limit'] + summary_by_size.to_csv(os.path.join(pfold,'performance_summary_by_size.txt'),sep='\t') + + print('Performance analysis results (performance_summary_total.txt, performance_delta_detail.txt, performance_summary_by_size.txt) were written to: ' + pfold + '.') + logger.info('DONE') + + +def similarity(ptuner,pfold,maxsample,batchsize,numbins,precision,minpop,rseed): + + """ + This function creates distance histograms. Sampling will take some time please be patient. + + For input: + 1. path to the matrix of ECFP features [compounds x ecfp], .mtx file + + It will output: + 1. a absolute performance summary csv + 2. a delta performance summary csv + 3. mean delta perfromances for size bins + + """ + + print('Similarity analysis started') + + # Read input files: folds and fingerprints (ecfp6) + logger.info('read in X data (ecfp)') + X = sio.mmread(os.path.join(ptuner,'files_4_ml','T11_x.mtx')) + X = X.tocsr() + + foldings = glob.glob(os.path.join(pfold,'*folds.npy')) + + for method in foldings: + prefix = os.path.split(method)[1].split('_')[0] + folds = np.load(os.path.join(pfold,prefix+'_folds.npy')) + logger.info('read in fold data for {0}'.format(prefix)) + print('Running similarity analysis on: ' + prefix) + + #set the random seed + np.random.seed(rseed) + + #run the sampling + tc_hist_same_fold, tc_hist_different_fold, intra_fold_fraction, bin_lower_limits = \ + batch_tc_pair_hist_conv(X, folds, Nbins=numbins, Precision=precision, Minpop=minpop, Maxsample = maxsample, Nbatch = batchsize) + logger.info('finished pair sampling') + #calculate significance from deviation of expected random value for Fraction_Intra_fold (0.2 for 5 folds) + p_intra_fold = tc_hist_same_fold.sum() / (tc_hist_same_fold.sum() + tc_hist_different_fold.sum()) + binom_pval = binom.logsf(k=tc_hist_same_fold.astype(int)-1,n=tc_hist_same_fold.astype(int)+tc_hist_different_fold.astype(int), p=p_intra_fold) + hist_df = pd.DataFrame({'tc_similarity_bin': bin_lower_limits, 'Num_intra_fold_pairs':tc_hist_same_fold, \ + 'Num_inter_fold_pairs': tc_hist_different_fold, 'Fraction_Intra_fold':intra_fold_fraction, \ + '1-binom.logsf':binom_pval}) + #write out results + hist_df.to_csv(os.path.join(pfold,prefix+'_similarity_histogram.txt'),sep='\t',header=True) + print('Similarity analysis result was written to: ' + os.path.join(pfold,prefix+'_similarity_histogram.txt') + ' .') + logger.info('DONE') + diff --git a/chemfold/analyze.py b/chemfold/analyze.py new file mode 100644 index 0000000..5981625 --- /dev/null +++ b/chemfold/analyze.py @@ -0,0 +1,63 @@ +import argparse +import logging +import sys +import analysis as cfa + +#logging +shandler = logging.StreamHandler(stream=sys.stdout) +formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s') +shandler.setFormatter(formatter) +shandler.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(shandler) + + +parser = argparse.ArgumentParser(description="Suite of different folding methods for chemical structures suitable for privacy-preserving federated learning and analysis scripts.") +#parser.add_argument("--x", help="Descriptor file (matrix market or numpy)", type=str, required=True) +parser.add_argument("--inp", help="Folder with input files, e.g., descriptor file (matrix market or numpy, distances, y matrix, sparse chem model folder + json summary,Matrix of ECFP features [compounds x ecfp], .mtx file)", type=str, required=True) +parser.add_argument("--analysis", help="which analysis to run options are: [all, performance, imbalance, similarity]", choices=['all','performance','imbalance','similarity'], default='all') +parser.add_argument("--out", help="Output directory for the clusters file (.npy)", type=str, required=True) + +#performance +parser.add_argument("--psc", help="Path to sparsechem models sparsechem/models", type=str, required=False) +parser.add_argument("--baseline_prefix", help="Baseline prefix", type=str, default='random') + +#similarity +parser.add_argument("--maxsample", help="Maximal number of compound pairs to sample", type=int, default=10000000) +parser.add_argument("--batchsize", help="Number of compound pairs to precessed in one batch", type=int, default=50000) +parser.add_argument("--numbins", help="Number of bins of the histogram", type=int, default=10) +parser.add_argument("--precision", help="Maximal tolerated change of intra fold ratio per similarity bin between to batches to reach convergence",type=float,default=0.02) +parser.add_argument("--minpop", help="Minimal population of intra-fold samples per similarity bin to required to reach convergence",type=int, default=3) +parser.add_argument("--rseed",help="random seed to use", type=int, default = 123456) + +args = parser.parse_args() +logger.info(args) + + +#call of analysis functions + +if args.analysis: + if args.analysis == 'imbalance' or args.analysis == 'all': + #balance + cfa.balance(args.inp,args.out) + + + if args.analysis == 'performance' or args.analysis == 'all': + #performance + #before a model has to be trained with the baseline and the alternative fold splitting using SparseChem + # We recognize the conditions by the prefix given as option during the training job submission. This prefix is found in the model json files. All fold models under the same conditions are expected to have the same prefix. We now declare which condition is the baseline. + + #TODO: add an example call of sparsechem train + #cd ./examples/chembl + #python train.py --prefix [specify folding method here] + + + #Analysis of the effect of the fold splits on model performance, using random folding as the baseline. It assumes you have created the models for 5 validation folds and reads in the json files created by sparsechem. Each folding scheme should use a diffrent sparsechem --prefix to distinguish them. + cfa.performance(args.psc, args.baseline_prefix, args.out) + + + if args.analysis == 'similarity' or args.analysis == 'all': + #similarity + #This analyzes the fraction of randomly chosen compound pairs per similariuy bins that come from the same fold. Ideally the fraction on intra-fold pairs for high similarity bins should be as high as possible. + cfa.similarity(args.inp, args.out,args.maxsample,args.batchsize,args.numbins,args.precision,args.minpop,args.rseed) diff --git a/chemfold/random_split.py b/chemfold/random_split.py new file mode 100644 index 0000000..fba51ce --- /dev/null +++ b/chemfold/random_split.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""random_split.py: Assigns a random fold number to each input. + +Assigns a random fold number to each input for the ML model to compare the generated distribution of data and labels over the folds and the resulting performance with other methods like LFC, LSH or a scaffold-based fold splitting. This script is part of the folding single partner study of WP1 from the MELLODDY project. It is highly inspired by the script 'split_by_scaffold.py by Ansgar Schuffenhauer. +""" +#ATTENTION! WARNING! This is a beta version. Use on your own risk! + + +import argparse +import numpy as np +import pandas as pd +import json +import hashlib +import random +import hmac +import os, sys, logging + +__author__ = "Lina Humbeck, Boehringer Ingelheim Pharma GmbH & Co.KG" +__contributors__ = "Ansgar Schuffenhauer, Novartis" + +shandler = logging.StreamHandler(stream=sys.stdout) +formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s') +shandler.setFormatter(formatter) +shandler.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(shandler) + +def hashed_fold_scaffold(scaff, secret, nfolds = 5): + scaff = str(scaff).encode("ASCII") + #h = sha256([scaff], secret) + m = hmac.new(secret, b'', hashlib.sha256) + m.update(scaff) + random.seed(m.digest(), version=2) + return random.randint(0, nfolds - 1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds') + parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True) + parser.add_argument('--out', help="output folder", type=str, required=True) + parser.add_argument('--params_file',help='path to parameters.json file',required=True) + + args = parser.parse_args() + logger.info(args) + + with open(args.params_file) as cnf_f: + params = json.load(cnf_f) + + #read standardized structures + fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv') + t2_standardized = pd.read_csv(fname) + logger.info('read in standardized structure file: {}'.format(fname)) + + t2_unique = t2_standardized[['canonical_smiles']].drop_duplicates() + t2_scaff = t2_standardized.merge(t2_unique[['canonical_smiles']], on='canonical_smiles') + logger.info('extracted unique compounds:{0}'.format(len(t2_scaff))) + + #now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs + t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv')) + t2_joined = t2_scaff.merge(t5, on='input_compound_id') + t6_scaff = t2_joined.groupby('descriptor_vector_id')['canonical_smiles'].min() + + #now we need to join to final file with continuous descriptor vector Ids + t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv')) + t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id') + + ## creating output path if it does not exist + os.makedirs(args.out, exist_ok=True) + + #now we need to generate the hashes for the scaffolds + key = params['key']['key'].encode("ASCII") + nfolds = params['lsh']['nfolds'] + t2_t11_joined['fold'] = t2_t11_joined['canonical_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds)) + t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_random_folds.csv')) + logger.info('written out csv file with fold info') + + #now create numpy arrays for folds + random_fold = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id'])) + random_fold[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['fold'] + + np.save(os.path.join(args.out,'random_folds.npy'),random_fold) + logger.info('written out numpy arrays with folds') + diff --git a/chemfold/scaffold_network.py b/chemfold/scaffold_network.py new file mode 100644 index 0000000..68fd3ba --- /dev/null +++ b/chemfold/scaffold_network.py @@ -0,0 +1,235 @@ +import argparse +import numpy as np +import pandas as pd +import pickle + +import json + +import hashlib +import random +import hmac + +from rdkit import Chem +from rdkit.Chem.Scaffolds import rdScaffoldNetwork, MurckoScaffold +from rdkit.Chem import rdMolDescriptors +from rdkit.Chem import PandasTools + +import os, sys, logging + +def has_unusual_ringsize(mol): + return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>6 or len(x)<5 ])>0 + +def has_macrocycle(mol): + return len([len(x) for x in mol.GetRingInfo().AtomRings() if len(x)>9])>0 + +def murcko_scaff_smiles(mol_smiles): + mol = Chem.MolFromSmiles(mol_smiles) + if mol is not None: + return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol)) + else: + return None + +def hashed_fold_scaffold(scaff, secret, nfolds = 5): + scaff = str(scaff).encode("ASCII") + h = sha256([scaff], secret) + m = hmac.new(secret, b'', hashlib.sha256) + m.update(scaff) + random.seed(m.digest(), version=2) + return random.randint(0, nfolds - 1) + +def sha256(inputs, secret): + """ + Encryption function using python's pre-installed packages HMAC and hashlib. + We are using SHA256 (it is equal in security to SHA512). + :param inputs: input strings + :param secret: given pharma partner key + :return: + """ + m = hmac.new(secret, b'', hashlib.sha256) + for i in inputs: + m.update(i) + return m.digest() + + +if __name__ == "__main__": + + shandler = logging.StreamHandler(stream=sys.stdout) + formatter = logging.Formatter('%(asctime)s -%(name)s - %(levelname)s - %(message)s') + shandler.setFormatter(formatter) + shandler.setLevel(logging.INFO) + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + logger.addHandler(shandler) + + + parser = argparse.ArgumentParser(description='compute fold vectors based on Scaffolds') + parser.add_argument('--infolder', help="Input folder, working directory of melloddy_tuner, expected to conatin results and results_tmp subfolder ", type=str, required=True) + parser.add_argument('--out', help="output folder", type=str, required=True) + parser.add_argument('--params_file',help='path to parameters.json file',required=True) + parser.add_argument('--sn_flattenIsotopes',help = 'controls flattenIsotopes parameter of scaffold network',type=bool,default=True) + parser.add_argument('--sn_includeGenericBondScaffolds',help = 'controls includeGenericBondScaffolds parameter of scaffold network',type=bool,default=False) + parser.add_argument('--sn_includeGenericScaffolds',help = 'controls includeGenericScaffolds parameter of scaffold network',type=bool,default=False) + parser.add_argument('--sn_includeScaffoldsWithAttachments',help = 'controls includeScaffoldsWithAttachments parameter of scaffold network',type=bool,default=False) + parser.add_argument('--sn_includeScaffoldsWithoutAttachments',help = 'controls includeScaffoldsWithoutAttachments parameter of scaffold network',type=bool,default=True) + parser.add_argument('--sn_pruneBeforeFragmenting',help = 'controls pruneBeforeFragmenting parameter of scaffold network',type=bool,default=True) + parser.add_argument('--nrings', help="preferred number of rings for scaffold, defines the cut-level of scaffold network", type=int, default=3) + + + args = parser.parse_args() + logger.info(args) + + with open(args.params_file) as cnf_f: + params = json.load(cnf_f) + + snparams = rdScaffoldNetwork.ScaffoldNetworkParams() + snparams.flattenIsotopes=args.sn_flattenIsotopes + snparams.includeGenericBondScaffolds=args.sn_includeGenericBondScaffolds + snparams.includeGenericScaffolds=args.sn_includeGenericScaffolds + snparams.includeScaffoldsWithAttachments = args.sn_includeScaffoldsWithAttachments + snparams.includeScaffoldsWithoutAttachments = args.sn_includeScaffoldsWithoutAttachments + snparams.pruneBeforeFragmenting = args.sn_pruneBeforeFragmenting + + + + #read standardized structures + fname = os.path.join(args.infolder,'results_tmp','standardization','T2_standardized.csv') + t2_standardized = pd.read_csv(fname) + logger.info('read in standardized structure file: {}'.format(fname)) + + #calculate Murcko Scaffold + t2_standardized['murcko_scaff_smiles'] = t2_standardized['canonical_smiles'].apply(murcko_scaff_smiles) + logger.info('created murcko scaff smiles column') + + #create unique Murcko Scaffolds dataframe + t2_murcko_unique = t2_standardized[['murcko_scaff_smiles']].drop_duplicates() + logger.info('extracted unique murcko scaffolds:{0}'.format(len(t2_murcko_unique))) + + + + #create scaffold network for each unique Murcko scaffold + sn_dict = {} + failed_list = [] + for row in t2_murcko_unique.itertuples(): + logger.debug('processing\t|{}|'.format(row.murcko_scaff_smiles)) + if row.murcko_scaff_smiles is not None and row.murcko_scaff_smiles != '' and pd.notnull( row.murcko_scaff_smiles): + mol = Chem.MolFromSmiles(row.murcko_scaff_smiles) + if mol is not None: + try: + sn_dict[row.murcko_scaff_smiles] = rdScaffoldNetwork.CreateScaffoldNetwork([mol],snparams) + except: + failed_list.append(row.murcko_scaff_smiles) + else: + failed_list.append(row.murcko_scaff_smiles) + else: + failed_list.append(row.murcko_scaff_smiles) + logger.info('generated scaffold network') + + ## creating output path if it does not exist + os.makedirs(args.out, exist_ok=True) + + #save the scaffold netowrks + with open(os.path.join(args.out,'T2_sn_dict.pkl'),'wb') as pklf: + pickle.dump(sn_dict,pklf) + logger.info('written pickle file for scaffold networks') + + #with open(os.path.join(args.out,'T2_sn_dict.pkl'),'rb') as pklf: + # sn_dict = pickle.load(pklf) + #logger.info('read pickle file for scaffold networks') + + #write out failed list + logger.info('encountered {} murcko scaffolds failing in scaffold network'.format(len(failed_list))) + fname = os.path.join(args.out,'failed_sn_murcko_scaffolds.txt') + pd.Series(failed_list).to_csv(fname,sep='\t') + + #make a unique data frame of scaffold network nodes, and clacukate properties + all_nodes = set([]) + for sn in sn_dict.values(): + all_nodes |= set([str(n) for n in sn.nodes]) + logger.info('extracted nodes') + logger.info('number of nodes {}'.format(len(all_nodes))) + + node_df = pd.DataFrame({'node_smiles':list(all_nodes)}) + PandasTools.AddMoleculeColumnToFrame(node_df,'node_smiles','mol',includeFingerprints=False) + node_df['num_rings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRings) + node_df['num_rings_delta'] = (node_df['num_rings'] -args.nrings).abs() + node_df['num_rbonds'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumRotatableBonds) + node_df['num_hrings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumHeterocycles) + node_df['num_arings'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumAromaticRings) + node_df['num_bridge'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumBridgeheadAtoms) + node_df['num_spiro'] = node_df['mol'].apply(Chem.rdMolDescriptors.CalcNumSpiroAtoms) + node_df['has_macrocyle'] = node_df['mol'].apply(has_macrocycle) + node_df['has_unusual_ring_size'] = node_df['mol'].apply(has_unusual_ringsize) + logger.info('calculated node properties') + + #the follwing lines define the precedence of nodes, we aim at ~3 ring scaffolds + priority_cols =['num_rings_delta','has_macrocyle','num_rbonds','num_bridge','num_spiro','has_unusual_ring_size','num_hrings', 'num_arings','node_smiles'] + priority_asc = [True,False,True,False,False,False,False,True,True] + node_df.sort_values(priority_cols, ascending = priority_asc, inplace =True) + node_df['priority'] = np.arange(len(node_df)) + node_df.set_index('node_smiles',inplace=True) + logger.info('sorted nodes by priority') + + with open(os.path.join(args.out,'T2_sn_nodes.pkl'),'wb') as pklf: + pickle.dump(node_df,pklf) + logger.info('written pickle file for node table') + #with open(args.out,'T2_sn_nodes.pkl'),'rb') as pklf: + # node_df = pickle.load(pklf) + #logger.info('read pickle file for node table') + + #assign for eac Murcko scaffold the preferred sn scaffolds + for row in t2_murcko_unique.itertuples(): + if row.murcko_scaff_smiles in sn_dict: + nodelist = [str(n) for n in sn_dict[row.murcko_scaff_smiles].nodes] + node_data = node_df.loc[nodelist] + t2_murcko_unique.loc[row.Index,'sn_scaff_smiles'] = node_data['priority'].idxmin() + logger.info('assigned preferred scaffolds') + + with open(os.path.join(args.out,'T2_sn_murcko_unique.pkl'),'wb') as pklf: + pickle.dump(t2_murcko_unique,pklf) + logger.info('written pickle file for unique Murcko scaffolds') + + #join back to original scaffolds + t2_scaff = t2_standardized.merge(t2_murcko_unique[['murcko_scaff_smiles','sn_scaff_smiles']],on='murcko_scaff_smiles') + t2_scaff.to_csv(os.path.join(args.out,'T2_sn_scaff.csv')) + logger.info('written scaffold assignment') + + #now need to merg the T5 table, to get scaffold assignments to unique descriptors IDs + t5 = pd.read_csv(os.path.join(args.infolder,'results_tmp','descriptors','mapping_table_T5.csv')) + t2_joined = t2_scaff.merge(t5,on='input_compound_id') + + # in rare cases of collisions 8diffeent scaffolds, but ientical descriptors we brea the ties by alpabetic precedence + t6_scaff = t2_joined.groupby('descriptor_vector_id')[['murcko_scaff_smiles','sn_scaff_smiles']].min() + + #identify collisions: different scaffold, same descriptor vector + nunique_murcko = t2_joined.groupby('descriptor_vector_id')['murcko_scaff_smiles'].nunique() + collisions_murcko = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_murcko[nunique_murcko > 1].index)].sort_values('descriptor_vector_id') + collisions_murcko.to_csv(os.path.join(args.out,'collisions_murcko.txt'),sep='\t') + logger.info('Colliding Murcko scaffolds observed on {0} records and written out'.format(len(collisions_murcko)) ) + + nunique_sn = t2_joined.groupby('descriptor_vector_id')['sn_scaff_smiles'].nunique() + collisions_sn = t2_joined[t2_joined['descriptor_vector_id'].isin(nunique_sn[nunique_sn > 1].index)].sort_values('descriptor_vector_id') + collisions_sn.to_csv(os.path.join(args.out,'collisions_sn.txt'),sep='\t') + logger.info('Colliding SN scaffolds observed on {0} records and written out'.format(len(collisions_sn)) ) + + #now we need to join to final file with continuous descriptor vector Ids + t2_t11 = pd.read_csv(os.path.join(args.infolder,'results','T11.csv')) + t2_t11_joined = t2_t11.join(t6_scaff,on='descriptor_vector_id') + + #now we need to generate the hashes for the scaffolds + key = params['key']['key'].encode("ASCII") + nfolds = params['lsh']['nfolds'] + t2_t11_joined['sn_scaff_fold'] = t2_t11_joined['sn_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds=nfolds)) + t2_t11_joined['murcko_scaff_fold'] = t2_t11_joined['murcko_scaff_smiles'].apply(lambda x : hashed_fold_scaffold(x, secret = key, nfolds = nfolds)) + t2_t11_joined.to_csv(os.path.join(args.out,'T2_T11_scaff_folds.csv')) + logger.info('written out csv file with fold info') + + #now create numpy arrays for folds + fold_sn = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id'])) + fold_sn[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['sn_scaff_fold'] + fold_murcko = np.zeros(len(t2_t11_joined['cont_descriptor_vector_id'])) + fold_murcko[t2_t11_joined['cont_descriptor_vector_id']] = t2_t11_joined['murcko_scaff_fold'] + + np.save(os.path.join(args.out,'murcko_scaff_folds.npy'),fold_murcko) + np.save(os.path.join(args.out,'sn_scaff_folds.npy'),fold_sn) + logger.info('written out numpy arrays with folds') + diff --git a/chemfold/sphere.pyx b/chemfold/sphere.pyx new file mode 100644 index 0000000..cfb6b2e --- /dev/null +++ b/chemfold/sphere.pyx @@ -0,0 +1,125 @@ +# distutils: language = c++ + +from libcpp.vector cimport vector +import numpy as np +import scipy.sparse + +def se_cluster(X, double radius): + """ + Performs sphere exclusion (leader follower) clustering. + With additional step of assigning each sample to closest cluster. + Args: + X csr matrix with binary values + radius clustering radius + Returns clustering assignment and center ids. + """ + cdef vector[double] dist + cdef int i, j, k, row, col, closest + + cdef int [:] x_indptr = X.indptr + cdef int [:] x_indices = X.indices + #cdef double [:] x_data = X.data + + cdef int X_shape0 = X.shape[0] + cdef int X_shape1 = X.shape[1] + + ## algorithm part + cdef long [:] Xnorm = np.array((X != 0).sum(axis = 1)).flatten() + cdef double [:] dists = np.zeros(X_shape0, dtype=np.float64) + + cdef vector[ vector[int] ] centers + centers.resize(X_shape1) + cdef vector[double] Cnorm + + cdef vector[int] center_ids + + cdef long [:] clusters = np.zeros(X.shape[0], dtype = np.int) - 1 + cdef long [:] perm_ids = np.random.permutation(X.shape[0]) + + cdef double min_dist, tmp_dist + cdef int num_changed + + for i in range(X_shape0): + if i % 10000 == 0: + print(f"Row {i}.") + row = perm_ids[i] + + ## computing distances to all centers + dists[0 : Cnorm.size()] = 0.0 + for j in range(x_indptr[row], x_indptr[row+1]): + col = x_indices[j] + for k in range(centers[col].size()): + dists[ centers[col][k] ] += 1.0 + + closest = -1 + min_dist = radius + for j in range(Cnorm.size()): + dists[j] = 1.0 - dists[j] / (Xnorm[row] + Cnorm[j] - dists[j]) + if dists[j] < min_dist: + min_dist = dists[j] + closest = j + + if closest >= 0: + clusters[row] = closest + continue + + ## create a new cluster + k = Cnorm.size() + for j in range(x_indptr[row], x_indptr[row+1]): + centers[ x_indices[j] ].push_back(k) + clusters[row] = k + Cnorm.push_back(Xnorm[row]) + center_ids.push_back(row) + + print("Reassigning compounds to the closest clusters.") + num_changed = 0 + for row in range(X_shape0): + if row % 10000 == 0: + print(f"Row {row}.") + dists[0 : Cnorm.size()] = 0.0 + ## compute distances to all clusters, assign to the closest + for j in range(x_indptr[row], x_indptr[row+1]): + col = x_indices[j] + for k in range(centers[col].size()): + dists[ centers[col][k] ] += 1.0 + + closest = -1 + min_dist = radius + 1e-5 + for j in range(Cnorm.size()): + tmp_dist = 1.0 - dists[j] / (Xnorm[row] + Cnorm[j] - dists[j]) + if tmp_dist < min_dist: + min_dist = tmp_dist + closest = j + if min_dist == 0: + ## best possible + break + if (closest >= 0) and (clusters[row] != closest): + clusters[row] = closest + num_changed += 1 + + print(f"Reassignement changed {num_changed} assignments.") + print(f"Total {len(center_ids)} clusters.") + + return np.asarray(clusters), np.asarray(center_ids) + + +def hierarchical_clustering(X, dists): + """ + Args: + X compound matrix in CSR + dists list of (increasing) distances + Sequentially clusters with each dists, returns final cluster ids + """ + assert isinstance(X, scipy.sparse.csr.csr_matrix), "X should be csr_matrix (scipy.sparse)" + assert all(dists[i] < dists[i+1] + for i in range(len(dists) - 1)), "dists must be a list of increasing values" + + cl0 = np.arange(X.shape[0]) + Xc = X + for dist in dists: + cl, cent = se_cluster(Xc, dist) + Xc = Xc[cent] + cl0 = cl[cl0] + + return cl0 + diff --git a/chemfold/sphere_exclusion.py b/chemfold/sphere_exclusion.py new file mode 100644 index 0000000..51d856f --- /dev/null +++ b/chemfold/sphere_exclusion.py @@ -0,0 +1,24 @@ +import chemfold as cf + +import numpy as np +import scipy.sparse +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Sphere exclusion on the given X") + parser.add_argument("--x", help="Molecule descriptor file, e.g., ECFPs (matrix market or numpy)", type=str, required=True) + parser.add_argument("--out", help="Output file for the clusters (.npy)", type=str, required=True) + parser.add_argument("--dists", nargs="+", help="Distances", type=float, required=True) + + args = parser.parse_args() + print(args) + + print(f"Loading '{args.x}'.") + X = cf.load_sparse(args.x).tocsr() + + print("Clustering.") + # two step clustering, first at 0.5, then at 0.6 + cl_hier = cf.hierarchical_clustering(X, dists=args.dists) + + np.save(args.out, cl_hier) + print(f"Saved clustering into '{args.out}'.") diff --git a/chemfold/utils.py b/chemfold/utils.py new file mode 100644 index 0000000..e23bc9e --- /dev/null +++ b/chemfold/utils.py @@ -0,0 +1,13 @@ +import scipy.io +import numpy as np + +def load_sparse(filename): + """Loads sparse from Matrix market or Numpy .npy file.""" + if filename is None: + return None + if filename.endswith('.mtx'): + return scipy.io.mmread(filename).tocsr() + elif filename.endswith('.npy'): + return np.load(filename, allow_pickle=True).item().tocsr() + raise ValueError(f"Loading '{filename}' failed. It must have a suffix '.mtx' or '.npy'.") + diff --git a/chemfold/version.py b/chemfold/version.py new file mode 100644 index 0000000..5becc17 --- /dev/null +++ b/chemfold/version.py @@ -0,0 +1 @@ +__version__ = "1.0.0"