From 7a64ca7865750acedefa9f44cc603d9edbdbdfff Mon Sep 17 00:00:00 2001 From: BeGeiger Date: Tue, 7 Feb 2023 11:27:46 -0500 Subject: [PATCH] Fixed indentation Also turned sorted into np.sort to improve performance. --- musical/preprocessing.py | 467 ++++++++++++++++++++------------------- 1 file changed, 234 insertions(+), 233 deletions(-) diff --git a/musical/preprocessing.py b/musical/preprocessing.py index 74000f9..44a89fc 100644 --- a/musical/preprocessing.py +++ b/musical/preprocessing.py @@ -16,17 +16,18 @@ def sort_with_indices(x): return x[indices_sorted], indices_sorted -def gini(x, assume_sorted=True): +def gini(x, input_sorted=True): """ - x is assumed to be increasing + Calculate the Gini coefficient of x. If x is not sorted (increasing), set input_sorted=False """ - if not assume_sorted: + if not input_sorted: x = np.sort(x) + n = len(x) - aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)] - scaling = 1 / (n * sum(x)) + aux = x * (2 * np.arange(n) - n + 1) + scaling = 1 / (n * np.sum(x)) - return scaling * sum(aux) + return scaling * np.sum(aux) def n_remove_gini(x, gini_delta, thresh): @@ -91,7 +92,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): n_samples = H.shape[1] # normalize the exposures - H = H /np.sum(X, axis=0) + H = H / np.sum(X, axis=0) # Gini coefficients of normalized signature exposures gini_coeffs = np.array([gini(np.sort(h)) for h in H]) @@ -106,13 +107,13 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): n_remove = n_remove_gini(sorted_h, gini_delta, .8) to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0)) - samples_to_keep[sig_index] = np.array(sorted(to_keep)) + samples_to_keep[sig_index] = np.sort(to_keep) samples_to_remove |= set(to_remove) X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()} samples_to_keep_all = set(range(n_samples)) - samples_to_remove - samples_to_keep_all = np.array(sorted(samples_to_keep_all)) + samples_to_keep_all = np.sort(list(samples_to_keep_all)) results = (samples_to_keep, X_to_keep, samples_to_keep_all) @@ -120,230 +121,230 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): def identify_distinct_cluster(X, H, frac_thresh=0.05): - """Identify distinct clusters from the cohort based on exposures. - - Here 'distinct' means that there are signatures with high exposures in this cluster - and low exposures in other samples. This function will only do 2-cluster separations. - One can run this function iteratively until there are no separations. - - Parameters: - ---------- - X : array-like of shape (n_features, n_samples) - Input data matrix. - - H : array-like of shape (n_components, n_samples) - Exposure matrix. - - frac_thresh : float, default 0.05 - Threshold below which exposure fractions will be considered zero. - - Returns: - ---------- - k : int, 1 | 2 - Number of clusters. If 1, there isn't a distinct cluster, thus no separation is done. - If 2, there is a distinct cluster, thus the cohort is separated into two clusters. - - clusters : list of numpy arrays - The indices of samples in the clusters. - - Xs : list of numpy arrays - Samples in the clusters. - - distinct : list of bool, or None - When there is a distinct cluster, this attribute informs whether the first or the second cluster is distinct, or both. - - Notes: - ---------- - 1. Consider using .utils.differential_tail_test(). The logic in identifying cluster-specific signatures is basically - a differential tail test. - 2. Note that compared to Doga's original code, the output now is always either 1 or 2 clusters. Also, the nonspecific signatures - are not used anymore. - """ - if (type(X) != np.ndarray) or (not np.issubdtype(X.dtype, np.floating)): - X = np.array(X).astype(float) - if (type(H) != np.ndarray) or (not np.issubdtype(H.dtype, np.floating)): - H = np.array(H).astype(float) - n_components, n_samples = H.shape - - ### Normalize H - # Several differences from previous codes here: - # 1. Previous codes modify H in place, i.e., the outside H that is passed into this function will be modified. - # This is not desired behavior. - # 2. Previous codes use X to normalize H. Now H itself is used for the normalization. This should not matter too much. - # 3. Now, after setting small exposures to 0, we renormalize so that each sample is summed to one. This is conceptually - # appealing if we want to use cosine distances later. - H = normalize(H, norm='l1', axis=0) - H[H < frac_thresh] = 0.0 - H = normalize(H, norm='l1', axis=0) - - ### Clustering - d_square_form, cluster_membership = hierarchical_cluster(H, 2, metric='cosine', linkage_method='average') - clust1_index = np.arange(0, n_samples)[(cluster_membership == 1)] - clust2_index = np.arange(0, n_samples)[(cluster_membership == 2)] - H_clust1 = H[:, clust1_index] - H_clust2 = H[:, clust2_index] - - ### Identify cluster-specific signatures - n_clust1 = len(clust1_index) # Number of samples in cluster 1 - n_clust2 = len(clust2_index) # Number of samples in cluster 2 - n_clust1_pos = np.sum(H_clust1 > 0, 1) # Number of samples in cluster 1 that have positive exposures of the signature, one per signature - n_clust2_pos = np.sum(H_clust2 > 0, 1) # Number of samples in cluster 2 that have positive exposures of the signature, one per signature - pvalue_clust1_greater = [] # p-values for whether the signature has greater exposures in cluster 1 than in cluster 2, one per signature - pvalue_clust2_greater = [] # p-values for whether the signature has greater exposures in cluster 2 than in cluster 1, one per signature - for h1, h2 in zip(H_clust1, H_clust2): - try: - p = stats.mannwhitneyu(h1, h2, alternative='greater')[1] - except: # E.g., when the exposure of a signature is all 0 in the dataset. This could happen in later iterations of running the function. - p = 1.0 - pvalue_clust1_greater.append(p) - try: - p = stats.mannwhitneyu(h2, h1, alternative='greater')[1] - except: - p = 1.0 - pvalue_clust2_greater.append(p) - pvalue_clust1_greater = np.array(pvalue_clust1_greater) - pvalue_clust2_greater = np.array(pvalue_clust2_greater) - # A signature is considered to be specific to cluster 1, if - # (the exposure is significantly greater in cluster 1 and less than 10% of samples in cluster 2 have positive exposures) or - # (more than 90% of samples in cluster 1 and less than 10% of samples in cluster 2 have positive exposures) - sigs_clust1_specific = np.arange(0, n_components)[np.logical_or( - np.logical_and(pvalue_clust1_greater < 0.05, n_clust2_pos/n_clust2 < 0.1), - np.logical_and(n_clust1_pos/n_clust1 > 0.9, n_clust2_pos/n_clust2 < 0.1) - )] - sigs_clust2_specific = np.arange(0, n_components)[np.logical_or( - np.logical_and(pvalue_clust2_greater < 0.05, n_clust1_pos/n_clust1 < 0.1), - np.logical_and(n_clust2_pos/n_clust2 > 0.9, n_clust1_pos/n_clust1 < 0.1) - )] - # If a signature has positive exposures in more than 10% of samples in both cluster 1 and 2, it is considered non-specific. - #sigs_nonspecific = np.arange(0, n_components)[ - # np.logical_and(n_clust1_pos/n_clust1 > 0.1, n_clust2_pos/n_clust2 > 0.1) - #] - - ### Calculate mean exposures for cluster-specific and nonspecific signatures in the 2 clusters separately. - if len(sigs_clust1_specific) > 0: - mean_fraction_clust1_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_clust1_specific, :], 0)) - mean_fraction_clust1_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_clust1_specific, :], 0)) - else: - mean_fraction_clust1_sigs_in_clust1 = np.nan - mean_fraction_clust1_sigs_in_clust2 = np.nan - if len(sigs_clust2_specific) > 0: - mean_fraction_clust2_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_clust2_specific, :], 0)) - mean_fraction_clust2_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_clust2_specific, :], 0)) - else: - mean_fraction_clust2_sigs_in_clust1 = np.nan - mean_fraction_clust2_sigs_in_clust2 = np.nan - #if len(sigs_nonspecific) > 0: - # mean_fraction_nonspecific_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_nonspecific, :], 0)) - # mean_fraction_nonspecific_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_nonspecific, :], 0)) - #else: - # mean_fraction_nonspecific_sigs_in_clust1 = np.nan - # mean_fraction_nonspecific_sigs_in_clust2 = np.nan - - ### Determine whether there is a distinct cluster. - # Cluster 1 is considered distinct, if cluster 1 specific signatures contribute less than 0.05 in cluster 2 and more than 0.3 in cluster 1. - if ((mean_fraction_clust1_sigs_in_clust2 < 0.05 and mean_fraction_clust1_sigs_in_clust1 > 0.3) and - (mean_fraction_clust2_sigs_in_clust1 < 0.05 and mean_fraction_clust2_sigs_in_clust2 > 0.3)): - Xs = [X[:, clust1_index], X[:, clust2_index]] - clusters = [clust1_index, clust2_index] - k = 2 - distinct = [True, True] - elif (mean_fraction_clust1_sigs_in_clust2 < 0.05 and mean_fraction_clust1_sigs_in_clust1 > 0.3): - Xs = [X[:, clust1_index], X[:, clust2_index]] - clusters = [clust1_index, clust2_index] - k = 2 - distinct = [True, False] - elif (mean_fraction_clust2_sigs_in_clust1 < 0.05 and mean_fraction_clust2_sigs_in_clust2 > 0.3): - Xs = [X[:, clust1_index], X[:, clust2_index]] - clusters = [clust1_index, clust2_index] - k = 2 - distinct = [False, True] - else: - Xs = [X] - clusters = [np.arange(0, n_samples)] - k = 1 - distinct = None - - return k, clusters, Xs, distinct + """Identify distinct clusters from the cohort based on exposures. + + Here 'distinct' means that there are signatures with high exposures in this cluster + and low exposures in other samples. This function will only do 2-cluster separations. + One can run this function iteratively until there are no separations. + + Parameters: + ---------- + X : array-like of shape (n_features, n_samples) + Input data matrix. + + H : array-like of shape (n_components, n_samples) + Exposure matrix. + + frac_thresh : float, default 0.05 + Threshold below which exposure fractions will be considered zero. + + Returns: + ---------- + k : int, 1 | 2 + Number of clusters. If 1, there isn't a distinct cluster, thus no separation is done. + If 2, there is a distinct cluster, thus the cohort is separated into two clusters. + + clusters : list of numpy arrays + The indices of samples in the clusters. + + Xs : list of numpy arrays + Samples in the clusters. + + distinct : list of bool, or None + When there is a distinct cluster, this attribute informs whether the first or the second cluster is distinct, or both. + + Notes: + ---------- + 1. Consider using .utils.differential_tail_test(). The logic in identifying cluster-specific signatures is basically + a differential tail test. + 2. Note that compared to Doga's original code, the output now is always either 1 or 2 clusters. Also, the nonspecific signatures + are not used anymore. + """ + if (type(X) != np.ndarray) or (not np.issubdtype(X.dtype, np.floating)): + X = np.array(X).astype(float) + if (type(H) != np.ndarray) or (not np.issubdtype(H.dtype, np.floating)): + H = np.array(H).astype(float) + n_components, n_samples = H.shape + + ### Normalize H + # Several differences from previous codes here: + # 1. Previous codes modify H in place, i.e., the outside H that is passed into this function will be modified. + # This is not desired behavior. + # 2. Previous codes use X to normalize H. Now H itself is used for the normalization. This should not matter too much. + # 3. Now, after setting small exposures to 0, we renormalize so that each sample is summed to one. This is conceptually + # appealing if we want to use cosine distances later. + H = normalize(H, norm='l1', axis=0) + H[H < frac_thresh] = 0.0 + H = normalize(H, norm='l1', axis=0) + + ### Clustering + d_square_form, cluster_membership = hierarchical_cluster(H, 2, metric='cosine', linkage_method='average') + clust1_index = np.arange(0, n_samples)[(cluster_membership == 1)] + clust2_index = np.arange(0, n_samples)[(cluster_membership == 2)] + H_clust1 = H[:, clust1_index] + H_clust2 = H[:, clust2_index] + + ### Identify cluster-specific signatures + n_clust1 = len(clust1_index) # Number of samples in cluster 1 + n_clust2 = len(clust2_index) # Number of samples in cluster 2 + n_clust1_pos = np.sum(H_clust1 > 0, 1) # Number of samples in cluster 1 that have positive exposures of the signature, one per signature + n_clust2_pos = np.sum(H_clust2 > 0, 1) # Number of samples in cluster 2 that have positive exposures of the signature, one per signature + pvalue_clust1_greater = [] # p-values for whether the signature has greater exposures in cluster 1 than in cluster 2, one per signature + pvalue_clust2_greater = [] # p-values for whether the signature has greater exposures in cluster 2 than in cluster 1, one per signature + for h1, h2 in zip(H_clust1, H_clust2): + try: + p = stats.mannwhitneyu(h1, h2, alternative='greater')[1] + except: # E.g., when the exposure of a signature is all 0 in the dataset. This could happen in later iterations of running the function. + p = 1.0 + pvalue_clust1_greater.append(p) + try: + p = stats.mannwhitneyu(h2, h1, alternative='greater')[1] + except: + p = 1.0 + pvalue_clust2_greater.append(p) + pvalue_clust1_greater = np.array(pvalue_clust1_greater) + pvalue_clust2_greater = np.array(pvalue_clust2_greater) + # A signature is considered to be specific to cluster 1, if + # (the exposure is significantly greater in cluster 1 and less than 10% of samples in cluster 2 have positive exposures) or + # (more than 90% of samples in cluster 1 and less than 10% of samples in cluster 2 have positive exposures) + sigs_clust1_specific = np.arange(0, n_components)[np.logical_or( + np.logical_and(pvalue_clust1_greater < 0.05, n_clust2_pos/n_clust2 < 0.1), + np.logical_and(n_clust1_pos/n_clust1 > 0.9, n_clust2_pos/n_clust2 < 0.1) + )] + sigs_clust2_specific = np.arange(0, n_components)[np.logical_or( + np.logical_and(pvalue_clust2_greater < 0.05, n_clust1_pos/n_clust1 < 0.1), + np.logical_and(n_clust2_pos/n_clust2 > 0.9, n_clust1_pos/n_clust1 < 0.1) + )] + # If a signature has positive exposures in more than 10% of samples in both cluster 1 and 2, it is considered non-specific. + #sigs_nonspecific = np.arange(0, n_components)[ + # np.logical_and(n_clust1_pos/n_clust1 > 0.1, n_clust2_pos/n_clust2 > 0.1) + #] + + ### Calculate mean exposures for cluster-specific and nonspecific signatures in the 2 clusters separately. + if len(sigs_clust1_specific) > 0: + mean_fraction_clust1_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_clust1_specific, :], 0)) + mean_fraction_clust1_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_clust1_specific, :], 0)) + else: + mean_fraction_clust1_sigs_in_clust1 = np.nan + mean_fraction_clust1_sigs_in_clust2 = np.nan + if len(sigs_clust2_specific) > 0: + mean_fraction_clust2_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_clust2_specific, :], 0)) + mean_fraction_clust2_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_clust2_specific, :], 0)) + else: + mean_fraction_clust2_sigs_in_clust1 = np.nan + mean_fraction_clust2_sigs_in_clust2 = np.nan + #if len(sigs_nonspecific) > 0: + # mean_fraction_nonspecific_sigs_in_clust1 = np.mean(np.sum(H_clust1[sigs_nonspecific, :], 0)) + # mean_fraction_nonspecific_sigs_in_clust2 = np.mean(np.sum(H_clust2[sigs_nonspecific, :], 0)) + #else: + # mean_fraction_nonspecific_sigs_in_clust1 = np.nan + # mean_fraction_nonspecific_sigs_in_clust2 = np.nan + + ### Determine whether there is a distinct cluster. + # Cluster 1 is considered distinct, if cluster 1 specific signatures contribute less than 0.05 in cluster 2 and more than 0.3 in cluster 1. + if ((mean_fraction_clust1_sigs_in_clust2 < 0.05 and mean_fraction_clust1_sigs_in_clust1 > 0.3) and + (mean_fraction_clust2_sigs_in_clust1 < 0.05 and mean_fraction_clust2_sigs_in_clust2 > 0.3)): + Xs = [X[:, clust1_index], X[:, clust2_index]] + clusters = [clust1_index, clust2_index] + k = 2 + distinct = [True, True] + elif (mean_fraction_clust1_sigs_in_clust2 < 0.05 and mean_fraction_clust1_sigs_in_clust1 > 0.3): + Xs = [X[:, clust1_index], X[:, clust2_index]] + clusters = [clust1_index, clust2_index] + k = 2 + distinct = [True, False] + elif (mean_fraction_clust2_sigs_in_clust1 < 0.05 and mean_fraction_clust2_sigs_in_clust2 > 0.3): + Xs = [X[:, clust1_index], X[:, clust2_index]] + clusters = [clust1_index, clust2_index] + k = 2 + distinct = [False, True] + else: + Xs = [X] + clusters = [np.arange(0, n_samples)] + k = 1 + distinct = None + + return k, clusters, Xs, distinct def stratify_samples(X, H=None, sil_thresh=0.9, - max_k=20, nrefs=50, metric='cosine', linkage_method='average', ref_method='a'): - """Stratify samples by clustering with automatic selection of cluster number. - - If H is provided, H will be used for clustering. Otherwise, X will be used. - - Parameters: - ---------- - X : array-like of shape (n_features, n_samples) - Input data matrix. - - H : array-like of shape (n_components, n_samples) - Optional exposure matrix. - - sil_thresh : float, default 0.9 - Silhouette score threshold. After determining the optimal number of clusters (k) by OptimalK, - if the optimal k is greater than 1, we check the per-cluster silhouette scores. If there is at least - one cluster with a silhouette score > sil_thresh, we accept the clustering into k clusters. Otherwise, - we reject the clustering, and take k = 1 instead. This helps distinguish distinct clusters such as - MMRD vs. MMRP, versus not-so-distinct clusters such as APOBEC vs. non-APOBEC. - - max_k : int, default 20 - Maximum number of clusters to be tested. - - nrefs : int - Used in OptimalK. Number of reference datasets to be simulated. - - Returns: - ---------- - k : int - The optimal number of clusters. If k == 1, then no stratification is done. - - clusters : list of numpy arrays - The indices of samples in distinct clusters. For example, the exposures of distinct clusters - can be obtained by [H[:, indices] for indices in clusters] - - Xs : list of numpy arrays - Samples in distinct clusters. - - optimalK : object of OptimalK class - Contains information about the clustering. For example, use optimalK.plot() to visualize the selection curves. - - Notes: - ---------- - 1. Currently, sil_thresh is used in the way that if there is at least one cluster with silhouette score > sil_thresh, we - take the entire clustering returned by OptimalK. One may argue that we need k - 1 clusters with silhouette score > sil_thresh, - i.e., at most 1 cluster with silhouette score < sil_thresh, to accept the full clustering. Otherwise a smaller k might be - more appropriate. This is a bit more complicated. So we ignore this for now. - """ - - if H is None: - data = normalize(X, norm='l1', axis=0) - else: - data = normalize(H, norm='l1', axis=0) - - n_samples = data.shape[1] - # Clustering with automatic selection of cluster number - optimalK = OptimalK(data, max_k=max_k, nrefs=nrefs, metric=metric, linkage_method=linkage_method, ref_method=ref_method) - # Gather results - k = optimalK.k # Number of clusters - if k > 1: - # If k > 1, we check per-cluster silhouette scores. - # If at least one cluster has silhouette score > sil_thresh, we accept the clustering. - # Otherwise we reject it. - if np.any(optimalK.silscorek_percluster[k] > sil_thresh): - cluster_membership = optimalK.cluster_membership - clusters = [] - Xs = [] - for i in sorted(list(set(cluster_membership))): - indices = np.arange(0, n_samples)[cluster_membership == i] - clusters.append(indices) - Xs.append(X[:, indices]) - else: - k = 1 - clusters = [np.arange(0, n_samples)] - Xs = [X] - else: - k = 1 - clusters = [np.arange(0, n_samples)] - Xs = [X] - return k, clusters, Xs, optimalK + max_k=20, nrefs=50, metric='cosine', linkage_method='average', ref_method='a'): + """Stratify samples by clustering with automatic selection of cluster number. + + If H is provided, H will be used for clustering. Otherwise, X will be used. + + Parameters: + ---------- + X : array-like of shape (n_features, n_samples) + Input data matrix. + + H : array-like of shape (n_components, n_samples) + Optional exposure matrix. + + sil_thresh : float, default 0.9 + Silhouette score threshold. After determining the optimal number of clusters (k) by OptimalK, + if the optimal k is greater than 1, we check the per-cluster silhouette scores. If there is at least + one cluster with a silhouette score > sil_thresh, we accept the clustering into k clusters. Otherwise, + we reject the clustering, and take k = 1 instead. This helps distinguish distinct clusters such as + MMRD vs. MMRP, versus not-so-distinct clusters such as APOBEC vs. non-APOBEC. + + max_k : int, default 20 + Maximum number of clusters to be tested. + + nrefs : int + Used in OptimalK. Number of reference datasets to be simulated. + + Returns: + ---------- + k : int + The optimal number of clusters. If k == 1, then no stratification is done. + + clusters : list of numpy arrays + The indices of samples in distinct clusters. For example, the exposures of distinct clusters + can be obtained by [H[:, indices] for indices in clusters] + + Xs : list of numpy arrays + Samples in distinct clusters. + + optimalK : object of OptimalK class + Contains information about the clustering. For example, use optimalK.plot() to visualize the selection curves. + + Notes: + ---------- + 1. Currently, sil_thresh is used in the way that if there is at least one cluster with silhouette score > sil_thresh, we + take the entire clustering returned by OptimalK. One may argue that we need k - 1 clusters with silhouette score > sil_thresh, + i.e., at most 1 cluster with silhouette score < sil_thresh, to accept the full clustering. Otherwise a smaller k might be + more appropriate. This is a bit more complicated. So we ignore this for now. + """ + + if H is None: + data = normalize(X, norm='l1', axis=0) + else: + data = normalize(H, norm='l1', axis=0) + + n_samples = data.shape[1] + # Clustering with automatic selection of cluster number + optimalK = OptimalK(data, max_k=max_k, nrefs=nrefs, metric=metric, linkage_method=linkage_method, ref_method=ref_method) + # Gather results + k = optimalK.k # Number of clusters + if k > 1: + # If k > 1, we check per-cluster silhouette scores. + # If at least one cluster has silhouette score > sil_thresh, we accept the clustering. + # Otherwise we reject it. + if np.any(optimalK.silscorek_percluster[k] > sil_thresh): + cluster_membership = optimalK.cluster_membership + clusters = [] + Xs = [] + for i in sorted(list(set(cluster_membership))): + indices = np.arange(0, n_samples)[cluster_membership == i] + clusters.append(indices) + Xs.append(X[:, indices]) + else: + k = 1 + clusters = [np.arange(0, n_samples)] + Xs = [X] + else: + k = 1 + clusters = [np.arange(0, n_samples)] + Xs = [X] + return k, clusters, Xs, optimalK \ No newline at end of file