From 59b68f7f47a224a0b973e23fb72cd9c08c743e89 Mon Sep 17 00:00:00 2001 From: BeGeiger Date: Mon, 6 Feb 2023 21:44:32 -0500 Subject: [PATCH 1/3] Bug fixes remove_samples_based_on_gini The function remove_samples_based_on_gini caused an error when the exposure matrix H or mutation type matix X were pandas dataframes. Several bugs in the logic of the implementation were fixed. A detailed docstring was added. --- musical/preprocessing.py | 159 ++++++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 51 deletions(-) diff --git a/musical/preprocessing.py b/musical/preprocessing.py index 3738170..7506fa3 100644 --- a/musical/preprocessing.py +++ b/musical/preprocessing.py @@ -7,58 +7,115 @@ from .cluster import OptimalK, hierarchical_cluster +def sort_with_indices(x): + """ + x has to be a numpy array + """ + indices_sorted = np.argsort(x) + + return x[indices_sorted], indices_sorted + + def gini(x): - mad = np.abs(np.subtract.outer(x, x)).mean() - rmad = mad/np.mean(x) - g = 0.5 * rmad - return g - -def remove_samples_based_on_gini(H, X, gini_baseline = 0.65, gini_delta = 0.005, per_signature = True): - gini_vec = [] - gini_vec = np.array(gini_vec) - for h in H: - h_norm = h/np.sum(X, axis = 0) - gini_this = gini(h_norm) - gini_vec = np.append(gini_vec, gini_this) - - inds_columns_to_check = np.where(gini_vec > gini_baseline) - - indices_to_remove = [] - indices_to_remove = np.array(inds_columns_to_check) - - list_indices_to_keep = {} -# for i in np.array(inds_columns_to_check).tolist()[0]: - for i in inds_columns_to_check[0]: - h_norm = H[i,:]/np.sum(X, axis = 0) - index = h_norm.size - delta = 1 - while delta > gini_delta: - gini_this = gini(np.sort(h_norm)[1:index]) - gini_bef = gini(np.sort(h_norm)[1:(index - 1)]) - delta = gini_this - gini_bef - index = index - 1 - if index < np.around(h_norm.size * 0.8): - break - to_keep = np.where(h_norm < np.sort(h_norm)[index]) - to_remove = np.where(h_norm >= np.sort(h_norm)[index]) - list_indices_to_keep[i] = to_keep - indices_to_remove = np.append(indices_to_remove, to_remove) - - indices_to_remove = np.unique(indices_to_remove) - - if per_signature: - list_X = {} - indices_to_keep_all = np.array(range(0, H.shape[1])) - for i in inds_columns_to_check[0]: - X_this = X[:,list_indices_to_keep[i][0]] - list_X[i] = X_this - indices_this = list_indices_to_keep[i][0] - indices_to_keep_all = np.array([i for i in indices_to_keep_all if i in indices_this]) - return(list_X, inds_columns_to_check, list_indices_to_keep, indices_to_keep_all) - else: - X_this = np.delete(X, indices_to_remove, axis = 1) - indices_to_keep = np.array([i for i in range(0, index) if i not in indices_to_remove]) - return(X_this, indices_to_keep) + """ + x is assumed to be increasing + """ + n = len(x) + aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)] + scaling = 1 / (n * sum(x)) + + return scaling * sum(aux) + + +def n_remove_gini(x, gini_delta, thresh): + """ + Identify how many of the largest values of x significantly contribute to its Gini coefficient. + Do not identify more than (1 - thresh) * 100 % of x. + x is assumed to be increasing. + """ + n_remove = 0 + max_n_remove = np.round((1 - thresh) * len(x)) + + gini_old = gini(x) + gini_new = gini(x[:-1]) + n_remove = 0 + + while gini_old - gini_new > gini_delta and n_remove < max_n_remove: + + n_remove += 1 + gini_old = gini_new + gini_new = gini(x[: - n_remove - 1]) + + return n_remove + + +def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): + """ + Identify signatures with unequal exposures. A signature is said to have unequal exposures if the + Gini coefficient of the sample exposures is higher than a given threshold. + For these signatures, the samples causing the gini coefficient to be high are also identified. + + Input: + ------ + H: np.ndarray + The exposure matrix of shape (n_signatures, n_samples) + + X: np.ndarray + The mutation count matrix of shape (n_features, n_samples) + + gini_baseline: float + Signatures with exposures having a higher Gini coefficient than 'gini_baseline' are identified + as having unequal exposures + + gini_delta: float + Per signature with unequal exposure, a sample is identified as a sample significanlty contributing + the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta' + + Output: + ------ + samples_to_keep: dict + keys: indices of signatures with unequal exposures + values: corresponding sample indices that do not (!) cause the Gini coefficient to be high + + X_to_keep: dict + keys: indices of signatures with unequal exposures + values: mutation count matix subsetted to the samples that do not (!) cause the Gini coefficient to be high + + samples_to_keep_all: np.ndarray + List of sample indices not significantly causing the Gini coefficient of any signature with unequal exposure to be high + """ + H, X = np.array(H), np.array(X) + + n_samples = H.shape[1] + + # normalize the exposures + H = H /np.sum(X, axis=0) + + # Gini coefficients of normalized signature exposures + gini_coeffs = np.array([gini(sorted(h)) for h in H]) + sigs_to_check = np.where(gini_coeffs > gini_baseline)[0] + + samples_to_keep = {} + samples_to_remove = set() + + for sig_index in sigs_to_check: + + sorted_h, sorted_h_indices = sort_with_indices(H[sig_index,:]) + n_remove = n_remove_gini(sorted_h, gini_delta, .8) + + to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0)) + samples_to_keep[sig_index] = np.array(sorted(to_keep)) + samples_to_remove |= set(to_remove) + + X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()} + + samples_to_keep_all = set(range(n_samples)) - samples_to_remove + samples_to_keep_all = np.array(sorted(samples_to_keep_all)) + + results = (samples_to_keep, X_to_keep, samples_to_keep_all) + + return results + def identify_distinct_cluster(X, H, frac_thresh=0.05): """Identify distinct clusters from the cohort based on exposures. From 199e68b9e8a701ca33b686ae1b757a0d2dc72b16 Mon Sep 17 00:00:00 2001 From: BeGeiger Date: Mon, 6 Feb 2023 21:53:16 -0500 Subject: [PATCH 2/3] Indentation fixed --- musical/preprocessing.py | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/musical/preprocessing.py b/musical/preprocessing.py index 7506fa3..ee3825d 100644 --- a/musical/preprocessing.py +++ b/musical/preprocessing.py @@ -23,7 +23,7 @@ def gini(x): n = len(x) aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)] scaling = 1 / (n * sum(x)) - + return scaling * sum(aux) @@ -35,17 +35,17 @@ def n_remove_gini(x, gini_delta, thresh): """ n_remove = 0 max_n_remove = np.round((1 - thresh) * len(x)) - + gini_old = gini(x) gini_new = gini(x[:-1]) n_remove = 0 while gini_old - gini_new > gini_delta and n_remove < max_n_remove: - + n_remove += 1 gini_old = gini_new gini_new = gini(x[: - n_remove - 1]) - + return n_remove @@ -54,66 +54,66 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): Identify signatures with unequal exposures. A signature is said to have unequal exposures if the Gini coefficient of the sample exposures is higher than a given threshold. For these signatures, the samples causing the gini coefficient to be high are also identified. - + Input: ------ H: np.ndarray The exposure matrix of shape (n_signatures, n_samples) - + X: np.ndarray The mutation count matrix of shape (n_features, n_samples) - + gini_baseline: float Signatures with exposures having a higher Gini coefficient than 'gini_baseline' are identified as having unequal exposures - + gini_delta: float Per signature with unequal exposure, a sample is identified as a sample significanlty contributing the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta' - + Output: ------ samples_to_keep: dict keys: indices of signatures with unequal exposures values: corresponding sample indices that do not (!) cause the Gini coefficient to be high - + X_to_keep: dict keys: indices of signatures with unequal exposures values: mutation count matix subsetted to the samples that do not (!) cause the Gini coefficient to be high - + samples_to_keep_all: np.ndarray List of sample indices not significantly causing the Gini coefficient of any signature with unequal exposure to be high """ H, X = np.array(H), np.array(X) - + n_samples = H.shape[1] - + # normalize the exposures H = H /np.sum(X, axis=0) - + # Gini coefficients of normalized signature exposures gini_coeffs = np.array([gini(sorted(h)) for h in H]) sigs_to_check = np.where(gini_coeffs > gini_baseline)[0] - + samples_to_keep = {} samples_to_remove = set() - + for sig_index in sigs_to_check: - + sorted_h, sorted_h_indices = sort_with_indices(H[sig_index,:]) n_remove = n_remove_gini(sorted_h, gini_delta, .8) - + to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0)) samples_to_keep[sig_index] = np.array(sorted(to_keep)) samples_to_remove |= set(to_remove) X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()} - + samples_to_keep_all = set(range(n_samples)) - samples_to_remove samples_to_keep_all = np.array(sorted(samples_to_keep_all)) - + results = (samples_to_keep, X_to_keep, samples_to_keep_all) - + return results From 01b78e563a658bd7ae0bc4fd3aa061bdc7255120 Mon Sep 17 00:00:00 2001 From: Hu Jin Date: Mon, 6 Feb 2023 22:14:26 -0500 Subject: [PATCH 3/3] Minor change to allow wider use of gini() --- musical/preprocessing.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/musical/preprocessing.py b/musical/preprocessing.py index ee3825d..74000f9 100644 --- a/musical/preprocessing.py +++ b/musical/preprocessing.py @@ -16,10 +16,12 @@ def sort_with_indices(x): return x[indices_sorted], indices_sorted -def gini(x): +def gini(x, assume_sorted=True): """ x is assumed to be increasing """ + if not assume_sorted: + x = np.sort(x) n = len(x) aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)] scaling = 1 / (n * sum(x)) @@ -39,7 +41,7 @@ def n_remove_gini(x, gini_delta, thresh): gini_old = gini(x) gini_new = gini(x[:-1]) n_remove = 0 - + while gini_old - gini_new > gini_delta and n_remove < max_n_remove: n_remove += 1 @@ -68,7 +70,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): as having unequal exposures gini_delta: float - Per signature with unequal exposure, a sample is identified as a sample significanlty contributing + Per signature with unequal exposure, a sample is identified as a sample significanlty contributing the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta' Output: @@ -92,7 +94,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): H = H /np.sum(X, axis=0) # Gini coefficients of normalized signature exposures - gini_coeffs = np.array([gini(sorted(h)) for h in H]) + gini_coeffs = np.array([gini(np.sort(h)) for h in H]) sigs_to_check = np.where(gini_coeffs > gini_baseline)[0] samples_to_keep = {} @@ -106,7 +108,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005): to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0)) samples_to_keep[sig_index] = np.array(sorted(to_keep)) samples_to_remove |= set(to_remove) - + X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()} samples_to_keep_all = set(range(n_samples)) - samples_to_remove @@ -313,12 +315,12 @@ def stratify_samples(X, H=None, sil_thresh=0.9, i.e., at most 1 cluster with silhouette score < sil_thresh, to accept the full clustering. Otherwise a smaller k might be more appropriate. This is a bit more complicated. So we ignore this for now. """ - + if H is None: data = normalize(X, norm='l1', axis=0) else: data = normalize(H, norm='l1', axis=0) - + n_samples = data.shape[1] # Clustering with automatic selection of cluster number optimalK = OptimalK(data, max_k=max_k, nrefs=nrefs, metric=metric, linkage_method=linkage_method, ref_method=ref_method) @@ -327,7 +329,7 @@ def stratify_samples(X, H=None, sil_thresh=0.9, if k > 1: # If k > 1, we check per-cluster silhouette scores. # If at least one cluster has silhouette score > sil_thresh, we accept the clustering. - # Otherwise we reject it. + # Otherwise we reject it. if np.any(optimalK.silscorek_percluster[k] > sil_thresh): cluster_membership = optimalK.cluster_membership clusters = []