From 59b68f7f47a224a0b973e23fb72cd9c08c743e89 Mon Sep 17 00:00:00 2001
From: BeGeiger <benedikt-geiger@gmx.de>
Date: Mon, 6 Feb 2023 21:44:32 -0500
Subject: [PATCH 1/3] Bug fixes remove_samples_based_on_gini

The function remove_samples_based_on_gini caused an error when the exposure matrix H or mutation type matix X were pandas dataframes. Several bugs in the logic of the implementation were fixed. A detailed docstring was added.
---
 musical/preprocessing.py | 159 ++++++++++++++++++++++++++-------------
 1 file changed, 108 insertions(+), 51 deletions(-)

diff --git a/musical/preprocessing.py b/musical/preprocessing.py
index 3738170..7506fa3 100644
--- a/musical/preprocessing.py
+++ b/musical/preprocessing.py
@@ -7,58 +7,115 @@
 from .cluster import OptimalK, hierarchical_cluster
 
 
+def sort_with_indices(x):
+	"""
+	x has to be a numpy array
+	"""
+	indices_sorted = np.argsort(x)
+
+	return x[indices_sorted], indices_sorted
+
+
 def gini(x):
-    mad = np.abs(np.subtract.outer(x, x)).mean()
-    rmad = mad/np.mean(x)
-    g = 0.5 * rmad
-    return g
-
-def remove_samples_based_on_gini(H, X, gini_baseline = 0.65, gini_delta = 0.005, per_signature = True):
-    gini_vec = []
-    gini_vec = np.array(gini_vec)
-    for h in H:
-        h_norm = h/np.sum(X, axis = 0)
-        gini_this = gini(h_norm)
-        gini_vec = np.append(gini_vec, gini_this)
-
-    inds_columns_to_check = np.where(gini_vec > gini_baseline)
-
-    indices_to_remove = []
-    indices_to_remove = np.array(inds_columns_to_check)
-
-    list_indices_to_keep = {}
-#    for i in np.array(inds_columns_to_check).tolist()[0]:
-    for i in inds_columns_to_check[0]:
-        h_norm = H[i,:]/np.sum(X, axis = 0)
-        index = h_norm.size
-        delta = 1
-        while delta > gini_delta:
-            gini_this = gini(np.sort(h_norm)[1:index])
-            gini_bef = gini(np.sort(h_norm)[1:(index - 1)])
-            delta = gini_this - gini_bef
-            index = index - 1
-            if index < np.around(h_norm.size * 0.8):
-                break
-        to_keep = np.where(h_norm < np.sort(h_norm)[index])
-        to_remove = np.where(h_norm >= np.sort(h_norm)[index])
-        list_indices_to_keep[i] = to_keep
-        indices_to_remove = np.append(indices_to_remove, to_remove)
-
-    indices_to_remove = np.unique(indices_to_remove)
-
-    if per_signature:
-        list_X = {}
-        indices_to_keep_all = np.array(range(0, H.shape[1]))
-        for i in inds_columns_to_check[0]:
-            X_this = X[:,list_indices_to_keep[i][0]]
-            list_X[i] = X_this
-            indices_this = list_indices_to_keep[i][0]
-            indices_to_keep_all = np.array([i for i in indices_to_keep_all if i in indices_this])
-        return(list_X, inds_columns_to_check, list_indices_to_keep, indices_to_keep_all)
-    else:
-        X_this = np.delete(X, indices_to_remove, axis = 1)
-        indices_to_keep = np.array([i for i in range(0, index)  if i not in indices_to_remove])
-        return(X_this, indices_to_keep)
+	"""
+	x is assumed to be increasing
+	"""
+	n = len(x)
+	aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)]
+	scaling = 1 / (n * sum(x))
+	
+	return scaling * sum(aux)
+
+
+def n_remove_gini(x, gini_delta, thresh):
+	"""
+	Identify how many of the largest values of x significantly contribute to its Gini coefficient.
+	Do not identify more than (1 - thresh) * 100 % of x.
+	x is assumed to be increasing.
+	"""
+	n_remove = 0
+	max_n_remove = np.round((1 - thresh) * len(x))
+	
+	gini_old = gini(x)
+	gini_new = gini(x[:-1])
+	n_remove = 0
+	
+	while gini_old - gini_new > gini_delta and n_remove < max_n_remove:
+	
+		n_remove += 1
+		gini_old = gini_new
+		gini_new = gini(x[: - n_remove - 1])
+	
+	return n_remove
+
+
+def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005):
+	"""
+	Identify signatures with unequal exposures. A signature is said to have unequal exposures if the
+	Gini coefficient of the sample exposures is higher than a given threshold.
+	For these signatures, the samples causing the gini coefficient to be high are also identified.
+	
+	Input:
+	------
+	H: np.ndarray
+		The exposure matrix of shape (n_signatures, n_samples)
+	
+	X: np.ndarray
+		The mutation count matrix of shape (n_features, n_samples)
+	
+	gini_baseline: float
+		Signatures with exposures having a higher Gini coefficient than 'gini_baseline' are identified
+		as having unequal exposures
+	
+	gini_delta: float
+		Per signature with unequal exposure, a sample is identified as a sample significanlty contributing 
+		the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta'
+	
+	Output:
+	------
+	samples_to_keep: dict
+		keys: indices of signatures with unequal exposures
+		values: corresponding sample indices that do not (!) cause the Gini coefficient to be high
+	
+	X_to_keep: dict
+		keys: indices of signatures with unequal exposures
+		values: mutation count matix subsetted to the samples that do not (!) cause the Gini coefficient to be high
+	
+	samples_to_keep_all: np.ndarray
+		List of sample indices not significantly causing the Gini coefficient of any signature with unequal exposure to be high
+	"""
+	H, X = np.array(H), np.array(X)
+	
+	n_samples = H.shape[1]
+	
+	# normalize the exposures
+	H = H /np.sum(X, axis=0)
+	
+	# Gini coefficients of normalized signature exposures
+	gini_coeffs = np.array([gini(sorted(h)) for h in H])
+	sigs_to_check = np.where(gini_coeffs > gini_baseline)[0]
+	
+	samples_to_keep = {}
+	samples_to_remove = set()
+	
+	for sig_index in sigs_to_check:
+		
+		sorted_h, sorted_h_indices = sort_with_indices(H[sig_index,:])
+		n_remove = n_remove_gini(sorted_h, gini_delta, .8)
+		
+		to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0))
+		samples_to_keep[sig_index] = np.array(sorted(to_keep))
+		samples_to_remove |= set(to_remove)
+	
+	X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()}
+		
+	samples_to_keep_all = set(range(n_samples)) - samples_to_remove
+	samples_to_keep_all = np.array(sorted(samples_to_keep_all))
+	
+	results = (samples_to_keep, X_to_keep, samples_to_keep_all)
+	
+	return results
+
 
 def identify_distinct_cluster(X, H, frac_thresh=0.05):
     """Identify distinct clusters from the cohort based on exposures.

From 199e68b9e8a701ca33b686ae1b757a0d2dc72b16 Mon Sep 17 00:00:00 2001
From: BeGeiger <benedikt-geiger@gmx.de>
Date: Mon, 6 Feb 2023 21:53:16 -0500
Subject: [PATCH 2/3] Indentation fixed

---
 musical/preprocessing.py | 42 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/musical/preprocessing.py b/musical/preprocessing.py
index 7506fa3..ee3825d 100644
--- a/musical/preprocessing.py
+++ b/musical/preprocessing.py
@@ -23,7 +23,7 @@ def gini(x):
 	n = len(x)
 	aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)]
 	scaling = 1 / (n * sum(x))
-	
+
 	return scaling * sum(aux)
 
 
@@ -35,17 +35,17 @@ def n_remove_gini(x, gini_delta, thresh):
 	"""
 	n_remove = 0
 	max_n_remove = np.round((1 - thresh) * len(x))
-	
+
 	gini_old = gini(x)
 	gini_new = gini(x[:-1])
 	n_remove = 0
 	
 	while gini_old - gini_new > gini_delta and n_remove < max_n_remove:
-	
+
 		n_remove += 1
 		gini_old = gini_new
 		gini_new = gini(x[: - n_remove - 1])
-	
+
 	return n_remove
 
 
@@ -54,66 +54,66 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005):
 	Identify signatures with unequal exposures. A signature is said to have unequal exposures if the
 	Gini coefficient of the sample exposures is higher than a given threshold.
 	For these signatures, the samples causing the gini coefficient to be high are also identified.
-	
+
 	Input:
 	------
 	H: np.ndarray
 		The exposure matrix of shape (n_signatures, n_samples)
-	
+
 	X: np.ndarray
 		The mutation count matrix of shape (n_features, n_samples)
-	
+
 	gini_baseline: float
 		Signatures with exposures having a higher Gini coefficient than 'gini_baseline' are identified
 		as having unequal exposures
-	
+
 	gini_delta: float
 		Per signature with unequal exposure, a sample is identified as a sample significanlty contributing 
 		the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta'
-	
+
 	Output:
 	------
 	samples_to_keep: dict
 		keys: indices of signatures with unequal exposures
 		values: corresponding sample indices that do not (!) cause the Gini coefficient to be high
-	
+
 	X_to_keep: dict
 		keys: indices of signatures with unequal exposures
 		values: mutation count matix subsetted to the samples that do not (!) cause the Gini coefficient to be high
-	
+
 	samples_to_keep_all: np.ndarray
 		List of sample indices not significantly causing the Gini coefficient of any signature with unequal exposure to be high
 	"""
 	H, X = np.array(H), np.array(X)
-	
+
 	n_samples = H.shape[1]
-	
+
 	# normalize the exposures
 	H = H /np.sum(X, axis=0)
-	
+
 	# Gini coefficients of normalized signature exposures
 	gini_coeffs = np.array([gini(sorted(h)) for h in H])
 	sigs_to_check = np.where(gini_coeffs > gini_baseline)[0]
-	
+
 	samples_to_keep = {}
 	samples_to_remove = set()
-	
+
 	for sig_index in sigs_to_check:
-		
+
 		sorted_h, sorted_h_indices = sort_with_indices(H[sig_index,:])
 		n_remove = n_remove_gini(sorted_h, gini_delta, .8)
-		
+
 		to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0))
 		samples_to_keep[sig_index] = np.array(sorted(to_keep))
 		samples_to_remove |= set(to_remove)
 	
 	X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()}
-		
+
 	samples_to_keep_all = set(range(n_samples)) - samples_to_remove
 	samples_to_keep_all = np.array(sorted(samples_to_keep_all))
-	
+
 	results = (samples_to_keep, X_to_keep, samples_to_keep_all)
-	
+
 	return results
 
 

From 01b78e563a658bd7ae0bc4fd3aa061bdc7255120 Mon Sep 17 00:00:00 2001
From: Hu Jin <jinhu.stanley@gmail.com>
Date: Mon, 6 Feb 2023 22:14:26 -0500
Subject: [PATCH 3/3] Minor change to allow wider use of gini()

---
 musical/preprocessing.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/musical/preprocessing.py b/musical/preprocessing.py
index ee3825d..74000f9 100644
--- a/musical/preprocessing.py
+++ b/musical/preprocessing.py
@@ -16,10 +16,12 @@ def sort_with_indices(x):
 	return x[indices_sorted], indices_sorted
 
 
-def gini(x):
+def gini(x, assume_sorted=True):
 	"""
 	x is assumed to be increasing
 	"""
+	if not assume_sorted:
+		x = np.sort(x)
 	n = len(x)
 	aux = [xi * (2 * i - n + 1) for i, xi in enumerate(x)]
 	scaling = 1 / (n * sum(x))
@@ -39,7 +41,7 @@ def n_remove_gini(x, gini_delta, thresh):
 	gini_old = gini(x)
 	gini_new = gini(x[:-1])
 	n_remove = 0
-	
+
 	while gini_old - gini_new > gini_delta and n_remove < max_n_remove:
 
 		n_remove += 1
@@ -68,7 +70,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005):
 		as having unequal exposures
 
 	gini_delta: float
-		Per signature with unequal exposure, a sample is identified as a sample significanlty contributing 
+		Per signature with unequal exposure, a sample is identified as a sample significanlty contributing
 		the high Gini coefficient if removing it decreases the Gini coefficient by at least 'gini_delta'
 
 	Output:
@@ -92,7 +94,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005):
 	H = H /np.sum(X, axis=0)
 
 	# Gini coefficients of normalized signature exposures
-	gini_coeffs = np.array([gini(sorted(h)) for h in H])
+	gini_coeffs = np.array([gini(np.sort(h)) for h in H])
 	sigs_to_check = np.where(gini_coeffs > gini_baseline)[0]
 
 	samples_to_keep = {}
@@ -106,7 +108,7 @@ def remove_samples_based_on_gini(H, X, gini_baseline=.65, gini_delta=.005):
 		to_keep, to_remove = np.split(sorted_h_indices, [-n_remove]) if n_remove else (sorted_h_indices, np.empty(0))
 		samples_to_keep[sig_index] = np.array(sorted(to_keep))
 		samples_to_remove |= set(to_remove)
-	
+
 	X_to_keep = {sig_index: X[:, samples] for sig_index, samples in samples_to_keep.items()}
 
 	samples_to_keep_all = set(range(n_samples)) - samples_to_remove
@@ -313,12 +315,12 @@ def stratify_samples(X, H=None, sil_thresh=0.9,
         i.e., at most 1 cluster with silhouette score < sil_thresh, to accept the full clustering. Otherwise a smaller k might be
         more appropriate. This is a bit more complicated. So we ignore this for now.
     """
-    
+
     if H is None:
         data = normalize(X, norm='l1', axis=0)
     else:
         data = normalize(H, norm='l1', axis=0)
-        
+
     n_samples = data.shape[1]
     # Clustering with automatic selection of cluster number
     optimalK = OptimalK(data, max_k=max_k, nrefs=nrefs, metric=metric, linkage_method=linkage_method, ref_method=ref_method)
@@ -327,7 +329,7 @@ def stratify_samples(X, H=None, sil_thresh=0.9,
     if k > 1:
         # If k > 1, we check per-cluster silhouette scores.
         # If at least one cluster has silhouette score > sil_thresh, we accept the clustering.
-        # Otherwise we reject it. 
+        # Otherwise we reject it.
         if np.any(optimalK.silscorek_percluster[k] > sil_thresh):
             cluster_membership = optimalK.cluster_membership
             clusters = []