From b27da44ec29e414311bb8eab24cde3f9904b0e69 Mon Sep 17 00:00:00 2001 From: jrudar Date: Tue, 11 Jul 2023 20:35:02 -0400 Subject: [PATCH] - Updated LANDMark version - TreeOrdination can now take advantage of LANDMark's proximity measures (both using terminal nodes as features or all nodes in the decision path as features) - Data is cast into the np.float32 dtype for the CLRClosureTransformer --- TreeOrdination/TreeOrdination.py | 28 ++++++++++++++++---------- TreeOrdination/transformers_treeord.py | 8 ++++---- environment.yml | 2 +- pyproject.toml | 4 ++-- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/TreeOrdination/TreeOrdination.py b/TreeOrdination/TreeOrdination.py index 5e9af40..9ebde97 100644 --- a/TreeOrdination/TreeOrdination.py +++ b/TreeOrdination/TreeOrdination.py @@ -5,6 +5,9 @@ from sklearn.base import ClassifierMixin, BaseEstimator, clone from sklearn.decomposition import PCA +from scipy.sparse import csr_array +from scipy.sparse import hstack as sp_hstack + from umap import UMAP from LANDMark import LANDMarkClassifier @@ -61,6 +64,7 @@ def __init__( feature_names, resampler=None, metric="hamming", + prox_method = "terminal", supervised_clf=ExtraTreesClassifier(1024), proxy_model = ExtraTreesRegressor(1024), landmark_model = LANDMarkClassifier(160, use_nnet=False, n_jobs = 8), @@ -76,6 +80,7 @@ def __init__( self.resampler = resampler self.metric = metric + self.prox_method = prox_method self.supervised_clf = supervised_clf self.proxy_model = proxy_model @@ -117,20 +122,22 @@ def get_initial_embedding(self, X): # Get proximity X_trf = resampler.transform(X) - self.LM_emb.append(model.proximity(X_trf)) + + # Update Overall Proximity + if i > 0: + self.LM_emb = sp_hstack((self.LM_emb, model.proximity(X_trf, self.prox_method))) + else: + self.LM_emb = model.proximity(X_trf, self.prox_method) # Save the resampler self.transformers.append(resampler) - # Get Overall Proximity - self.LM_emb = np.hstack(self.LM_emb) - # Get Embeddings self.UMAP_trf = UMAP( n_neighbors=self.n_neighbors, n_components=15, min_dist=self.min_dist, - metric=self.metric, + metric="hamming", densmap=False, ).fit(self.LM_emb) @@ -248,7 +255,7 @@ def plot_projection(self, X, y, ax_1=0, ax_2=1, use_approx=True, trf_type = "PCA def predict_proba(self, X): - tree_emb = self.emb_transform(X, "LM") + tree_emb = self.emb_transform(X, "UMAP") P = self.p_model.predict_proba(tree_emb) @@ -281,11 +288,10 @@ def emb_transform(self, X, trf_type = "PCA"): transformer = self.transformers[i] # Get proximity - proximity = model.proximity(transformer.transform(X)) - - tree_emb.append(proximity) - - tree_emb = np.hstack(tree_emb) + if i != 0: + tree_emb = sp_hstack((tree_emb, model.proximity(transformer.transform(X), self.prox_method))) + else: + tree_emb = model.proximity(transformer.transform(X), self.prox_method) if trf_type == "LM": return tree_emb diff --git a/TreeOrdination/transformers_treeord.py b/TreeOrdination/transformers_treeord.py index 367095d..6d596a1 100644 --- a/TreeOrdination/transformers_treeord.py +++ b/TreeOrdination/transformers_treeord.py @@ -16,18 +16,18 @@ def __init__(self, do_clr=False, delta=None): def fit_transform(self, X, y=None, **kwargs): if self.do_clr: - return clr(multiplicative_replacement(closure(X), delta=self.delta)) + return clr(multiplicative_replacement(closure(X), delta=self.delta)).astype(np.float32) else: - return closure(X) + return closure(X).astype(np.float32) def transform(self, X, y=None, **kwargs): if self.do_clr: - return clr(multiplicative_replacement(closure(X), delta=self.delta)) + return clr(multiplicative_replacement(closure(X), delta=self.delta)).astype(np.float32) else: - return closure(X) + return closure(X).astype(np.float32) class ResampleRandomizeTransform(BaseEstimator, TransformerMixin): diff --git a/environment.yml b/environment.yml index 2c63fed..a5e50c2 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ dependencies: - python >=3.8 - pip - pip: - - LANDMarkClassifier >= 2.0.0 + - LANDMarkClassifier >= 2.1.0 - numpy == 1.23.5 - scikit-learn >= 1.1.2 - scikit-bio >= 0.5.8 diff --git a/pyproject.toml b/pyproject.toml index a91b19b..66925eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ["hatchling"] [project] name = "TreeOrdination" -version = "1.3.2" +version = "1.3.4" authors = [ {name = "Josip Rudar", email = "rudarj@uoguelph.ca"}, {name = "G. Brian Golding"}, @@ -40,7 +40,7 @@ dependencies = [ "umap-learn >= 0.5.3", "seaborn", "shap >= 0.40.0", - "LANDMarkClassifier >= 2.0.4" + "LANDMarkClassifier >= 2.1.0" ] [tool.hatch.metadata]