Skip to content

Commit

Permalink
Merge pull request #13 from jrudar/Cascade
Browse files Browse the repository at this point in the history
Preservation of Proximity Information Within LANDMark Trees
  • Loading branch information
jrudar authored Jul 12, 2023
2 parents 4c386c1 + f00efbb commit 99f5b05
Show file tree
Hide file tree
Showing 15 changed files with 1,389 additions and 275 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,5 @@ ENV/
.ruff_cache
/.vs/LANDMark/v16
/.vs/LANDMark/config
/.vs
/.vs
/notebooks/Untitled.ipynb
94 changes: 80 additions & 14 deletions LANDMark/LANDMark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@

from typing import Optional, List

from scipy.sparse import csr_array, issparse

class LANDMarkClassifier(BaseEstimator, ClassifierMixin):

class LANDMarkClassifier(BaseEstimator, ClassifierMixin):
def __init__(
self,
n_estimators: int = 64,
Expand All @@ -27,14 +28,16 @@ def __init__(
use_oracle: bool = True,
use_lm_l2: bool = True,
use_lm_l1: bool = True,
minority_sz_lm: int = 6,
use_nnet: bool = True,
nnet_min_samples: int = 32,
minority_sz_nnet: int = 6,
use_etc: bool = True,
etc_max_depth: int = 5,
etc_max_trees: int = 128,
resampler = None,
resampler=None,
use_cascade: bool = False,
n_jobs: int = 4
n_jobs: int = 4,
):
# Tree construction parameters
self.n_estimators = n_estimators
Expand All @@ -47,8 +50,10 @@ def __init__(
self.use_oracle = use_oracle
self.use_lm_l2 = use_lm_l2
self.use_lm_l1 = use_lm_l1
self.minority_sz_lm = minority_sz_lm
self.use_nnet = use_nnet
self.nnet_min_samples = nnet_min_samples
self.minority_sz_nnet = minority_sz_nnet
self.use_etc = use_etc
self.etc_max_depth = etc_max_depth
self.etc_max_trees = etc_max_trees
Expand Down Expand Up @@ -85,12 +90,15 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
use_oracle=self.use_oracle,
use_lm_l2=self.use_lm_l2,
use_lm_l1=self.use_lm_l1,
minority_sz_lm=self.minority_sz_lm,
use_nnet=self.use_nnet,
nnet_min_samples=self.nnet_min_samples,
minority_sz_nnet=self.minority_sz_nnet,
use_etc=self.use_etc,
etc_max_depth=self.etc_max_depth,
etc_max_trees=self.etc_max_trees,
resampler=self.resampler
resampler=self.resampler,
use_cascade=self.use_cascade,
),
n_estimators=self.n_estimators,
class_names=self.classes_,
Expand Down Expand Up @@ -140,23 +148,78 @@ def score(self, X: np.ndarray, y: np.ndarray) -> float:

return score

def proximity(self, X: np.ndarray) -> np.ndarray:
def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:
check_is_fitted(self, attributes=["classes_", "estimators_"])

tree_mats = []
if prox_type == "terminal":
tree_mats = []

for estimator in self.estimators_.estimators_:
tree_mats.append(estimator.proximity(X, prox_type))

emb = np.hstack(tree_mats)

return csr_array(emb.astype(np.uint8))

elif prox_type == "path":
if hasattr(self, "node_set"):
embs = [
est.proximity(X, prox_type) for est in self.estimators_.estimators_
]

if X.ndim == 1:
emb = np.zeros(shape=(1, len(self.node_set)), dtype=np.uint8)
else:
emb = np.zeros(
shape=(X.shape[0], len(self.node_set)), dtype=np.uint8
)

for tree_emb in embs:
for sample, nodes in tree_emb.items():
for node in nodes:
emb[sample, self.node_set[node]] = 1

return csr_array(emb)

for estimator in self.estimators_.estimators_:
tree_mats.append(estimator.proximity(X))
else:
# Get the list of nodes associated with each sample in X
embs = [
est.proximity(X, prox_type) for est in self.estimators_.estimators_
]

emb = np.hstack(tree_mats)
# Create a list of all nodes across all trees in the forest
node_set = set()
[node_set.update(est.all_nodes) for est in self.estimators_.estimators_]

return emb
node_set = list(node_set)

def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.ndarray]:
# Create the embedding matrix
emb = np.zeros(shape=(X.shape[0], len(node_set)), dtype=np.uint8)

# Create a mapping between node id and index in the embedding matrix
self.node_set = {node: i for i, node in enumerate(node_set)}

# Update the embedding matrix
for tree_emb in embs:
for sample, nodes in tree_emb.items():
for node in nodes:
emb[sample, self.node_set[node]] = 1

return csr_array(emb)

def _check_params(
self, X: np.ndarray, y: np.ndarray
) -> List[np.ndarray, np.ndarray]:
SUPPORTED_IMPURITY = {"gain", "gain-ratio", "tsallis", "tsallis-gain-ratio"}

# Check that X and y meet the minimum requirements
X_conv, y_conv = check_X_y(X, y, accept_sparse=False)
X_conv, y_conv = check_X_y(X, y, accept_sparse=True)

if not issparse(X_conv):
sparsity = 1.0 - (np.count_nonzero(X_conv) / X_conv.size)

if sparsity >= 0.9:
X_conv = csr_array(X_conv)

if not isinstance(self.n_estimators, int):
raise TypeError("'n_estimators' must be an integer.")
Expand All @@ -174,9 +237,11 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda

if isinstance(self.max_depth, type(None)):
pass

elif isinstance(self.max_depth, int):
if self.max_depth <= 0:
raise ValueError("'max_depth' must be an greater than zero.")

else:
raise TypeError("'max_depth' must be an integer greater than zero or None.")

Expand All @@ -192,6 +257,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
if isinstance(self.min_gain, float):
if self.min_gain < 0:
raise ValueError("'min_gain' must be greater than or equal to zero.")

else:
raise TypeError("'min_gain' must be float.")

Expand Down Expand Up @@ -233,7 +299,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda

if not isinstance(self.use_etc, bool):
raise TypeError("'use_etc' must be True or False.")

if isinstance(self.etc_max_depth, int):
if self.etc_max_depth <= 0:
raise ValueError("'etc_max_depth' must be greater than zero.")
Expand All @@ -259,7 +325,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
if isinstance(self.resampler, type(None)):
pass

elif hasattr(self.resampler, "fit_transform") == False:
elif hasattr(self.resampler, "fit_transform") is False:
raise ValueError("'resampler' must have a 'fit_transform(X, y)' function.")

return X_conv, y_conv
8 changes: 3 additions & 5 deletions LANDMark/lm_dtree_clfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np


class ETClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, n_feat=0.8, max_depth=5, max_trees=128):
self.n_feat = n_feat
Expand All @@ -29,26 +30,23 @@ def fit(self, X, y):

self.classes_, y_counts = np.unique(y_re, return_counts=True)

clf_1 = ExtraTreesClassifier(
clf = ExtraTreesClassifier(
n_estimators=self.max_trees, max_depth=self.max_depth
)

self.model_type = "nonlinear_etc"

self.clf_model = clf_1.fit(X_re, y_re)
self.clf_model = clf.fit(X_re, y_re)

return self, self.decision_function(X)

def predict(self, X):
return self.clf_model.predict(X[:, self.features])

def predict_proba(self, X):

return self.clf_model.predict_proba(X[:, self.features])

def decision_function(self, X):
D = self.clf_model.predict_proba(X[:, self.features])

return np.where(D > 0.5, 1, -1)


49 changes: 19 additions & 30 deletions LANDMark/lm_linear_clfs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import logging
import os

import warnings
from sklearn.exceptions import ConvergenceWarning

Expand All @@ -12,24 +9,21 @@
from sklearn.linear_model import (
RidgeClassifierCV,
LogisticRegressionCV,
LogisticRegression,
SGDClassifier,
RidgeClassifier,
)
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

from random import choice
from sklearn.model_selection import StratifiedKFold

from math import ceil


class LMClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, model_type, n_feat=0.8):
def __init__(self, model_type, n_feat=0.8, minority=6, use_etc_split=True):
self.model_type = model_type
self.n_feat = n_feat
self.minority = minority

def fit(self, X, y):
if X.shape[1] >= 4:
Expand All @@ -48,19 +42,21 @@ def fit(self, X, y):

self.classes_, y_counts = np.unique(y_re, return_counts=True)

self.y_min = min(y_counts)
if self.y_min > 6:
self.y_min = min(y_counts) * 0.8

if self.y_min > self.minority:
if self.model_type == "lr_l2":
self.clf = LogisticRegressionCV(max_iter=2000, cv=5).fit(X_re, y_re)
self.clf = LogisticRegressionCV(
max_iter=2000, cv=StratifiedKFold(5)
).fit(X_re, y_re)

elif self.model_type == "lr_l1":
solver = "liblinear"
if X.shape[0] >= 500:
solver = "saga"

self.clf = LogisticRegressionCV(
max_iter=2000, cv=5, solver=solver, penalty="l1"
max_iter=2000, cv=StratifiedKFold(5), solver=solver, penalty="l1"
).fit(X_re, y_re)

elif self.model_type == "sgd_l2":
Expand All @@ -70,7 +66,7 @@ def fit(self, X, y):
"alpha": [0.001, 0.01, 0.1, 1.0, 10, 100],
"loss": ["hinge", "modified_huber"],
},
cv=5,
cv=StratifiedKFold(5),
).fit(X_re, y_re)

self.clf = self.cv.best_estimator_
Expand All @@ -82,41 +78,34 @@ def fit(self, X, y):
"alpha": [0.001, 0.01, 0.1, 1.0, 10, 100],
"loss": ["hinge", "modified_huber"],
},
cv=5,
cv=StratifiedKFold(5),
).fit(X_re, y_re)

self.clf = self.cv.best_estimator_

elif self.model_type == "ridge":
self.clf = RidgeClassifierCV(
alphas=(0.001, 0.01, 0.1, 1.0, 10, 100, 1000), cv=5
alphas=(0.001, 0.01, 0.1, 1.0, 10, 100, 1000), cv=StratifiedKFold(5)
).fit(X_re, y_re)

elif self.model_type == "lsvc":
self.cv = GridSearchCV(
LinearSVC(max_iter=2000),
param_grid={"C": [0.001, 0.01, 0.1, 1.0, 10, 100]},
cv=5,
cv=StratifiedKFold(5),
).fit(X_re, y_re)

self.clf = self.cv.best_estimator_

else:
self.clf = ExtraTreesClassifier(n_estimators = 128, max_depth = 1)

self.clf.fit(X_re, y_re)
return self, self.decision_function(X)

return self, self.decision_function(X)
# Otherwise use an Extra Trees Classifier or Nothing
else:
return self, None

def predict(self, X):
return self.clf.predict(X[:, self.features])

def decision_function(self, X):

if self.y_min > 6:
return self.clf.decision_function(X[:, self.features])

else:
D = self.clf.predict_proba(X[:, self.features])
return self.clf.decision_function(X[:, self.features])

return np.where(D > 0.5, 1, -1)
Loading

0 comments on commit 99f5b05

Please sign in to comment.