-
Notifications
You must be signed in to change notification settings - Fork 0
/
custom_random_forest.py
123 lines (107 loc) · 4.31 KB
/
custom_random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from typing import Tuple, List, Any
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from multiprocessing import Pool
class RandomForestClassifierCustom(BaseEstimator):
"""
Custom implementation of a Random Forest classifier.
"""
def __init__(
self,
n_estimators: int = 10,
max_depth: int = None,
max_features: int = None,
random_state: int = 42,
):
"""
Initializes the RandomForestClassifierCustom instance.
Args:
n_estimators (int): The number of trees in the forest.
max_depth (int): The maximum depth of the trees.
max_features (int): The maximum number of features considered for splitting a node.
random_state (int): A seed used by the random number generator.
"""
self.n_estimators = n_estimators
self.max_depth = max_depth
self.max_features = max_features
self.random_state = random_state
self.trees = [] # List to store each tree in the forest.
self.feat_ids_by_tree = [] # List to store feature indices for each tree.
def _fit_tree(
self, args: Tuple[Any, np.ndarray, int, int, int]
) -> Tuple[DecisionTreeClassifier, np.ndarray]:
"""
Fits a single decision tree using a subset of data and features.
Args:
args (tuple): Contains the dataset, labels, and parameters for tree fitting.
Returns:
A tuple containing the fitted tree and the feature indices used.
"""
X, y, max_features, max_depth, random_state = args
np.random.seed(random_state)
feat_ids = np.random.choice(range(X.shape[1]), size=max_features, replace=False)
pseudo_ids = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True)
pseudo_X = X[pseudo_ids, :][:, feat_ids]
pseudo_y = y[pseudo_ids]
dt_clf = DecisionTreeClassifier(
max_depth=max_depth, max_features=max_features, random_state=random_state
)
dt_clf.fit(pseudo_X, pseudo_y)
return dt_clf, feat_ids
def fit(
self, X: np.ndarray, y: np.ndarray, n_jobs: int = 1
) -> "RandomForestClassifierCustom":
"""
Fits the Random Forest classifier to the training data.
Args:
X (np.ndarray): Training features.
y (np.ndarray): Training labels.
n_jobs (int): Number of jobs to run in parallel for both fit and predict.
Returns:
self: The instance of RandomForestClassifierCustom.
"""
self.classes_ = np.unique(y)
args = [
(X, y, self.max_features, self.max_depth, self.random_state + i)
for i in range(self.n_estimators)
]
if n_jobs == 1:
results = [self._fit_tree(arg) for arg in args]
else:
with Pool(n_jobs) as pool:
results = pool.map(self._fit_tree, args)
self.trees, self.feat_ids_by_tree = zip(*results)
return self
def _predict_tree_proba(
self, args: Tuple[DecisionTreeClassifier, np.ndarray, np.ndarray]
) -> np.ndarray:
"""
Predicts class probabilities for a given tree and feature set.
Args:
args (tuple): Contains the decision tree, feature indices, and the test dataset.
Returns:
np.ndarray: The predicted probabilities.
"""
tree, feat_ids, X = args
return tree.predict_proba(X[:, feat_ids])
def predict_proba(self, X: np.ndarray, n_jobs: int = 1) -> np.ndarray:
"""
Predicts class probabilities for each tree in the forest.
Args:
X (np.ndarray): Test features.
n_jobs (int): Number of jobs to run in parallel.
Returns:
np.ndarray: The average predicted probabilities across all trees.
"""
args = [
(tree, self.feat_ids_by_tree[i], X) for i, tree in enumerate(self.trees)
]
if n_jobs == 1:
results = [self._predict_tree_proba(arg) for arg in args]
else:
with Pool(n_jobs) as pool:
results = pool.map(self._predict_tree_proba, args)
proba_sum = sum(results)
averaged_proba = proba_sum / len(self.trees)
return averaged_proba