Skip to content

Commit 7413ded

Browse files
authored
Add Random Forest Classifier implementation from scratch
Implements Random Forest Classifier with: - Decision Tree base learners from scratch - Bootstrap sampling (bagging) - Random feature selection at splits - Majority voting aggregation - Clear docstrings and example usage Part of implementation for issue #13537
1 parent c79034c commit 7413ded

File tree

1 file changed

+326
-0
lines changed

1 file changed

+326
-0
lines changed
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
"""Random Forest Classifier implementation from scratch.
2+
3+
This module implements a Random Forest Classifier using:
4+
- Decision Tree base learners built from scratch
5+
- Bootstrap sampling (bagging)
6+
- Random feature selection at splits
7+
- Majority voting for aggregation
8+
"""
9+
10+
import numpy as np
11+
from collections import Counter
12+
13+
14+
class DecisionTreeClassifier:
15+
"""A Decision Tree Classifier built from scratch.
16+
17+
This tree uses information gain (entropy-based) for splitting decisions.
18+
19+
Attributes:
20+
max_depth: Maximum depth of the tree
21+
min_samples_split: Minimum samples required to split a node
22+
n_features: Number of features to consider for best split
23+
tree: The built tree structure
24+
"""
25+
26+
def __init__(self, max_depth=10, min_samples_split=2, n_features=None):
27+
self.max_depth = max_depth
28+
self.min_samples_split = min_samples_split
29+
self.n_features = n_features
30+
self.tree = None
31+
32+
def fit(self, X, y):
33+
"""Build the decision tree.
34+
35+
Args:
36+
X: Training features, shape (n_samples, n_features)
37+
y: Training labels, shape (n_samples,)
38+
"""
39+
self.n_features = X.shape[1] if not self.n_features else min(self.n_features, X.shape[1])
40+
self.tree = self._grow_tree(X, y)
41+
42+
def _grow_tree(self, X, y, depth=0):
43+
"""Recursively grow the decision tree."""
44+
n_samples, n_features = X.shape
45+
n_labels = len(np.unique(y))
46+
47+
# Stopping criteria
48+
if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
49+
leaf_value = self._most_common_label(y)
50+
return {'leaf': True, 'value': leaf_value}
51+
52+
# Find best split
53+
feat_idxs = np.random.choice(n_features, self.n_features, replace=False)
54+
best_feat, best_thresh = self._best_split(X, y, feat_idxs)
55+
56+
if best_feat is None:
57+
leaf_value = self._most_common_label(y)
58+
return {'leaf': True, 'value': leaf_value}
59+
60+
# Split the data
61+
left_idxs = X[:, best_feat] <= best_thresh
62+
right_idxs = ~left_idxs
63+
64+
# Grow subtrees
65+
left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
66+
right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
67+
68+
return {
69+
'leaf': False,
70+
'feature': best_feat,
71+
'threshold': best_thresh,
72+
'left': left,
73+
'right': right
74+
}
75+
76+
def _best_split(self, X, y, feat_idxs):
77+
"""Find the best feature and threshold to split on."""
78+
best_gain = -1
79+
split_idx, split_thresh = None, None
80+
81+
for feat_idx in feat_idxs:
82+
X_column = X[:, feat_idx]
83+
thresholds = np.unique(X_column)
84+
85+
for threshold in thresholds:
86+
gain = self._information_gain(y, X_column, threshold)
87+
88+
if gain > best_gain:
89+
best_gain = gain
90+
split_idx = feat_idx
91+
split_thresh = threshold
92+
93+
return split_idx, split_thresh
94+
95+
def _information_gain(self, y, X_column, threshold):
96+
"""Calculate information gain from a split."""
97+
# Parent entropy
98+
parent_entropy = self._entropy(y)
99+
100+
# Create children
101+
left_idxs = X_column <= threshold
102+
right_idxs = ~left_idxs
103+
104+
if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
105+
return 0
106+
107+
# Calculate weighted average entropy of children
108+
n = len(y)
109+
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
110+
e_left, e_right = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
111+
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
112+
113+
# Information gain
114+
ig = parent_entropy - child_entropy
115+
return ig
116+
117+
def _entropy(self, y):
118+
"""Calculate entropy of a label distribution."""
119+
hist = np.bincount(y)
120+
ps = hist / len(y)
121+
return -np.sum([p * np.log2(p) for p in ps if p > 0])
122+
123+
def _most_common_label(self, y):
124+
"""Return the most common label."""
125+
counter = Counter(y)
126+
return counter.most_common(1)[0][0]
127+
128+
def predict(self, X):
129+
"""Predict class labels for samples in X.
130+
131+
Args:
132+
X: Features, shape (n_samples, n_features)
133+
134+
Returns:
135+
Predicted labels, shape (n_samples,)
136+
"""
137+
return np.array([self._traverse_tree(x, self.tree) for x in X])
138+
139+
def _traverse_tree(self, x, node):
140+
"""Traverse the tree to make a prediction for a single sample."""
141+
if node['leaf']:
142+
return node['value']
143+
144+
if x[node['feature']] <= node['threshold']:
145+
return self._traverse_tree(x, node['left'])
146+
return self._traverse_tree(x, node['right'])
147+
148+
149+
class RandomForestClassifier:
150+
"""Random Forest Classifier built from scratch.
151+
152+
Random Forest is an ensemble learning method that constructs multiple
153+
decision trees during training and outputs the mode of the classes
154+
(classification) of the individual trees.
155+
156+
Features:
157+
- Bootstrap sampling (bagging) to create diverse trees
158+
- Random feature selection at each split
159+
- Majority voting for final predictions
160+
161+
Attributes:
162+
n_estimators: Number of trees in the forest
163+
max_depth: Maximum depth of each tree
164+
min_samples_split: Minimum samples required to split a node
165+
n_features: Number of features to consider for best split
166+
trees: List of trained decision trees
167+
168+
Example:
169+
>>> from sklearn.datasets import make_classification
170+
>>> from sklearn.model_selection import train_test_split
171+
>>> from sklearn.metrics import accuracy_score
172+
>>>
173+
>>> # Generate sample data
174+
>>> X, y = make_classification(n_samples=1000, n_features=20,
175+
... n_informative=15, n_redundant=5,
176+
... random_state=42)
177+
>>> X_train, X_test, y_train, y_test = train_test_split(
178+
... X, y, test_size=0.2, random_state=42)
179+
>>>
180+
>>> # Train Random Forest
181+
>>> rf = RandomForestClassifier(n_estimators=10, max_depth=10)
182+
>>> rf.fit(X_train, y_train)
183+
>>>
184+
>>> # Make predictions
185+
>>> y_pred = rf.predict(X_test)
186+
>>> print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
187+
"""
188+
189+
def __init__(self, n_estimators=100, max_depth=10, min_samples_split=2, n_features=None):
190+
"""Initialize Random Forest Classifier.
191+
192+
Args:
193+
n_estimators: Number of trees in the forest (default: 100)
194+
max_depth: Maximum depth of each tree (default: 10)
195+
min_samples_split: Minimum samples required to split (default: 2)
196+
n_features: Number of features to consider for best split.
197+
If None, uses sqrt(n_features) (default: None)
198+
"""
199+
self.n_estimators = n_estimators
200+
self.max_depth = max_depth
201+
self.min_samples_split = min_samples_split
202+
self.n_features = n_features
203+
self.trees = []
204+
205+
def fit(self, X, y):
206+
"""Build a forest of trees from the training set (X, y).
207+
208+
Args:
209+
X: Training features, shape (n_samples, n_features)
210+
y: Training labels, shape (n_samples,)
211+
212+
Returns:
213+
self: Fitted classifier
214+
"""
215+
self.trees = []
216+
n_features = X.shape[1]
217+
218+
# Default to sqrt of total features if not specified
219+
if self.n_features is None:
220+
self.n_features = int(np.sqrt(n_features))
221+
222+
for _ in range(self.n_estimators):
223+
tree = DecisionTreeClassifier(
224+
max_depth=self.max_depth,
225+
min_samples_split=self.min_samples_split,
226+
n_features=self.n_features
227+
)
228+
X_sample, y_sample = self._bootstrap_sample(X, y)
229+
tree.fit(X_sample, y_sample)
230+
self.trees.append(tree)
231+
232+
return self
233+
234+
def _bootstrap_sample(self, X, y):
235+
"""Create a bootstrap sample from the dataset.
236+
237+
Bootstrap sampling randomly samples with replacement from the dataset.
238+
This creates diverse training sets for each tree.
239+
240+
Args:
241+
X: Features, shape (n_samples, n_features)
242+
y: Labels, shape (n_samples,)
243+
244+
Returns:
245+
X_sample: Bootstrap sample of features
246+
y_sample: Bootstrap sample of labels
247+
"""
248+
n_samples = X.shape[0]
249+
idxs = np.random.choice(n_samples, n_samples, replace=True)
250+
return X[idxs], y[idxs]
251+
252+
def predict(self, X):
253+
"""Predict class labels for samples in X.
254+
255+
Uses majority voting: each tree votes for a class, and the
256+
class with the most votes becomes the final prediction.
257+
258+
Args:
259+
X: Features, shape (n_samples, n_features)
260+
261+
Returns:
262+
Predicted labels, shape (n_samples,)
263+
"""
264+
# Get predictions from all trees
265+
tree_preds = np.array([tree.predict(X) for tree in self.trees])
266+
267+
# Majority voting: transpose to get predictions per sample
268+
# then find most common prediction for each sample
269+
tree_preds = np.swapaxes(tree_preds, 0, 1)
270+
y_pred = [self._most_common_label(tree_pred) for tree_pred in tree_preds]
271+
return np.array(y_pred)
272+
273+
def _most_common_label(self, y):
274+
"""Return the most common label (majority vote)."""
275+
counter = Counter(y)
276+
return counter.most_common(1)[0][0]
277+
278+
279+
if __name__ == "__main__":
280+
# Example usage with synthetic data
281+
from sklearn.datasets import make_classification
282+
from sklearn.model_selection import train_test_split
283+
from sklearn.metrics import accuracy_score, classification_report
284+
285+
print("Random Forest Classifier - Example Usage")
286+
print("=" * 50)
287+
288+
# Generate sample classification dataset
289+
X, y = make_classification(
290+
n_samples=1000,
291+
n_features=20,
292+
n_informative=15,
293+
n_redundant=5,
294+
random_state=42
295+
)
296+
297+
# Split the data
298+
X_train, X_test, y_train, y_test = train_test_split(
299+
X, y, test_size=0.2, random_state=42
300+
)
301+
302+
print(f"Training samples: {X_train.shape[0]}")
303+
print(f"Test samples: {X_test.shape[0]}")
304+
print(f"Number of features: {X_train.shape[1]}")
305+
print()
306+
307+
# Train Random Forest Classifier
308+
print("Training Random Forest Classifier...")
309+
rf_classifier = RandomForestClassifier(
310+
n_estimators=10,
311+
max_depth=10,
312+
min_samples_split=2
313+
)
314+
rf_classifier.fit(X_train, y_train)
315+
print("Training complete!")
316+
print()
317+
318+
# Make predictions
319+
y_pred = rf_classifier.predict(X_test)
320+
321+
# Evaluate
322+
accuracy = accuracy_score(y_test, y_pred)
323+
print(f"Accuracy: {accuracy:.4f}")
324+
print()
325+
print("Classification Report:")
326+
print(classification_report(y_test, y_pred))

0 commit comments

Comments
 (0)