Skip to content

Commit e39a2ce

Browse files
authored
Add Random Forest Regressor implementation from scratch
- Implemented DecisionTreeRegressor with MSE-based splitting - Implemented RandomForestRegressor with bootstrap aggregating - Added comprehensive docstrings and examples - Includes doctest and demo usage with sklearn metrics - Completes issue #13537 alongside the classifier implementation
1 parent 7413ded commit e39a2ce

File tree

1 file changed

+370
-0
lines changed

1 file changed

+370
-0
lines changed
Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
"""Random Forest Regressor implementation from scratch."""
2+
3+
import numpy as np
4+
from collections import Counter
5+
6+
7+
class DecisionTreeRegressor:
8+
"""
9+
A simple decision tree regressor implementation.
10+
11+
Parameters
12+
----------
13+
max_depth : int, optional (default=None)
14+
The maximum depth of the tree.
15+
min_samples_split : int, optional (default=2)
16+
The minimum number of samples required to split an internal node.
17+
18+
Examples
19+
--------
20+
>>> X = np.array([[1], [2], [3], [4], [5]])
21+
>>> y = np.array([1.5, 2.5, 3.5, 4.5, 5.5])
22+
>>> tree = DecisionTreeRegressor(max_depth=2)
23+
>>> tree.fit(X, y)
24+
>>> predictions = tree.predict(X)
25+
>>> np.allclose(predictions, y, atol=0.5)
26+
True
27+
"""
28+
29+
def __init__(self, max_depth=None, min_samples_split=2):
30+
self.max_depth = max_depth
31+
self.min_samples_split = min_samples_split
32+
self.tree = None
33+
34+
def fit(self, X, y):
35+
"""
36+
Build a decision tree regressor from the training set (X, y).
37+
38+
Parameters
39+
----------
40+
X : array-like of shape (n_samples, n_features)
41+
The training input samples.
42+
y : array-like of shape (n_samples,)
43+
The target values.
44+
45+
Returns
46+
-------
47+
self : object
48+
Fitted estimator.
49+
"""
50+
self.tree = self._grow_tree(X, y)
51+
return self
52+
53+
def _grow_tree(self, X, y, depth=0):
54+
"""
55+
Recursively grow the decision tree.
56+
57+
Parameters
58+
----------
59+
X : array-like of shape (n_samples, n_features)
60+
Training samples.
61+
y : array-like of shape (n_samples,)
62+
Target values.
63+
depth : int, optional (default=0)
64+
Current depth of the tree.
65+
66+
Returns
67+
-------
68+
node : dict
69+
A node in the decision tree.
70+
"""
71+
n_samples, n_features = X.shape
72+
73+
# Stopping criteria
74+
if (
75+
depth == self.max_depth
76+
or n_samples < self.min_samples_split
77+
or len(np.unique(y)) == 1
78+
):
79+
return {"value": np.mean(y)}
80+
81+
# Find the best split
82+
best_split = self._best_split(X, y, n_features)
83+
if best_split is None:
84+
return {"value": np.mean(y)}
85+
86+
# Recursively build the tree
87+
left_indices = X[:, best_split["feature"]] <= best_split["threshold"]
88+
right_indices = ~left_indices
89+
90+
left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
91+
right_subtree = self._grow_tree(
92+
X[right_indices], y[right_indices], depth + 1
93+
)
94+
95+
return {
96+
"feature": best_split["feature"],
97+
"threshold": best_split["threshold"],
98+
"left": left_subtree,
99+
"right": right_subtree,
100+
}
101+
102+
def _best_split(self, X, y, n_features):
103+
"""
104+
Find the best feature and threshold to split on.
105+
106+
Parameters
107+
----------
108+
X : array-like of shape (n_samples, n_features)
109+
Training samples.
110+
y : array-like of shape (n_samples,)
111+
Target values.
112+
n_features : int
113+
Number of features to consider.
114+
115+
Returns
116+
-------
117+
best_split : dict or None
118+
The best split configuration.
119+
"""
120+
best_mse = float("inf")
121+
best_split = None
122+
123+
for feature in range(n_features):
124+
thresholds = np.unique(X[:, feature])
125+
for threshold in thresholds:
126+
left_indices = X[:, feature] <= threshold
127+
right_indices = ~left_indices
128+
129+
if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
130+
continue
131+
132+
mse = self._calculate_mse(
133+
y[left_indices], y[right_indices], len(y)
134+
)
135+
136+
if mse < best_mse:
137+
best_mse = mse
138+
best_split = {"feature": feature, "threshold": threshold}
139+
140+
return best_split
141+
142+
def _calculate_mse(self, left_y, right_y, n_samples):
143+
"""
144+
Calculate weighted mean squared error for a split.
145+
146+
Parameters
147+
----------
148+
left_y : array-like
149+
Target values in the left split.
150+
right_y : array-like
151+
Target values in the right split.
152+
n_samples : int
153+
Total number of samples.
154+
155+
Returns
156+
-------
157+
mse : float
158+
Weighted mean squared error.
159+
"""
160+
n_left, n_right = len(left_y), len(right_y)
161+
mse_left = np.var(left_y) if n_left > 0 else 0
162+
mse_right = np.var(right_y) if n_right > 0 else 0
163+
return (n_left / n_samples) * mse_left + (n_right / n_samples) * mse_right
164+
165+
def predict(self, X):
166+
"""
167+
Predict target values for X.
168+
169+
Parameters
170+
----------
171+
X : array-like of shape (n_samples, n_features)
172+
The input samples.
173+
174+
Returns
175+
-------
176+
y_pred : array-like of shape (n_samples,)
177+
The predicted values.
178+
"""
179+
return np.array([self._predict_sample(sample, self.tree) for sample in X])
180+
181+
def _predict_sample(self, sample, tree):
182+
"""
183+
Predict the target value for a single sample.
184+
185+
Parameters
186+
----------
187+
sample : array-like
188+
A single sample.
189+
tree : dict
190+
The decision tree node.
191+
192+
Returns
193+
-------
194+
prediction : float
195+
The predicted value.
196+
"""
197+
if "value" in tree:
198+
return tree["value"]
199+
200+
if sample[tree["feature"]] <= tree["threshold"]:
201+
return self._predict_sample(sample, tree["left"])
202+
return self._predict_sample(sample, tree["right"])
203+
204+
205+
class RandomForestRegressor:
206+
"""
207+
Random Forest Regressor implementation from scratch.
208+
209+
A random forest is an ensemble of decision trees, generally trained via
210+
the bagging method. The predictions are made by averaging the predictions
211+
of individual trees.
212+
213+
Parameters
214+
----------
215+
n_estimators : int, optional (default=100)
216+
The number of trees in the forest.
217+
max_depth : int, optional (default=None)
218+
The maximum depth of the trees.
219+
min_samples_split : int, optional (default=2)
220+
The minimum number of samples required to split an internal node.
221+
max_features : int, str or None, optional (default='sqrt')
222+
The number of features to consider when looking for the best split.
223+
- If int, then consider max_features features at each split.
224+
- If 'sqrt', then max_features=sqrt(n_features).
225+
- If None, then max_features=n_features.
226+
random_state : int or None, optional (default=None)
227+
Controls the randomness of the estimator.
228+
229+
Examples
230+
--------
231+
>>> X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
232+
>>> y = np.array([1.5, 2.5, 3.5, 4.5, 5.5])
233+
>>> rf = RandomForestRegressor(n_estimators=5, max_depth=2, random_state=42)
234+
>>> rf.fit(X, y)
235+
>>> predictions = rf.predict(X)
236+
>>> len(predictions) == len(y)
237+
True
238+
>>> np.all((predictions >= y.min()) & (predictions <= y.max()))
239+
True
240+
"""
241+
242+
def __init__(
243+
self,
244+
n_estimators=100,
245+
max_depth=None,
246+
min_samples_split=2,
247+
max_features="sqrt",
248+
random_state=None,
249+
):
250+
self.n_estimators = n_estimators
251+
self.max_depth = max_depth
252+
self.min_samples_split = min_samples_split
253+
self.max_features = max_features
254+
self.random_state = random_state
255+
self.trees = []
256+
257+
def fit(self, X, y):
258+
"""
259+
Build a random forest regressor from the training set (X, y).
260+
261+
Parameters
262+
----------
263+
X : array-like of shape (n_samples, n_features)
264+
The training input samples.
265+
y : array-like of shape (n_samples,)
266+
The target values.
267+
268+
Returns
269+
-------
270+
self : object
271+
Fitted estimator.
272+
"""
273+
np.random.seed(self.random_state)
274+
X = np.array(X)
275+
y = np.array(y)
276+
277+
n_samples, n_features = X.shape
278+
279+
# Determine max_features
280+
if self.max_features == "sqrt":
281+
max_features = int(np.sqrt(n_features))
282+
elif self.max_features is None:
283+
max_features = n_features
284+
else:
285+
max_features = self.max_features
286+
287+
self.trees = []
288+
for _ in range(self.n_estimators):
289+
# Bootstrap sampling
290+
indices = np.random.choice(n_samples, n_samples, replace=True)
291+
X_bootstrap = X[indices]
292+
y_bootstrap = y[indices]
293+
294+
# Feature sampling
295+
feature_indices = np.random.choice(
296+
n_features, max_features, replace=False
297+
)
298+
X_bootstrap = X_bootstrap[:, feature_indices]
299+
300+
# Train decision tree
301+
tree = DecisionTreeRegressor(
302+
max_depth=self.max_depth, min_samples_split=self.min_samples_split
303+
)
304+
tree.fit(X_bootstrap, y_bootstrap)
305+
306+
self.trees.append((tree, feature_indices))
307+
308+
return self
309+
310+
def predict(self, X):
311+
"""
312+
Predict target values for X.
313+
314+
Parameters
315+
----------
316+
X : array-like of shape (n_samples, n_features)
317+
The input samples.
318+
319+
Returns
320+
-------
321+
y_pred : array-like of shape (n_samples,)
322+
The predicted values (average of all tree predictions).
323+
"""
324+
X = np.array(X)
325+
predictions = []
326+
327+
for tree, feature_indices in self.trees:
328+
X_subset = X[:, feature_indices]
329+
predictions.append(tree.predict(X_subset))
330+
331+
# Average predictions from all trees
332+
return np.mean(predictions, axis=0)
333+
334+
335+
if __name__ == "__main__":
336+
import doctest
337+
338+
doctest.testmod()
339+
340+
# Example usage
341+
from sklearn.datasets import make_regression
342+
from sklearn.model_selection import train_test_split
343+
from sklearn.metrics import mean_squared_error, r2_score
344+
345+
# Generate synthetic regression data
346+
X, y = make_regression(
347+
n_samples=200, n_features=5, n_informative=3, noise=10, random_state=42
348+
)
349+
350+
# Split the data
351+
X_train, X_test, y_train, y_test = train_test_split(
352+
X, y, test_size=0.3, random_state=42
353+
)
354+
355+
# Train the Random Forest Regressor
356+
rf_regressor = RandomForestRegressor(
357+
n_estimators=10, max_depth=5, random_state=42
358+
)
359+
rf_regressor.fit(X_train, y_train)
360+
361+
# Make predictions
362+
y_pred = rf_regressor.predict(X_test)
363+
364+
# Evaluate the model
365+
mse = mean_squared_error(y_test, y_pred)
366+
r2 = r2_score(y_test, y_pred)
367+
368+
print(f"Mean Squared Error: {mse:.2f}")
369+
print(f"R² Score: {r2:.2f}")
370+
print(f"Number of trees: {len(rf_regressor.trees)}")

0 commit comments

Comments
 (0)