Skip to content

Commit e0ef096

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent e39a2ce commit e0ef096

File tree

2 files changed

+51
-57
lines changed

2 files changed

+51
-57
lines changed

machine_learning/random_forest_classifier.py

Lines changed: 47 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313

1414
class DecisionTreeClassifier:
1515
"""A Decision Tree Classifier built from scratch.
16-
16+
1717
This tree uses information gain (entropy-based) for splitting decisions.
18-
18+
1919
Attributes:
2020
max_depth: Maximum depth of the tree
2121
min_samples_split: Minimum samples required to split a node
@@ -31,12 +31,14 @@ def __init__(self, max_depth=10, min_samples_split=2, n_features=None):
3131

3232
def fit(self, X, y):
3333
"""Build the decision tree.
34-
34+
3535
Args:
3636
X: Training features, shape (n_samples, n_features)
3737
y: Training labels, shape (n_samples,)
3838
"""
39-
self.n_features = X.shape[1] if not self.n_features else min(self.n_features, X.shape[1])
39+
self.n_features = (
40+
X.shape[1] if not self.n_features else min(self.n_features, X.shape[1])
41+
)
4042
self.tree = self._grow_tree(X, y)
4143

4244
def _grow_tree(self, X, y, depth=0):
@@ -45,17 +47,21 @@ def _grow_tree(self, X, y, depth=0):
4547
n_labels = len(np.unique(y))
4648

4749
# Stopping criteria
48-
if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
50+
if (
51+
depth >= self.max_depth
52+
or n_labels == 1
53+
or n_samples < self.min_samples_split
54+
):
4955
leaf_value = self._most_common_label(y)
50-
return {'leaf': True, 'value': leaf_value}
56+
return {"leaf": True, "value": leaf_value}
5157

5258
# Find best split
5359
feat_idxs = np.random.choice(n_features, self.n_features, replace=False)
5460
best_feat, best_thresh = self._best_split(X, y, feat_idxs)
5561

5662
if best_feat is None:
5763
leaf_value = self._most_common_label(y)
58-
return {'leaf': True, 'value': leaf_value}
64+
return {"leaf": True, "value": leaf_value}
5965

6066
# Split the data
6167
left_idxs = X[:, best_feat] <= best_thresh
@@ -66,11 +72,11 @@ def _grow_tree(self, X, y, depth=0):
6672
right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
6773

6874
return {
69-
'leaf': False,
70-
'feature': best_feat,
71-
'threshold': best_thresh,
72-
'left': left,
73-
'right': right
75+
"leaf": False,
76+
"feature": best_feat,
77+
"threshold": best_thresh,
78+
"left": left,
79+
"right": right,
7480
}
7581

7682
def _best_split(self, X, y, feat_idxs):
@@ -127,44 +133,44 @@ def _most_common_label(self, y):
127133

128134
def predict(self, X):
129135
"""Predict class labels for samples in X.
130-
136+
131137
Args:
132138
X: Features, shape (n_samples, n_features)
133-
139+
134140
Returns:
135141
Predicted labels, shape (n_samples,)
136142
"""
137143
return np.array([self._traverse_tree(x, self.tree) for x in X])
138144

139145
def _traverse_tree(self, x, node):
140146
"""Traverse the tree to make a prediction for a single sample."""
141-
if node['leaf']:
142-
return node['value']
147+
if node["leaf"]:
148+
return node["value"]
143149

144-
if x[node['feature']] <= node['threshold']:
145-
return self._traverse_tree(x, node['left'])
146-
return self._traverse_tree(x, node['right'])
150+
if x[node["feature"]] <= node["threshold"]:
151+
return self._traverse_tree(x, node["left"])
152+
return self._traverse_tree(x, node["right"])
147153

148154

149155
class RandomForestClassifier:
150156
"""Random Forest Classifier built from scratch.
151-
157+
152158
Random Forest is an ensemble learning method that constructs multiple
153159
decision trees during training and outputs the mode of the classes
154160
(classification) of the individual trees.
155-
161+
156162
Features:
157163
- Bootstrap sampling (bagging) to create diverse trees
158164
- Random feature selection at each split
159165
- Majority voting for final predictions
160-
166+
161167
Attributes:
162168
n_estimators: Number of trees in the forest
163169
max_depth: Maximum depth of each tree
164170
min_samples_split: Minimum samples required to split a node
165171
n_features: Number of features to consider for best split
166172
trees: List of trained decision trees
167-
173+
168174
Example:
169175
>>> from sklearn.datasets import make_classification
170176
>>> from sklearn.model_selection import train_test_split
@@ -186,9 +192,11 @@ class RandomForestClassifier:
186192
>>> print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
187193
"""
188194

189-
def __init__(self, n_estimators=100, max_depth=10, min_samples_split=2, n_features=None):
195+
def __init__(
196+
self, n_estimators=100, max_depth=10, min_samples_split=2, n_features=None
197+
):
190198
"""Initialize Random Forest Classifier.
191-
199+
192200
Args:
193201
n_estimators: Number of trees in the forest (default: 100)
194202
max_depth: Maximum depth of each tree (default: 10)
@@ -204,17 +212,17 @@ def __init__(self, n_estimators=100, max_depth=10, min_samples_split=2, n_featur
204212

205213
def fit(self, X, y):
206214
"""Build a forest of trees from the training set (X, y).
207-
215+
208216
Args:
209217
X: Training features, shape (n_samples, n_features)
210218
y: Training labels, shape (n_samples,)
211-
219+
212220
Returns:
213221
self: Fitted classifier
214222
"""
215223
self.trees = []
216224
n_features = X.shape[1]
217-
225+
218226
# Default to sqrt of total features if not specified
219227
if self.n_features is None:
220228
self.n_features = int(np.sqrt(n_features))
@@ -223,24 +231,24 @@ def fit(self, X, y):
223231
tree = DecisionTreeClassifier(
224232
max_depth=self.max_depth,
225233
min_samples_split=self.min_samples_split,
226-
n_features=self.n_features
234+
n_features=self.n_features,
227235
)
228236
X_sample, y_sample = self._bootstrap_sample(X, y)
229237
tree.fit(X_sample, y_sample)
230238
self.trees.append(tree)
231-
239+
232240
return self
233241

234242
def _bootstrap_sample(self, X, y):
235243
"""Create a bootstrap sample from the dataset.
236-
244+
237245
Bootstrap sampling randomly samples with replacement from the dataset.
238246
This creates diverse training sets for each tree.
239-
247+
240248
Args:
241249
X: Features, shape (n_samples, n_features)
242250
y: Labels, shape (n_samples,)
243-
251+
244252
Returns:
245253
X_sample: Bootstrap sample of features
246254
y_sample: Bootstrap sample of labels
@@ -251,19 +259,19 @@ def _bootstrap_sample(self, X, y):
251259

252260
def predict(self, X):
253261
"""Predict class labels for samples in X.
254-
262+
255263
Uses majority voting: each tree votes for a class, and the
256264
class with the most votes becomes the final prediction.
257-
265+
258266
Args:
259267
X: Features, shape (n_samples, n_features)
260-
268+
261269
Returns:
262270
Predicted labels, shape (n_samples,)
263271
"""
264272
# Get predictions from all trees
265273
tree_preds = np.array([tree.predict(X) for tree in self.trees])
266-
274+
267275
# Majority voting: transpose to get predictions per sample
268276
# then find most common prediction for each sample
269277
tree_preds = np.swapaxes(tree_preds, 0, 1)
@@ -287,11 +295,7 @@ def _most_common_label(self, y):
287295

288296
# Generate sample classification dataset
289297
X, y = make_classification(
290-
n_samples=1000,
291-
n_features=20,
292-
n_informative=15,
293-
n_redundant=5,
294-
random_state=42
298+
n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42
295299
)
296300

297301
# Split the data
@@ -307,9 +311,7 @@ def _most_common_label(self, y):
307311
# Train Random Forest Classifier
308312
print("Training Random Forest Classifier...")
309313
rf_classifier = RandomForestClassifier(
310-
n_estimators=10,
311-
max_depth=10,
312-
min_samples_split=2
314+
n_estimators=10, max_depth=10, min_samples_split=2
313315
)
314316
rf_classifier.fit(X_train, y_train)
315317
print("Training complete!")

machine_learning/random_forest_regressor.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,7 @@ def _grow_tree(self, X, y, depth=0):
8888
right_indices = ~left_indices
8989

9090
left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
91-
right_subtree = self._grow_tree(
92-
X[right_indices], y[right_indices], depth + 1
93-
)
91+
right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
9492

9593
return {
9694
"feature": best_split["feature"],
@@ -129,9 +127,7 @@ def _best_split(self, X, y, n_features):
129127
if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
130128
continue
131129

132-
mse = self._calculate_mse(
133-
y[left_indices], y[right_indices], len(y)
134-
)
130+
mse = self._calculate_mse(y[left_indices], y[right_indices], len(y))
135131

136132
if mse < best_mse:
137133
best_mse = mse
@@ -292,9 +288,7 @@ def fit(self, X, y):
292288
y_bootstrap = y[indices]
293289

294290
# Feature sampling
295-
feature_indices = np.random.choice(
296-
n_features, max_features, replace=False
297-
)
291+
feature_indices = np.random.choice(n_features, max_features, replace=False)
298292
X_bootstrap = X_bootstrap[:, feature_indices]
299293

300294
# Train decision tree
@@ -353,9 +347,7 @@ def predict(self, X):
353347
)
354348

355349
# Train the Random Forest Regressor
356-
rf_regressor = RandomForestRegressor(
357-
n_estimators=10, max_depth=5, random_state=42
358-
)
350+
rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)
359351
rf_regressor.fit(X_train, y_train)
360352

361353
# Make predictions

0 commit comments

Comments
 (0)