|
| 1 | +"""Random Forest Classifier implementation from scratch. |
| 2 | +
|
| 3 | +This module implements a Random Forest Classifier using: |
| 4 | +- Decision Tree base learners built from scratch |
| 5 | +- Bootstrap sampling (bagging) |
| 6 | +- Random feature selection at splits |
| 7 | +- Majority voting for aggregation |
| 8 | +""" |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +from collections import Counter |
| 12 | + |
| 13 | + |
| 14 | +class DecisionTreeClassifier: |
| 15 | + """A Decision Tree Classifier built from scratch. |
| 16 | + |
| 17 | + This tree uses information gain (entropy-based) for splitting decisions. |
| 18 | + |
| 19 | + Attributes: |
| 20 | + max_depth: Maximum depth of the tree |
| 21 | + min_samples_split: Minimum samples required to split a node |
| 22 | + n_features: Number of features to consider for best split |
| 23 | + tree: The built tree structure |
| 24 | + """ |
| 25 | + |
| 26 | + def __init__(self, max_depth=10, min_samples_split=2, n_features=None): |
| 27 | + self.max_depth = max_depth |
| 28 | + self.min_samples_split = min_samples_split |
| 29 | + self.n_features = n_features |
| 30 | + self.tree = None |
| 31 | + |
| 32 | + def fit(self, X, y): |
| 33 | + """Build the decision tree. |
| 34 | + |
| 35 | + Args: |
| 36 | + X: Training features, shape (n_samples, n_features) |
| 37 | + y: Training labels, shape (n_samples,) |
| 38 | + """ |
| 39 | + self.n_features = X.shape[1] if not self.n_features else min(self.n_features, X.shape[1]) |
| 40 | + self.tree = self._grow_tree(X, y) |
| 41 | + |
| 42 | + def _grow_tree(self, X, y, depth=0): |
| 43 | + """Recursively grow the decision tree.""" |
| 44 | + n_samples, n_features = X.shape |
| 45 | + n_labels = len(np.unique(y)) |
| 46 | + |
| 47 | + # Stopping criteria |
| 48 | + if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split: |
| 49 | + leaf_value = self._most_common_label(y) |
| 50 | + return {'leaf': True, 'value': leaf_value} |
| 51 | + |
| 52 | + # Find best split |
| 53 | + feat_idxs = np.random.choice(n_features, self.n_features, replace=False) |
| 54 | + best_feat, best_thresh = self._best_split(X, y, feat_idxs) |
| 55 | + |
| 56 | + if best_feat is None: |
| 57 | + leaf_value = self._most_common_label(y) |
| 58 | + return {'leaf': True, 'value': leaf_value} |
| 59 | + |
| 60 | + # Split the data |
| 61 | + left_idxs = X[:, best_feat] <= best_thresh |
| 62 | + right_idxs = ~left_idxs |
| 63 | + |
| 64 | + # Grow subtrees |
| 65 | + left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1) |
| 66 | + right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1) |
| 67 | + |
| 68 | + return { |
| 69 | + 'leaf': False, |
| 70 | + 'feature': best_feat, |
| 71 | + 'threshold': best_thresh, |
| 72 | + 'left': left, |
| 73 | + 'right': right |
| 74 | + } |
| 75 | + |
| 76 | + def _best_split(self, X, y, feat_idxs): |
| 77 | + """Find the best feature and threshold to split on.""" |
| 78 | + best_gain = -1 |
| 79 | + split_idx, split_thresh = None, None |
| 80 | + |
| 81 | + for feat_idx in feat_idxs: |
| 82 | + X_column = X[:, feat_idx] |
| 83 | + thresholds = np.unique(X_column) |
| 84 | + |
| 85 | + for threshold in thresholds: |
| 86 | + gain = self._information_gain(y, X_column, threshold) |
| 87 | + |
| 88 | + if gain > best_gain: |
| 89 | + best_gain = gain |
| 90 | + split_idx = feat_idx |
| 91 | + split_thresh = threshold |
| 92 | + |
| 93 | + return split_idx, split_thresh |
| 94 | + |
| 95 | + def _information_gain(self, y, X_column, threshold): |
| 96 | + """Calculate information gain from a split.""" |
| 97 | + # Parent entropy |
| 98 | + parent_entropy = self._entropy(y) |
| 99 | + |
| 100 | + # Create children |
| 101 | + left_idxs = X_column <= threshold |
| 102 | + right_idxs = ~left_idxs |
| 103 | + |
| 104 | + if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0: |
| 105 | + return 0 |
| 106 | + |
| 107 | + # Calculate weighted average entropy of children |
| 108 | + n = len(y) |
| 109 | + n_left, n_right = np.sum(left_idxs), np.sum(right_idxs) |
| 110 | + e_left, e_right = self._entropy(y[left_idxs]), self._entropy(y[right_idxs]) |
| 111 | + child_entropy = (n_left / n) * e_left + (n_right / n) * e_right |
| 112 | + |
| 113 | + # Information gain |
| 114 | + ig = parent_entropy - child_entropy |
| 115 | + return ig |
| 116 | + |
| 117 | + def _entropy(self, y): |
| 118 | + """Calculate entropy of a label distribution.""" |
| 119 | + hist = np.bincount(y) |
| 120 | + ps = hist / len(y) |
| 121 | + return -np.sum([p * np.log2(p) for p in ps if p > 0]) |
| 122 | + |
| 123 | + def _most_common_label(self, y): |
| 124 | + """Return the most common label.""" |
| 125 | + counter = Counter(y) |
| 126 | + return counter.most_common(1)[0][0] |
| 127 | + |
| 128 | + def predict(self, X): |
| 129 | + """Predict class labels for samples in X. |
| 130 | + |
| 131 | + Args: |
| 132 | + X: Features, shape (n_samples, n_features) |
| 133 | + |
| 134 | + Returns: |
| 135 | + Predicted labels, shape (n_samples,) |
| 136 | + """ |
| 137 | + return np.array([self._traverse_tree(x, self.tree) for x in X]) |
| 138 | + |
| 139 | + def _traverse_tree(self, x, node): |
| 140 | + """Traverse the tree to make a prediction for a single sample.""" |
| 141 | + if node['leaf']: |
| 142 | + return node['value'] |
| 143 | + |
| 144 | + if x[node['feature']] <= node['threshold']: |
| 145 | + return self._traverse_tree(x, node['left']) |
| 146 | + return self._traverse_tree(x, node['right']) |
| 147 | + |
| 148 | + |
| 149 | +class RandomForestClassifier: |
| 150 | + """Random Forest Classifier built from scratch. |
| 151 | + |
| 152 | + Random Forest is an ensemble learning method that constructs multiple |
| 153 | + decision trees during training and outputs the mode of the classes |
| 154 | + (classification) of the individual trees. |
| 155 | + |
| 156 | + Features: |
| 157 | + - Bootstrap sampling (bagging) to create diverse trees |
| 158 | + - Random feature selection at each split |
| 159 | + - Majority voting for final predictions |
| 160 | + |
| 161 | + Attributes: |
| 162 | + n_estimators: Number of trees in the forest |
| 163 | + max_depth: Maximum depth of each tree |
| 164 | + min_samples_split: Minimum samples required to split a node |
| 165 | + n_features: Number of features to consider for best split |
| 166 | + trees: List of trained decision trees |
| 167 | + |
| 168 | + Example: |
| 169 | + >>> from sklearn.datasets import make_classification |
| 170 | + >>> from sklearn.model_selection import train_test_split |
| 171 | + >>> from sklearn.metrics import accuracy_score |
| 172 | + >>> |
| 173 | + >>> # Generate sample data |
| 174 | + >>> X, y = make_classification(n_samples=1000, n_features=20, |
| 175 | + ... n_informative=15, n_redundant=5, |
| 176 | + ... random_state=42) |
| 177 | + >>> X_train, X_test, y_train, y_test = train_test_split( |
| 178 | + ... X, y, test_size=0.2, random_state=42) |
| 179 | + >>> |
| 180 | + >>> # Train Random Forest |
| 181 | + >>> rf = RandomForestClassifier(n_estimators=10, max_depth=10) |
| 182 | + >>> rf.fit(X_train, y_train) |
| 183 | + >>> |
| 184 | + >>> # Make predictions |
| 185 | + >>> y_pred = rf.predict(X_test) |
| 186 | + >>> print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") |
| 187 | + """ |
| 188 | + |
| 189 | + def __init__(self, n_estimators=100, max_depth=10, min_samples_split=2, n_features=None): |
| 190 | + """Initialize Random Forest Classifier. |
| 191 | + |
| 192 | + Args: |
| 193 | + n_estimators: Number of trees in the forest (default: 100) |
| 194 | + max_depth: Maximum depth of each tree (default: 10) |
| 195 | + min_samples_split: Minimum samples required to split (default: 2) |
| 196 | + n_features: Number of features to consider for best split. |
| 197 | + If None, uses sqrt(n_features) (default: None) |
| 198 | + """ |
| 199 | + self.n_estimators = n_estimators |
| 200 | + self.max_depth = max_depth |
| 201 | + self.min_samples_split = min_samples_split |
| 202 | + self.n_features = n_features |
| 203 | + self.trees = [] |
| 204 | + |
| 205 | + def fit(self, X, y): |
| 206 | + """Build a forest of trees from the training set (X, y). |
| 207 | + |
| 208 | + Args: |
| 209 | + X: Training features, shape (n_samples, n_features) |
| 210 | + y: Training labels, shape (n_samples,) |
| 211 | + |
| 212 | + Returns: |
| 213 | + self: Fitted classifier |
| 214 | + """ |
| 215 | + self.trees = [] |
| 216 | + n_features = X.shape[1] |
| 217 | + |
| 218 | + # Default to sqrt of total features if not specified |
| 219 | + if self.n_features is None: |
| 220 | + self.n_features = int(np.sqrt(n_features)) |
| 221 | + |
| 222 | + for _ in range(self.n_estimators): |
| 223 | + tree = DecisionTreeClassifier( |
| 224 | + max_depth=self.max_depth, |
| 225 | + min_samples_split=self.min_samples_split, |
| 226 | + n_features=self.n_features |
| 227 | + ) |
| 228 | + X_sample, y_sample = self._bootstrap_sample(X, y) |
| 229 | + tree.fit(X_sample, y_sample) |
| 230 | + self.trees.append(tree) |
| 231 | + |
| 232 | + return self |
| 233 | + |
| 234 | + def _bootstrap_sample(self, X, y): |
| 235 | + """Create a bootstrap sample from the dataset. |
| 236 | + |
| 237 | + Bootstrap sampling randomly samples with replacement from the dataset. |
| 238 | + This creates diverse training sets for each tree. |
| 239 | + |
| 240 | + Args: |
| 241 | + X: Features, shape (n_samples, n_features) |
| 242 | + y: Labels, shape (n_samples,) |
| 243 | + |
| 244 | + Returns: |
| 245 | + X_sample: Bootstrap sample of features |
| 246 | + y_sample: Bootstrap sample of labels |
| 247 | + """ |
| 248 | + n_samples = X.shape[0] |
| 249 | + idxs = np.random.choice(n_samples, n_samples, replace=True) |
| 250 | + return X[idxs], y[idxs] |
| 251 | + |
| 252 | + def predict(self, X): |
| 253 | + """Predict class labels for samples in X. |
| 254 | + |
| 255 | + Uses majority voting: each tree votes for a class, and the |
| 256 | + class with the most votes becomes the final prediction. |
| 257 | + |
| 258 | + Args: |
| 259 | + X: Features, shape (n_samples, n_features) |
| 260 | + |
| 261 | + Returns: |
| 262 | + Predicted labels, shape (n_samples,) |
| 263 | + """ |
| 264 | + # Get predictions from all trees |
| 265 | + tree_preds = np.array([tree.predict(X) for tree in self.trees]) |
| 266 | + |
| 267 | + # Majority voting: transpose to get predictions per sample |
| 268 | + # then find most common prediction for each sample |
| 269 | + tree_preds = np.swapaxes(tree_preds, 0, 1) |
| 270 | + y_pred = [self._most_common_label(tree_pred) for tree_pred in tree_preds] |
| 271 | + return np.array(y_pred) |
| 272 | + |
| 273 | + def _most_common_label(self, y): |
| 274 | + """Return the most common label (majority vote).""" |
| 275 | + counter = Counter(y) |
| 276 | + return counter.most_common(1)[0][0] |
| 277 | + |
| 278 | + |
| 279 | +if __name__ == "__main__": |
| 280 | + # Example usage with synthetic data |
| 281 | + from sklearn.datasets import make_classification |
| 282 | + from sklearn.model_selection import train_test_split |
| 283 | + from sklearn.metrics import accuracy_score, classification_report |
| 284 | + |
| 285 | + print("Random Forest Classifier - Example Usage") |
| 286 | + print("=" * 50) |
| 287 | + |
| 288 | + # Generate sample classification dataset |
| 289 | + X, y = make_classification( |
| 290 | + n_samples=1000, |
| 291 | + n_features=20, |
| 292 | + n_informative=15, |
| 293 | + n_redundant=5, |
| 294 | + random_state=42 |
| 295 | + ) |
| 296 | + |
| 297 | + # Split the data |
| 298 | + X_train, X_test, y_train, y_test = train_test_split( |
| 299 | + X, y, test_size=0.2, random_state=42 |
| 300 | + ) |
| 301 | + |
| 302 | + print(f"Training samples: {X_train.shape[0]}") |
| 303 | + print(f"Test samples: {X_test.shape[0]}") |
| 304 | + print(f"Number of features: {X_train.shape[1]}") |
| 305 | + print() |
| 306 | + |
| 307 | + # Train Random Forest Classifier |
| 308 | + print("Training Random Forest Classifier...") |
| 309 | + rf_classifier = RandomForestClassifier( |
| 310 | + n_estimators=10, |
| 311 | + max_depth=10, |
| 312 | + min_samples_split=2 |
| 313 | + ) |
| 314 | + rf_classifier.fit(X_train, y_train) |
| 315 | + print("Training complete!") |
| 316 | + print() |
| 317 | + |
| 318 | + # Make predictions |
| 319 | + y_pred = rf_classifier.predict(X_test) |
| 320 | + |
| 321 | + # Evaluate |
| 322 | + accuracy = accuracy_score(y_test, y_pred) |
| 323 | + print(f"Accuracy: {accuracy:.4f}") |
| 324 | + print() |
| 325 | + print("Classification Report:") |
| 326 | + print(classification_report(y_test, y_pred)) |
0 commit comments