kylie genner
professional
Asked 6/3/2024
Idyllic Answers
Here is an implementation of a random forest from scratch in Python:
import numpy as np
class RandomForest:
def __init__(self, n_estimators, max_depth, min_samples_split, min_impurity_decrease):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_impurity_decrease = min_impurity_decrease
self.trees = []
def fit(self, X, y):
self.trees = []
for _ in range(self.n_estimators):
tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_impurity_decrease=self.min_impurity_decrease)
idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
X_boot, y_boot = X[idx], y[idx]
tree.fit(X_boot, y_boot)
self.trees.append(tree)
def predict(self, X):
predictions = []
for tree in self.trees:
predictions.append(tree.predict(X))
return np.array(predictions).mean(axis=0)
class DecisionTreeClassifier:
def __init__(self, max_depth, min_samples_split, min_impurity_decrease):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_impurity_decrease = min_impurity_decrease
self.tree = {}
def fit(self, X, y):
self.tree = self._build_tree(X, y, 0)
def _build_tree(self, X, y, depth):
if depth < self.max_depth:
feat_idxs = np.random.choice(X.shape[1], X.shape[1], replace=False)
feat_idxs = feat_idxs[:int(np.log2(X.shape[1]))]
best_feat, best_val, best_gain, best_left, best_right = self._find_best_split(X, y, feat_idxs)
if best_gain > self.min_impurity_decrease and len(y) > self.min_samples_split:
left_X, right_X, left_y, right_y = self._split_data(X, y, best_feat, best_val)
left_tree = self._build_tree(left_X, left_y, depth + 1)
right_tree = self._build_tree(right_X, right_y, depth + 1)
return { 'feature': best_feat, 'value': best_val, 'left': left_tree, 'right': right_tree }
else:
return y
return y
def _find_best_split(self, X, y, feat_idxs):
best_feat, best_val, best_gain, best_left, best_right = None, None, -1, None, None
for feat in feat_idxs:
X_column = X[:, feat]
thresholds, classes = np.unique(X_column), np.unique(y)
for threshold in thresholds:
left_idxs, right_idxs = X_column <= threshold, X_column > threshold
left_y, right_y = y[left_idxs], y[right_idxs]
gini_left, gini_right = gini(y), gini(y)
gain = gini(y) - (np.sum(left_idxs) / len(y)) * gini_left - (np.sum(right_idxs) / len(y)) * gini_right
if gain > best_gain:
best_gain = gain
best_feat = feat
best_val = threshold
best_left, best_right = left_y, right_y
return best_feat, best_val, best_gain, best_left, best_right
def _split_data(self, X, y, feat, val):
left_idxs, right_idxs = X[:, feat] <= val, X[:, feat] > val
return X[left_idxs], X[right_idxs], y[left_idxs], y[right_idxs]
def gini(y):
return 1 - np.sum(np.mean(y == c) ** 2 for c in np.unique(y))
# Example usage:
rf = RandomForest(n_estimators=10, max_depth=5, min_samples_split=2, min_impurity_decrease=0.01)
X = # your feature data
y = # your target data
rf.fit(X, y)
predictions = rf.predict(X)
This implementation includes the following:
RandomForest class that creates multiple decision trees and combines their predictions.DecisionTreeClassifier class that builds a decision tree using a recursive function _build_tree._find_best_split function finds the best feature and value to split the data._split_data function splits the data based on the best feature and value.gini function calculates the Gini impurity.Note that this is a basic implementation, and you may want to add more features, such as handling categorical variables, missing values, and hyperparameter tuning.
Nov 13, 2016 · In this tutorial, you will discover how to implement the Random Forest algorithm from scratch in Python. After completing this tutorial, you will know: The …
Nov 16, 2023 · In this in-depth hands-on guide, we'll build an intuition on how decision trees work, how ensembling boosts individual classifiers and regressors, what random forests are and build a random forest …
May 16, 2018 · Updated Feb 2023 · 14 min read. This tutorial explains how to use random forests for classification in Python. We will cover: How random forests work. How to use them for classification. How to …
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive …
View 4 more