Implement Out-of-Bag (OOB) score calculation for a Random Forest. Each tree is trained on a bootstrap sample, so about 37% of data is left out. Use these out-of-bag samples for each tree to compute an unbiased estimate of the model's performance.
import numpy as np
from collections import Counter
from typing import Dict
def gini_impurity(y: np.ndarray) -> float:
if len(y) == 0:
return 0.0
_, counts = np.unique(y, return_counts=True)
probs = counts / len(y)
return 1.0 - np.sum(probs ** 2)
def build_simple_tree(X, y, max_depth, max_features):
if max_depth == 0 or len(np.unique(y)) == 1 or len(y) < 2:
vals, counts = np.unique(y, return_counts=True)
return {"leaf": True, "value": vals[np.argmax(counts)]}
feats = np.random.choice(X.shape[1], min(max_features, X.shape[1]), replace=False)
best_feat, best_thresh, best_gain = -1, 0, 0
parent_imp = gini_impurity(y)
for f in feats:
for t in np.unique(X[:, f]):
lm = X[:, f] <= t
if lm.sum() == 0 or (~lm).sum() == 0:
continue
gain = parent_imp - (lm.sum() * gini_impurity(y[lm]) + (~lm).sum() * gini_impurity(y[~lm])) / len(y)
if gain > best_gain:
best_gain, best_feat, best_thresh = gain, f, t
if best_feat == -1:
vals, counts = np.unique(y, return_counts=True)
return {"leaf": True, "value": vals[np.argmax(counts)]}
lm = X[:, best_feat] <= best_thresh
return {
"leaf": False, "feat": best_feat, "thresh": best_thresh,
"left": build_simple_tree(X[lm], y[lm], max_depth - 1, max_features),
"right": build_simple_tree(X[~lm], y[~lm], max_depth - 1, max_features)
}
def predict_tree(tree, x):
if tree["leaf"]:
return tree["value"]
if x[tree["feat"]] <= tree["thresh"]:
return predict_tree(tree["left"], x)
return predict_tree(tree["right"], x)
def oob_score(
X: np.ndarray, y: np.ndarray,
n_trees: int = 50, max_depth: int = 5, seed: int = 42
) -> Dict:
np.random.seed(seed)
n = len(y)
max_features = max(1, int(np.sqrt(X.shape[1])))
oob_votes = [[] for _ in range(n)]
for _ in range(n_trees):
bag = np.random.choice(n, n, replace=True)
oob_mask = np.ones(n, dtype=bool)
oob_mask[bag] = False
tree = build_simple_tree(X[bag], y[bag], max_depth, max_features)
for i in np.where(oob_mask)[0]:
pred = predict_tree(tree, X[i])
oob_votes[i].append(pred)
correct = 0
counted = 0
for i in range(n):
if oob_votes[i]:
counter = Counter(oob_votes[i])
majority = counter.most_common(1)[0][0]
if majority == y[i]:
correct += 1
counted += 1
score = correct / counted if counted > 0 else 0.0
return {
"oob_score": round(score, 4),
"samples_with_oob": counted,
"total_samples": n
}