Implement Out-of-Bag Score Calculation

#345 · Machine Learning · Medium

Problem

Implement Out-of-Bag (OOB) score calculation for a Random Forest. Each tree is trained on a bootstrap sample, so about 37% of data is left out. Use these out-of-bag samples for each tree to compute an unbiased estimate of the model's performance.

Solution

import numpy as np
from collections import Counter
from typing import Dict

def gini_impurity(y: np.ndarray) -> float:
    if len(y) == 0:
        return 0.0
    _, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return 1.0 - np.sum(probs ** 2)

def build_simple_tree(X, y, max_depth, max_features):
    if max_depth == 0 or len(np.unique(y)) == 1 or len(y) < 2:
        vals, counts = np.unique(y, return_counts=True)
        return {"leaf": True, "value": vals[np.argmax(counts)]}

    feats = np.random.choice(X.shape[1], min(max_features, X.shape[1]), replace=False)
    best_feat, best_thresh, best_gain = -1, 0, 0
    parent_imp = gini_impurity(y)

    for f in feats:
        for t in np.unique(X[:, f]):
            lm = X[:, f] <= t
            if lm.sum() == 0 or (~lm).sum() == 0:
                continue
            gain = parent_imp - (lm.sum() * gini_impurity(y[lm]) + (~lm).sum() * gini_impurity(y[~lm])) / len(y)
            if gain > best_gain:
                best_gain, best_feat, best_thresh = gain, f, t

    if best_feat == -1:
        vals, counts = np.unique(y, return_counts=True)
        return {"leaf": True, "value": vals[np.argmax(counts)]}

    lm = X[:, best_feat] <= best_thresh
    return {
        "leaf": False, "feat": best_feat, "thresh": best_thresh,
        "left": build_simple_tree(X[lm], y[lm], max_depth - 1, max_features),
        "right": build_simple_tree(X[~lm], y[~lm], max_depth - 1, max_features)
    }

def predict_tree(tree, x):
    if tree["leaf"]:
        return tree["value"]
    if x[tree["feat"]] <= tree["thresh"]:
        return predict_tree(tree["left"], x)
    return predict_tree(tree["right"], x)

def oob_score(
    X: np.ndarray, y: np.ndarray,
    n_trees: int = 50, max_depth: int = 5, seed: int = 42
) -> Dict:
    np.random.seed(seed)
    n = len(y)
    max_features = max(1, int(np.sqrt(X.shape[1])))
    oob_votes = [[] for _ in range(n)]

    for _ in range(n_trees):
        bag = np.random.choice(n, n, replace=True)
        oob_mask = np.ones(n, dtype=bool)
        oob_mask[bag] = False

        tree = build_simple_tree(X[bag], y[bag], max_depth, max_features)

        for i in np.where(oob_mask)[0]:
            pred = predict_tree(tree, X[i])
            oob_votes[i].append(pred)

    correct = 0
    counted = 0
    for i in range(n):
        if oob_votes[i]:
            counter = Counter(oob_votes[i])
            majority = counter.most_common(1)[0][0]
            if majority == y[i]:
                correct += 1
            counted += 1

    score = correct / counted if counted > 0 else 0.0
    return {
        "oob_score": round(score, 4),
        "samples_with_oob": counted,
        "total_samples": n
    }

Explanation

For each tree, draw a bootstrap sample. About 37% of samples are not drawn (out-of-bag).
Train a decision tree on the bootstrap sample. Use the OOB samples for that tree to collect predictions.
Each data point accumulates OOB predictions across all trees where it was not in the training set.
The OOB prediction for each point is the majority vote. The OOB score is the fraction of correct OOB predictions.

Complexity

Time: O(T (n f * d + n_oob)) where T is trees, n is samples, f is features, d is depth
Space: O(n * T) for OOB vote storage

← #344 #346 →