Implement Bagging Classifier from Scratch

#307 · Machine Learning · Hard

Problem

Implement a bagging (Bootstrap Aggregating) classifier from scratch. Train multiple base classifiers on bootstrap samples of the training data and aggregate predictions using majority voting.

Solution

import numpy as np
from collections import Counter

class BaggingClassifier:
    def __init__(self, base_classifier_class, n_estimators: int = 10,
                 sample_fraction: float = 1.0, **base_params):
        self.base_classifier_class = base_classifier_class
        self.n_estimators = n_estimators
        self.sample_fraction = sample_fraction
        self.base_params = base_params
        self.estimators = []
        self.oob_indices = []

    def _bootstrap_sample(self, X: np.ndarray, y: np.ndarray):
        n = len(y)
        sample_size = int(n * self.sample_fraction)
        indices = np.random.choice(n, size=sample_size, replace=True)
        oob = list(set(range(n)) - set(indices))
        return X[indices], y[indices], oob

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.estimators = []
        self.oob_indices = []

        for _ in range(self.n_estimators):
            X_boot, y_boot, oob = self._bootstrap_sample(X, y)
            clf = self.base_classifier_class(**self.base_params)
            clf.fit(X_boot, y_boot)
            self.estimators.append(clf)
            self.oob_indices.append(oob)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        all_preds = np.array([clf.predict(X) for clf in self.estimators])
        # all_preds shape: (n_estimators, n_samples)
        final_preds = []
        for i in range(X.shape[0]):
            votes = all_preds[:, i].tolist()
            counter = Counter(votes)
            final_preds.append(counter.most_common(1)[0][0])
        return np.array(final_preds)

    def oob_score(self, X: np.ndarray, y: np.ndarray) -> float:
        n = len(y)
        oob_preds = {}
        for est_idx, (clf, oob) in enumerate(zip(self.estimators, self.oob_indices)):
            if len(oob) == 0:
                continue
            preds = clf.predict(X[oob])
            for sample_idx, pred in zip(oob, preds):
                if sample_idx not in oob_preds:
                    oob_preds[sample_idx] = []
                oob_preds[sample_idx].append(pred)

        correct = 0
        total = 0
        for idx, preds in oob_preds.items():
            majority = Counter(preds).most_common(1)[0][0]
            if majority == y[idx]:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0.0

Explanation

Bootstrap sampling: for each estimator, draw n samples with replacement from the training set. About 36.8% of samples are left out (out-of-bag).
Training: fit an independent base classifier on each bootstrap sample.
Prediction: collect votes from all estimators and use majority voting.
OOB score: each sample is evaluated only by estimators that did not see it during training, providing a built-in cross-validation estimate.
Bagging reduces variance without increasing bias, working especially well with high-variance base learners like decision trees.

Complexity

Time: O(n_estimators * T_base) for training, where T_base is the base classifier training time
Space: O(n_estimators * S_base) where S_base is the base classifier size

← #306 #308 →