← back

Implement Bagging Classifier from Scratch

#307 · Machine Learning · Hard

⊣ Solve on deep-ml.com

Problem

Implement a bagging (Bootstrap Aggregating) classifier from scratch. Train multiple base classifiers on bootstrap samples of the training data and aggregate predictions using majority voting.

Solution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
from collections import Counter

class BaggingClassifier:
    def __init__(self, base_classifier_class, n_estimators: int = 10,
                 sample_fraction: float = 1.0, **base_params):
        self.base_classifier_class = base_classifier_class
        self.n_estimators = n_estimators
        self.sample_fraction = sample_fraction
        self.base_params = base_params
        self.estimators = []
        self.oob_indices = []

    def _bootstrap_sample(self, X: np.ndarray, y: np.ndarray):
        n = len(y)
        sample_size = int(n * self.sample_fraction)
        indices = np.random.choice(n, size=sample_size, replace=True)
        oob = list(set(range(n)) - set(indices))
        return X[indices], y[indices], oob

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.estimators = []
        self.oob_indices = []

        for _ in range(self.n_estimators):
            X_boot, y_boot, oob = self._bootstrap_sample(X, y)
            clf = self.base_classifier_class(**self.base_params)
            clf.fit(X_boot, y_boot)
            self.estimators.append(clf)
            self.oob_indices.append(oob)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        all_preds = np.array([clf.predict(X) for clf in self.estimators])
        # all_preds shape: (n_estimators, n_samples)
        final_preds = []
        for i in range(X.shape[0]):
            votes = all_preds[:, i].tolist()
            counter = Counter(votes)
            final_preds.append(counter.most_common(1)[0][0])
        return np.array(final_preds)

    def oob_score(self, X: np.ndarray, y: np.ndarray) -> float:
        n = len(y)
        oob_preds = {}
        for est_idx, (clf, oob) in enumerate(zip(self.estimators, self.oob_indices)):
            if len(oob) == 0:
                continue
            preds = clf.predict(X[oob])
            for sample_idx, pred in zip(oob, preds):
                if sample_idx not in oob_preds:
                    oob_preds[sample_idx] = []
                oob_preds[sample_idx].append(pred)

        correct = 0
        total = 0
        for idx, preds in oob_preds.items():
            majority = Counter(preds).most_common(1)[0][0]
            if majority == y[idx]:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0.0

Explanation

  1. Bootstrap sampling: for each estimator, draw n samples with replacement from the training set. About 36.8% of samples are left out (out-of-bag).
  2. Training: fit an independent base classifier on each bootstrap sample.
  3. Prediction: collect votes from all estimators and use majority voting.
  4. OOB score: each sample is evaluated only by estimators that did not see it during training, providing a built-in cross-validation estimate.
  5. Bagging reduces variance without increasing bias, working especially well with high-variance base learners like decision trees.

Complexity

  • Time: O(n_estimators * T_base) for training, where T_base is the base classifier training time
  • Space: O(n_estimators * S_base) where S_base is the base classifier size