← back

Implement Random Forest Feature Importance

#343 · Machine Learning · Medium

⊣ Solve on deep-ml.com

Problem

Implement Random Forest feature importance using the mean decrease in impurity (Gini importance). Build an ensemble of decision trees and compute how much each feature contributes to reducing impurity across all trees.

Solution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
from typing import Dict, List, Tuple

def gini_impurity(y: np.ndarray) -> float:
    if len(y) == 0:
        return 0.0
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return 1.0 - np.sum(probs ** 2)

def best_split(X: np.ndarray, y: np.ndarray, feature_indices: np.ndarray) -> Tuple:
    best_feat, best_thresh, best_gain = -1, 0.0, 0.0
    n = len(y)
    parent_impurity = gini_impurity(y)

    for feat in feature_indices:
        thresholds = np.unique(X[:, feat])
        for thresh in thresholds:
            left_mask = X[:, feat] <= thresh
            right_mask = ~left_mask
            if left_mask.sum() == 0 or right_mask.sum() == 0:
                continue

            left_imp = gini_impurity(y[left_mask])
            right_imp = gini_impurity(y[right_mask])
            weighted = (left_mask.sum() * left_imp + right_mask.sum() * right_imp) / n
            gain = parent_impurity - weighted

            if gain > best_gain:
                best_gain = gain
                best_feat = feat
                best_thresh = thresh

    return best_feat, best_thresh, best_gain

def build_tree(X, y, max_depth, feature_subset_size, importances, n_samples):
    if max_depth == 0 or len(np.unique(y)) == 1:
        return

    feat_indices = np.random.choice(X.shape[1], feature_subset_size, replace=False)
    feat, thresh, gain = best_split(X, y, feat_indices)

    if feat == -1:
        return

    importances[feat] += gain * len(y) / n_samples

    left_mask = X[:, feat] <= thresh
    build_tree(X[left_mask], y[left_mask], max_depth - 1, feature_subset_size, importances, n_samples)
    build_tree(X[~left_mask], y[~left_mask], max_depth - 1, feature_subset_size, importances, n_samples)

def random_forest_importance(
    X: np.ndarray,
    y: np.ndarray,
    n_trees: int = 10,
    max_depth: int = 5,
    seed: int = 42
) -> Dict[int, float]:
    np.random.seed(seed)
    n_samples, n_features = X.shape
    feature_subset_size = max(1, int(np.sqrt(n_features)))
    total_importances = np.zeros(n_features)

    for _ in range(n_trees):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        X_boot, y_boot = X[indices], y[indices]
        tree_imp = np.zeros(n_features)
        build_tree(X_boot, y_boot, max_depth, feature_subset_size, tree_imp, n_samples)
        total_importances += tree_imp

    total_importances /= n_trees
    total = total_importances.sum()
    if total > 0:
        total_importances /= total

    return {i: round(float(v), 4) for i, v in enumerate(total_importances)}

Explanation

  1. For each tree, bootstrap sample the data and build a decision tree considering a random subset of features at each split (sqrt(n_features)).
  2. At each split, record the weighted impurity decrease, scaled by the fraction of samples reaching that node.
  3. Accumulate importances across all trees and normalize so they sum to 1.
  4. Features with higher importance contribute more to reducing Gini impurity on average.

Complexity

  • Time: O(T n f * d) where T is trees, n is samples, f is feature subset size, d is max depth
  • Space: O(n p + T p) where p is the number of features