Implement Random Forest Feature Importance

#343 · Machine Learning · Medium

Problem

Implement Random Forest feature importance using the mean decrease in impurity (Gini importance). Build an ensemble of decision trees and compute how much each feature contributes to reducing impurity across all trees.

Solution

import numpy as np
from typing import Dict, List, Tuple

def gini_impurity(y: np.ndarray) -> float:
    if len(y) == 0:
        return 0.0
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return 1.0 - np.sum(probs ** 2)

def best_split(X: np.ndarray, y: np.ndarray, feature_indices: np.ndarray) -> Tuple:
    best_feat, best_thresh, best_gain = -1, 0.0, 0.0
    n = len(y)
    parent_impurity = gini_impurity(y)

    for feat in feature_indices:
        thresholds = np.unique(X[:, feat])
        for thresh in thresholds:
            left_mask = X[:, feat] <= thresh
            right_mask = ~left_mask
            if left_mask.sum() == 0 or right_mask.sum() == 0:
                continue

            left_imp = gini_impurity(y[left_mask])
            right_imp = gini_impurity(y[right_mask])
            weighted = (left_mask.sum() * left_imp + right_mask.sum() * right_imp) / n
            gain = parent_impurity - weighted

            if gain > best_gain:
                best_gain = gain
                best_feat = feat
                best_thresh = thresh

    return best_feat, best_thresh, best_gain

def build_tree(X, y, max_depth, feature_subset_size, importances, n_samples):
    if max_depth == 0 or len(np.unique(y)) == 1:
        return

    feat_indices = np.random.choice(X.shape[1], feature_subset_size, replace=False)
    feat, thresh, gain = best_split(X, y, feat_indices)

    if feat == -1:
        return

    importances[feat] += gain * len(y) / n_samples

    left_mask = X[:, feat] <= thresh
    build_tree(X[left_mask], y[left_mask], max_depth - 1, feature_subset_size, importances, n_samples)
    build_tree(X[~left_mask], y[~left_mask], max_depth - 1, feature_subset_size, importances, n_samples)

def random_forest_importance(
    X: np.ndarray,
    y: np.ndarray,
    n_trees: int = 10,
    max_depth: int = 5,
    seed: int = 42
) -> Dict[int, float]:
    np.random.seed(seed)
    n_samples, n_features = X.shape
    feature_subset_size = max(1, int(np.sqrt(n_features)))
    total_importances = np.zeros(n_features)

    for _ in range(n_trees):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        X_boot, y_boot = X[indices], y[indices]
        tree_imp = np.zeros(n_features)
        build_tree(X_boot, y_boot, max_depth, feature_subset_size, tree_imp, n_samples)
        total_importances += tree_imp

    total_importances /= n_trees
    total = total_importances.sum()
    if total > 0:
        total_importances /= total

    return {i: round(float(v), 4) for i, v in enumerate(total_importances)}

Explanation

For each tree, bootstrap sample the data and build a decision tree considering a random subset of features at each split (sqrt(n_features)).
At each split, record the weighted impurity decrease, scaled by the fraction of samples reaching that node.
Accumulate importances across all trees and normalize so they sum to 1.
Features with higher importance contribute more to reducing Gini impurity on average.

Complexity

Time: O(T n f * d) where T is trees, n is samples, f is feature subset size, d is max depth
Space: O(n p + T p) where p is the number of features

← #342 #344 →