#343 · Machine Learning · Medium
⊣ Solve on deep-ml.comImplement Random Forest feature importance using the mean decrease in impurity (Gini importance). Build an ensemble of decision trees and compute how much each feature contributes to reducing impurity across all trees.
import numpy as np
from typing import Dict, List, Tuple
def gini_impurity(y: np.ndarray) -> float:
if len(y) == 0:
return 0.0
classes, counts = np.unique(y, return_counts=True)
probs = counts / len(y)
return 1.0 - np.sum(probs ** 2)
def best_split(X: np.ndarray, y: np.ndarray, feature_indices: np.ndarray) -> Tuple:
best_feat, best_thresh, best_gain = -1, 0.0, 0.0
n = len(y)
parent_impurity = gini_impurity(y)
for feat in feature_indices:
thresholds = np.unique(X[:, feat])
for thresh in thresholds:
left_mask = X[:, feat] <= thresh
right_mask = ~left_mask
if left_mask.sum() == 0 or right_mask.sum() == 0:
continue
left_imp = gini_impurity(y[left_mask])
right_imp = gini_impurity(y[right_mask])
weighted = (left_mask.sum() * left_imp + right_mask.sum() * right_imp) / n
gain = parent_impurity - weighted
if gain > best_gain:
best_gain = gain
best_feat = feat
best_thresh = thresh
return best_feat, best_thresh, best_gain
def build_tree(X, y, max_depth, feature_subset_size, importances, n_samples):
if max_depth == 0 or len(np.unique(y)) == 1:
return
feat_indices = np.random.choice(X.shape[1], feature_subset_size, replace=False)
feat, thresh, gain = best_split(X, y, feat_indices)
if feat == -1:
return
importances[feat] += gain * len(y) / n_samples
left_mask = X[:, feat] <= thresh
build_tree(X[left_mask], y[left_mask], max_depth - 1, feature_subset_size, importances, n_samples)
build_tree(X[~left_mask], y[~left_mask], max_depth - 1, feature_subset_size, importances, n_samples)
def random_forest_importance(
X: np.ndarray,
y: np.ndarray,
n_trees: int = 10,
max_depth: int = 5,
seed: int = 42
) -> Dict[int, float]:
np.random.seed(seed)
n_samples, n_features = X.shape
feature_subset_size = max(1, int(np.sqrt(n_features)))
total_importances = np.zeros(n_features)
for _ in range(n_trees):
indices = np.random.choice(n_samples, n_samples, replace=True)
X_boot, y_boot = X[indices], y[indices]
tree_imp = np.zeros(n_features)
build_tree(X_boot, y_boot, max_depth, feature_subset_size, tree_imp, n_samples)
total_importances += tree_imp
total_importances /= n_trees
total = total_importances.sum()
if total > 0:
total_importances /= total
return {i: round(float(v), 4) for i, v in enumerate(total_importances)}