← back

A/B Test Statistical Analysis for Model Comparison

#269 · MLOps · Hard

⊣ Solve on deep-ml.com

Problem

Perform a rigorous A/B test statistical analysis to compare two ML models. Given performance metrics from both models (e.g., accuracy, latency), compute the statistical significance, confidence intervals, effect size, and recommend whether to deploy the challenger model.

Solution

Use a two-sample t-test (Welch's) for each metric, compute Cohen's d effect size, and build confidence intervals.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import math

def ab_test_analysis(
    control: list[float],
    treatment: list[float],
    alpha: float = 0.05,
    min_effect_size: float = 0.0,
) -> dict:
    n_c = len(control)
    n_t = len(treatment)

    mean_c = sum(control) / n_c
    mean_t = sum(treatment) / n_t

    var_c = sum((x - mean_c) ** 2 for x in control) / (n_c - 1) if n_c > 1 else 0
    var_t = sum((x - mean_t) ** 2 for x in treatment) / (n_t - 1) if n_t > 1 else 0

    # Welch's t-test
    se = math.sqrt(var_c / n_c + var_t / n_t) if (var_c + var_t) > 0 else 1e-12
    t_stat = (mean_t - mean_c) / se

    # Welch-Satterthwaite degrees of freedom
    num = (var_c / n_c + var_t / n_t) ** 2
    denom = ((var_c / n_c) ** 2 / (n_c - 1) + (var_t / n_t) ** 2 / (n_t - 1)) if (n_c > 1 and n_t > 1) else 1
    df = num / denom if denom > 0 else 1.0

    # Approximate two-tailed p-value using normal approximation for large df
    # For a more accurate result we use a simple approximation
    z = abs(t_stat)
    # Approximation of 2-tailed p-value from t-distribution (normal approx for df > 30)
    if df > 30:
        p_value = 2.0 * (1.0 - _normal_cdf(z))
    else:
        p_value = 2.0 * (1.0 - _t_cdf(z, df))

    p_value = min(p_value, 1.0)

    # Cohen's d effect size
    pooled_std = math.sqrt(((n_c - 1) * var_c + (n_t - 1) * var_t) / (n_c + n_t - 2)) if (n_c + n_t > 2) else 1e-12
    cohens_d = (mean_t - mean_c) / pooled_std if pooled_std > 0 else 0.0

    # 95% confidence interval for the difference
    t_crit = _t_critical(alpha, df)
    ci_lower = (mean_t - mean_c) - t_crit * se
    ci_upper = (mean_t - mean_c) + t_crit * se

    significant = p_value < alpha
    practically_significant = abs(cohens_d) >= min_effect_size if min_effect_size > 0 else True
    recommend_deploy = significant and (mean_t > mean_c) and practically_significant

    return {
        "control_mean": round(mean_c, 6),
        "treatment_mean": round(mean_t, 6),
        "difference": round(mean_t - mean_c, 6),
        "t_statistic": round(t_stat, 6),
        "p_value": round(p_value, 6),
        "degrees_of_freedom": round(df, 2),
        "cohens_d": round(cohens_d, 6),
        "ci_lower": round(ci_lower, 6),
        "ci_upper": round(ci_upper, 6),
        "significant": significant,
        "recommend_deploy": recommend_deploy,
    }


def _normal_cdf(x: float) -> float:
    return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))


def _t_cdf(t: float, df: float) -> float:
    # Regularized incomplete beta approximation
    x = df / (df + t * t)
    return 1.0 - 0.5 * _incomplete_beta(df / 2.0, 0.5, x)


def _incomplete_beta(a: float, b: float, x: float) -> float:
    # Simple continued fraction approximation (Lentz's method)
    if x < 0 or x > 1:
        return 0.0
    if x == 0 or x == 1:
        return x
    # Use the regularized incomplete beta via series expansion
    lbeta = math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
    front = math.exp(a * math.log(x) + b * math.log(1 - x) - lbeta) / a
    # Series expansion
    s = 1.0
    term = 1.0
    for n in range(1, 200):
        term *= (n - b) * x / (n * (a + n))
        s += term
        if abs(term) < 1e-10:
            break
    return min(max(front * a * s, 0.0), 1.0)


def _t_critical(alpha: float, df: float) -> float:
    # Approximate t-critical using normal approximation + correction
    # For df > 30, use normal; otherwise a simple polynomial correction
    z = _normal_ppf(1.0 - alpha / 2.0)
    if df > 30:
        return z
    g1 = (z ** 3 + z) / 4
    g2 = (5 * z ** 5 + 16 * z ** 3 + 3 * z) / 96
    return z + g1 / df + g2 / (df ** 2)


def _normal_ppf(p: float) -> float:
    # Rational approximation for the inverse normal CDF
    if p <= 0:
        return -10.0
    if p >= 1:
        return 10.0
    if p < 0.5:
        return -_normal_ppf(1.0 - p)
    t = math.sqrt(-2.0 * math.log(1.0 - p))
    c0, c1, c2 = 2.515517, 0.802853, 0.010328
    d1, d2, d3 = 1.432788, 0.189269, 0.001308
    return t - (c0 + c1 * t + c2 * t ** 2) / (1.0 + d1 * t + d2 * t ** 2 + d3 * t ** 3)

Explanation

  1. Compute sample means and variances for both control and treatment groups.
  2. Apply Welch's t-test which does not assume equal variances. The t-statistic measures the standardized difference.
  3. Estimate the degrees of freedom using the Welch-Satterthwaite equation.
  4. Compute a p-value (two-tailed) to test whether the difference is statistically significant.
  5. Calculate Cohen's d effect size to measure practical significance.
  6. Build a confidence interval for the true difference in means.
  7. Recommend deployment if the difference is both statistically and practically significant.

Complexity

  • Time: O(n) where n is the total number of observations
  • Space: O(1) beyond the input data