Perform a rigorous A/B test statistical analysis to compare two ML models. Given performance metrics from both models (e.g., accuracy, latency), compute the statistical significance, confidence intervals, effect size, and recommend whether to deploy the challenger model.
Use a two-sample t-test (Welch's) for each metric, compute Cohen's d effect size, and build confidence intervals.
import math
def ab_test_analysis(
control: list[float],
treatment: list[float],
alpha: float = 0.05,
min_effect_size: float = 0.0,
) -> dict:
n_c = len(control)
n_t = len(treatment)
mean_c = sum(control) / n_c
mean_t = sum(treatment) / n_t
var_c = sum((x - mean_c) ** 2 for x in control) / (n_c - 1) if n_c > 1 else 0
var_t = sum((x - mean_t) ** 2 for x in treatment) / (n_t - 1) if n_t > 1 else 0
# Welch's t-test
se = math.sqrt(var_c / n_c + var_t / n_t) if (var_c + var_t) > 0 else 1e-12
t_stat = (mean_t - mean_c) / se
# Welch-Satterthwaite degrees of freedom
num = (var_c / n_c + var_t / n_t) ** 2
denom = ((var_c / n_c) ** 2 / (n_c - 1) + (var_t / n_t) ** 2 / (n_t - 1)) if (n_c > 1 and n_t > 1) else 1
df = num / denom if denom > 0 else 1.0
# Approximate two-tailed p-value using normal approximation for large df
# For a more accurate result we use a simple approximation
z = abs(t_stat)
# Approximation of 2-tailed p-value from t-distribution (normal approx for df > 30)
if df > 30:
p_value = 2.0 * (1.0 - _normal_cdf(z))
else:
p_value = 2.0 * (1.0 - _t_cdf(z, df))
p_value = min(p_value, 1.0)
# Cohen's d effect size
pooled_std = math.sqrt(((n_c - 1) * var_c + (n_t - 1) * var_t) / (n_c + n_t - 2)) if (n_c + n_t > 2) else 1e-12
cohens_d = (mean_t - mean_c) / pooled_std if pooled_std > 0 else 0.0
# 95% confidence interval for the difference
t_crit = _t_critical(alpha, df)
ci_lower = (mean_t - mean_c) - t_crit * se
ci_upper = (mean_t - mean_c) + t_crit * se
significant = p_value < alpha
practically_significant = abs(cohens_d) >= min_effect_size if min_effect_size > 0 else True
recommend_deploy = significant and (mean_t > mean_c) and practically_significant
return {
"control_mean": round(mean_c, 6),
"treatment_mean": round(mean_t, 6),
"difference": round(mean_t - mean_c, 6),
"t_statistic": round(t_stat, 6),
"p_value": round(p_value, 6),
"degrees_of_freedom": round(df, 2),
"cohens_d": round(cohens_d, 6),
"ci_lower": round(ci_lower, 6),
"ci_upper": round(ci_upper, 6),
"significant": significant,
"recommend_deploy": recommend_deploy,
}
def _normal_cdf(x: float) -> float:
return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))
def _t_cdf(t: float, df: float) -> float:
# Regularized incomplete beta approximation
x = df / (df + t * t)
return 1.0 - 0.5 * _incomplete_beta(df / 2.0, 0.5, x)
def _incomplete_beta(a: float, b: float, x: float) -> float:
# Simple continued fraction approximation (Lentz's method)
if x < 0 or x > 1:
return 0.0
if x == 0 or x == 1:
return x
# Use the regularized incomplete beta via series expansion
lbeta = math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
front = math.exp(a * math.log(x) + b * math.log(1 - x) - lbeta) / a
# Series expansion
s = 1.0
term = 1.0
for n in range(1, 200):
term *= (n - b) * x / (n * (a + n))
s += term
if abs(term) < 1e-10:
break
return min(max(front * a * s, 0.0), 1.0)
def _t_critical(alpha: float, df: float) -> float:
# Approximate t-critical using normal approximation + correction
# For df > 30, use normal; otherwise a simple polynomial correction
z = _normal_ppf(1.0 - alpha / 2.0)
if df > 30:
return z
g1 = (z ** 3 + z) / 4
g2 = (5 * z ** 5 + 16 * z ** 3 + 3 * z) / 96
return z + g1 / df + g2 / (df ** 2)
def _normal_ppf(p: float) -> float:
# Rational approximation for the inverse normal CDF
if p <= 0:
return -10.0
if p >= 1:
return 10.0
if p < 0.5:
return -_normal_ppf(1.0 - p)
t = math.sqrt(-2.0 * math.log(1.0 - p))
c0, c1, c2 = 2.515517, 0.802853, 0.010328
d1, d2, d3 = 1.432788, 0.189269, 0.001308
return t - (c0 + c1 * t + c2 * t ** 2) / (1.0 + d1 * t + d2 * t ** 2 + d3 * t ** 3)