"""FlameIQ statistical comparison engine.
Provides optional statistical significance testing as a complement to the
threshold-based comparator. Use when benchmark noise is high and you need
confidence that a detected regression is real rather than noise.
Supported methods (v1.0)
------------------------
1. **Mann-Whitney U test** — non-parametric, distribution-free.
Preferred for latency distributions, which are typically right-skewed.
2. **Median-based noise filter** — warmup-aware, stable central tendency.
All methods are **deterministic** given fixed inputs. No random seeds.
Mathematical specification
--------------------------
See :doc:`/specs/statistical-methodology` for the full specification.
"""
from __future__ import annotations
import logging
import math
from dataclasses import dataclass
import scipy as sp
from flameiq.core.errors import InsufficientSamplesError
logger = logging.getLogger(__name__)
#: Minimum samples required for any statistical test.
MINIMUM_SAMPLES: int = 3
[docs]
@dataclass(frozen=True)
class StatisticalResult:
"""Result of a statistical significance test."""
is_significant: bool
"""True if the difference is statistically significant."""
p_value: float
"""The test p-value. Lower = stronger evidence."""
effect_size: float
"""Cohen's *d* effect size. Positive = current > baseline."""
test_name: str
"""The test used (e.g. ``"Mann-Whitney U"``)."""
confidence_level: float
"""The confidence level (default 0.95)."""
@property
def alpha(self) -> float:
"""Significance threshold α = 1 − confidence_level."""
return round(1.0 - self.confidence_level, 10)
@property
def effect_label(self) -> str:
"""Cohen (1988) verbal label for the effect size magnitude."""
d = abs(self.effect_size)
if d < 0.2:
return "negligible"
if d < 0.5:
return "small"
if d < 0.8:
return "medium"
return "large"
[docs]
def mann_whitney_compare(
baseline_samples: list[float],
current_samples: list[float],
confidence: float = 0.95,
minimum_samples: int = MINIMUM_SAMPLES,
) -> StatisticalResult:
"""Compare two sample sets using the Mann-Whitney U test.
Tests the one-tailed hypothesis that the current distribution tends to
produce **larger** values than the baseline distribution.
This is the preferred test for latency distributions, which are
typically right-skewed and non-normal.
Args:
baseline_samples: Measurements from the baseline run.
current_samples: Measurements from the current run.
confidence: Required confidence level. Default 0.95 (95%).
minimum_samples: Minimum samples required in each group.
Returns:
A :class:`StatisticalResult` with significance, p-value, and effect size.
Raises:
:class:`~flameiq.core.errors.InsufficientSamplesError`:
If either sample set has fewer than ``minimum_samples`` entries.
References:
Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of
two random variables is stochastically larger than the other.
*Annals of Mathematical Statistics*, 18(1), 50–60.
"""
if len(baseline_samples) < minimum_samples:
raise InsufficientSamplesError("baseline", len(baseline_samples), minimum_samples)
if len(current_samples) < minimum_samples:
raise InsufficientSamplesError("current", len(current_samples), minimum_samples)
alpha = 1.0 - confidence
_, p_value_raw = sp.stats.mannwhitneyu(
current_samples,
baseline_samples,
alternative="greater",
)
p_value = float(p_value_raw)
is_significant = bool(p_value < alpha)
effect = _cohens_d(baseline_samples, current_samples)
logger.debug(
"Mann-Whitney U: p=%.6f significant=%s effect_size=%.4f alpha=%.4f",
p_value,
is_significant,
effect,
alpha,
)
return StatisticalResult(
is_significant=is_significant,
p_value=round(p_value, 6),
effect_size=round(effect, 4),
test_name="Mann-Whitney U",
confidence_level=confidence,
)
def _cohens_d(group1: list[float], group2: list[float]) -> float:
"""Compute Cohen's *d* effect size.
Cohen's d = (mean₂ − mean₁) / pooled_std
Positive values indicate group2 > group1 (current > baseline).
Returns ``0.0`` if pooled standard deviation is zero.
References:
Cohen, J. (1988). *Statistical Power Analysis for the Behavioral
Sciences* (2nd ed.). Erlbaum.
"""
n1, n2 = len(group1), len(group2)
mean1 = sum(group1) / n1
mean2 = sum(group2) / n2
var1 = (sum((x - mean1) ** 2 for x in group1) / (n1 - 1)) if n1 > 1 else 0.0
var2 = (sum((x - mean2) ** 2 for x in group2) / (n2 - 1)) if n2 > 1 else 0.0
pooled_std = math.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
if pooled_std == 0.0:
return 0.0
return (mean2 - mean1) / pooled_std