Source code for flameiq.engine.statistics

"""FlameIQ statistical comparison engine.

Provides optional statistical significance testing as a complement to the
threshold-based comparator. Use when benchmark noise is high and you need
confidence that a detected regression is real rather than noise.

Supported methods (v1.0)
------------------------

1. **Mann-Whitney U test** — non-parametric, distribution-free.
   Preferred for latency distributions, which are typically right-skewed.

2. **Median-based noise filter** — warmup-aware, stable central tendency.

All methods are **deterministic** given fixed inputs. No random seeds.

Mathematical specification
--------------------------
See :doc:`/specs/statistical-methodology` for the full specification.
"""

from __future__ import annotations

import logging
import math
from dataclasses import dataclass

import scipy as sp

from flameiq.core.errors import InsufficientSamplesError

logger = logging.getLogger(__name__)

#: Minimum samples required for any statistical test.
MINIMUM_SAMPLES: int = 3


[docs] @dataclass(frozen=True) class StatisticalResult: """Result of a statistical significance test.""" is_significant: bool """True if the difference is statistically significant.""" p_value: float """The test p-value. Lower = stronger evidence.""" effect_size: float """Cohen's *d* effect size. Positive = current > baseline.""" test_name: str """The test used (e.g. ``"Mann-Whitney U"``).""" confidence_level: float """The confidence level (default 0.95).""" @property def alpha(self) -> float: """Significance threshold α = 1 − confidence_level.""" return round(1.0 - self.confidence_level, 10) @property def effect_label(self) -> str: """Cohen (1988) verbal label for the effect size magnitude.""" d = abs(self.effect_size) if d < 0.2: return "negligible" if d < 0.5: return "small" if d < 0.8: return "medium" return "large"
[docs] def mann_whitney_compare( baseline_samples: list[float], current_samples: list[float], confidence: float = 0.95, minimum_samples: int = MINIMUM_SAMPLES, ) -> StatisticalResult: """Compare two sample sets using the Mann-Whitney U test. Tests the one-tailed hypothesis that the current distribution tends to produce **larger** values than the baseline distribution. This is the preferred test for latency distributions, which are typically right-skewed and non-normal. Args: baseline_samples: Measurements from the baseline run. current_samples: Measurements from the current run. confidence: Required confidence level. Default 0.95 (95%). minimum_samples: Minimum samples required in each group. Returns: A :class:`StatisticalResult` with significance, p-value, and effect size. Raises: :class:`~flameiq.core.errors.InsufficientSamplesError`: If either sample set has fewer than ``minimum_samples`` entries. References: Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of two random variables is stochastically larger than the other. *Annals of Mathematical Statistics*, 18(1), 50–60. """ if len(baseline_samples) < minimum_samples: raise InsufficientSamplesError("baseline", len(baseline_samples), minimum_samples) if len(current_samples) < minimum_samples: raise InsufficientSamplesError("current", len(current_samples), minimum_samples) alpha = 1.0 - confidence _, p_value_raw = sp.stats.mannwhitneyu( current_samples, baseline_samples, alternative="greater", ) p_value = float(p_value_raw) is_significant = bool(p_value < alpha) effect = _cohens_d(baseline_samples, current_samples) logger.debug( "Mann-Whitney U: p=%.6f significant=%s effect_size=%.4f alpha=%.4f", p_value, is_significant, effect, alpha, ) return StatisticalResult( is_significant=is_significant, p_value=round(p_value, 6), effect_size=round(effect, 4), test_name="Mann-Whitney U", confidence_level=confidence, )
[docs] def noise_filter_median(samples: list[float], warmup: int = 0) -> float: """Compute a noise-resistant median, optionally discarding warmup runs. Args: samples: Raw measurement samples (any order). warmup: Number of leading samples to discard as warmup. Default 0. Returns: Median of the remaining samples. Raises: ValueError: If no samples remain after the warmup discard. Notes: The median is more robust than the mean for noisy benchmark data with occasional outlier spikes. Examples:: noise_filter_median([1.0, 3.0, 5.0]) # → 3.0 noise_filter_median([99.0, 1.0, 3.0], warmup=1) # → 2.0 """ filtered = samples[warmup:] if not filtered: raise ValueError( f"No samples remain after discarding {warmup} warmup run(s) from {len(samples)} total." ) s = sorted(filtered) n = len(s) mid = n // 2 return s[mid] if n % 2 == 1 else (s[mid - 1] + s[mid]) / 2.0
def _cohens_d(group1: list[float], group2: list[float]) -> float: """Compute Cohen's *d* effect size. Cohen's d = (mean₂ − mean₁) / pooled_std Positive values indicate group2 > group1 (current > baseline). Returns ``0.0`` if pooled standard deviation is zero. References: Cohen, J. (1988). *Statistical Power Analysis for the Behavioral Sciences* (2nd ed.). Erlbaum. """ n1, n2 = len(group1), len(group2) mean1 = sum(group1) / n1 mean2 = sum(group2) / n2 var1 = (sum((x - mean1) ** 2 for x in group1) / (n1 - 1)) if n1 > 1 else 0.0 var2 = (sum((x - mean2) ** 2 for x in group2) / (n2 - 1)) if n2 > 1 else 0.0 pooled_std = math.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)) if pooled_std == 0.0: return 0.0 return (mean2 - mean1) / pooled_std