Source code for polyzymd.compare.statistics

"""Statistical tests for comparing simulation conditions.

This module provides statistical functions for comparing analysis results
across multiple conditions, including t-tests, ANOVA, and effect sizes.

All functions use SciPy for statistical calculations.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Sequence

import numpy as np
from numpy.typing import ArrayLike


[docs] @dataclass class TTestResult: """Result of a two-sample t-test. Attributes ---------- t_statistic : float The t-statistic p_value : float Two-tailed p-value """ t_statistic: float p_value: float @property def significant(self) -> bool: """Whether the result is significant at p < 0.05.""" return self.p_value < 0.05
[docs] def to_dict(self) -> dict: """Convert to dictionary.""" return { "t_statistic": self.t_statistic, "p_value": self.p_value, "significant": self.significant, }
[docs] @dataclass class EffectSize: """Cohen's d effect size with interpretation. Attributes ---------- cohens_d : float The effect size (positive = group1 > group2) interpretation : str Categorical interpretation: "negligible", "small", "medium", "large" direction : str For RMSF: "stabilizing" (d > 0, lower RMSF) or "destabilizing" (d < 0) """ cohens_d: float interpretation: str direction: str
[docs] def to_dict(self) -> dict: """Convert to dictionary.""" return { "cohens_d": self.cohens_d, "interpretation": self.interpretation, "direction": self.direction, }
[docs] @dataclass class ANOVAResult: """Result of one-way ANOVA. Attributes ---------- f_statistic : float The F-statistic p_value : float P-value for the test """ f_statistic: float p_value: float @property def significant(self) -> bool: """Whether the result is significant at p < 0.05.""" return self.p_value < 0.05
[docs] def to_dict(self) -> dict: """Convert to dictionary.""" return { "f_statistic": self.f_statistic, "p_value": self.p_value, "significant": self.significant, }
[docs] def independent_ttest( group1: ArrayLike, group2: ArrayLike, ) -> TTestResult: """Perform two-sample independent t-test. Tests the null hypothesis that two independent samples have identical expected values. Parameters ---------- group1 : array_like First group of values (e.g., control replicate means) group2 : array_like Second group of values (e.g., treatment replicate means) Returns ------- TTestResult Result containing t-statistic and p-value Examples -------- >>> control = [0.715, 0.693, 0.696] # No polymer RMSF >>> treatment = [0.517, 0.586] # 100% SBMA RMSF >>> result = independent_ttest(control, treatment) >>> print(f"t = {result.t_statistic:.3f}, p = {result.p_value:.4f}") """ from scipy import stats g1 = np.asarray(group1, dtype=np.float64) g2 = np.asarray(group2, dtype=np.float64) t, p = stats.ttest_ind(g1, g2) return TTestResult( t_statistic=float(t), p_value=float(p), )
[docs] def cohens_d( group1: ArrayLike, group2: ArrayLike, rmsf_mode: bool = True, ) -> EffectSize: """Compute Cohen's d effect size. Cohen's d is the difference between means divided by the pooled standard deviation. A positive d means group1 has higher values. For RMSF comparisons (rmsf_mode=True), direction is interpreted as: - d > 0 (control > treatment) = "stabilizing" (treatment reduces RMSF) - d < 0 (control < treatment) = "destabilizing" (treatment increases RMSF) Parameters ---------- group1 : array_like First group (typically control) group2 : array_like Second group (typically treatment) rmsf_mode : bool, optional If True, interpret direction for RMSF (lower = better). Default is True. Returns ------- EffectSize Effect size with interpretation Notes ----- Effect size interpretation (Cohen, 1988): - |d| < 0.2: negligible - 0.2 <= |d| < 0.5: small - 0.5 <= |d| < 0.8: medium - |d| >= 0.8: large """ g1 = np.asarray(group1, dtype=np.float64) g2 = np.asarray(group2, dtype=np.float64) n1, n2 = len(g1), len(g2) if n1 < 2 or n2 < 2: # Can't compute pooled std with < 2 samples d = 0.0 else: var1 = np.var(g1, ddof=1) var2 = np.var(g2, ddof=1) # Pooled standard deviation pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)) if pooled_std > 0: d = float((np.mean(g1) - np.mean(g2)) / pooled_std) else: d = 0.0 # Interpret magnitude d_abs = abs(d) if d_abs < 0.2: interpretation = "negligible" elif d_abs < 0.5: interpretation = "small" elif d_abs < 0.8: interpretation = "medium" else: interpretation = "large" # Interpret direction for RMSF if rmsf_mode: # For RMSF: positive d means control > treatment, so treatment stabilizes direction = "stabilizing" if d > 0 else "destabilizing" else: direction = "higher" if d > 0 else "lower" return EffectSize( cohens_d=d, interpretation=interpretation, direction=direction, )
[docs] def one_way_anova(*groups: ArrayLike) -> ANOVAResult: """Perform one-way ANOVA across multiple groups. Tests the null hypothesis that all groups have the same mean. Parameters ---------- *groups : array_like Variable number of groups to compare Returns ------- ANOVAResult Result containing F-statistic and p-value Examples -------- >>> no_poly = [0.715, 0.693, 0.696] >>> sbma = [0.517, 0.586] >>> egma = [0.558, 0.738, 0.496] >>> result = one_way_anova(no_poly, sbma, egma) >>> print(f"F = {result.f_statistic:.3f}, p = {result.p_value:.4f}") """ from scipy import stats # Convert to numpy arrays arrays = [np.asarray(g, dtype=np.float64) for g in groups] f, p = stats.f_oneway(*arrays) return ANOVAResult( f_statistic=float(f), p_value=float(p), )
[docs] def percent_change(control_mean: float, treatment_mean: float) -> float: """Calculate percent change from control. Parameters ---------- control_mean : float Mean value of control condition treatment_mean : float Mean value of treatment condition Returns ------- float Percent change: (treatment - control) / control * 100 Negative = reduction, Positive = increase """ if control_mean == 0: return 0.0 return (treatment_mean - control_mean) / control_mean * 100