Source code for polyzymd.compare.comparators.exposure

"""Exposure dynamics comparator for chaperone-like polymer-protein interaction analysis.

This module provides ExposureDynamicsComparator, which orchestrates:
1. SASA computation (MDTraj shrake_rupley, protein-only)
2. Exposure dynamics analysis (classify residues, detect chaperone events)
3. Chaperone enrichment (dual residue/atom normalization)
4. Statistical comparison of chaperone fraction across conditions

Design follows the ContactsComparator pattern:
- compare() is fully overridden (custom multi-metric flow)
- _load_or_compute() handles caching at replicate level
- Condition summaries aggregate per-replicate ExposureDynamicsResults

Registration: ``@ComparatorRegistry.register("exposure")``
"""

from __future__ import annotations

import logging
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar

import numpy as np

from polyzymd import __version__
from polyzymd.analysis.core.metric_type import MetricType
from polyzymd.analysis.core.statistics import compute_sem
from polyzymd.compare.core.base import ANOVASummary, BaseComparator, PairwiseComparison
from polyzymd.compare.core.registry import ComparatorRegistry
from polyzymd.compare.results.exposure import ExposureComparisonResult, ExposureConditionSummary
from polyzymd.compare.settings import ExposureAnalysisSettings, ExposureComparisonSettings
from polyzymd.compare.statistics import cohens_d, independent_ttest, one_way_anova, percent_change

if TYPE_CHECKING:
    from polyzymd.analysis.contacts.results import ContactResult
    from polyzymd.analysis.exposure.dynamics import ExposureDynamicsResult
    from polyzymd.analysis.exposure.enrichment import ChaperoneEnrichmentResult
    from polyzymd.analysis.sasa.trajectory import SASATrajectoryResult
    from polyzymd.compare.config import ComparisonConfig, ConditionConfig

logger = logging.getLogger("polyzymd.compare")

# Type alias for per-condition raw data dict
ExposureConditionData = dict[str, Any]


[docs] @ComparatorRegistry.register("exposure") class ExposureDynamicsComparator( BaseComparator[ ExposureAnalysisSettings, ExposureConditionData, ExposureConditionSummary, ExposureComparisonResult, ] ): """Compare chaperone-like polymer activity across simulation conditions. Combines per-frame SASA data with polymer-protein contact data to: 1. Classify each protein residue as stably exposed, stably buried, or transiently exposed. 2. Detect "chaperone events" (buried → exposed → polymer contact → re-buried) and unassisted refolding events. 3. Compute dynamic chaperone enrichment per (polymer_type, aa_group) pair with dual residue/atom normalization. 4. Statistically compare chaperone_fraction across conditions. Parameters ---------- config : ComparisonConfig Comparison configuration defining conditions. analysis_settings : ExposureAnalysisSettings Settings defining SASA and exposure parameters. comparison_settings : ExposureComparisonSettings, optional Settings for statistical comparison. Defaults to ExposureComparisonSettings() if not provided. equilibration : str, optional Equilibration time override. If None, uses config.defaults.equilibration_time. Notes ----- - This is a MEAN_BASED metric (chaperone fraction is an average over frames). - Conditions without polymer (no chaperone events possible) are excluded. - Contacts must be pre-computed (contacts_rep{n}.json must exist). - SASA is computed on demand and cached under analysis_dir/sasa/. """ comparison_type: ClassVar[str] = "exposure"
[docs] def __init__( self, config: "ComparisonConfig", analysis_settings: ExposureAnalysisSettings, comparison_settings: ExposureComparisonSettings | None = None, equilibration: str | None = None, ): super().__init__(config, analysis_settings, equilibration) self.comparison_settings = comparison_settings or ExposureComparisonSettings()
[docs] @classmethod def comparison_type_name(cls) -> str: """Return the comparison type identifier.""" return "exposure"
@property def metric_type(self) -> MetricType: """Chaperone fraction is a mean-based metric. Chaperone fraction is the fraction of exposed windows that coincide with polymer contact — an average over discrete events. The mean converges regardless of autocorrelation; uncertainty is corrected using N_eff. Returns ------- MetricType MetricType.MEAN_BASED """ return MetricType.MEAN_BASED # ======================================================================== # Override compare() — custom multi-metric flow # ========================================================================
[docs] def compare(self, recompute: bool = False) -> ExposureComparisonResult: """Run exposure dynamics comparison across all conditions. Parameters ---------- recompute : bool, optional If True, force recompute even if cached results exist. Returns ------- ExposureComparisonResult Complete comparison with statistics and rankings. """ self._recompute = recompute logger.info(f"Starting exposure dynamics comparison: {self.config.name}") logger.info(f"Conditions: {len(self.config.conditions)}") logger.info(f"Equilibration: {self.equilibration}") # Step 1: Filter conditions (exclude no-polymer) valid_conditions, excluded_conditions = self._filter_conditions() if excluded_conditions: logger.warning( f"Auto-excluding {len(excluded_conditions)} condition(s) without polymer: " f"{[c.label for c in excluded_conditions]}" ) if len(valid_conditions) < 2: raise ValueError( f"Need at least 2 conditions with polymer for exposure comparison. " f"Found {len(valid_conditions)} valid condition(s)." ) # Step 2: Load or compute for each condition condition_data: list[tuple["ConditionConfig", ExposureConditionData]] = [] for cond in valid_conditions: data = self._load_or_compute(cond, recompute) condition_data.append((cond, data)) # Step 3: Build condition summaries summaries: list[ExposureConditionSummary] = [] for cond, data in condition_data: summary = self._build_condition_summary(cond, data) summaries.append(summary) # Step 4: Determine effective control effective_control = self._get_effective_control(excluded_conditions) # Step 5: Pairwise comparisons on chaperone_fraction comparisons = self._compute_pairwise_comparisons(summaries, effective_control) # Step 6: ANOVA if 3+ conditions anova: ANOVASummary | None = None if len(summaries) >= 3: anova = self._compute_anova(summaries) # Step 7: Rankings ranked_chaperone = sorted(summaries, key=lambda s: s.mean_chaperone_fraction, reverse=True) ranked_transient = sorted(summaries, key=lambda s: s.mean_transient_fraction, reverse=True) return ExposureComparisonResult( name=self.config.name, metric="chaperone_fraction", control_label=effective_control, conditions=summaries, pairwise_comparisons=comparisons, anova=anova, ranking=[s.label for s in ranked_chaperone], ranking_by_transient_fraction=[s.label for s in ranked_transient], excluded_conditions=[c.label for c in excluded_conditions], equilibration_time=self.equilibration, created_at=datetime.now(), polyzymd_version=__version__, )
# ======================================================================== # Abstract Method Implementations # ======================================================================== def _load_or_compute( self, cond: "ConditionConfig", recompute: bool, ) -> ExposureConditionData: """Load or compute exposure dynamics for a condition. For each replicate: 1. Load ContactResult from cached JSON. 2. Compute (or load cached) SASATrajectoryResult. 3. Compute (or load cached) ExposureDynamicsResult. 4. Compute ChaperoneEnrichmentResult. Parameters ---------- cond : ConditionConfig Condition to analyze. recompute : bool Force recompute even if cached. Returns ------- ExposureConditionData Dict with per-replicate dynamics results and enrichment. """ from polyzymd.config.schema import SimulationConfig logger.info(f"Processing condition: {cond.label}") sim_config = SimulationConfig.from_yaml(cond.config) # Resolve condition-specific output directory (None in standalone mode) condition_output_dir = self._resolve_condition_output_dir(cond.label, "exposure") dynamics_per_rep: list["ExposureDynamicsResult"] = [] enrichment_per_rep: list["ChaperoneEnrichmentResult"] = [] successful_reps: list[int] = [] for rep in cond.replicates: result = self._load_or_compute_replicate( sim_config, rep, recompute, cond_config_path=Path(cond.config), condition_output_dir=condition_output_dir, ) if result is not None: dynamics, enrichment = result dynamics_per_rep.append(dynamics) enrichment_per_rep.append(enrichment) successful_reps.append(rep) if not dynamics_per_rep: raise ValueError(f"No successful replicates for condition: {cond.label}") if len(dynamics_per_rep) < len(cond.replicates): missing = set(cond.replicates) - set(successful_reps) logger.warning( f" Missing replicates for {cond.label}: {missing} " f"(using {len(dynamics_per_rep)} of {len(cond.replicates)})" ) return { "dynamics_per_rep": dynamics_per_rep, "enrichment_per_rep": enrichment_per_rep, } def _load_or_compute_replicate( self, sim_config: Any, replicate: int, recompute: bool, cond_config_path: Path | None = None, condition_output_dir: Path | None = None, ) -> tuple["ExposureDynamicsResult", "ChaperoneEnrichmentResult"] | None: """Load or compute exposure dynamics for a single replicate. Parameters ---------- sim_config : SimulationConfig Simulation configuration. replicate : int Replicate number. recompute : bool Force recompute. cond_config_path : Path, optional Condition config path for fallback result location. condition_output_dir : Path, optional Condition-specific output directory (from comparison mode). Checked first before falling back to ``projects_directory``. Returns ------- tuple[ExposureDynamicsResult, ChaperoneEnrichmentResult] or None """ from polyzymd.analysis.contacts.results import ContactResult from polyzymd.analysis.core.loader import TrajectoryLoader from polyzymd.analysis.exposure.dynamics import ( ExposureDynamicsResult, analyze_exposure_dynamics, ) from polyzymd.analysis.exposure.enrichment import compute_chaperone_enrichment from polyzymd.analysis.sasa.config import SASAConfig from polyzymd.analysis.sasa.trajectory import compute_trajectory_sasa # Locate cached ContactResult — check condition-specific contacts dir first # Since condition_output_dir is .../analysis/<label>/exposure, # contacts is at the sibling path .../analysis/<label>/contacts contacts_cond_dir: Path | None = None if condition_output_dir is not None: contacts_cond_dir = condition_output_dir.parent / "contacts" contact_result_path = self._find_contact_result( sim_config, replicate, cond_config_path=cond_config_path, condition_output_dir=contacts_cond_dir, ) if contact_result_path is None or not contact_result_path.exists(): logger.warning( f" Contacts not found for rep {replicate}: {contact_result_path}. " "Run contacts analysis first." ) return None contact_result = ContactResult.load(contact_result_path) logger.info(f" Loaded contacts for rep {replicate}: {contact_result_path}") # Use TrajectoryLoader for consistent path resolution (DRY) loader = TrajectoryLoader(sim_config) try: traj_info = loader.get_trajectory_info(replicate) except FileNotFoundError as e: logger.warning(f" Skipping replicate {replicate}: trajectory data not found. {e}") return None try: topology_path = traj_info.topology_file trajectory_paths = traj_info.trajectory_files # list[Path] — multi-segment support # Analysis directory for this replicate — use condition-specific dir if available if condition_output_dir is not None: analysis_dir = condition_output_dir / f"rep{replicate}" else: analysis_dir = ( self._get_analysis_dir(sim_config, cond_config_path) / f"rep{replicate}" ) # SASA config sasa_config = SASAConfig( exposure_threshold=self.analysis_settings.exposure_threshold, probe_radius_nm=self.analysis_settings.probe_radius_nm, n_sphere_points=self.analysis_settings.n_sphere_points, chain_id=self.analysis_settings.protein_chain, cache_sasa=True, ) # Check cached ExposureDynamicsResult dynamics_cache_path = ExposureDynamicsResult.cache_path(analysis_dir) if not recompute and dynamics_cache_path.exists(): logger.info(f" Loading cached exposure dynamics: {dynamics_cache_path}") dynamics = ExposureDynamicsResult.load(dynamics_cache_path) # Still need to compute enrichment (not cached separately) sasa_result = compute_trajectory_sasa( topology_path=topology_path, trajectory_path=trajectory_paths, config=sasa_config, analysis_dir=analysis_dir, recompute=False, # Use SASA cache if available ) enrichment = compute_chaperone_enrichment( sasa_result=sasa_result, contact_result=contact_result, polymer_resnames=self.analysis_settings.polymer_resnames, ) return dynamics, enrichment # Compute SASA logger.info(f" Computing SASA for rep {replicate}...") sasa_result = compute_trajectory_sasa( topology_path=topology_path, trajectory_path=trajectory_paths, config=sasa_config, analysis_dir=analysis_dir, recompute=recompute, ) # Compute exposure dynamics (with caching) from polyzymd.analysis.exposure.config import ExposureConfig exposure_config = ExposureConfig( transient_lower=self.analysis_settings.transient_lower, transient_upper=self.analysis_settings.transient_upper, min_event_length=self.analysis_settings.min_event_length, ) logger.info(f" Analyzing exposure dynamics for rep {replicate}...") dynamics = analyze_exposure_dynamics( sasa_result=sasa_result, contact_result=contact_result, config=exposure_config, analysis_dir=analysis_dir, recompute=recompute, ) # Compute enrichment enrichment = compute_chaperone_enrichment( sasa_result=sasa_result, contact_result=contact_result, polymer_resnames=self.analysis_settings.polymer_resnames, ) return dynamics, enrichment except Exception as e: logger.warning(f" Skipping replicate {replicate}: analysis failed with error: {e}") return None def _build_condition_summary( self, cond: "ConditionConfig", data: ExposureConditionData, ) -> ExposureConditionSummary: """Build condition summary from per-replicate exposure dynamics results. Parameters ---------- cond : ConditionConfig Condition configuration. data : ExposureConditionData Raw per-replicate data. Returns ------- ExposureConditionSummary """ dynamics_per_rep: list["ExposureDynamicsResult"] = data["dynamics_per_rep"] enrichment_per_rep: list["ChaperoneEnrichmentResult"] = data["enrichment_per_rep"] n_replicates = len(dynamics_per_rep) # Per-replicate primary metric: mean chaperone_fraction over transient residues chap_fractions: list[float] = [] transient_fractions: list[float] = [] n_transient_per_rep: list[float] = [] total_chap_events: list[float] = [] total_unassisted: list[float] = [] for dyn in dynamics_per_rep: n_total = dyn.n_residues n_transient = dyn.n_transient() n_transient_per_rep.append(float(n_transient)) transient_fractions.append(float(n_transient / n_total) if n_total > 0 else 0.0) transient_residues = dyn.transient_residues() if transient_residues: chap_frac = float(np.mean([r.chaperone_fraction for r in transient_residues])) else: chap_frac = 0.0 chap_fractions.append(chap_frac) total_chap_events.append(float(dyn.total_chaperone_events())) total_unassisted.append(float(dyn.total_unassisted_events())) mean_chap = float(np.mean(chap_fractions)) _chap_stats = compute_sem(chap_fractions) if n_replicates > 1 else None sem_chap = _chap_stats.sem if _chap_stats else 0.0 mean_transient = float(np.mean(transient_fractions)) _transient_stats = compute_sem(transient_fractions) if n_replicates > 1 else None sem_transient = _transient_stats.sem if _transient_stats else 0.0 # Aggregate enrichment: mean residue-based enrichment per (polymer_type, aa_group) enrichment_by_ptype: dict[str, dict[str, float]] = {} polymer_types_set: set[str] = set() aa_groups_set: set[str] = set() for enr in enrichment_per_rep: for e in enr.entries: polymer_types_set.add(e.polymer_type) aa_groups_set.add(e.aa_group) polymer_types = sorted(polymer_types_set) aa_groups = sorted(aa_groups_set) for ptype in polymer_types: enrichment_by_ptype[ptype] = {} for ag in aa_groups: vals = [] for enr in enrichment_per_rep: entry = enr.get(ptype, ag) if entry is not None: vals.append(entry.enrichment_residue) enrichment_by_ptype[ptype][ag] = float(np.mean(vals)) if vals else float("nan") return ExposureConditionSummary( label=cond.label, config_path=str(cond.config), n_replicates=n_replicates, replicate_values=chap_fractions, mean_chaperone_fraction=mean_chap, sem_chaperone_fraction=sem_chap, mean_transient_fraction=mean_transient, sem_transient_fraction=sem_transient, mean_n_transient=float(np.mean(n_transient_per_rep)), mean_total_chaperone_events=float(np.mean(total_chap_events)), mean_total_unassisted_events=float(np.mean(total_unassisted)), enrichment_by_polymer_type=enrichment_by_ptype, polymer_types=polymer_types, aa_groups=aa_groups, ) def _get_replicate_values(self, summary: ExposureConditionSummary) -> list[float]: """Return per-replicate chaperone fractions for statistical tests.""" return summary.replicate_values def _get_mean_value(self, summary: ExposureConditionSummary) -> float: """Return mean chaperone fraction.""" return summary.mean_chaperone_fraction @property def _direction_labels(self) -> tuple[str, str, str]: """Higher chaperone fraction = more polymer-assisted events.""" return ("decreased", "unchanged", "increased") def _rank_summaries( self, summaries: list[ExposureConditionSummary] ) -> list[ExposureConditionSummary]: """Rank by chaperone fraction (highest = most chaperone activity).""" return sorted(summaries, key=lambda s: s.mean_chaperone_fraction, reverse=True) # ======================================================================== # Polymer detection filter # ======================================================================== def _filter_conditions( self, ) -> tuple[list["ConditionConfig"], list["ConditionConfig"]]: """Exclude conditions that cannot have chaperone events. A condition is excluded if: 1. Its ``SimulationConfig.polymers`` is ``None`` or disabled — no polymer means no chaperone events are possible. 2. No pre-computed contact JSON exists for any replicate. The polymer check (1) is the primary gate and prevents the fallback file-search from accidentally matching contacts that belong to a different simulation sharing the same ``projects_directory``. Returns ------- tuple[list[ConditionConfig], list[ConditionConfig]] (valid_conditions, excluded_conditions) """ from polyzymd.config.schema import SimulationConfig valid_conditions = [] excluded_conditions = [] for cond in self.config.conditions: try: sim_config = SimulationConfig.from_yaml(cond.config) # Primary gate: conditions without polymer cannot have # chaperone events — exclude before searching for contacts. if sim_config.polymers is None or ( hasattr(sim_config.polymers, "enabled") and not sim_config.polymers.enabled ): excluded_conditions.append(cond) logger.info(f" Excluding '{cond.label}': no polymer configured") continue # Resolve condition-specific contacts dir for lookup exposure_cond_dir = self._resolve_condition_output_dir(cond.label, "exposure") contacts_cond_dir: Path | None = None if exposure_cond_dir is not None: contacts_cond_dir = exposure_cond_dir.parent / "contacts" has_contacts = False for rep in cond.replicates: result_path = self._find_contact_result( sim_config, rep, cond_config_path=Path(cond.config), condition_output_dir=contacts_cond_dir, ) if result_path and result_path.exists(): has_contacts = True break if has_contacts: valid_conditions.append(cond) else: excluded_conditions.append(cond) logger.info(f" Excluding '{cond.label}': no cached contact results found") except Exception as e: logger.warning(f" Error checking condition '{cond.label}': {e}") valid_conditions.append(cond) return valid_conditions, excluded_conditions # ======================================================================== # File location helpers # ======================================================================== def _find_contact_result( self, sim_config: Any, replicate: int, cond_config_path: Path | None = None, condition_output_dir: Path | None = None, ) -> Path | None: """Find the cached contact result JSON for a replicate. Only checks the condition-specific output directory (comparison mode). The ``projects_directory`` and ``cond_config_path`` fallbacks used by other comparators are intentionally **not** consulted here because multiple conditions can share the same ``projects_directory``. Loading contacts from a shared parent risks cross-contamination — e.g. a no-polymer control picking up contacts that belong to a polymer simulation. Parameters ---------- sim_config : SimulationConfig Simulation configuration (unused, kept for interface compat). replicate : int Replicate number. cond_config_path : Path, optional Condition config path (unused, kept for interface compat). condition_output_dir : Path, optional Condition-specific contacts output directory (from comparison mode). Returns ------- Path or None Path to the JSON file, or None if not found. """ result_filename = f"contacts_rep{replicate}.json" # Only use condition-specific path (comparison mode) if condition_output_dir is not None: cond_path = condition_output_dir / result_filename if cond_path.exists(): return cond_path return None def _get_analysis_dir( self, sim_config: Any, cond_config_path: Path | None = None, ) -> Path: """Get base analysis directory with fallback to condition config parent. Parameters ---------- sim_config : SimulationConfig Simulation configuration. cond_config_path : Path, optional Condition config path for fallback. Returns ------- Path Analysis directory path. """ from polyzymd.compare.comparators._utils import find_analysis_dir return find_analysis_dir( sim_config, analysis_subdir="analysis", cond_config_path=cond_config_path, )