Source code for polyzymd.compare.comparators.binding_free_energy

"""Binding free energy comparator via Boltzmann inversion of binding preference.

This module implements BindingFreeEnergyComparator, which converts the existing
binding preference (enrichment) data into a selectivity free energy ΔG_sel in
real units (kT, kcal/mol, or kJ/mol).

Physics
-------
In the NPT ensemble the correct thermodynamic potential is the Gibbs free energy G.
The polymer distributes its contacts across protein surface groups. Both the
observed contact distribution (contact_share) and the null reference distribution
(expected_share, proportional to each group's solvent-exposed surface area) are
proper probability distributions that sum to 1 over the partition. Boltzmann
inversion of their ratio gives the selectivity free energy:

    ΔG_sel(j) = -k_B·T · ln(contact_share_j / expected_share_j)

Because both distributions are normalized over the same partition, there is no
arbitrary constant — ΔG_sel(j) is fully determined by the data.

Because contact_share / expected_share = enrichment + 1, per replicate:

    ΔG_sel,rep = -k_B·T · ln(enrichment_rep + 1)

This is the exact Boltzmann-inverted version of the dimensionless enrichment score.

Sign convention:
    ΔG_sel < 0  →  preferential contact (observed > surface-availability reference)
    ΔG_sel > 0  →  contact avoidance (observed < surface-availability reference)
    ΔG_sel = 0  →  contacts match the surface-availability reference exactly

Differences between groups (ΔG_sel(i) - ΔG_sel(j)) give the relative selectivity.
Differences between conditions (ΔG_sel,B(j) - ΔG_sel,A(j)) give a true ΔΔG.

Temperature handling
--------------------
ΔG_sel computed at temperature T is not comparable to ΔG_sel at T' (in physical
units). Pairwise statistics are suppressed between conditions at different
simulation temperatures.

Design
------
- Consumes cached binding preference files produced by ContactsComparator /
  binding_preference.py. When cached data is missing, computes binding preference
  on-demand from per-replicate contacts_rep{N}.json files (following the same
  load-or-compute contract as every other comparator).
- Inherits BaseComparator but overrides compare() (like ContactsComparator) because
  the result type (BindingFreeEnergyResult) does not conform to BaseComparisonResult.
"""

from __future__ import annotations

import logging
import math
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar

import numpy as np

from polyzymd import __version__
from polyzymd.analysis.core.metric_type import MetricType
from polyzymd.compare.core.base import BaseComparator
from polyzymd.compare.core.registry import ComparatorRegistry
from polyzymd.compare.results.binding_free_energy import (
    BindingFreeEnergyResult,
    FreeEnergyConditionSummary,
    FreeEnergyEntry,
    FreeEnergyPairwiseEntry,
)
from polyzymd.compare.settings import (
    BindingFreeEnergyAnalysisSettings,
    BindingFreeEnergyComparisonSettings,
)
from polyzymd.compare.statistics import independent_ttest

if TYPE_CHECKING:
    from polyzymd.analysis.contacts.binding_preference import (
        AggregatedBindingPreferenceResult,
        AggregatedPartitionBindingEntry,
        BindingPreferenceResult,
    )
    from polyzymd.compare.config import ComparisonConfig, ConditionConfig

logger = logging.getLogger("polyzymd.compare")

# Type alias
BFEConditionData = dict[str, Any]



[docs]
@ComparatorRegistry.register("binding_free_energy")
class BindingFreeEnergyComparator(
    BaseComparator[
        BindingFreeEnergyAnalysisSettings,
        BFEConditionData,
        FreeEnergyConditionSummary,
        BindingFreeEnergyResult,
    ]
):
    """Compare selectivity free energy (ΔG_sel) across simulation conditions.

    Consumes cached binding preference results (produced by the contacts analysis
    layer) and converts them to selectivity free energies via Boltzmann
    inversion:

        ΔG_sel = -k_B·T · ln(contact_share / expected_share)

    Statistical comparisons are only computed between conditions that share the
    same simulation temperature. Cross-temperature pairs are flagged and their
    statistics suppressed.

    Parameters
    ----------
    config : ComparisonConfig
        Comparison configuration.
    analysis_settings : BindingFreeEnergyAnalysisSettings
        Units, surface-exposure threshold, custom partitions.
    comparison_settings : BindingFreeEnergyComparisonSettings, optional
        FDR alpha. Defaults to BindingFreeEnergyComparisonSettings().
    equilibration : str, optional
        Equilibration time override.

    Examples
    --------
    >>> config = ComparisonConfig.from_yaml("comparison.yaml")
    >>> settings = BindingFreeEnergyAnalysisSettings(units="kcal/mol")
    >>> comparator = BindingFreeEnergyComparator(config, settings)
    >>> result = comparator.compare()
    >>> print(result.units)
    kcal/mol

    Notes
    -----
    This is a MEAN_BASED metric (contact fractions are averages over frames,
    not fluctuation-based quantities).
    """

    comparison_type: ClassVar[str] = "binding_free_energy"


[docs]
    def __init__(
        self,
        config: "ComparisonConfig",
        analysis_settings: BindingFreeEnergyAnalysisSettings,
        comparison_settings: BindingFreeEnergyComparisonSettings | None = None,
        equilibration: str | None = None,
    ):
        super().__init__(config, analysis_settings, equilibration)
        self.comparison_settings = comparison_settings or BindingFreeEnergyComparisonSettings()



[docs]
    @classmethod
    def comparison_type_name(cls) -> str:
        """Return the comparison type identifier."""
        return "binding_free_energy"


    @property
    def metric_type(self) -> MetricType:
        """Contact share is a mean-based metric.

        Returns
        -------
        MetricType
            MetricType.MEAN_BASED
        """
        return MetricType.MEAN_BASED

    # =========================================================================
    # Main entry point — override compare() for custom result type
    # =========================================================================


[docs]
    def compare(self, recompute: bool = False) -> BindingFreeEnergyResult:  # type: ignore[override]
        """Run binding free energy comparison across all conditions.

        Parameters
        ----------
        recompute : bool, optional
            Ignored (binding free energy is always recomputed from cached
            binding preference data; it is fast and stateless).

        Returns
        -------
        BindingFreeEnergyResult
            Complete ΔG_sel comparison result.
        """
        logger.info(f"Starting binding free energy comparison: {self.config.name}")
        logger.info(f"Units: {self.analysis_settings.units}")
        logger.info(f"Conditions: {len(self.config.conditions)}")

        # Step 1: Load data for each condition
        condition_summaries: list[FreeEnergyConditionSummary] = []
        for cond in self.config.conditions:
            data = self._load_or_compute(cond, recompute)
            summary = self._build_condition_summary(cond, data)
            condition_summaries.append(summary)

        if not condition_summaries:
            raise ValueError("No binding preference data found for any condition.")

        # Step 2: Collect metadata
        all_polymer_types: list[str] = sorted(
            {e.polymer_type for s in condition_summaries for e in s.entries}
        )
        all_protein_groups: list[str] = sorted(
            {e.protein_group for s in condition_summaries for e in s.entries}
        )

        # Step 3: Temperature groups
        temp_groups: dict[float, list[str]] = {}
        for s in condition_summaries:
            temp_groups.setdefault(s.temperature_K, []).append(s.label)
        mixed_temperatures = len(temp_groups) > 1

        if mixed_temperatures:
            logger.warning(
                f"Conditions span {len(temp_groups)} temperatures: "
                + ", ".join(f"{t}K ({len(labels)} conds)" for t, labels in temp_groups.items())
                + ". Cross-temperature pairwise statistics will be suppressed."
            )

        # Step 4: Pairwise comparisons
        pairwise = self._compute_pairwise(condition_summaries)

        # Stringify temp_groups keys for JSON compatibility
        temp_groups_str = {str(k): v for k, v in temp_groups.items()}

        # Step 5: Build result
        surface_threshold = self.analysis_settings.surface_exposure_threshold
        if self.analysis_settings.units == "kT":
            formula = "ΔG_sel = -ln(contact_share / expected_share)  [units: k_bT]"
        else:
            formula = "ΔG_sel = -k_B·T · ln(contact_share / expected_share)"

        return BindingFreeEnergyResult(
            name=self.config.name,
            units=self.analysis_settings.units,
            formula=formula,
            mixed_temperatures=mixed_temperatures,
            temperature_groups=temp_groups_str,
            conditions=condition_summaries,
            pairwise_comparisons=pairwise,
            polymer_types=all_polymer_types,
            protein_groups=all_protein_groups,
            surface_exposure_threshold=surface_threshold,
            equilibration_time=self.equilibration,
            created_at=datetime.now(),
            polyzymd_version=__version__,
        )


    # =========================================================================
    # Abstract method implementations
    # =========================================================================

    def _load_or_compute(
        self,
        cond: "ConditionConfig",
        recompute: bool,
    ) -> BFEConditionData:
        """Load cached binding preference data, computing it if missing.

        Follows the same load-or-compute contract as every other comparator:

        1. Try to load cached binding preference results from disk.
        2. If no cached data exists and ``compute_binding_preference`` is True
           (the default), compute binding preference from per-replicate
           ``contacts_rep{N}.json`` files via the shared helper.
        3. If compute is disabled, warn and return empty data.

        The compute step requires per-replicate contacts files to already
        exist. It does **not** trigger contacts computation.

        Parameters
        ----------
        cond : ConditionConfig
            Condition to load or compute.
        recompute : bool
            If True, skip cache and always recompute.

        Returns
        -------
        dict
            Raw binding preference data and temperature.
        """
        from polyzymd.compare.comparators._binding_preference_helpers import (
            compute_condition_binding_preference,
            resolve_enzyme_pdb,
        )
        from polyzymd.config.schema import SimulationConfig

        logger.info(f"Loading binding preference for: {cond.label}")

        sim_config = SimulationConfig.from_yaml(cond.config)

        # Get temperature
        temperature_K = float(sim_config.thermodynamics.temperature)

        # Resolve condition-specific output directory (None in standalone mode)
        condition_output_dir = self._resolve_condition_output_dir(cond.label, "contacts")

        # Find analysis directory (contacts layer) — check condition dir first
        analysis_dir = self._find_contacts_analysis_dir(
            sim_config, cond, condition_output_dir=condition_output_dir
        )

        # Step 1: Try to load cached binding preference (unless recompute)
        bp_result = None
        if not recompute:
            bp_result = self._try_load_cached_binding_preference(cond, analysis_dir)

        if bp_result is not None:
            logger.info(f"  Loaded binding preference for {cond.label} at {temperature_K} K")
            return {
                "bp_result": bp_result,
                "temperature_K": temperature_K,
                "cond_label": cond.label,
                "config_path": str(cond.config),
            }

        # Step 2: Compute if enabled
        compute_enabled = getattr(self.analysis_settings, "compute_binding_preference", True)
        if compute_enabled:
            logger.info(f"  No cached data for {cond.label}, computing binding preference...")

            # Resolve settings, falling back to contacts settings if needed
            settings = self._resolve_compute_settings()

            enzyme_pdb = resolve_enzyme_pdb(
                enzyme_pdb_setting=settings["enzyme_pdb_for_sasa"],
                source_path=self.config.source_path,
                sim_config=sim_config,
            )

            if enzyme_pdb is None or not enzyme_pdb.exists():
                logger.warning(
                    f"Cannot compute binding preference for '{cond.label}': "
                    f"enzyme PDB not found. Set enzyme_pdb_for_sasa in "
                    f"binding_free_energy or contacts analysis settings."
                )
            else:
                bp_result = compute_condition_binding_preference(
                    cond=cond,
                    sim_config=sim_config,
                    analysis_dir=analysis_dir,
                    enzyme_pdb=enzyme_pdb,
                    threshold=settings["surface_exposure_threshold"],
                    include_default_aa_groups=settings["include_default_aa_groups"],
                    custom_protein_groups=settings["protein_groups"],
                    protein_partitions=settings["protein_partitions"],
                    polymer_type_selections=settings["polymer_type_selections"],
                )

            if bp_result is not None:
                logger.info(f"  Computed binding preference for {cond.label} at {temperature_K} K")
                return {
                    "bp_result": bp_result,
                    "temperature_K": temperature_K,
                    "cond_label": cond.label,
                    "config_path": str(cond.config),
                }
            else:
                logger.warning(
                    f"Failed to compute binding preference for '{cond.label}'. "
                    f"Ensure contacts_rep{{N}}.json files exist in {analysis_dir}."
                )
        else:
            logger.warning(
                f"No binding preference data found for '{cond.label}'. "
                f"Set compute_binding_preference=true or run contacts analysis "
                f"with binding preference enabled first."
            )

        return {
            "bp_result": None,
            "temperature_K": temperature_K,
            "cond_label": cond.label,
            "config_path": str(cond.config),
        }

    def _build_condition_summary(
        self,
        cond: "ConditionConfig",
        data: BFEConditionData,
    ) -> FreeEnergyConditionSummary:
        """Build ΔG_sel entries for all (polymer_type, protein_group) pairs.

        Parameters
        ----------
        cond : ConditionConfig
            Condition configuration.
        data : dict
            Raw data from _load_or_compute.

        Returns
        -------
        FreeEnergyConditionSummary
            All ΔG_sel entries for this condition.
        """
        temperature_K = data["temperature_K"]
        bp_result = data["bp_result"]
        units = self.analysis_settings.units
        if units == "kT":
            kT = 1.0  # ΔG_sel = -ln(ratio), dimensionless in units of k_bT
        else:
            kT = self.analysis_settings.k_b() * temperature_K

        if bp_result is None:
            return FreeEnergyConditionSummary(
                label=cond.label,
                config_path=str(cond.config),
                temperature_K=temperature_K,
                n_replicates=0,
                units=units,
                entries=[],
                polymer_types=[],
                protein_groups=[],
            )

        entries = self._compute_dg_entries(bp_result, kT, units, temperature_K)
        polymer_types = sorted({e.polymer_type for e in entries})
        protein_groups = sorted({e.protein_group for e in entries})
        n_replicates = max((e.n_replicates for e in entries), default=0)

        return FreeEnergyConditionSummary(
            label=cond.label,
            config_path=str(cond.config),
            temperature_K=temperature_K,
            n_replicates=n_replicates,
            units=units,
            entries=entries,
            polymer_types=polymer_types,
            protein_groups=protein_groups,
        )

    def _get_replicate_values(self, summary: FreeEnergyConditionSummary) -> list[float]:
        """Not used directly — pairwise logic uses per-entry replicate values."""
        vals = [dg for e in summary.entries for dg in e.delta_G_per_replicate if not math.isnan(dg)]
        return vals if vals else [0.0]

    def _get_mean_value(self, summary: FreeEnergyConditionSummary) -> float:
        """Return mean ΔG_sel across all valid entries."""
        return summary.primary_metric_value

    @property
    def _direction_labels(self) -> tuple[str, str, str]:
        """Negative ΔG_sel change = more favorable binding."""
        return ("more favorable", "unchanged", "less favorable")

    def _rank_summaries(
        self, summaries: list[FreeEnergyConditionSummary]
    ) -> list[FreeEnergyConditionSummary]:
        """Sort by mean ΔG_sel (most negative = most favorable first)."""
        return sorted(summaries, key=lambda s: s.primary_metric_value)

    # =========================================================================
    # ΔG_sel computation helpers
    # =========================================================================

    def _compute_dg_entries(
        self,
        bp_result: Any,
        kT: float,
        units: str,
        temperature_K: float,
    ) -> list[FreeEnergyEntry]:
        """Compute ΔG_sel entries from a binding preference result.

        Parameters
        ----------
        bp_result : AggregatedBindingPreferenceResult | BindingPreferenceResult
            Binding preference result for one condition.
        kT : float
            k_B * T in the selected units.
        units : str
            Energy units label.
        temperature_K : float
            Simulation temperature.

        Returns
        -------
        list[FreeEnergyEntry]
            One entry per (polymer_type, protein_group) pair.
        """
        from polyzymd.analysis.contacts.binding_preference import AggregatedBindingPreferenceResult

        entries: list[FreeEnergyEntry] = []

        if isinstance(bp_result, AggregatedBindingPreferenceResult):
            # Aggregated multi-replicate result
            entries = self._entries_from_aggregated(bp_result, kT, units, temperature_K)
        else:
            # Single-replicate BindingPreferenceResult
            entries = self._entries_from_single(bp_result, kT, units, temperature_K)

        return entries

    def _entries_from_aggregated(
        self,
        result: "AggregatedBindingPreferenceResult",
        kT: float,
        units: str,
        temperature_K: float,
    ) -> list[FreeEnergyEntry]:
        """Extract ΔG_sel entries from AggregatedBindingPreferenceResult."""
        entries: list[FreeEnergyEntry] = []

        bp = result.binding_preference

        # Iterate AA-class partition
        for polymer_type, partition_result in bp.aa_class_binding.items():
            for entry in partition_result.entries:
                fe = self._entry_from_agg_bp_entry(
                    entry, polymer_type, "aa_class", kT, units, temperature_K
                )
                if fe is not None:
                    entries.append(fe)

        # Iterate user-defined partitions if present
        for partition_name, partition_dict in bp.user_defined_partitions.items():
            for polymer_type, partition_result in partition_dict.items():
                for entry in partition_result.entries:
                    fe = self._entry_from_agg_bp_entry(
                        entry, polymer_type, partition_name, kT, units, temperature_K
                    )
                    if fe is not None:
                        entries.append(fe)

        return entries

    def _entry_from_agg_bp_entry(
        self,
        entry: "AggregatedPartitionBindingEntry",
        polymer_type: str,
        partition_name: str,
        kT: float,
        units: str,
        temperature_K: float,
    ) -> FreeEnergyEntry | None:
        """Convert one AggregatedPartitionBindingEntry to a FreeEnergyEntry.

        Returns None if data is insufficient (e.g., no replicates, zero shares).
        """
        cs = entry.mean_contact_share
        es = entry.expected_share
        sem_cs = entry.sem_contact_share if entry.sem_contact_share is not None else 0.0

        if cs <= 0.0 or es <= 0.0:
            return None

        enrichment_ratio = cs / es

        # ΔG_sel = -kT * ln(enrichment_ratio)
        delta_G = -kT * math.log(enrichment_ratio)

        # σ(ΔG_sel) = kT * sqrt[(σ_cs/cs)^2 + (σ_es/es)^2]
        # expected_share uncertainty: treat as zero (single PDB SASA computation)
        sigma_es = 0.0
        delta_G_unc: float | None = None
        if sem_cs > 0:
            delta_G_unc = kT * math.sqrt((sem_cs / cs) ** 2 + (sigma_es / max(es, 1e-12)) ** 2)

        # Per-replicate ΔG_sel from per_replicate_enrichments
        dg_per_rep: list[float] = []
        for enrich_rep in entry.per_replicate_enrichments:
            ratio_rep = enrich_rep + 1.0
            if ratio_rep > 0:
                dg_per_rep.append(-kT * math.log(ratio_rep))
            else:
                dg_per_rep.append(float("nan"))

        return FreeEnergyEntry(
            polymer_type=polymer_type,
            protein_group=entry.partition_element,
            partition_name=partition_name,
            contact_share=cs,
            expected_share=es,
            enrichment_ratio=enrichment_ratio,
            delta_G=delta_G,
            delta_G_uncertainty=delta_G_unc,
            delta_G_per_replicate=dg_per_rep,
            units=units,
            temperature_K=temperature_K,
            n_replicates=len(dg_per_rep),
            n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
        )

    def _entries_from_single(
        self,
        result: "BindingPreferenceResult",
        kT: float,
        units: str,
        temperature_K: float,
    ) -> list[FreeEnergyEntry]:
        """Extract ΔG_sel entries from a single-replicate BindingPreferenceResult."""
        entries: list[FreeEnergyEntry] = []

        bp = result.binding_preference

        # Iterate AA-class partition
        for polymer_type, partition_result in bp.aa_class_binding.items():
            for entry in partition_result.entries:
                cs = entry.contact_share
                es = entry.expected_share

                if cs <= 0.0 or es <= 0.0:
                    continue

                enrichment_ratio = cs / es
                delta_G = -kT * math.log(enrichment_ratio)
                dg_per_rep = [delta_G]  # single replicate

                entries.append(
                    FreeEnergyEntry(
                        polymer_type=polymer_type,
                        protein_group=entry.partition_element,
                        partition_name="aa_class",
                        contact_share=cs,
                        expected_share=es,
                        enrichment_ratio=enrichment_ratio,
                        delta_G=delta_G,
                        delta_G_uncertainty=None,
                        delta_G_per_replicate=dg_per_rep,
                        units=units,
                        temperature_K=temperature_K,
                        n_replicates=1,
                        n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
                    )
                )

        # Iterate user-defined partitions if present
        for partition_name, partition_dict in bp.user_defined_partitions.items():
            for polymer_type, partition_result in partition_dict.items():
                for entry in partition_result.entries:
                    cs = entry.contact_share
                    es = entry.expected_share

                    if cs <= 0.0 or es <= 0.0:
                        continue

                    enrichment_ratio = cs / es
                    delta_G = -kT * math.log(enrichment_ratio)

                    entries.append(
                        FreeEnergyEntry(
                            polymer_type=polymer_type,
                            protein_group=entry.partition_element,
                            partition_name=partition_name,
                            contact_share=cs,
                            expected_share=es,
                            enrichment_ratio=enrichment_ratio,
                            delta_G=delta_G,
                            delta_G_uncertainty=None,
                            delta_G_per_replicate=[delta_G],
                            units=units,
                            temperature_K=temperature_K,
                            n_replicates=1,
                            n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
                        )
                    )

        return entries

    # =========================================================================
    # Pairwise comparison
    # =========================================================================

    def _compute_pairwise(
        self,
        summaries: list[FreeEnergyConditionSummary],
    ) -> list[FreeEnergyPairwiseEntry]:
        """Compute pairwise ΔΔG (= ΔG_sel,B − ΔG_sel,A) comparisons, respecting temperature grouping.

        Parameters
        ----------
        summaries : list[FreeEnergyConditionSummary]
            Condition summaries.

        Returns
        -------
        list[FreeEnergyPairwiseEntry]
            All pairwise entries (cross-temperature ones have stats suppressed).
        """
        control = self.config.control
        comparisons: list[FreeEnergyPairwiseEntry] = []

        label_to_summary = {s.label: s for s in summaries}
        labels = [s.label for s in summaries]

        # Determine whether the control has usable BFE data.
        # "No Polymer" conditions have no polymer contacts and therefore no entries.
        # When the control is missing or has no valid entries, fall back to all-pairs.
        control_has_data = (
            control is not None
            and control in label_to_summary
            and any(e.delta_G is not None for e in label_to_summary[control].entries)
        )

        if control_has_data:
            # Compare all conditions with data vs control
            summary_a = label_to_summary[control]  # type: ignore[index]
            for label_b in labels:
                if label_b == control:
                    continue
                summary_b = label_to_summary[label_b]
                # Only compare if summary_b also has data
                if not any(e.delta_G is not None for e in summary_b.entries):
                    continue
                comparisons.extend(self._compare_condition_pair(summary_a, summary_b))
        else:
            # Fall back to all-pairs among conditions that have valid ΔG_sel data
            if control is not None and control in label_to_summary:
                logger.info(
                    f"Control '{control}' has no ΔG_sel data (e.g. no polymer contacts). "
                    "Falling back to all-pairs comparison among conditions with data."
                )
            valid_labels = [
                lbl
                for lbl in labels
                if any(e.delta_G is not None for e in label_to_summary[lbl].entries)
            ]
            for i in range(len(valid_labels)):
                for j in range(i + 1, len(valid_labels)):
                    sa = label_to_summary[valid_labels[i]]
                    sb = label_to_summary[valid_labels[j]]
                    comparisons.extend(self._compare_condition_pair(sa, sb))

        return comparisons

    def _compare_condition_pair(
        self,
        summary_a: FreeEnergyConditionSummary,
        summary_b: FreeEnergyConditionSummary,
    ) -> list[FreeEnergyPairwiseEntry]:
        """Compare two conditions for all shared (polymer_type, protein_group) pairs.

        Parameters
        ----------
        summary_a : FreeEnergyConditionSummary
            First condition (typically control).
        summary_b : FreeEnergyConditionSummary
            Second condition (typically treatment).

        Returns
        -------
        list[FreeEnergyPairwiseEntry]
            One entry per shared (polymer, group) pair.
        """
        cross_temperature = not math.isclose(
            summary_a.temperature_K, summary_b.temperature_K, rel_tol=1e-3
        )

        # Find all polymer/group pairs present in both
        pairs_a = {(e.polymer_type, e.protein_group): e for e in summary_a.entries}
        pairs_b = {(e.polymer_type, e.protein_group): e for e in summary_b.entries}
        shared_pairs = sorted(set(pairs_a.keys()) & set(pairs_b.keys()))

        pairwise: list[FreeEnergyPairwiseEntry] = []

        for polymer_type, protein_group in shared_pairs:
            entry_a = pairs_a[(polymer_type, protein_group)]
            entry_b = pairs_b[(polymer_type, protein_group)]

            dg_a = entry_a.delta_G
            dg_b = entry_b.delta_G

            delta_delta_G: float | None = None
            if dg_a is not None and dg_b is not None:
                delta_delta_G = dg_b - dg_a

            # Stats: only between same-temperature conditions with enough replicates
            t_stat: float | None = None
            p_val: float | None = None

            if not cross_temperature:
                reps_a = [v for v in entry_a.delta_G_per_replicate if not math.isnan(v)]
                reps_b = [v for v in entry_b.delta_G_per_replicate if not math.isnan(v)]
                if len(reps_a) >= 2 and len(reps_b) >= 2:
                    try:
                        ttest = independent_ttest(reps_a, reps_b)
                        t_stat = ttest.t_statistic
                        p_val = ttest.p_value
                    except Exception as exc:
                        logger.debug(f"T-test failed for ({polymer_type}, {protein_group}): {exc}")

            pairwise.append(
                FreeEnergyPairwiseEntry(
                    polymer_type=polymer_type,
                    protein_group=protein_group,
                    condition_a=summary_a.label,
                    condition_b=summary_b.label,
                    temperature_a_K=summary_a.temperature_K,
                    temperature_b_K=summary_b.temperature_K,
                    cross_temperature=cross_temperature,
                    delta_G_a=dg_a,
                    delta_G_b=dg_b,
                    delta_delta_G=delta_delta_G,
                    t_statistic=t_stat,
                    p_value=p_val,
                )
            )

        return pairwise

    # =========================================================================
    # Settings resolution (BFE settings with fallback to contacts settings)
    # =========================================================================

    def _resolve_compute_settings(self) -> dict[str, Any]:
        """Resolve compute settings, falling back to contacts analysis settings.

        BFE settings may omit fields like ``enzyme_pdb_for_sasa`` that are
        typically configured in the contacts analysis section. This method
        returns a unified dict, preferring BFE settings and falling back to
        contacts settings from the same comparison.yaml.

        Returns
        -------
        dict
            Resolved settings for compute_condition_binding_preference().
        """
        bfe = self.analysis_settings

        # Try to get contacts settings for fallback
        contacts_settings = None
        if hasattr(self.config, "analysis_settings"):
            contacts_settings = self.config.analysis_settings.get("contacts")

        def _get(attr: str, default: Any = None) -> Any:
            """Get from BFE settings first, then contacts settings, then default."""
            val = getattr(bfe, attr, None)
            if val is not None:
                return val
            if contacts_settings is not None:
                val = getattr(contacts_settings, attr, None)
                if val is not None:
                    return val
            return default

        return {
            "enzyme_pdb_for_sasa": _get("enzyme_pdb_for_sasa"),
            "surface_exposure_threshold": _get("surface_exposure_threshold", 0.2),
            "include_default_aa_groups": _get("include_default_aa_groups", True),
            "protein_groups": _get("protein_groups"),
            "protein_partitions": _get("protein_partitions"),
            "polymer_type_selections": _get("polymer_type_selections"),
        }

    # =========================================================================
    # Cache loading helpers (mirrors ContactsComparator pattern)
    # =========================================================================

    def _find_contacts_analysis_dir(
        self,
        sim_config: Any,
        cond: "ConditionConfig",
        condition_output_dir: Path | None = None,
    ) -> Path:
        """Find the contacts analysis directory for a condition.

        Checks locations in order:
        1. condition_output_dir (condition-specific, comparison mode)
        2. sim_config.output.projects_directory / analysis / contacts /
        3. cond.config parent directory / analysis / contacts /

        Parameters
        ----------
        sim_config : SimulationConfig
            Simulation configuration.
        cond : ConditionConfig
            Condition configuration.
        condition_output_dir : Path, optional
            Condition-specific contacts output directory (from comparison
            mode).  Checked first before falling back to
            ``projects_directory``.

        Returns
        -------
        Path
            Analysis directory path.
        """
        # Check condition-specific path first (comparison mode)
        if condition_output_dir is not None:
            if condition_output_dir.exists():
                return condition_output_dir
            # In comparison mode, do NOT fall back to the shared
            # projects_directory — all conditions share the same path.
            # Return the condition-specific path (even if not yet created)
            # so that downstream code computes and saves there.
            return condition_output_dir

        from polyzymd.compare.comparators._utils import find_analysis_dir

        return find_analysis_dir(
            sim_config,
            analysis_subdir="analysis/contacts",
            cond_config_path=Path(cond.config),
        )

    def _try_load_cached_binding_preference(
        self,
        cond: "ConditionConfig",
        analysis_dir: Path,
    ) -> "AggregatedBindingPreferenceResult | BindingPreferenceResult | None":
        """Load cached binding preference results for a condition.

        Searches for files in order of preference:
        1. binding_preference_aggregated.json
        2. binding_preference_aggregated_reps*.json (glob)
        3. binding_preference.json (single replicate)
        4. Per-replicate binding_preference_rep{N}.json → aggregate

        Parameters
        ----------
        cond : ConditionConfig
            Condition configuration.
        analysis_dir : Path
            Contacts analysis directory.

        Returns
        -------
        AggregatedBindingPreferenceResult | BindingPreferenceResult | None
            Loaded result, or None if not found.
        """
        from polyzymd.compare.comparators._binding_preference_helpers import (
            try_load_cached_binding_preference,
        )

        return try_load_cached_binding_preference(cond, analysis_dir)