"""Binding free energy comparator via Boltzmann inversion of binding preference.
This module implements BindingFreeEnergyComparator, which converts the existing
binding preference (enrichment) data into a selectivity free energy ΔG_sel in
real units (kT, kcal/mol, or kJ/mol).
Physics
-------
In the NPT ensemble the correct thermodynamic potential is the Gibbs free energy G.
The polymer distributes its contacts across protein surface groups. Both the
observed contact distribution (contact_share) and the null reference distribution
(expected_share, proportional to each group's solvent-exposed surface area) are
proper probability distributions that sum to 1 over the partition. Boltzmann
inversion of their ratio gives the selectivity free energy:
ΔG_sel(j) = -k_B·T · ln(contact_share_j / expected_share_j)
Because both distributions are normalized over the same partition, there is no
arbitrary constant — ΔG_sel(j) is fully determined by the data.
Because contact_share / expected_share = enrichment + 1, per replicate:
ΔG_sel,rep = -k_B·T · ln(enrichment_rep + 1)
This is the exact Boltzmann-inverted version of the dimensionless enrichment score.
Sign convention:
ΔG_sel < 0 → preferential contact (observed > surface-availability reference)
ΔG_sel > 0 → contact avoidance (observed < surface-availability reference)
ΔG_sel = 0 → contacts match the surface-availability reference exactly
Differences between groups (ΔG_sel(i) - ΔG_sel(j)) give the relative selectivity.
Differences between conditions (ΔG_sel,B(j) - ΔG_sel,A(j)) give a true ΔΔG.
Temperature handling
--------------------
ΔG_sel computed at temperature T is not comparable to ΔG_sel at T' (in physical
units). Pairwise statistics are suppressed between conditions at different
simulation temperatures.
Design
------
- Consumes cached binding preference files produced by ContactsComparator /
binding_preference.py. When cached data is missing, computes binding preference
on-demand from per-replicate contacts_rep{N}.json files (following the same
load-or-compute contract as every other comparator).
- Inherits BaseComparator but overrides compare() (like ContactsComparator) because
the result type (BindingFreeEnergyResult) does not conform to BaseComparisonResult.
"""
from __future__ import annotations
import logging
import math
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar
import numpy as np
from polyzymd import __version__
from polyzymd.analysis.core.metric_type import MetricType
from polyzymd.compare.core.base import BaseComparator
from polyzymd.compare.core.registry import ComparatorRegistry
from polyzymd.compare.results.binding_free_energy import (
BindingFreeEnergyResult,
FreeEnergyConditionSummary,
FreeEnergyEntry,
FreeEnergyPairwiseEntry,
)
from polyzymd.compare.settings import (
BindingFreeEnergyAnalysisSettings,
BindingFreeEnergyComparisonSettings,
)
from polyzymd.compare.statistics import independent_ttest
if TYPE_CHECKING:
from polyzymd.analysis.contacts.binding_preference import (
AggregatedBindingPreferenceResult,
AggregatedPartitionBindingEntry,
BindingPreferenceResult,
)
from polyzymd.compare.config import ComparisonConfig, ConditionConfig
logger = logging.getLogger("polyzymd.compare")
# Type alias
BFEConditionData = dict[str, Any]
[docs]
@ComparatorRegistry.register("binding_free_energy")
class BindingFreeEnergyComparator(
BaseComparator[
BindingFreeEnergyAnalysisSettings,
BFEConditionData,
FreeEnergyConditionSummary,
BindingFreeEnergyResult,
]
):
"""Compare selectivity free energy (ΔG_sel) across simulation conditions.
Consumes cached binding preference results (produced by the contacts analysis
layer) and converts them to selectivity free energies via Boltzmann
inversion:
ΔG_sel = -k_B·T · ln(contact_share / expected_share)
Statistical comparisons are only computed between conditions that share the
same simulation temperature. Cross-temperature pairs are flagged and their
statistics suppressed.
Parameters
----------
config : ComparisonConfig
Comparison configuration.
analysis_settings : BindingFreeEnergyAnalysisSettings
Units, surface-exposure threshold, custom partitions.
comparison_settings : BindingFreeEnergyComparisonSettings, optional
FDR alpha. Defaults to BindingFreeEnergyComparisonSettings().
equilibration : str, optional
Equilibration time override.
Examples
--------
>>> config = ComparisonConfig.from_yaml("comparison.yaml")
>>> settings = BindingFreeEnergyAnalysisSettings(units="kcal/mol")
>>> comparator = BindingFreeEnergyComparator(config, settings)
>>> result = comparator.compare()
>>> print(result.units)
kcal/mol
Notes
-----
This is a MEAN_BASED metric (contact fractions are averages over frames,
not fluctuation-based quantities).
"""
comparison_type: ClassVar[str] = "binding_free_energy"
[docs]
def __init__(
self,
config: "ComparisonConfig",
analysis_settings: BindingFreeEnergyAnalysisSettings,
comparison_settings: BindingFreeEnergyComparisonSettings | None = None,
equilibration: str | None = None,
):
super().__init__(config, analysis_settings, equilibration)
self.comparison_settings = comparison_settings or BindingFreeEnergyComparisonSettings()
[docs]
@classmethod
def comparison_type_name(cls) -> str:
"""Return the comparison type identifier."""
return "binding_free_energy"
@property
def metric_type(self) -> MetricType:
"""Contact share is a mean-based metric.
Returns
-------
MetricType
MetricType.MEAN_BASED
"""
return MetricType.MEAN_BASED
# =========================================================================
# Main entry point — override compare() for custom result type
# =========================================================================
[docs]
def compare(self, recompute: bool = False) -> BindingFreeEnergyResult: # type: ignore[override]
"""Run binding free energy comparison across all conditions.
Parameters
----------
recompute : bool, optional
Ignored (binding free energy is always recomputed from cached
binding preference data; it is fast and stateless).
Returns
-------
BindingFreeEnergyResult
Complete ΔG_sel comparison result.
"""
logger.info(f"Starting binding free energy comparison: {self.config.name}")
logger.info(f"Units: {self.analysis_settings.units}")
logger.info(f"Conditions: {len(self.config.conditions)}")
# Step 1: Load data for each condition
condition_summaries: list[FreeEnergyConditionSummary] = []
for cond in self.config.conditions:
data = self._load_or_compute(cond, recompute)
summary = self._build_condition_summary(cond, data)
condition_summaries.append(summary)
if not condition_summaries:
raise ValueError("No binding preference data found for any condition.")
# Step 2: Collect metadata
all_polymer_types: list[str] = sorted(
{e.polymer_type for s in condition_summaries for e in s.entries}
)
all_protein_groups: list[str] = sorted(
{e.protein_group for s in condition_summaries for e in s.entries}
)
# Step 3: Temperature groups
temp_groups: dict[float, list[str]] = {}
for s in condition_summaries:
temp_groups.setdefault(s.temperature_K, []).append(s.label)
mixed_temperatures = len(temp_groups) > 1
if mixed_temperatures:
logger.warning(
f"Conditions span {len(temp_groups)} temperatures: "
+ ", ".join(f"{t}K ({len(labels)} conds)" for t, labels in temp_groups.items())
+ ". Cross-temperature pairwise statistics will be suppressed."
)
# Step 4: Pairwise comparisons
pairwise = self._compute_pairwise(condition_summaries)
# Stringify temp_groups keys for JSON compatibility
temp_groups_str = {str(k): v for k, v in temp_groups.items()}
# Step 5: Build result
surface_threshold = self.analysis_settings.surface_exposure_threshold
if self.analysis_settings.units == "kT":
formula = "ΔG_sel = -ln(contact_share / expected_share) [units: k_bT]"
else:
formula = "ΔG_sel = -k_B·T · ln(contact_share / expected_share)"
return BindingFreeEnergyResult(
name=self.config.name,
units=self.analysis_settings.units,
formula=formula,
mixed_temperatures=mixed_temperatures,
temperature_groups=temp_groups_str,
conditions=condition_summaries,
pairwise_comparisons=pairwise,
polymer_types=all_polymer_types,
protein_groups=all_protein_groups,
surface_exposure_threshold=surface_threshold,
equilibration_time=self.equilibration,
created_at=datetime.now(),
polyzymd_version=__version__,
)
# =========================================================================
# Abstract method implementations
# =========================================================================
def _load_or_compute(
self,
cond: "ConditionConfig",
recompute: bool,
) -> BFEConditionData:
"""Load cached binding preference data, computing it if missing.
Follows the same load-or-compute contract as every other comparator:
1. Try to load cached binding preference results from disk.
2. If no cached data exists and ``compute_binding_preference`` is True
(the default), compute binding preference from per-replicate
``contacts_rep{N}.json`` files via the shared helper.
3. If compute is disabled, warn and return empty data.
The compute step requires per-replicate contacts files to already
exist. It does **not** trigger contacts computation.
Parameters
----------
cond : ConditionConfig
Condition to load or compute.
recompute : bool
If True, skip cache and always recompute.
Returns
-------
dict
Raw binding preference data and temperature.
"""
from polyzymd.compare.comparators._binding_preference_helpers import (
compute_condition_binding_preference,
resolve_enzyme_pdb,
)
from polyzymd.config.schema import SimulationConfig
logger.info(f"Loading binding preference for: {cond.label}")
sim_config = SimulationConfig.from_yaml(cond.config)
# Get temperature
temperature_K = float(sim_config.thermodynamics.temperature)
# Resolve condition-specific output directory (None in standalone mode)
condition_output_dir = self._resolve_condition_output_dir(cond.label, "contacts")
# Find analysis directory (contacts layer) — check condition dir first
analysis_dir = self._find_contacts_analysis_dir(
sim_config, cond, condition_output_dir=condition_output_dir
)
# Step 1: Try to load cached binding preference (unless recompute)
bp_result = None
if not recompute:
bp_result = self._try_load_cached_binding_preference(cond, analysis_dir)
if bp_result is not None:
logger.info(f" Loaded binding preference for {cond.label} at {temperature_K} K")
return {
"bp_result": bp_result,
"temperature_K": temperature_K,
"cond_label": cond.label,
"config_path": str(cond.config),
}
# Step 2: Compute if enabled
compute_enabled = getattr(self.analysis_settings, "compute_binding_preference", True)
if compute_enabled:
logger.info(f" No cached data for {cond.label}, computing binding preference...")
# Resolve settings, falling back to contacts settings if needed
settings = self._resolve_compute_settings()
enzyme_pdb = resolve_enzyme_pdb(
enzyme_pdb_setting=settings["enzyme_pdb_for_sasa"],
source_path=self.config.source_path,
sim_config=sim_config,
)
if enzyme_pdb is None or not enzyme_pdb.exists():
logger.warning(
f"Cannot compute binding preference for '{cond.label}': "
f"enzyme PDB not found. Set enzyme_pdb_for_sasa in "
f"binding_free_energy or contacts analysis settings."
)
else:
bp_result = compute_condition_binding_preference(
cond=cond,
sim_config=sim_config,
analysis_dir=analysis_dir,
enzyme_pdb=enzyme_pdb,
threshold=settings["surface_exposure_threshold"],
include_default_aa_groups=settings["include_default_aa_groups"],
custom_protein_groups=settings["protein_groups"],
protein_partitions=settings["protein_partitions"],
polymer_type_selections=settings["polymer_type_selections"],
)
if bp_result is not None:
logger.info(f" Computed binding preference for {cond.label} at {temperature_K} K")
return {
"bp_result": bp_result,
"temperature_K": temperature_K,
"cond_label": cond.label,
"config_path": str(cond.config),
}
else:
logger.warning(
f"Failed to compute binding preference for '{cond.label}'. "
f"Ensure contacts_rep{{N}}.json files exist in {analysis_dir}."
)
else:
logger.warning(
f"No binding preference data found for '{cond.label}'. "
f"Set compute_binding_preference=true or run contacts analysis "
f"with binding preference enabled first."
)
return {
"bp_result": None,
"temperature_K": temperature_K,
"cond_label": cond.label,
"config_path": str(cond.config),
}
def _build_condition_summary(
self,
cond: "ConditionConfig",
data: BFEConditionData,
) -> FreeEnergyConditionSummary:
"""Build ΔG_sel entries for all (polymer_type, protein_group) pairs.
Parameters
----------
cond : ConditionConfig
Condition configuration.
data : dict
Raw data from _load_or_compute.
Returns
-------
FreeEnergyConditionSummary
All ΔG_sel entries for this condition.
"""
temperature_K = data["temperature_K"]
bp_result = data["bp_result"]
units = self.analysis_settings.units
if units == "kT":
kT = 1.0 # ΔG_sel = -ln(ratio), dimensionless in units of k_bT
else:
kT = self.analysis_settings.k_b() * temperature_K
if bp_result is None:
return FreeEnergyConditionSummary(
label=cond.label,
config_path=str(cond.config),
temperature_K=temperature_K,
n_replicates=0,
units=units,
entries=[],
polymer_types=[],
protein_groups=[],
)
entries = self._compute_dg_entries(bp_result, kT, units, temperature_K)
polymer_types = sorted({e.polymer_type for e in entries})
protein_groups = sorted({e.protein_group for e in entries})
n_replicates = max((e.n_replicates for e in entries), default=0)
return FreeEnergyConditionSummary(
label=cond.label,
config_path=str(cond.config),
temperature_K=temperature_K,
n_replicates=n_replicates,
units=units,
entries=entries,
polymer_types=polymer_types,
protein_groups=protein_groups,
)
def _get_replicate_values(self, summary: FreeEnergyConditionSummary) -> list[float]:
"""Not used directly — pairwise logic uses per-entry replicate values."""
vals = [dg for e in summary.entries for dg in e.delta_G_per_replicate if not math.isnan(dg)]
return vals if vals else [0.0]
def _get_mean_value(self, summary: FreeEnergyConditionSummary) -> float:
"""Return mean ΔG_sel across all valid entries."""
return summary.primary_metric_value
@property
def _direction_labels(self) -> tuple[str, str, str]:
"""Negative ΔG_sel change = more favorable binding."""
return ("more favorable", "unchanged", "less favorable")
def _rank_summaries(
self, summaries: list[FreeEnergyConditionSummary]
) -> list[FreeEnergyConditionSummary]:
"""Sort by mean ΔG_sel (most negative = most favorable first)."""
return sorted(summaries, key=lambda s: s.primary_metric_value)
# =========================================================================
# ΔG_sel computation helpers
# =========================================================================
def _compute_dg_entries(
self,
bp_result: Any,
kT: float,
units: str,
temperature_K: float,
) -> list[FreeEnergyEntry]:
"""Compute ΔG_sel entries from a binding preference result.
Parameters
----------
bp_result : AggregatedBindingPreferenceResult | BindingPreferenceResult
Binding preference result for one condition.
kT : float
k_B * T in the selected units.
units : str
Energy units label.
temperature_K : float
Simulation temperature.
Returns
-------
list[FreeEnergyEntry]
One entry per (polymer_type, protein_group) pair.
"""
from polyzymd.analysis.contacts.binding_preference import AggregatedBindingPreferenceResult
entries: list[FreeEnergyEntry] = []
if isinstance(bp_result, AggregatedBindingPreferenceResult):
# Aggregated multi-replicate result
entries = self._entries_from_aggregated(bp_result, kT, units, temperature_K)
else:
# Single-replicate BindingPreferenceResult
entries = self._entries_from_single(bp_result, kT, units, temperature_K)
return entries
def _entries_from_aggregated(
self,
result: "AggregatedBindingPreferenceResult",
kT: float,
units: str,
temperature_K: float,
) -> list[FreeEnergyEntry]:
"""Extract ΔG_sel entries from AggregatedBindingPreferenceResult."""
entries: list[FreeEnergyEntry] = []
bp = result.binding_preference
# Iterate AA-class partition
for polymer_type, partition_result in bp.aa_class_binding.items():
for entry in partition_result.entries:
fe = self._entry_from_agg_bp_entry(
entry, polymer_type, "aa_class", kT, units, temperature_K
)
if fe is not None:
entries.append(fe)
# Iterate user-defined partitions if present
for partition_name, partition_dict in bp.user_defined_partitions.items():
for polymer_type, partition_result in partition_dict.items():
for entry in partition_result.entries:
fe = self._entry_from_agg_bp_entry(
entry, polymer_type, partition_name, kT, units, temperature_K
)
if fe is not None:
entries.append(fe)
return entries
def _entry_from_agg_bp_entry(
self,
entry: "AggregatedPartitionBindingEntry",
polymer_type: str,
partition_name: str,
kT: float,
units: str,
temperature_K: float,
) -> FreeEnergyEntry | None:
"""Convert one AggregatedPartitionBindingEntry to a FreeEnergyEntry.
Returns None if data is insufficient (e.g., no replicates, zero shares).
"""
cs = entry.mean_contact_share
es = entry.expected_share
sem_cs = entry.sem_contact_share if entry.sem_contact_share is not None else 0.0
if cs <= 0.0 or es <= 0.0:
return None
enrichment_ratio = cs / es
# ΔG_sel = -kT * ln(enrichment_ratio)
delta_G = -kT * math.log(enrichment_ratio)
# σ(ΔG_sel) = kT * sqrt[(σ_cs/cs)^2 + (σ_es/es)^2]
# expected_share uncertainty: treat as zero (single PDB SASA computation)
sigma_es = 0.0
delta_G_unc: float | None = None
if sem_cs > 0:
delta_G_unc = kT * math.sqrt((sem_cs / cs) ** 2 + (sigma_es / max(es, 1e-12)) ** 2)
# Per-replicate ΔG_sel from per_replicate_enrichments
dg_per_rep: list[float] = []
for enrich_rep in entry.per_replicate_enrichments:
ratio_rep = enrich_rep + 1.0
if ratio_rep > 0:
dg_per_rep.append(-kT * math.log(ratio_rep))
else:
dg_per_rep.append(float("nan"))
return FreeEnergyEntry(
polymer_type=polymer_type,
protein_group=entry.partition_element,
partition_name=partition_name,
contact_share=cs,
expected_share=es,
enrichment_ratio=enrichment_ratio,
delta_G=delta_G,
delta_G_uncertainty=delta_G_unc,
delta_G_per_replicate=dg_per_rep,
units=units,
temperature_K=temperature_K,
n_replicates=len(dg_per_rep),
n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
)
def _entries_from_single(
self,
result: "BindingPreferenceResult",
kT: float,
units: str,
temperature_K: float,
) -> list[FreeEnergyEntry]:
"""Extract ΔG_sel entries from a single-replicate BindingPreferenceResult."""
entries: list[FreeEnergyEntry] = []
bp = result.binding_preference
# Iterate AA-class partition
for polymer_type, partition_result in bp.aa_class_binding.items():
for entry in partition_result.entries:
cs = entry.contact_share
es = entry.expected_share
if cs <= 0.0 or es <= 0.0:
continue
enrichment_ratio = cs / es
delta_G = -kT * math.log(enrichment_ratio)
dg_per_rep = [delta_G] # single replicate
entries.append(
FreeEnergyEntry(
polymer_type=polymer_type,
protein_group=entry.partition_element,
partition_name="aa_class",
contact_share=cs,
expected_share=es,
enrichment_ratio=enrichment_ratio,
delta_G=delta_G,
delta_G_uncertainty=None,
delta_G_per_replicate=dg_per_rep,
units=units,
temperature_K=temperature_K,
n_replicates=1,
n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
)
)
# Iterate user-defined partitions if present
for partition_name, partition_dict in bp.user_defined_partitions.items():
for polymer_type, partition_result in partition_dict.items():
for entry in partition_result.entries:
cs = entry.contact_share
es = entry.expected_share
if cs <= 0.0 or es <= 0.0:
continue
enrichment_ratio = cs / es
delta_G = -kT * math.log(enrichment_ratio)
entries.append(
FreeEnergyEntry(
polymer_type=polymer_type,
protein_group=entry.partition_element,
partition_name=partition_name,
contact_share=cs,
expected_share=es,
enrichment_ratio=enrichment_ratio,
delta_G=delta_G,
delta_G_uncertainty=None,
delta_G_per_replicate=[delta_G],
units=units,
temperature_K=temperature_K,
n_replicates=1,
n_exposed_in_group=getattr(entry, "n_exposed_in_group", 0),
)
)
return entries
# =========================================================================
# Pairwise comparison
# =========================================================================
def _compute_pairwise(
self,
summaries: list[FreeEnergyConditionSummary],
) -> list[FreeEnergyPairwiseEntry]:
"""Compute pairwise ΔΔG (= ΔG_sel,B − ΔG_sel,A) comparisons, respecting temperature grouping.
Parameters
----------
summaries : list[FreeEnergyConditionSummary]
Condition summaries.
Returns
-------
list[FreeEnergyPairwiseEntry]
All pairwise entries (cross-temperature ones have stats suppressed).
"""
control = self.config.control
comparisons: list[FreeEnergyPairwiseEntry] = []
label_to_summary = {s.label: s for s in summaries}
labels = [s.label for s in summaries]
# Determine whether the control has usable BFE data.
# "No Polymer" conditions have no polymer contacts and therefore no entries.
# When the control is missing or has no valid entries, fall back to all-pairs.
control_has_data = (
control is not None
and control in label_to_summary
and any(e.delta_G is not None for e in label_to_summary[control].entries)
)
if control_has_data:
# Compare all conditions with data vs control
summary_a = label_to_summary[control] # type: ignore[index]
for label_b in labels:
if label_b == control:
continue
summary_b = label_to_summary[label_b]
# Only compare if summary_b also has data
if not any(e.delta_G is not None for e in summary_b.entries):
continue
comparisons.extend(self._compare_condition_pair(summary_a, summary_b))
else:
# Fall back to all-pairs among conditions that have valid ΔG_sel data
if control is not None and control in label_to_summary:
logger.info(
f"Control '{control}' has no ΔG_sel data (e.g. no polymer contacts). "
"Falling back to all-pairs comparison among conditions with data."
)
valid_labels = [
lbl
for lbl in labels
if any(e.delta_G is not None for e in label_to_summary[lbl].entries)
]
for i in range(len(valid_labels)):
for j in range(i + 1, len(valid_labels)):
sa = label_to_summary[valid_labels[i]]
sb = label_to_summary[valid_labels[j]]
comparisons.extend(self._compare_condition_pair(sa, sb))
return comparisons
def _compare_condition_pair(
self,
summary_a: FreeEnergyConditionSummary,
summary_b: FreeEnergyConditionSummary,
) -> list[FreeEnergyPairwiseEntry]:
"""Compare two conditions for all shared (polymer_type, protein_group) pairs.
Parameters
----------
summary_a : FreeEnergyConditionSummary
First condition (typically control).
summary_b : FreeEnergyConditionSummary
Second condition (typically treatment).
Returns
-------
list[FreeEnergyPairwiseEntry]
One entry per shared (polymer, group) pair.
"""
cross_temperature = not math.isclose(
summary_a.temperature_K, summary_b.temperature_K, rel_tol=1e-3
)
# Find all polymer/group pairs present in both
pairs_a = {(e.polymer_type, e.protein_group): e for e in summary_a.entries}
pairs_b = {(e.polymer_type, e.protein_group): e for e in summary_b.entries}
shared_pairs = sorted(set(pairs_a.keys()) & set(pairs_b.keys()))
pairwise: list[FreeEnergyPairwiseEntry] = []
for polymer_type, protein_group in shared_pairs:
entry_a = pairs_a[(polymer_type, protein_group)]
entry_b = pairs_b[(polymer_type, protein_group)]
dg_a = entry_a.delta_G
dg_b = entry_b.delta_G
delta_delta_G: float | None = None
if dg_a is not None and dg_b is not None:
delta_delta_G = dg_b - dg_a
# Stats: only between same-temperature conditions with enough replicates
t_stat: float | None = None
p_val: float | None = None
if not cross_temperature:
reps_a = [v for v in entry_a.delta_G_per_replicate if not math.isnan(v)]
reps_b = [v for v in entry_b.delta_G_per_replicate if not math.isnan(v)]
if len(reps_a) >= 2 and len(reps_b) >= 2:
try:
ttest = independent_ttest(reps_a, reps_b)
t_stat = ttest.t_statistic
p_val = ttest.p_value
except Exception as exc:
logger.debug(f"T-test failed for ({polymer_type}, {protein_group}): {exc}")
pairwise.append(
FreeEnergyPairwiseEntry(
polymer_type=polymer_type,
protein_group=protein_group,
condition_a=summary_a.label,
condition_b=summary_b.label,
temperature_a_K=summary_a.temperature_K,
temperature_b_K=summary_b.temperature_K,
cross_temperature=cross_temperature,
delta_G_a=dg_a,
delta_G_b=dg_b,
delta_delta_G=delta_delta_G,
t_statistic=t_stat,
p_value=p_val,
)
)
return pairwise
# =========================================================================
# Settings resolution (BFE settings with fallback to contacts settings)
# =========================================================================
def _resolve_compute_settings(self) -> dict[str, Any]:
"""Resolve compute settings, falling back to contacts analysis settings.
BFE settings may omit fields like ``enzyme_pdb_for_sasa`` that are
typically configured in the contacts analysis section. This method
returns a unified dict, preferring BFE settings and falling back to
contacts settings from the same comparison.yaml.
Returns
-------
dict
Resolved settings for compute_condition_binding_preference().
"""
bfe = self.analysis_settings
# Try to get contacts settings for fallback
contacts_settings = None
if hasattr(self.config, "analysis_settings"):
contacts_settings = self.config.analysis_settings.get("contacts")
def _get(attr: str, default: Any = None) -> Any:
"""Get from BFE settings first, then contacts settings, then default."""
val = getattr(bfe, attr, None)
if val is not None:
return val
if contacts_settings is not None:
val = getattr(contacts_settings, attr, None)
if val is not None:
return val
return default
return {
"enzyme_pdb_for_sasa": _get("enzyme_pdb_for_sasa"),
"surface_exposure_threshold": _get("surface_exposure_threshold", 0.2),
"include_default_aa_groups": _get("include_default_aa_groups", True),
"protein_groups": _get("protein_groups"),
"protein_partitions": _get("protein_partitions"),
"polymer_type_selections": _get("polymer_type_selections"),
}
# =========================================================================
# Cache loading helpers (mirrors ContactsComparator pattern)
# =========================================================================
def _find_contacts_analysis_dir(
self,
sim_config: Any,
cond: "ConditionConfig",
condition_output_dir: Path | None = None,
) -> Path:
"""Find the contacts analysis directory for a condition.
Checks locations in order:
1. condition_output_dir (condition-specific, comparison mode)
2. sim_config.output.projects_directory / analysis / contacts /
3. cond.config parent directory / analysis / contacts /
Parameters
----------
sim_config : SimulationConfig
Simulation configuration.
cond : ConditionConfig
Condition configuration.
condition_output_dir : Path, optional
Condition-specific contacts output directory (from comparison
mode). Checked first before falling back to
``projects_directory``.
Returns
-------
Path
Analysis directory path.
"""
# Check condition-specific path first (comparison mode)
if condition_output_dir is not None:
if condition_output_dir.exists():
return condition_output_dir
# In comparison mode, do NOT fall back to the shared
# projects_directory — all conditions share the same path.
# Return the condition-specific path (even if not yet created)
# so that downstream code computes and saves there.
return condition_output_dir
from polyzymd.compare.comparators._utils import find_analysis_dir
return find_analysis_dir(
sim_config,
analysis_subdir="analysis/contacts",
cond_config_path=Path(cond.config),
)
def _try_load_cached_binding_preference(
self,
cond: "ConditionConfig",
analysis_dir: Path,
) -> "AggregatedBindingPreferenceResult | BindingPreferenceResult | None":
"""Load cached binding preference results for a condition.
Searches for files in order of preference:
1. binding_preference_aggregated.json
2. binding_preference_aggregated_reps*.json (glob)
3. binding_preference.json (single replicate)
4. Per-replicate binding_preference_rep{N}.json → aggregate
Parameters
----------
cond : ConditionConfig
Condition configuration.
analysis_dir : Path
Contacts analysis directory.
Returns
-------
AggregatedBindingPreferenceResult | BindingPreferenceResult | None
Loaded result, or None if not found.
"""
from polyzymd.compare.comparators._binding_preference_helpers import (
try_load_cached_binding_preference,
)
return try_load_cached_binding_preference(cond, analysis_dir)