Source code for polyzymd.analyses.shared.binding_preference._orchestration

"""Config-driven orchestration for binding preference workflows."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Mapping, Protocol, Sequence

from ._compute import compute_binding_preference
from ._models import PolymerComposition
from ._polymer import extract_polymer_composition
from ._resolution import (
    resolve_polymer_type_selections,
    resolve_protein_group_selections,
    resolve_protein_groups_from_surface_exposure,
)

if TYPE_CHECKING:
    from MDAnalysis.core.universe import Universe

    from ._models import AggregatedBindingPreferenceResult, BindingPreferenceResult

logger = logging.getLogger(__name__)


class ConditionLike(Protocol):
    """Minimal condition protocol required by orchestration helpers.

    Attributes
    ----------
    label : str
        Condition label.
    replicates : Sequence[int]
        Replicate IDs associated with this condition.
    """

    label: str
    replicates: Sequence[int]


[docs] def compute_condition_binding_preference( cond: ConditionLike, sim_config: Any, analysis_dir: Path, *, enzyme_pdb: Path, contact_results_by_replicate: Mapping[int, Path], load_contact_result: Callable[[Path], Any], threshold: float, include_default_aa_groups: bool, custom_protein_groups: dict[str, list[int]] | None, protein_partitions: dict[str, list[str]] | None, polymer_type_selections: dict[str, str] | None, polymer_chain: str, settings_fp: str | None, ) -> "AggregatedBindingPreferenceResult | None": """Compute condition-level binding preference from contacts data. Parameters ---------- cond : ConditionLike Condition to compute. sim_config : Any Simulation config used for topology lookup. analysis_dir : Path Contacts analysis directory where outputs are saved. enzyme_pdb : Path Enzyme PDB path used for SASA filtering. contact_results_by_replicate : Mapping[int, Path] Mapping from replicate index to contacts JSON path. load_contact_result : callable Callable used to deserialize contacts results. threshold : float Relative SASA threshold for exposed residues. include_default_aa_groups : bool Whether default AA class groups are included. custom_protein_groups : dict[str, list[int]] | None Optional custom protein groups. protein_partitions : dict[str, list[str]] | None Optional user-defined protein partitions. polymer_type_selections : dict[str, str] | None Optional explicit polymer selection map. polymer_chain : str Polymer chain ID for auto-detection. settings_fp : str | None Optional settings fingerprint used for cache filenames. Returns ------- AggregatedBindingPreferenceResult | None Aggregated result, or None if computation failed. """ import MDAnalysis as mda from polyzymd.analyses.shared.loader import TrajectoryLoader from polyzymd.analyses.shared.surface_exposure import SurfaceExposureFilter from ._aggregate import aggregate_binding_preference # Compute surface exposure from enzyme structure try: exposure_filter = SurfaceExposureFilter(threshold=threshold) surface_exposure = exposure_filter.calculate(str(enzyme_pdb)) logger.debug( f"Computed surface exposure for {cond.label}: " f"{surface_exposure.exposed_count}/{surface_exposure.total_count} residues exposed" ) except (FileNotFoundError, ValueError, OSError, ImportError) as exc: logger.warning(f"Failed to compute surface exposure for {cond.label}: {exc}") return None protein_groups = resolve_protein_groups_from_surface_exposure( surface_exposure, include_default_aa_groups=include_default_aa_groups, custom_protein_groups=custom_protein_groups, ) if not protein_groups: logger.warning(f"No protein groups resolved for {cond.label}") return None polymer_composition: PolymerComposition | None = None first_rep = cond.replicates[0] if cond.replicates else 1 run_dir = sim_config.get_working_directory(first_rep) topology_path = None try: loader = TrajectoryLoader(sim_config) topology_path = loader.find_topology(run_dir) except (FileNotFoundError, ImportError): logger.warning( f"Cannot extract polymer composition for {cond.label}: " f"no topology file found in {run_dir}" ) if topology_path is not None: try: universe = mda.Universe(str(topology_path)) polymer_composition = extract_polymer_composition( universe, polymer_type_selections, polymer_chain=polymer_chain ) logger.debug( f"Extracted polymer composition for {cond.label}: " f"{polymer_composition.total_residues} residues, " f"{polymer_composition.total_heavy_atoms} heavy atoms" ) except (ValueError, OSError, AttributeError) as exc: logger.warning(f"Failed to extract polymer composition for {cond.label}: {exc}") if polymer_composition is None: polymer_composition = PolymerComposition() logger.warning( f"Using empty polymer composition for {cond.label} — enrichment ratios will be NaN" ) rep_results = [] for rep in cond.replicates: contact_path = contact_results_by_replicate.get(rep) if contact_path is None: logger.warning(f"Contacts file not found for {cond.label} rep{rep} in {analysis_dir}") continue try: contact_result = load_contact_result(contact_path) bp_result = compute_binding_preference( contact_result=contact_result, surface_exposure=surface_exposure, protein_groups=protein_groups, polymer_composition=polymer_composition, protein_partitions=protein_partitions, ) rep_results.append(bp_result) if settings_fp is not None: rep_bp_path = analysis_dir / f"binding_preference_s{settings_fp}_rep{rep}.json" else: rep_bp_path = analysis_dir / f"binding_preference_rep{rep}.json" bp_result.save(rep_bp_path) logger.debug(f"Computed and saved binding preference for {cond.label} rep{rep}") except (FileNotFoundError, json.JSONDecodeError, ValueError, KeyError, OSError) as exc: logger.warning(f"Failed to compute binding preference for {cond.label} rep{rep}: {exc}") if not rep_results: logger.warning(f"No binding preference results computed for {cond.label}") return None agg_result = aggregate_binding_preference(rep_results) rep_range = f"{min(cond.replicates)}-{max(cond.replicates)}" if settings_fp is not None: agg_path = ( analysis_dir / f"binding_preference_aggregated_s{settings_fp}_reps{rep_range}.json" ) else: agg_path = analysis_dir / f"binding_preference_aggregated_reps{rep_range}.json" agg_result.save(agg_path) logger.info( f"Computed binding preference for {cond.label}: " f"{len(rep_results)} replicates, {len(protein_groups)} protein groups" ) return agg_result
[docs] def compute_binding_preference_from_config( contact_result: Any, universe: "Universe", enzyme_pdb_path: Path | str, config: Any, ) -> BindingPreferenceResult: """Compute binding preference using contacts plugin settings. This is a convenience function that orchestrates the full binding preference computation using configuration from analysis.yaml. It: 1. Calculates surface exposure from enzyme PDB 2. Resolves protein group selections to residue IDs 3. Computes binding preference with enrichment ratios Parameters ---------- contact_result : Any Contact analysis results from trajectory universe : Universe MDAnalysis Universe (used for resolving selections) enzyme_pdb_path : Path or str Path to enzyme PDB file for SASA calculation config : Any Settings object with binding preference fields Returns ------- BindingPreferenceResult Binding preference with enrichment ratios Notes ----- This function requires rust_sasa_python to be installed for surface exposure calculation. Install with: pip install rust-sasa-python Examples -------- >>> from polyzymd.analyses.contacts import ContactsSettings >>> config = ContactsSettings( ... compute_binding_preference=True, ... surface_exposure_threshold=0.2, ... ) >>> result = compute_binding_preference_from_config( ... contact_result, universe, "enzyme.pdb", config ... ) >>> print(result.enrichment_matrix()) {'SBM': {'aromatic': 1.45, 'polar': 0.82, ...}, ...} """ from polyzymd.analyses.shared.surface_exposure import SurfaceExposureFilter logger.info("Computing binding preference from config...") # Step 1: Calculate surface exposure exposure_filter = SurfaceExposureFilter(threshold=config.surface_exposure_threshold) surface_exposure = exposure_filter.calculate(enzyme_pdb_path) logger.info( f"Surface exposure: {surface_exposure.exposed_count}/{surface_exposure.total_count} " f"residues exposed (threshold={config.surface_exposure_threshold})" ) # Step 2: Resolve protein group selections to residue IDs protein_groups = resolve_protein_group_selections(universe, config.protein_group_selections) for group_name, resids in protein_groups.items(): exposed_in_group = surface_exposure.exposed_in_selection(resids) logger.debug(f" {group_name}: {len(resids)} residues, {len(exposed_in_group)} exposed") # Step 3: Get polymer types # Extract polymer chain ID from config.polymer_selection (e.g. "chainID C" → "C"). # Falls back to default "C" if the selection doesn't match the "chainID X" pattern. _polymer_chain = "C" _ps = getattr(config, "polymer_selection", "") if _ps.startswith("chainID ") and len(_ps.split()) == 2: _polymer_chain = _ps.split()[1] polymer_types = resolve_polymer_type_selections( universe, config.polymer_type_selections, polymer_chain=_polymer_chain ) logger.info(f"Polymer types for binding preference: {polymer_types}") # Step 4: Extract polymer composition for dual normalization polymer_composition = extract_polymer_composition( universe, config.polymer_type_selections, polymer_chain=_polymer_chain ) # Step 5: Compute binding preference result = compute_binding_preference( contact_result=contact_result, surface_exposure=surface_exposure, protein_groups=protein_groups, polymer_composition=polymer_composition, polymer_types=polymer_types if polymer_types else None, protein_group_selections=config.protein_group_selections, polymer_type_selections=config.polymer_type_selections, ) return result