Source code for polyzymd.analyses.shared.binding_preference_helpers

"""Shared helpers for computing binding preference from contacts data.

Used by the contacts, binding_free_energy, and polymer_affinity analysis
plugins to compute SASA-based binding preference enrichment.

Public functions
----------------
- find_enzyme_pdb()
- resolve_enzyme_pdb()
- try_load_cached_binding_preference()
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol, Sequence

if TYPE_CHECKING:
    from polyzymd.analyses.shared.binding_preference import (
        AggregatedBindingPreferenceResult,
        BindingPreferenceResult,
    )

logger = logging.getLogger("polyzymd.analyses")


[docs] class ConditionLike(Protocol): """Minimal condition protocol required by BP helper functions. Attributes ---------- label : str Condition label replicates : Sequence[int] Replicate IDs associated with this condition """ label: str replicates: Sequence[int]
[docs] def find_enzyme_pdb(sim_config: Any) -> Path | None: """Find enzyme PDB file from simulation config. Searches common locations relative to the project directory. Parameters ---------- sim_config : SimulationConfig Simulation configuration (must have ``output.projects_directory``). Returns ------- Path or None Path to enzyme PDB, or None if not found. Raises ------ ValueError If a glob pattern matches multiple candidate enzyme PDB files. """ import glob as glob_module project_dir = sim_config.output.projects_directory possible_paths = [ project_dir / "structures" / "enzyme.pdb", project_dir / "input" / "enzyme.pdb", project_dir.parent / "structures" / "enzyme.pdb", project_dir.parent / "enzyme.pdb", ] for path in possible_paths: if path.exists(): return path # Try glob for any PDB with "enzyme" in name patterns = [ str(project_dir / "**" / "*enzyme*.pdb"), str(project_dir.parent / "*enzyme*.pdb"), ] for pattern in patterns: matches = sorted(glob_module.glob(pattern, recursive=True)) if matches: if len(matches) > 1: raise ValueError( f"Ambiguous enzyme PDB auto-discovery for pattern '{pattern}': " f"{len(matches)} matches found: " + ", ".join(matches) ) return Path(matches[0]) return None
[docs] def resolve_enzyme_pdb( enzyme_pdb_setting: str | None, source_path: Path | None, sim_config: Any, ) -> Path | None: """Resolve the enzyme PDB path from settings or auto-discovery. Parameters ---------- enzyme_pdb_setting : str or None Explicit enzyme PDB path from analysis settings (e.g., ``enzyme_pdb_for_sasa``). If relative, resolved against *source_path*'s parent directory. source_path : Path or None Path to the comparison.yaml file (used to resolve relative paths). sim_config : Any Simulation configuration for auto-discovery fallback. Returns ------- Path or None Resolved enzyme PDB path, or None if not found. """ if enzyme_pdb_setting: if source_path: enzyme_pdb = source_path.parent / enzyme_pdb_setting else: enzyme_pdb = Path(enzyme_pdb_setting) if enzyme_pdb.exists(): return enzyme_pdb logger.warning(f"Explicit enzyme_pdb_for_sasa not found at {enzyme_pdb}") return None return find_enzyme_pdb(sim_config)
[docs] def try_load_cached_binding_preference( cond: ConditionLike, analysis_dir: Path, *, settings_fp: str | None = None, ) -> "AggregatedBindingPreferenceResult | BindingPreferenceResult | None": """Try to load cached binding preference results for a condition. Searches for binding preference files in order of preference: 1. binding_preference_aggregated.json 2. binding_preference_aggregated_reps*.json (glob pattern) 3. binding_preference.json (single replicate) 4. Per-replicate files (binding_preference_rep{N}.json) Parameters ---------- cond : ConditionLike Condition to load. analysis_dir : Path Analysis directory for this condition. settings_fp : str or None, optional Settings fingerprint for cache lookup. When provided, fingerprinted cache files are searched first, then legacy filenames. Returns ------- AggregatedBindingPreferenceResult | BindingPreferenceResult | None Loaded result, or None if not found. """ import glob as glob_module from polyzymd.analyses.shared.binding_preference import ( AggregatedBindingPreferenceResult, BindingPreferenceResult, aggregate_binding_preference, ) if settings_fp is not None: fp_agg_path = analysis_dir / f"binding_preference_aggregated_s{settings_fp}.json" if fp_agg_path.exists(): result = AggregatedBindingPreferenceResult.load(fp_agg_path) logger.debug(f"Loaded aggregated binding preference for {cond.label}") return result fp_agg_pattern = str( analysis_dir / f"binding_preference_aggregated_s{settings_fp}_reps*.json" ) fp_agg_matches = sorted(glob_module.glob(fp_agg_pattern)) if len(fp_agg_matches) == 1: result = AggregatedBindingPreferenceResult.load(fp_agg_matches[0]) logger.debug(f"Loaded aggregated binding preference for {cond.label}") return result if len(fp_agg_matches) > 1: raise ValueError( f"Ambiguous binding preference cache for {cond.label}: " f"found {len(fp_agg_matches)} files matching '{fp_agg_pattern}': " + ", ".join(fp_agg_matches) ) fp_single_path = analysis_dir / f"binding_preference_s{settings_fp}.json" if fp_single_path.exists(): result = BindingPreferenceResult.load(fp_single_path) logger.debug(f"Loaded single binding preference for {cond.label}") return result fp_rep_results = [] for rep in cond.replicates: fp_rep_path = analysis_dir / f"binding_preference_s{settings_fp}_rep{rep}.json" if fp_rep_path.exists(): fp_rep_results.append(BindingPreferenceResult.load(fp_rep_path)) if fp_rep_results: agg_result = aggregate_binding_preference(fp_rep_results) logger.debug( f"Aggregated {len(fp_rep_results)} replicate binding preference results " f"for {cond.label}" ) return agg_result # Try aggregated result first (multi-replicate) agg_path = analysis_dir / "binding_preference_aggregated.json" if agg_path.exists(): result = AggregatedBindingPreferenceResult.load(agg_path) logger.debug(f"Loaded aggregated binding preference for {cond.label}") return result # Try aggregated result with rep range in name (e.g., _reps1-3.json) agg_pattern = str(analysis_dir / "binding_preference_aggregated_reps*.json") agg_matches = sorted(glob_module.glob(agg_pattern)) if len(agg_matches) == 1: result = AggregatedBindingPreferenceResult.load(agg_matches[0]) logger.debug(f"Loaded aggregated binding preference for {cond.label}") return result if len(agg_matches) > 1: raise ValueError( f"Ambiguous binding preference cache for {cond.label}: " f"found {len(agg_matches)} files: " + ", ".join(agg_matches) ) # Try single replicate result single_path = analysis_dir / "binding_preference.json" if single_path.exists(): result = BindingPreferenceResult.load(single_path) logger.debug(f"Loaded single binding preference for {cond.label}") return result # Try per-replicate results and aggregate them rep_results = [] for rep in cond.replicates: rep_path = analysis_dir / f"binding_preference_rep{rep}.json" if rep_path.exists(): rep_results.append(BindingPreferenceResult.load(rep_path)) if rep_results: agg_result = aggregate_binding_preference(rep_results) logger.debug( f"Aggregated {len(rep_results)} replicate binding preference results for {cond.label}" ) return agg_result return None