Source code for polyzymd.analyses.shared.surface_exposure

"""Surface exposure filtering using SASA calculations.

Shared utility for determining which protein residues are surface-exposed
based on solvent-accessible surface area (SASA) calculations.  Surface
exposure data is consumed by multiple analysis plugins (contacts,
binding_free_energy, polymer_affinity) for binding preference analysis,
since buried residues cannot participate in polymer-protein contacts.

Uses `rust_sasa_python` for fast SASA computation on initial PDB structures.

The surface exposure threshold determines which residues are considered
"exposed" based on their relative SASA (actual SASA / max theoretical SASA).
A threshold of 0.2 means residues with at least 20% of their theoretical
maximum surface area exposed are considered surface-accessible.

Examples
--------
>>> from polyzymd.analyses.shared.surface_exposure import SurfaceExposureFilter
>>> filter = SurfaceExposureFilter(threshold=0.2)
>>> result = filter.calculate("enzyme.pdb")
>>> print(f"Found {result.exposed_count} surface-exposed residues")
>>> print(result.exposed_by_aa_class())
{'aromatic': 12, 'polar': 25, 'nonpolar': 18, ...}
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING

from polyzymd.analyses.shared.aa_classification import MAX_ASA_TABLE, get_aa_class
from polyzymd.analyses.shared.constants import DEFAULT_SURFACE_EXPOSURE_THRESHOLD

if TYPE_CHECKING:
    pass

logger = logging.getLogger(__name__)


[docs] @dataclass class ResidueExposure: """SASA data for a single protein residue. Attributes ---------- resid : int Residue ID (1-indexed, as in PDB) resname : str 3-letter residue name (e.g., "ALA", "PHE") chain_id : str Chain identifier from PDB sasa : float Calculated SASA in Angstrom^2 max_sasa : float Maximum theoretical SASA for this residue type (Tien et al. 2013) relative_sasa : float Ratio of actual to maximum SASA (0.0 to ~1.0+) is_exposed : bool True if relative_sasa >= threshold aa_class : str Amino acid classification (aromatic, polar, etc.) """ resid: int resname: str chain_id: str sasa: float max_sasa: float relative_sasa: float is_exposed: bool aa_class: str
[docs] @dataclass class SurfaceExposureResult: """Surface exposure analysis result for a protein structure. Contains SASA data for all residues and provides convenience methods for filtering and grouping by exposure status and amino acid class. Attributes ---------- residue_exposures : list[ResidueExposure] SASA data for each residue threshold : float Relative SASA threshold used (e.g., 0.2 for 20%) pdb_path : str Path to the PDB file analyzed probe_radius : float Probe radius used for SASA calculation n_points : int Number of points used for SASA calculation """ residue_exposures: list[ResidueExposure] = field(default_factory=list) threshold: float = DEFAULT_SURFACE_EXPOSURE_THRESHOLD pdb_path: str = "" probe_radius: float = 1.4 n_points: int = 1000 @property def exposed_resids(self) -> set[int]: """Set of residue IDs that are surface-exposed.""" return {r.resid for r in self.residue_exposures if r.is_exposed} @property def buried_resids(self) -> set[int]: """Set of residue IDs that are buried (not surface-exposed).""" return {r.resid for r in self.residue_exposures if not r.is_exposed} @property def all_resids(self) -> set[int]: """Set of all residue IDs.""" return {r.resid for r in self.residue_exposures} @property def exposed_count(self) -> int: """Number of surface-exposed residues.""" return len(self.exposed_resids) @property def total_count(self) -> int: """Total number of residues analyzed.""" return len(self.residue_exposures)
[docs] def exposed_by_aa_class(self) -> dict[str, int]: """Count of exposed residues per amino acid class. Returns ------- dict[str, int] Mapping from AA class to count of exposed residues. E.g., {'aromatic': 12, 'polar': 25, ...} """ counts: dict[str, int] = {} for r in self.residue_exposures: if r.is_exposed: counts[r.aa_class] = counts.get(r.aa_class, 0) + 1 return counts
[docs] def total_by_aa_class(self) -> dict[str, int]: """Count of all residues per amino acid class (exposed + buried). Returns ------- dict[str, int] Mapping from AA class to total count. """ counts: dict[str, int] = {} for r in self.residue_exposures: counts[r.aa_class] = counts.get(r.aa_class, 0) + 1 return counts
[docs] def exposed_in_selection(self, resids: set[int]) -> set[int]: """Get exposed residues within a specific selection. Parameters ---------- resids : set[int] Set of residue IDs to filter Returns ------- set[int] Intersection of resids with exposed_resids """ return self.exposed_resids & resids
[docs] def get_exposure(self, resid: int) -> ResidueExposure | None: """Get exposure data for a specific residue. Parameters ---------- resid : int Residue ID to look up Returns ------- ResidueExposure or None Exposure data, or None if resid not found """ for r in self.residue_exposures: if r.resid == resid: return r return None
[docs] def to_dict(self) -> dict: """Serialize to dictionary for JSON storage.""" return { "threshold": self.threshold, "pdb_path": self.pdb_path, "probe_radius": self.probe_radius, "n_points": self.n_points, "residue_exposures": [ { "resid": r.resid, "resname": r.resname, "chain_id": r.chain_id, "sasa": r.sasa, "max_sasa": r.max_sasa, "relative_sasa": r.relative_sasa, "is_exposed": r.is_exposed, "aa_class": r.aa_class, } for r in self.residue_exposures ], }
[docs] @classmethod def from_dict(cls, data: dict) -> "SurfaceExposureResult": """Deserialize from dictionary.""" exposures = [ ResidueExposure( resid=r["resid"], resname=r["resname"], chain_id=r["chain_id"], sasa=r["sasa"], max_sasa=r["max_sasa"], relative_sasa=r["relative_sasa"], is_exposed=r["is_exposed"], aa_class=r["aa_class"], ) for r in data.get("residue_exposures", []) ] return cls( residue_exposures=exposures, threshold=data.get("threshold", DEFAULT_SURFACE_EXPOSURE_THRESHOLD), pdb_path=data.get("pdb_path", ""), probe_radius=data.get("probe_radius", 1.4), n_points=data.get("n_points", 1000), )
[docs] class SurfaceExposureFilter: """Compute surface exposure from initial PDB structure. Uses `rust_sasa_python` for fast SASA calculation. Residues are classified as "exposed" if their relative SASA (actual/max theoretical) meets or exceeds the threshold. Parameters ---------- threshold : float Relative SASA threshold (0-1). Residues with SASA/maxSASA >= threshold are considered exposed. Default 0.2 (20% of max theoretical). probe_radius : float Probe radius for SASA calculation in Angstroms. Default 1.4 (water molecule radius). n_points : int Number of points for SASA calculation. Higher = more accurate but slower. Default 1000. Examples -------- >>> filter = SurfaceExposureFilter(threshold=0.2) >>> result = filter.calculate("enzyme.pdb") >>> exposed = result.exposed_resids >>> print(f"Found {len(exposed)} exposed residues") >>> # Get exposed residues by AA class >>> by_class = result.exposed_by_aa_class() >>> print(f"Exposed aromatics: {by_class.get('aromatic', 0)}") """
[docs] def __init__( self, threshold: float = DEFAULT_SURFACE_EXPOSURE_THRESHOLD, probe_radius: float = 1.4, n_points: int = 1000, ): if not 0.0 < threshold < 1.0: logger.warning( f"Surface exposure threshold {threshold} is outside typical range (0-1). " "This may classify too few or too many residues as exposed." ) self.threshold = threshold self.probe_radius = probe_radius self.n_points = n_points
[docs] def calculate(self, pdb_path: Path | str) -> SurfaceExposureResult: """Calculate surface exposure for all protein residues. Parameters ---------- pdb_path : Path or str Path to PDB file (typically the enzyme PDB from config). Should contain only the protein (not polymer/solvent). Returns ------- SurfaceExposureResult Surface exposure data for all residues Raises ------ FileNotFoundError If PDB file does not exist ImportError If rust_sasa_python is not installed """ # Lazy import to avoid dependency issues try: import rust_sasa_python as sasa except ImportError as e: raise ImportError( "rust_sasa_python is required for surface exposure analysis. " "Ensure rust_sasa_python is available in the PolyzyMD pixi " 'environment (for example: pixi run -e build python -c "import rust_sasa_python").' ) from e pdb_path = Path(pdb_path) if not pdb_path.exists(): raise FileNotFoundError(f"PDB file not found: {pdb_path}") logger.info(f"Calculating SASA for {pdb_path.name} (threshold={self.threshold})") # Create calculator and configure calculator = sasa.SASACalculator(str(pdb_path)) calculator = calculator.with_probe_radius(self.probe_radius) calculator = calculator.with_n_points(self.n_points) # Calculate per-residue SASA residue_sasa_values = calculator.calculate_residue() exposures = [] for res_data in residue_sasa_values: resname = res_data.residue_name resid = res_data.residue_number chain_id = getattr(res_data, "chain_id", "A") # Default to A if not present sasa_value = res_data.sasa # Get max SASA from table, fallback to 200 A^2 for unknown residues max_sasa = MAX_ASA_TABLE.get(resname.upper(), 200.0) # Calculate relative SASA relative_sasa = sasa_value / max_sasa if max_sasa > 0 else 0.0 # Determine if exposed is_exposed = relative_sasa >= self.threshold exposures.append( ResidueExposure( resid=resid, resname=resname, chain_id=chain_id, sasa=sasa_value, max_sasa=max_sasa, relative_sasa=relative_sasa, is_exposed=is_exposed, aa_class=get_aa_class(resname), ) ) result = SurfaceExposureResult( residue_exposures=exposures, threshold=self.threshold, pdb_path=str(pdb_path), probe_radius=self.probe_radius, n_points=self.n_points, ) logger.info( f"SASA analysis complete: {result.exposed_count}/{result.total_count} " f"residues are surface-exposed (>={self.threshold * 100:.0f}% relative SASA)" ) return result