Source code for polyzymd.analyses.shared.alignment

"""Trajectory alignment utilities.

This module provides reusable trajectory alignment infrastructure for
all analysis modules. The alignment is performed in-memory using
MDAnalysis and removes rotational drift and center-of-mass motion
that can add noise to distance and flexibility measurements.

Reference Modes
---------------
- **centroid**: Align to a representative aligned frame (closest to aligned
  mean structure).
  Best for measuring fluctuations around a representative equilibrium state.
- **average**: Align to mathematical average structure.
  Best for pure thermal fluctuation analysis (note: average may have
  unphysical geometry).
- **frame**: Align to a specific user-specified frame number.
  Best for comparing to a known functional conformation.
- **external**: Align to an external PDB structure (e.g., crystal structure).
  Best for measuring deviations from a catalytically competent reference
  geometry that is independent of the simulation conditions.

Usage
-----
>>> from polyzymd.analyses.shared.alignment import AlignmentConfig, align_trajectory
>>>
>>> # Create configuration
>>> config = AlignmentConfig(
...     enabled=True,
...     reference_mode="centroid",
...     selection="protein and name CA",
... )
>>>
>>> # Align trajectory in-place
>>> ref_frame = align_trajectory(universe, config, start_frame=100)
>>> # Universe is now aligned; ref_frame is the reference frame index

Notes
-----
Alignment modifies the Universe in-place. If you need the original
coordinates, load a fresh Universe after alignment.

The alignment selection should typically be stable atoms that define
the reference frame. Common choices:
- "protein and name CA": Backbone alpha-carbons (default)
- "protein and backbone": Full backbone (N, CA, C)
- "protein": All protein atoms (includes flexible side chains)

References
----------
- MDAnalysis alignment: https://docs.mdanalysis.org/stable/documentation_pages/analysis/align.html
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING, Literal

from pydantic import BaseModel, Field, model_validator

from polyzymd.analyses.shared.loader import _require_mdanalysis

if TYPE_CHECKING:
    from MDAnalysis.core.universe import Universe

LOGGER = logging.getLogger(__name__)

# Type alias for reference modes (re-export from centroid for consistency)
ReferenceMode = Literal["centroid", "average", "frame", "external"]


[docs] class AlignmentConfig(BaseModel): """Configuration for trajectory alignment. This model defines how a trajectory should be aligned before analysis. By default, alignment is ENABLED to remove rotational drift and COM motion that can add noise to distance measurements. Attributes ---------- enabled : bool Whether to align the trajectory before analysis. Default is True. When enabled, the user is notified via INFO-level logging. reference_mode : ReferenceMode How to select the reference structure: - "centroid": Representative aligned frame (closest to aligned mean) - "average": Mathematical average structure - "frame": Specific frame number reference_frame : int | None Frame number (1-indexed) when reference_mode="frame". Required when mode is "frame", ignored otherwise. selection : str MDAnalysis selection string for superposition. Atoms matching this selection are used to compute the optimal rotation/translation. Default: "protein and name CA" (backbone alpha-carbons). centroid_selection : str Selection for representative-frame detection when mode="centroid". Default: "protein" (all protein atoms for better discrimination). Examples -------- >>> # Default: align to centroid using CA atoms >>> config = AlignmentConfig() >>> # Align to a specific frame >>> config = AlignmentConfig( ... reference_mode="frame", ... reference_frame=500, ... ) >>> # Disable alignment (not recommended for most analyses) >>> config = AlignmentConfig(enabled=False) """ enabled: bool = Field( default=True, description="Whether to align trajectory before analysis", ) reference_mode: ReferenceMode = Field( default="centroid", description="Reference structure mode: centroid, average, or frame", ) reference_frame: int | None = Field( default=None, description="Frame number (1-indexed) when reference_mode='frame'", ) selection: str = Field( default="protein and name CA", description="MDAnalysis selection for superposition", ) centroid_selection: str = Field( default="protein", description="Selection for representative aligned frame detection (centroid mode only)", ) reference_file: Path | None = Field( default=None, description=( "Path to an external PDB file to use as the alignment reference " "(required when reference_mode='external'). The external PDB " "must contain protein atoms that match the simulation topology." ), )
[docs] @model_validator(mode="after") def validate_reference_params(self) -> "AlignmentConfig": """Validate reference_frame and reference_file for their modes.""" if not self.enabled: return self if self.reference_mode == "frame" and self.reference_frame is None: raise ValueError( "reference_frame is required when reference_mode='frame'. " "Provide a 1-indexed frame number." ) if self.reference_mode == "external": if self.reference_file is None: raise ValueError( "reference_file is required when reference_mode='external'. " "Provide a path to the external PDB reference structure." ) ref_path = Path(self.reference_file) if not ref_path.exists(): raise ValueError( f"reference_file does not exist: {ref_path}. " "Provide a valid path to the external PDB reference structure." ) return self
[docs] def to_dict(self) -> dict: """Convert to dictionary for cache key hashing.""" d = { "enabled": self.enabled, "reference_mode": self.reference_mode, "reference_frame": self.reference_frame, "selection": self.selection, "centroid_selection": self.centroid_selection, } if self.reference_file is not None: d["reference_file"] = str(self.reference_file) return d
[docs] def align_trajectory( universe: "Universe", config: AlignmentConfig, start_frame: int = 0, stop_frame: int | None = None, ) -> int | None: """Align trajectory in-memory to a reference structure. This function performs trajectory alignment using MDAnalysis, removing rotational drift and center-of-mass motion. The alignment is performed in-place, modifying the Universe's coordinates. Parameters ---------- universe : Universe MDAnalysis Universe to align. Will be modified in-place. config : AlignmentConfig Alignment configuration specifying reference mode and selections. start_frame : int, optional First frame to consider (0-indexed). Default is 0. Frames before this are typically equilibration and should be excluded. stop_frame : int, optional Last frame (exclusive). If None, uses all frames after start_frame. Returns ------- int | None Reference frame index (0-indexed), or None if using average mode. This can be stored in results for reproducibility. Raises ------ ImportError If MDAnalysis is not available. ValueError If reference_mode is "frame" but the frame number is out of range. Notes ----- When alignment is performed, an INFO-level log message is emitted to notify the user. This ensures users are aware that their trajectory coordinates have been modified. Examples -------- >>> from polyzymd.analyses.shared.alignment import AlignmentConfig, align_trajectory >>> >>> config = AlignmentConfig(reference_mode="centroid") >>> ref_frame = align_trajectory(universe, config, start_frame=100) >>> print(f"Aligned to frame {ref_frame}") """ if not config.enabled: LOGGER.debug("Alignment disabled by configuration") return None _require_mdanalysis("trajectory alignment") from MDAnalysis.analysis import align from polyzymd.analyses.shared.centroid import find_centroid_frame n_frames = len(universe.trajectory) if stop_frame is None: stop_frame = n_frames ref_frame_idx: int | None = None if config.reference_mode == "centroid": # Find representative aligned frame LOGGER.info( f"Aligning trajectory to representative aligned frame (selection: '{config.selection}', " f"frames {start_frame}-{stop_frame})" ) ref_frame_idx = find_centroid_frame( universe, selection=config.centroid_selection, start_frame=start_frame, stop_frame=stop_frame, verbose=False, # We handle our own logging ) LOGGER.info(f"Found representative aligned frame at {ref_frame_idx} (0-indexed)") # Align to centroid frame aligner = align.AlignTraj( universe, universe, select=config.selection, ref_frame=ref_frame_idx, in_memory=True, ).run() del aligner elif config.reference_mode == "average": # Compute average structure and align to it LOGGER.info(f"Aligning trajectory to average structure (selection: '{config.selection}')") average = align.AverageStructure( universe, universe, select=config.selection, ref_frame=start_frame, # Initial reference for iterative averaging ).run() ref_universe = average.results.universe aligner = align.AlignTraj( universe, ref_universe, select=config.selection, in_memory=True, ).run() del aligner del average ref_frame_idx = None # Average is synthetic, not a real frame LOGGER.info("Aligned to average structure (synthetic reference)") elif config.reference_mode == "frame": # Use user-specified frame (convert from 1-indexed to 0-indexed) ref_frame_idx = config.reference_frame - 1 # type: ignore[operator] # Validate frame number if ref_frame_idx < 0 or ref_frame_idx >= n_frames: raise ValueError( f"reference_frame={config.reference_frame} (1-indexed) is out of range. " f"Valid range: 1 to {n_frames}" ) LOGGER.info( f"Aligning trajectory to frame {config.reference_frame} " f"(selection: '{config.selection}')" ) aligner = align.AlignTraj( universe, universe, select=config.selection, ref_frame=ref_frame_idx, in_memory=True, ).run() del aligner elif config.reference_mode == "external": # Align to an external PDB structure (e.g., catalytically competent crystal) import MDAnalysis as mda ref_path = Path(config.reference_file) # type: ignore[arg-type] LOGGER.info( f"Aligning trajectory to external PDB: {ref_path} (selection: '{config.selection}')" ) # Load external reference as a separate Universe ref_universe = mda.Universe(str(ref_path)) # Validate atom counts match for the alignment selection traj_atoms = universe.select_atoms(config.selection) ref_atoms = ref_universe.select_atoms(config.selection) if len(ref_atoms) == 0: raise ValueError( f"External PDB '{ref_path.name}' has no atoms matching " f"selection '{config.selection}'. Check that the PDB contains " f"protein atoms with matching names/residues." ) if len(traj_atoms) != len(ref_atoms): raise ValueError( f"Atom count mismatch between trajectory ({len(traj_atoms)}) " f"and external PDB ({len(ref_atoms)}) for selection " f"'{config.selection}'. The external PDB must contain " f"the same protein atoms as the simulation topology." ) # Warn if residue names don't match traj_resnames = [r.resname for r in traj_atoms.residues] ref_resnames = [r.resname for r in ref_atoms.residues] if traj_resnames != ref_resnames: mismatches = [ (i, t, r) for i, (t, r) in enumerate(zip(traj_resnames, ref_resnames)) if t != r ] LOGGER.warning( f"Residue name mismatches between trajectory and external PDB " f"({len(mismatches)} residues differ). First 5: " f"{mismatches[:5]}. Proceeding with alignment anyway." ) LOGGER.info( f"External PDB matched: {len(ref_atoms)} atoms, {len(ref_atoms.residues)} residues" ) # Align trajectory to external reference aligner = align.AlignTraj( universe, ref_universe, select=config.selection, in_memory=True, ).run() del aligner ref_frame_idx = None # External reference is not a trajectory frame LOGGER.info("Aligned to external PDB reference structure") else: raise ValueError(f"Unknown reference_mode: {config.reference_mode}") LOGGER.info("Trajectory alignment complete") return ref_frame_idx
[docs] def get_alignment_description(config: AlignmentConfig) -> str: """Generate a human-readable description of the alignment configuration. Parameters ---------- config : AlignmentConfig Alignment configuration. Returns ------- str Description suitable for documentation or result files. """ if not config.enabled: return "Alignment: disabled" if config.reference_mode == "centroid": return ( "Alignment: centroid mode (representative aligned frame), " f"selection='{config.selection}'" ) elif config.reference_mode == "average": return f"Alignment: average structure, selection='{config.selection}'" elif config.reference_mode == "frame": return ( f"Alignment: frame {config.reference_frame} (1-indexed), selection='{config.selection}'" ) elif config.reference_mode == "external": return f"Alignment: external PDB '{config.reference_file}', selection='{config.selection}'" else: return f"Alignment: {config.reference_mode}"