Source code for polyzymd.analyses.shared.centroid

"""Representative frame finding utilities.

This module provides functions to find representative frames from MD trajectories
using different methods. The representative frame is commonly used as a reference
for trajectory alignment before RMSF calculations.

Methods
-------
centroid (aligned-mean representative frame)
    Finds the frame closest to the aligned mean structure.
    Uses all protein atoms by default to capture side chain conformations.
    Best for: Finding a representative equilibrium conformation while
    removing rigid-body translation and rotation effects.

average
    Aligns to an average structure computed from all frames.
    Note: The average structure is synthetic and may have unphysical geometry
    (e.g., distorted bond lengths/angles).
    Best for: Pure mathematical measure of thermal fluctuations around the mean.

frame
    Uses a specific frame as the reference (user-specified).
    Best for: Analyzing fluctuations relative to a known functional state,
    such as a catalytically competent conformation.

See Also
--------
The RMSF best-practices documentation provides detailed guidance on when to
use each reference selection method.
"""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import numpy as np
from numpy.typing import NDArray

if TYPE_CHECKING:
    import MDAnalysis as mda
    from MDAnalysis.core.universe import Universe

from polyzymd.analyses.shared.alignment import ReferenceMode

LOGGER = logging.getLogger(__name__)


def _kabsch_align_to_reference(
    coordinates: NDArray[np.float64],
    reference: NDArray[np.float64],
) -> NDArray[np.float64]:
    """Align coordinates to a reference with Kabsch superposition.

    Parameters
    ----------
    coordinates : NDArray[np.float64]
        Coordinates to align with shape (n_atoms, 3).
    reference : NDArray[np.float64]
        Reference coordinates with shape (n_atoms, 3).

    Returns
    -------
    NDArray[np.float64]
        Coordinates aligned to the centered reference.
    """
    centered = coordinates - np.mean(coordinates, axis=0)
    ref_centered = reference - np.mean(reference, axis=0)

    covariance = centered.T @ ref_centered
    u_matrix, _, v_t = np.linalg.svd(covariance)

    rotation = u_matrix @ v_t
    if np.linalg.det(rotation) < 0:
        u_matrix[:, -1] *= -1.0
        rotation = u_matrix @ v_t

    return centered @ rotation


def _find_frame_closest_to_aligned_mean(coordinates: NDArray[np.float64]) -> tuple[int, float]:
    """Find frame closest to aligned mean coordinates.

    Parameters
    ----------
    coordinates : NDArray[np.float64]
        Raw coordinates with shape (n_frames, n_atoms, 3).

    Returns
    -------
    tuple[int, float]
        Relative index of frame closest to the aligned mean structure and the
        corresponding RMSD to aligned mean.
    """
    if coordinates.ndim != 3:
        raise ValueError("coordinates must have shape (n_frames, n_atoms, 3)")

    n_frames = coordinates.shape[0]
    if n_frames == 1:
        return 0, 0.0

    reference = coordinates[0]
    aligned = np.empty_like(coordinates)
    for frame_idx in range(n_frames):
        aligned[frame_idx] = _kabsch_align_to_reference(coordinates[frame_idx], reference)

    mean_coordinates = np.mean(aligned, axis=0)
    rmsd_to_mean = np.sqrt(np.mean((aligned - mean_coordinates) ** 2, axis=(1, 2)))
    relative_idx = int(np.argmin(rmsd_to_mean))
    return relative_idx, float(rmsd_to_mean[relative_idx])


[docs] def find_centroid_frame( universe: "Universe", selection: str = "protein", start_frame: int = 0, stop_frame: int | None = None, verbose: bool = True, ) -> int: """Find a representative aligned frame. This function identifies the frame closest to the aligned mean structure. It first performs rigid-body alignment of each frame to a common reference, computes the mean coordinates in aligned space, then returns the trajectory frame with minimum RMSD to that aligned mean. This approach avoids contamination from translation/rotation and provides a scientifically defensible representative frame for downstream alignment and RMSF calculations. Parameters ---------- universe : MDAnalysis.Universe Universe containing the trajectory to analyze. selection : str, optional MDAnalysis selection string for atoms used to find the representative frame. Default is "protein" (all protein atoms) to capture both backbone and side chain conformations. start_frame : int, optional First frame to include in analysis (0-indexed). Default is 0. Use this to skip equilibration frames. stop_frame : int, optional Last frame to include (exclusive). Default is None (all frames). verbose : bool, optional If True, log progress messages. Default is True. Returns ------- int Index of the representative frame (0-indexed, relative to full trajectory, not to ``start_frame``). Notes ----- The algorithm aligns all candidate frames to a common reference frame, computes the aligned mean structure, then selects the frame with minimum RMSD to that mean. Using all protein atoms (default) rather than just CA atoms captures the full conformational state including side chain rotamers. Examples -------- >>> import MDAnalysis as mda >>> u = mda.Universe("topology.pdb", "trajectory.dcd") >>> # Find centroid after 100 frames of equilibration >>> centroid_idx = find_centroid_frame(u, start_frame=100) >>> print(f"Representative aligned frame: {centroid_idx}") >>> # Use only backbone atoms >>> centroid_idx = find_centroid_frame(u, selection="protein and backbone") See Also -------- find_reference_frame : High-level function supporting multiple methods """ # Select atoms for representative-frame calculation atoms = universe.select_atoms(selection) if len(atoms) == 0: from polyzymd.analyses.shared.diagnostics import get_selection_diagnostics diag = get_selection_diagnostics(universe, selection) raise ValueError(f"Selection '{selection}' matched no atoms.\n\n{diag}") if verbose: LOGGER.info( f"Finding representative aligned frame using {len(atoms)} atoms from '{selection}'" ) # Determine frame range n_frames_total = len(universe.trajectory) if stop_frame is None: stop_frame = n_frames_total # Validate frame range if start_frame < 0 or start_frame >= n_frames_total: raise ValueError(f"start_frame={start_frame} is out of range [0, {n_frames_total})") if stop_frame <= start_frame: raise ValueError(f"stop_frame={stop_frame} must be greater than start_frame={start_frame}") n_frames = stop_frame - start_frame if verbose: LOGGER.info(f"Analyzing frames {start_frame} to {stop_frame - 1} ({n_frames} frames)") # Collect coordinates for all frames in range if verbose: LOGGER.info("Collecting coordinates...") coordinates = np.empty((n_frames, len(atoms), 3), dtype=np.float64) for i, _ in enumerate(universe.trajectory[start_frame:stop_frame]): coordinates[i] = atoms.positions if verbose: LOGGER.info("Aligning frames and selecting representative frame...") relative_idx, rmsd_to_mean = _find_frame_closest_to_aligned_mean(coordinates) # Convert to absolute frame index representative_frame_idx = relative_idx + start_frame if verbose: LOGGER.info( f"Representative aligned frame: {representative_frame_idx} " f"(RMSD to aligned mean: {rmsd_to_mean:.3f} Å)" ) return representative_frame_idx
[docs] def find_reference_frame( universe: "Universe", mode: ReferenceMode = "centroid", selection: str = "protein", start_frame: int = 0, stop_frame: int | None = None, specific_frame: int | None = None, verbose: bool = True, ) -> int | None: """Find a reference frame for trajectory alignment. This is the high-level interface for selecting a reference structure for RMSF calculations. It supports multiple methods for choosing the reference, each appropriate for different scientific questions. Parameters ---------- universe : MDAnalysis.Universe Universe containing the trajectory. mode : {"centroid", "average", "frame", "external"}, optional Method for selecting the reference. Default is "centroid". - "centroid": Representative aligned frame mode. Returns the frame index closest to the aligned mean structure. - "average": Use average structure as reference. Returns None (caller should use AverageStructure). - "frame": Use a specific frame specified by `specific_frame`. Returns the specified frame index (converted to 0-indexed). - "external": Use an external PDB file as reference. Returns None (alignment handled by align_trajectory). selection : str, optional MDAnalysis selection string for atoms to use. Default is "protein". Only used for "centroid" mode. start_frame : int, optional First frame for analysis (0-indexed). Default is 0. stop_frame : int, optional Last frame for analysis (exclusive). Default is None. specific_frame : int, optional Frame index to use when mode="frame" (1-indexed, PyMOL convention). Required when mode="frame". verbose : bool, optional Log progress messages. Default is True. Returns ------- int or None Frame index (0-indexed) to use as reference, or None if mode="average" (indicating the caller should compute an average structure). Raises ------ ValueError If mode="frame" but specific_frame is not provided. If specific_frame is out of range. Examples -------- >>> # Find a representative aligned frame (equilibrium conformation) >>> ref_frame = find_reference_frame(u, mode="centroid", start_frame=100) >>> # Use average structure >>> ref_frame = find_reference_frame(u, mode="average") >>> # ref_frame is None, use AverageStructure instead >>> # Use a specific frame (e.g., catalytically competent state) >>> ref_frame = find_reference_frame(u, mode="frame", specific_frame=500) See Also -------- find_centroid_frame : Low-level representative frame selection """ if mode == "centroid": return find_centroid_frame( universe, selection=selection, start_frame=start_frame, stop_frame=stop_frame, verbose=verbose, ) elif mode == "average": if verbose: LOGGER.info("Using average structure as reference (will be computed during alignment)") return None # Caller should use AverageStructure elif mode == "frame": if specific_frame is None: raise ValueError("specific_frame is required when mode='frame'") # Convert from 1-indexed (PyMOL) to 0-indexed frame_idx = specific_frame - 1 # Validate n_frames = len(universe.trajectory) if frame_idx < 0 or frame_idx >= n_frames: raise ValueError( f"specific_frame={specific_frame} (1-indexed) is out of range. " f"Valid range: 1 to {n_frames}" ) if verbose: LOGGER.info(f"Using user-specified frame {specific_frame} (0-indexed: {frame_idx})") return frame_idx elif mode == "external": # External PDB reference — no trajectory frame to return. # Alignment to external PDB is handled by align_trajectory() in alignment.py. if verbose: LOGGER.info("Using external PDB as reference (alignment handled by align_trajectory)") return None else: raise ValueError( f"Unknown mode: {mode}. Must be 'centroid', 'average', 'frame', or 'external'" )
[docs] def get_reference_mode_description(mode: ReferenceMode) -> str: """Get a human-readable description of a reference mode. Parameters ---------- mode : {"centroid", "average", "frame", "external"} The reference mode. Returns ------- str Description of what this mode represents. """ descriptions = { "centroid": ( "Representative aligned frame (closest to aligned mean) - " "measures flexibility around a representative equilibrium conformation" ), "average": ("Average structure - pure thermal fluctuations around the mathematical mean"), "frame": ( "Specific frame - " "fluctuations relative to a user-defined reference (e.g., functional state)" ), "external": ( "External PDB structure - " "deviations from a condition-independent reference geometry " "(e.g., catalytically competent crystal structure)" ), } return descriptions.get(mode, f"Unknown mode: {mode}")