Source code for polyzymd.config.schema

"""
Configuration schema for PolyzyMD simulations.

This module defines Pydantic models for all configuration sections,
providing validation, type safety, and YAML/JSON serialization support.
"""

from __future__ import annotations

import logging
import re
import shlex
from enum import Enum
from pathlib import Path
from string import Formatter
from typing import Any, Literal, Self

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

LOGGER = logging.getLogger(__name__)

_TOKEN_UNSAFE_CHARS = re.compile(r"[^A-Za-z0-9_-]+")
_TOKEN_UNDERSCORES = re.compile(r"_+")


def _format_safe_token(value: object) -> str:
    """Convert a value into a safe lowercase naming token.

    Parameters
    ----------
    value : object
        Value to normalize for a directory or job-name token.

    Returns
    -------
    str
        Sanitized token using only alphanumerics, underscores, and hyphens.
    """
    token = str(value).strip().lower()
    token = _TOKEN_UNSAFE_CHARS.sub("_", token)
    token = _TOKEN_UNDERSCORES.sub("_", token)
    return token.strip("_") or "unknown"


def _format_decimal_token(value: float) -> str:
    """Format a decimal value without raw decimal points.

    Parameters
    ----------
    value : float
        Numeric value to format.

    Returns
    -------
    str
        Compact token, using ``p`` as the decimal separator when needed.
    """
    formatted = f"{value:g}"
    return formatted.replace(".", "p")


[docs] class ChargeMethod(str, Enum): """Supported charge assignment methods for small molecules.""" NAGL = "nagl" ESPALOMA = "espaloma" AM1BCC = "am1bcc"
[docs] class WaterModel(str, Enum): """Supported water models.""" TIP3P = "tip3p" SPCE = "spce" TIP4P = "tip4p" TIP4PEW = "tip4pew" OPC = "opc"
[docs] class BoxShape(str, Enum): """Supported simulation box shapes.""" CUBE = "cube" RHOMBIC_DODECAHEDRON = "rhombic_dodecahedron" TRUNCATED_OCTAHEDRON = "truncated_octahedron"
[docs] class Ensemble(str, Enum): """Thermodynamic ensemble types.""" NVT = "NVT" NPT = "NPT" NVE = "NVE"
[docs] class ThermostatType(str, Enum): """Supported thermostat types.""" LANGEVIN_MIDDLE = "LangevinMiddle" LANGEVIN = "Langevin" ANDERSEN = "Andersen" NOSE_HOOVER = "NoseHoover"
[docs] class BarostatType(str, Enum): """Supported barostat types.""" MONTE_CARLO = "MC" MONTE_CARLO_ANISOTROPIC = "MCA"
[docs] class RestraintType(str, Enum): """Types of restraints that can be applied.""" FLAT_BOTTOM = "flat_bottom" HARMONIC = "harmonic" UPPER_WALL = "upper_wall" LOWER_WALL = "lower_wall"
# ============================================================================= # Enzyme Configuration # =============================================================================
[docs] class EnzymeConfig(BaseModel): """Configuration for the enzyme/protein component. Attributes: name: Identifier for the enzyme (e.g., "LipA") pdb_path: Path to the PDB file containing the enzyme structure description: Optional description of the enzyme """ name: str = Field(..., description="Enzyme identifier") pdb_path: Path = Field(..., description="Path to enzyme PDB file") description: str | None = Field(None, description="Optional description")
[docs] @field_validator("pdb_path") @classmethod def validate_pdb_path(cls, v: Path) -> Path: """Validate that PDB path has correct extension.""" if v.suffix.lower() != ".pdb": raise ValueError(f"Expected .pdb file, got {v.suffix}") return v
# ============================================================================= # Substrate/Ligand Configuration # =============================================================================
[docs] class SubstrateConfig(BaseModel): """Configuration for the docked substrate/ligand. Attributes: name: Identifier for the substrate (e.g., "Resorufin-Butyrate") sdf_path: Path to SDF file with docked conformers conformer_index: Which conformer to use (0-indexed) charge_method: Method for assigning partial charges residue_name: 3-letter residue name for topology (default: "LIG") """ name: str = Field(..., description="Substrate identifier") sdf_path: Path = Field(..., description="Path to docked conformers SDF") conformer_index: int = Field(0, ge=0, description="Index of conformer to use") charge_method: ChargeMethod = Field(ChargeMethod.NAGL, description="Charge assignment method") residue_name: str = Field("LIG", max_length=3, description="3-letter residue name")
[docs] @field_validator("sdf_path") @classmethod def validate_sdf_path(cls, v: Path) -> Path: """Validate that SDF path has correct extension.""" if v.suffix.lower() != ".sdf": raise ValueError(f"Expected .sdf file, got {v.suffix}") return v
# ============================================================================= # Polymer Packing Configuration # =============================================================================
[docs] class PolymerPackingConfig(BaseModel): """Settings for packing polymers around the solute. Controls the box size and PACKMOL behavior when packing polymers around the protein-ligand complex. Attributes: padding: Box padding around the solute in nanometers. Larger values give polymers more room and can speed up PACKMOL convergence. tolerance: Minimum molecular spacing for PACKMOL in Angstrom. movebadrandom: When ``True``, pass the ``movebadrandom`` keyword to PACKMOL. This places badly-packed molecules at random positions in the box rather than near well-packed neighbours, which improves convergence for dense or heterogeneous systems (many unique chain types). Has no effect when only a single chain type is present. Default is ``False`` (PACKMOL default behaviour is preserved). box_vectors: Optional explicit box dimensions ``[Lx, Ly, Lz]`` in nanometers. When set, overrides the auto-computed bounding box plus *padding*. The protein is centered at the midpoint of the box. Default is ``None`` (auto-compute from solute bounding box + padding). Example: >>> PolymerPackingConfig(padding=2.5, movebadrandom=True) >>> PolymerPackingConfig(box_vectors=[8.0, 10.0, 12.0]) """ padding: float = Field(2.0, gt=0.0, description="Box padding around solute (nm)") tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)") movebadrandom: bool = Field( False, description=( "Pass the 'movebadrandom' keyword to PACKMOL. " "Improves convergence for heterogeneous polymer systems " "by placing badly-packed molecules at random box positions." ), ) box_vectors: list[float] | None = Field( None, min_length=3, max_length=3, description=( "Optional explicit box dimensions [Lx, Ly, Lz] in nanometers. " "Overrides auto-computed bounding box + padding. " "Protein is centered at the midpoint of this box." ), )
# ============================================================================= # Polymer Configuration # =============================================================================
[docs] class MonomerSpec(BaseModel): """Specification for a single monomer type in a co-polymer. For dynamic polymer generation, provide the raw (unactivated) monomer SMILES. The system will run initiation reactions to create the active fragments. Attributes: label: Single character label for this monomer (e.g., "A", "B") probability: Probability of selecting this monomer (0-1) name: Optional full name (e.g., "SBMA", "EGPMA") smiles: Raw monomer SMILES string (required for dynamic generation) residue_name: 3-character PDB residue name (auto-generated if not provided) """ label: str = Field(..., min_length=1, max_length=1, description="Monomer label") probability: float = Field(..., ge=0.0, le=1.0, description="Selection probability") name: str | None = Field(None, description="Full monomer name") smiles: str | None = Field( None, description="Raw monomer SMILES (required for dynamic generation mode)", ) residue_name: str | None = Field( None, max_length=3, description="3-char PDB residue name (auto-generated from name if not provided)", )
[docs] @model_validator(mode="after") def auto_generate_residue_name(self) -> "MonomerSpec": """Auto-generate residue name from monomer name if not provided.""" if self.residue_name is None and self.name is not None: # Use first 3 characters of name, uppercase object.__setattr__(self, "residue_name", self.name[:3].upper()) return self
[docs] class PolymerGenerationMode(str, Enum): """Mode for polymer generation.""" CACHED = "cached" # Load pre-built SDF files from disk DYNAMIC = "dynamic" # Generate polymers on-the-fly using Polymerist
[docs] class ReactionConfig(BaseModel): """Paths to reaction templates for ATRP polymer generation. These .rxn files define the chemical transformations used to create polymer fragments from raw monomer SMILES. For ATRP, this includes: - Initiation: Activates the vinyl group (e.g., chlorination) - Polymerization: Creates chain-extending fragments - Termination: Restores the alkene for chain ends You can use "default" as a special value to use the bundled ATRP methacrylate reaction templates that ship with PolyzyMD. Example: reactions: initiation: "default" polymerization: "default" termination: "default" Attributes: initiation: Path to the initiation reaction template (.rxn) or "default" polymerization: Path to the polymerization reaction template (.rxn) or "default" termination: Path to the termination reaction template (.rxn) or "default" """ initiation: Path = Field(..., description="Path to ATRP initiation .rxn file or 'default'") polymerization: Path = Field( ..., description="Path to ATRP polymerization .rxn file or 'default'" ) termination: Path = Field(..., description="Path to ATRP termination .rxn file or 'default'")
[docs] @field_validator("initiation", "polymerization", "termination", mode="before") @classmethod def resolve_default_paths(cls, v, info) -> Path: """Resolve 'default' to bundled ATRP reaction paths.""" from polyzymd.data.reactions import ( get_atrp_initiation_path, get_atrp_polymerization_path, get_atrp_termination_path, ) if isinstance(v, str) and v.lower() == "default": # Map field name to the appropriate getter field_to_getter = { "initiation": get_atrp_initiation_path, "polymerization": get_atrp_polymerization_path, "termination": get_atrp_termination_path, } getter = field_to_getter.get(info.field_name) if getter: return getter() raise ValueError(f"Unknown reaction field: {info.field_name}") # Convert string to Path if needed if isinstance(v, str): v = Path(v) # Validate .rxn extension if isinstance(v, Path) and v.suffix.lower() != ".rxn": raise ValueError(f"Expected .rxn file, got {v.suffix}") return v
[docs] class PolymerConfig(BaseModel): """Configuration for polymer components. Supports two generation modes: - "cached": Load pre-built polymer SDF files from sdf_directory - "dynamic": Generate polymers on-the-fly using Polymerist from SMILES For dynamic mode, you must provide: - SMILES for each monomer in monomers[].smiles - Reaction templates in the reactions field Attributes: enabled: Whether to include polymers in the system generation_mode: "cached" for pre-built SDFs, "dynamic" for on-the-fly generation type_prefix: Prefix for polymer type in filenames (e.g., "SBMA-EGPMA") monomers: List of monomer specifications with probabilities (and SMILES for dynamic) length: Number of monomer units per polymer chain count: Number of polymer chains to add sdf_directory: Path to pre-built polymer SDF files (for cached mode) reactions: Reaction templates for ATRP (required for dynamic mode) charger: Charge assignment method for generated polymers max_retries: Maximum retries for polymer generation (ring-piercing failures) cache_directory: Directory for caching generated polymers and fragments packing: Settings for packing polymers around the solute random_seed: Random seed for polymer sequence generation (for reproducibility) """ enabled: bool = Field(True, description="Include polymers in system") generation_mode: PolymerGenerationMode = Field( PolymerGenerationMode.CACHED, description="Polymer generation mode: 'cached' (pre-built SDFs) or 'dynamic' (generate from SMILES)", ) type_prefix: str = Field(..., description="Polymer type prefix for filenames") monomers: list[MonomerSpec] = Field(..., min_length=1, description="Monomer specifications") length: int = Field(..., ge=1, description="Monomers per chain") count: int = Field(..., ge=1, description="Number of polymer chains") sdf_directory: Path | None = Field( None, description="Directory with pre-built polymer SDFs (for cached mode)" ) reactions: ReactionConfig | None = Field( None, description="ATRP reaction templates (required for dynamic mode)" ) charger: ChargeMethod = Field( ChargeMethod.NAGL, description="Charge assignment method for generated polymers", ) max_retries: int = Field( 10, ge=1, description="Maximum retries for polymer generation (handles ring-piercing failures)", ) cache_directory: Path = Field( Path(".polymer_cache"), description="Cache directory for generated polymers and fragments" ) packing: PolymerPackingConfig = Field( default_factory=PolymerPackingConfig, description="Polymer packing settings (padding, tolerance)", ) random_seed: int | None = Field( None, description="Random seed for polymer sequence generation. If None, uses replicate number.", )
[docs] @model_validator(mode="after") def validate_probabilities_sum_to_one(self) -> "PolymerConfig": """Ensure monomer probabilities sum to 1.0.""" if self.enabled: total = sum(m.probability for m in self.monomers) if abs(total - 1.0) > 1e-6: raise ValueError(f"Monomer probabilities must sum to 1.0, got {total}") return self
[docs] @model_validator(mode="after") def validate_generation_mode_requirements(self) -> "PolymerConfig": """Validate that required fields are present for the selected generation mode.""" if not self.enabled: return self if self.generation_mode == PolymerGenerationMode.DYNAMIC: # Dynamic mode requires SMILES for all monomers missing_smiles = [m.label for m in self.monomers if m.smiles is None] if missing_smiles: raise ValueError( f"Dynamic generation mode requires 'smiles' for all monomers. " f"Missing SMILES for monomers: {missing_smiles}" ) # Dynamic mode requires reaction templates if self.reactions is None: raise ValueError( "Dynamic generation mode requires 'reactions' field with ATRP reaction templates" ) elif self.generation_mode == PolymerGenerationMode.CACHED: # Cached mode requires sdf_directory if self.sdf_directory is None: raise ValueError( "Cached generation mode requires 'sdf_directory' with pre-built polymer SDFs" ) return self
# ============================================================================= # Solvent Configuration # =============================================================================
[docs] class CoSolventSpec(BaseModel): """Specification for a co-solvent component. You must specify either ``mole_fraction`` or ``concentration``, not both. For co-solvents in the built-in library (dmso, dmf, urea, ethanol, etc.), you can omit the smiles and density fields - they will be looked up automatically. Example (library co-solvent with mole fraction): >>> CoSolventSpec(name="dmso", mole_fraction=0.10) Example (library co-solvent with concentration): >>> CoSolventSpec(name="urea", concentration=2.0) Example (custom co-solvent): >>> CoSolventSpec( ... name="my_solvent", ... smiles="CCOC(=O)C", ... mole_fraction=0.05 ... ) """ name: str = Field(..., description="Co-solvent identifier") smiles: str | None = Field(None, description="SMILES string (optional if in library)") # Specification method 1: Mole fraction mole_fraction: float | None = Field( None, gt=0.0, lt=1.0, description="Mole fraction (0-1), e.g., 0.10 for 10 mol%", ) # Specification method 2: Molar concentration concentration: float | None = Field(None, gt=0.0, description="Molar concentration (mol/L)") # Optional physical property for library metadata density: float | None = Field( None, gt=0.0, description="Optional density in g/mL for co-solvent metadata", ) residue_name: str | None = Field(None, max_length=3, description="Residue name")
[docs] @model_validator(mode="before") @classmethod def reject_volume_fraction(cls, data: Any) -> Any: """Reject removed volume-fraction configuration keys.""" if isinstance(data, dict) and "volume_fraction" in data: raise ValueError( "Co-solvent 'volume_fraction' has been removed. Use 'mole_fraction' " "for solvent mole fractions or 'concentration' for molarity; values are " "not converted automatically." ) return data
[docs] @model_validator(mode="after") def validate_and_populate(self) -> "CoSolventSpec": """Validate specification and populate missing fields from library.""" from polyzymd.data.cosolvent_library import get_cosolvent # Check that exactly one composition method is specified has_mole_fraction = self.mole_fraction is not None has_conc = self.concentration is not None if not has_mole_fraction and not has_conc: raise ValueError( f"Co-solvent '{self.name}': Must specify either 'mole_fraction' " f"or 'concentration'" ) if has_mole_fraction and has_conc: raise ValueError( f"Co-solvent '{self.name}': Cannot specify both 'mole_fraction' " f"and 'concentration' - choose one" ) # Look up from library library_data = get_cosolvent(self.name) # Populate SMILES if not provided if self.smiles is None: if library_data: object.__setattr__(self, "smiles", library_data.smiles) else: raise ValueError( f"Co-solvent '{self.name}' not in library. Please provide 'smiles' field." ) # Preserve library density as metadata when available if self.density is None and library_data: object.__setattr__(self, "density", library_data.density) # Set default residue name if self.residue_name is None: object.__setattr__(self, "residue_name", self.name[:3].upper()) return self
[docs] class PrimarySolventConfig(BaseModel): """Configuration for the primary solvent (usually water). Attributes: type: Solvent type identifier model: Water model to use (if type is "water") """ type: str = Field("water", description="Primary solvent type") model: WaterModel = Field(WaterModel.TIP3P, description="Water model")
[docs] class IonConfig(BaseModel): """Configuration for ions in the solvent. ``nacl_concentration`` is a NaCl-equivalent target for the final ion concentration. When ``neutralize`` is true, PolyzyMD adjusts Na+ and Cl- counts to cancel solute charge while keeping the achieved concentration near this target; it is not additional salt layered on top of neutralizing ions. """ neutralize: bool = Field(True, description="Neutralize system charge") nacl_concentration: float = Field( 0.0, ge=0.0, description="NaCl-equivalent target concentration (mol/L)", ) kcl_concentration: float = Field(0.0, ge=0.0, description="KCl conc. (mol/L)") mgcl2_concentration: float = Field(0.0, ge=0.0, description="MgCl2 conc. (mol/L)")
[docs] class BoxConfig(BaseModel): """Configuration for the simulation box. Attributes: padding: Distance from solute to box edge in nm shape: Box geometry target_density: Target density in g/mL tolerance: Minimum molecular spacing for PACKMOL in Angstrom """ padding: float = Field(1.2, gt=0.0, description="Box padding (nm)") shape: BoxShape = Field(BoxShape.RHOMBIC_DODECAHEDRON, description="Box shape") target_density: float = Field(1.0, gt=0.0, description="Target density (g/mL)") tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
[docs] class SolventConfig(BaseModel): """Complete solvent configuration.""" primary: PrimarySolventConfig = Field( default_factory=PrimarySolventConfig, description="Primary solvent" ) co_solvents: list[CoSolventSpec] = Field(default_factory=list, description="Co-solvents") ions: IonConfig = Field(default_factory=IonConfig, description="Ion settings") box: BoxConfig = Field(default_factory=BoxConfig, description="Box settings")
[docs] @model_validator(mode="after") def validate_mole_fractions(self) -> "SolventConfig": """Ensure co-solvent mole fractions leave room for water.""" total = sum(cs.mole_fraction for cs in self.co_solvents if cs.mole_fraction is not None) if total >= 1.0: raise ValueError(f"Total co-solvent mole fraction must be < 1.0, got {total}") return self
# ============================================================================= # Restraint Configuration # =============================================================================
[docs] class AtomSelectionConfig(BaseModel): """Configuration for selecting atoms for restraints. Uses MDAnalysis-compatible selection syntax for flexibility. Attributes: selection: MDAnalysis selection string (e.g., "resid 77 and name OG") description: Optional human-readable description """ selection: str = Field(..., description="MDAnalysis selection string") description: str | None = Field(None, description="Human-readable description")
[docs] class RestraintConfig(BaseModel): """Configuration for a single restraint. Attributes: type: Type of restraint (flat_bottom, harmonic, etc.) name: Identifier for this restraint atom1: First atom selection atom2: Second atom selection distance: Target/threshold distance in Angstroms force_constant: Force constant in kJ/mol/nm^2 enabled: Whether this restraint is active """ type: RestraintType = Field(..., description="Restraint type") name: str = Field(..., description="Restraint identifier") atom1: AtomSelectionConfig = Field(..., description="First atom selection") atom2: AtomSelectionConfig = Field(..., description="Second atom selection") distance: float = Field(..., gt=0.0, description="Distance threshold (Angstrom)") force_constant: float = Field(10000.0, gt=0.0, description="Force constant (kJ/mol/nm^2)") enabled: bool = Field(True, description="Whether restraint is active")
# ============================================================================= # Thermodynamics Configuration # =============================================================================
[docs] class ThermodynamicsConfig(BaseModel): """Thermodynamic conditions for the simulation. Attributes: temperature: System temperature in Kelvin pressure: System pressure in atmospheres (for NPT) salt_concentration: Ionic strength in mol/L (deprecated, use solvent.ions) """ temperature: float = Field(..., gt=0.0, description="Temperature (K)") pressure: float = Field(1.0, gt=0.0, description="Pressure (atm)")
# ============================================================================= # Simulation Phase Configuration # =============================================================================
[docs] class SimulationPhaseConfig(BaseModel): """Configuration for a single simulation phase (equilibration or production). Attributes: ensemble: Thermodynamic ensemble (NVT, NPT) duration: Simulation duration in nanoseconds samples: Number of trajectory frames to save report_interval: Explicit reporter interval in MD steps time_step: Integration time step in femtoseconds thermostat: Thermostat type thermostat_timescale: Thermostat coupling timescale in ps barostat: Barostat type (for NPT) barostat_frequency: Barostat update frequency (steps) checkpoint_interval: Wall-time interval (seconds) between restart checkpoints for preemption resilience """ ensemble: Ensemble = Field(..., description="Thermodynamic ensemble") duration: float = Field(..., gt=0.0, description="Duration (ns)") samples: int = Field(..., ge=1, description="Trajectory frames to save") report_interval: int = Field(..., ge=1, description="Reporter interval in MD steps") time_step: float = Field(2.0, gt=0.0, description="Time step (fs)") thermostat: ThermostatType = Field( ThermostatType.LANGEVIN_MIDDLE, description="Thermostat type" ) thermostat_timescale: float = Field(1.0, gt=0.0, description="Thermostat timescale (ps)") barostat: BarostatType | None = Field(None, description="Barostat type") barostat_frequency: int = Field(25, ge=1, description="Barostat update frequency") checkpoint_interval: float = Field( ..., gt=0.0, description=( "Wall-time interval in seconds between restart checkpoints. " "Controls how frequently simulation state is saved for automatic " "restart on SLURM preemption or hard kill. Independent of " "trajectory/reporter output frequency. Must be positive so " "unexpected termination recovery remains enabled." ), )
[docs] @model_validator(mode="after") def validate_ensemble_barostat(self) -> "SimulationPhaseConfig": """Ensure NPT ensemble has a barostat configured.""" if self.ensemble == Ensemble.NPT and self.barostat is None: self.barostat = BarostatType.MONTE_CARLO return self
# ============================================================================= # Equilibration Stage Configuration (Multi-Stage Equilibration) # =============================================================================
[docs] class PositionRestraintConfig(BaseModel): """Configuration for positional restraints on an atom group. Position restraints apply a harmonic potential to keep atoms near their initial coordinates. This is commonly used during equilibration to prevent large structural changes while the system relaxes. Attributes: group: Predefined atom group name force_constant: Force constant in kJ/mol/nm^2 (4184.0 = 1.0 kcal/mol/A^2) """ group: str = Field( ..., description=( "Atom group: protein_heavy, protein_backbone, protein_calpha, " "ligand_heavy, polymer_heavy, solvent, water_only, ions_only, cosolvents_only" ), ) force_constant: float = Field( 4184.0, # 1.0 kcal/mol/A^2 in kJ/mol/nm^2 gt=0.0, description="Force constant (kJ/mol/nm^2). Default 4184.0 = 1.0 kcal/mol/A^2", )
[docs] @field_validator("group") @classmethod def validate_group_name(cls, v: str) -> str: """Validate that the group name is a recognized predefined group.""" from polyzymd.core.atom_groups import PREDEFINED_GROUPS if v not in PREDEFINED_GROUPS: raise ValueError( f"Unknown atom group: '{v}'. Valid groups: {sorted(PREDEFINED_GROUPS)}" ) return v
[docs] class EquilibrationStageConfig(BaseModel): """Configuration for a single equilibration stage. Supports two temperature modes: 1. Constant temperature: Set 'temperature' field. 2. Temperature ramping (simulated annealing): Set 'temperature_start' and 'temperature_end' fields. Position restraints can be applied to hold specific atom groups in place during the stage. Attributes: name: Stage identifier (used in output paths) duration: Stage duration in nanoseconds samples: Number of trajectory frames to save ensemble: Thermodynamic ensemble (NVT or NPT) temperature: Constant temperature in K (mutually exclusive with ramping) temperature_start: Starting temperature for ramping in K temperature_end: Ending temperature for ramping in K temperature_increment: Temperature increment per update in K temperature_interval: Time between temperature updates in fs position_restraints: List of position restraints for this stage time_step: Optional time step override in fs thermostat: Optional thermostat type override thermostat_timescale: Optional thermostat timescale override in ps barostat: Optional barostat type (for NPT ensemble) barostat_frequency: Optional barostat update frequency """ name: str = Field(..., description="Stage identifier (used in output paths)") duration: float = Field(..., gt=0.0, description="Duration (ns)") samples: int = Field(100, ge=1, description="Trajectory frames to save") ensemble: Ensemble = Field(Ensemble.NVT, description="Thermodynamic ensemble") # Constant temperature mode temperature: float | None = Field( None, gt=0.0, description="Constant temperature (K). Mutually exclusive with temperature ramping.", ) # Temperature ramping mode (simulated annealing) temperature_start: float | None = Field( None, gt=0.0, description="Starting temperature for ramping (K)" ) temperature_end: float | None = Field( None, gt=0.0, description="Ending temperature for ramping (K)" ) temperature_increment: float = Field( 1.0, gt=0.0, description="Temperature increment per update (K)" ) temperature_interval: float = Field( 1200.0, gt=0.0, description="Time between temperature updates (fs)" ) # Position restraints position_restraints: list[PositionRestraintConfig] = Field( default_factory=list, description="Position restraints for this stage" ) # Optional overrides (inherit from parent/defaults if not specified) time_step: float | None = Field(None, gt=0.0, description="Time step (fs)") thermostat: ThermostatType | None = Field(None, description="Thermostat type") thermostat_timescale: float | None = Field( None, gt=0.0, description="Thermostat timescale (ps)" ) barostat: BarostatType | None = Field(None, description="Barostat type (for NPT)") barostat_frequency: int | None = Field(None, ge=1, description="Barostat update frequency")
[docs] @model_validator(mode="after") def validate_temperature_mode(self) -> "EquilibrationStageConfig": """Ensure valid temperature specification.""" has_constant = self.temperature is not None has_start = self.temperature_start is not None has_end = self.temperature_end is not None if has_constant and (has_start or has_end): raise ValueError( "Cannot specify both 'temperature' and temperature ramping " "('temperature_start'/'temperature_end')" ) if not has_constant and not (has_start and has_end): raise ValueError( "Must specify either 'temperature' for constant temperature, " "or both 'temperature_start' and 'temperature_end' for ramping" ) if has_start and has_end and self.temperature_start > self.temperature_end: raise ValueError( f"temperature_start ({self.temperature_start}) must be <= " f"temperature_end ({self.temperature_end})" ) return self
[docs] @model_validator(mode="after") def validate_npt_barostat(self) -> "EquilibrationStageConfig": """Ensure NPT ensemble has a barostat configured.""" if self.ensemble == Ensemble.NPT and self.barostat is None: self.barostat = BarostatType.MONTE_CARLO return self
@property def is_temperature_ramping(self) -> bool: """Check if this stage uses temperature ramping.""" return self.temperature_start is not None
[docs] def get_start_temperature(self) -> float: """Get the starting temperature of this stage.""" if self.is_temperature_ramping: return self.temperature_start return self.temperature
[docs] def get_final_temperature(self) -> float: """Get the final temperature of this stage.""" if self.is_temperature_ramping: return self.temperature_end return self.temperature
[docs] class SimulationPhasesConfig(BaseModel): """Configuration for all simulation phases. Attributes: equilibration_stages: Multi-stage equilibration protocol production: Production phase settings """ model_config = ConfigDict(extra="forbid") equilibration_stages: list[EquilibrationStageConfig] | None = Field( None, description="Multi-stage equilibration protocol with position restraints" ) production: SimulationPhaseConfig = Field(..., description="Production settings")
[docs] @model_validator(mode="after") def validate_equilibration_mode(self) -> "SimulationPhasesConfig": """Ensure staged equilibration is configured.""" if self.equilibration_stages is None: raise ValueError( "PolyzyMD now requires 'equilibration_stages' in simulation_phases. " "The legacy 'equilibration' block is no longer supported. " "Convert your config to one or more staged equilibration entries." ) if len(self.equilibration_stages) == 0: raise ValueError("'equilibration_stages' must contain at least one stage") return self
@property def total_equilibration_duration(self) -> float: """Total equilibration duration in nanoseconds. Returns the sum of all stage durations. """ return sum(stage.duration for stage in self.equilibration_stages) @property def total_equilibration_samples(self) -> int: """Total equilibration trajectory samples. Returns the sum of all stage samples. """ return sum(stage.samples for stage in self.equilibration_stages)
# ============================================================================= # Output Configuration # =============================================================================
[docs] def expand_path(path: Path) -> Path: """Expand environment variables and user home in a path. Supports: - $VAR and ${VAR} syntax for environment variables - ~ for user home directory Args: path: Path that may contain environment variables Returns: Path with variables expanded """ import os expanded = os.path.expandvars(str(path)) expanded = os.path.expanduser(expanded) return Path(expanded)
[docs] class OutputConfig(BaseModel): """Configuration for simulation output. Supports separate directories for: - scripts/logs (projects_directory): Where job scripts and SLURM logs are written - simulation data (scratch_directory): Where trajectories, checkpoints go This separation allows running simulations on HPC systems where code lives in long-term storage (projects) but data is written to high-performance scratch storage. Attributes: projects_directory: Directory for scripts, configs, logs (long-term storage) scratch_directory: Directory for simulation output (high-performance storage) naming_template: Template for naming working directories job_scripts_subdir: Subdirectory name for job scripts within projects slurm_logs_subdir: Subdirectory name for SLURM logs within projects save_checkpoint: Whether to save checkpoint files save_state_data: Whether to save thermodynamic state data trajectory_format: Output trajectory format Example YAML: output: projects_directory: /projects/user/polyzymd scratch_directory: /scratch/alpine/user/simulations naming_template: "{enzyme}_{substrate}_{temperature}K_run{replicate}" """ model_config = ConfigDict(extra="forbid") # Directory configuration projects_directory: Path = Field( Path("."), description="Base directory for scripts and logs (typically in projects/long-term storage)", ) scratch_directory: Path | None = Field( None, description="Base directory for simulation output (scratch storage). If None, uses projects_directory.", ) naming_template: str = Field( "{enzyme}_{substrate}_{polymer_type}_{duration}ns_{temperature}K_run{replicate}", description="Directory naming template", ) # Subdirectory organization job_scripts_subdir: str = Field( "job_scripts", description="Subdirectory for generated job scripts", ) slurm_logs_subdir: str = Field( "slurm_logs", description="Subdirectory for SLURM output logs", ) # Output options save_checkpoint: bool = Field(True, description="Save checkpoint files") save_state_data: bool = Field(True, description="Save state data CSV") trajectory_format: str = Field("dcd", description="Trajectory file format")
[docs] @field_validator("projects_directory", "scratch_directory", mode="before") @classmethod def expand_env_vars_in_paths(cls, v: str | Path | None) -> Path | None: """Expand environment variables and ~ in path fields. Supports $USER, ${HOME}, ~/path, etc. """ if v is None: return None return expand_path(Path(v))
@property def effective_scratch_directory(self) -> Path: """Get the effective scratch directory (falls back to projects if not set).""" return self.scratch_directory if self.scratch_directory else self.projects_directory
[docs] def format_directory_name( self, enzyme: str, substrate: str, polymer_type: str, temperature: float, replicate: int | str, duration: float = 0.0, primary_solvent: str = "water_tip3p", cosolvent_composition: str = "none", solvent_composition: str | None = None, **kwargs: Any, ) -> str: """Format the directory name using the template. Parameters ---------- enzyme : str Enzyme name. substrate : str Substrate name. polymer_type : str Polymer type, or ``"none"``. temperature : float Temperature in K. replicate : int or str Replicate number or glob token. duration : float, optional Production duration in ns, by default 0.0. primary_solvent : str, optional Primary solvent token, by default ``"water_tip3p"``. cosolvent_composition : str, optional Co-solvent composition token, by default ``"none"``. solvent_composition : str or None, optional Full solvent composition token. When not provided, the primary solvent and co-solvent tokens are combined. **kwargs : Any Additional template variables. Returns ------- str Formatted directory name. """ if solvent_composition is None: solvent_composition = primary_solvent if cosolvent_composition != "none": solvent_composition = f"{primary_solvent}_{cosolvent_composition}" return self.naming_template.format( enzyme=enzyme, substrate=substrate, polymer_type=polymer_type, temperature=int(temperature), replicate=replicate, duration=int(duration), primary_solvent=primary_solvent, cosolvent_composition=cosolvent_composition, solvent_composition=solvent_composition, **kwargs, )
[docs] def get_job_scripts_directory(self) -> Path: """Get the directory for job scripts. Returns: Path to job scripts directory (within projects) """ return self.projects_directory / self.job_scripts_subdir
[docs] def get_slurm_logs_directory(self) -> Path: """Get the directory for SLURM log files. Returns: Path to SLURM logs directory (within projects) """ return self.projects_directory / self.slurm_logs_subdir
# ============================================================================= # Force Field Configuration # =============================================================================
[docs] class ForceFieldConfig(BaseModel): """Configuration for force field selection. Attributes: protein: Force field for proteins small_molecule: Force field for small molecules water: Water model force field (derived from solvent config) """ protein: str = Field("ff14sb_off_impropers_0.0.4.offxml", description="Protein force field") small_molecule: str = Field("openff-2.0.0.offxml", description="Small molecule force field")
# ============================================================================= # Engine Configuration # =============================================================================
[docs] class OpenMMEngineConfig(BaseModel): """OpenMM-specific engine settings. These settings control OpenMM platform selection and device configuration. Only relevant when ``engine`` is ``"openmm"``. Attributes: platform: OpenMM platform to use for computation. device_index: GPU device index (platform-specific). precision: Floating-point precision mode. """ platform: str = Field("CUDA", description="OpenMM compute platform") device_index: str | None = Field(None, description="GPU device index") precision: str = Field("mixed", description="Floating-point precision")
[docs] class GromacsEngineConfig(BaseModel): """GROMACS-specific engine settings. These settings control how GROMACS binaries are located and invoked, and how SLURM resources are allocated for GROMACS jobs. GROMACS users configure parallelism via ``ntmpi`` (MPI ranks) and ``ntomp`` (OpenMP threads per rank). These map directly to the ``gmx mdrun -ntmpi`` and ``-ntomp`` flags. The corresponding SLURM resources (``ntasks``, ``cpus_per_task``, ``memory``) are set to match. Set ``gpu: true`` when running a CUDA-enabled GROMACS build on a GPU node. When ``False`` (default), the SLURM script omits GPU directives entirely. Attributes: gmx_binary: Path or name of the GROMACS binary. Resolved via config > $GMX_BIN > PATH discovery if None. mdrun_flags: Additional flags passed to ``gmx mdrun`` (all stages). grompp_flags: Additional flags passed to ``gmx grompp``. module_load: Module load command for HPC (e.g. ``"module load gromacs/2024"``) ntmpi: Number of MPI ranks for ``gmx mdrun -ntmpi``. Also sets SLURM ``--ntasks``. ntomp: Number of OpenMP threads per rank for ``gmx mdrun -ntomp``. Also sets SLURM ``--cpus-per-task``. gpu: Request a GPU via SLURM. When False, the ``--gres=gpu`` directive is omitted entirely. gpus: Number of GPUs to request when ``gpu`` is True. Ignored when ``gpu`` is False. memory: SLURM ``--mem`` allocation for GROMACS jobs. """ gmx_binary: str | None = Field(None, description="GROMACS binary path or name") mdrun_flags: str = Field("", description="Extra flags for gmx mdrun (all stages)") grompp_flags: str = Field("-maxwarn 1", description="Extra flags for gmx grompp") mdrun_flags_equilibration: str | None = Field( None, description=( "Override mdrun_flags for equilibration stages only. " "When None, falls back to mdrun_flags." ), ) mdrun_flags_production: str | None = Field( None, description=( "Override mdrun_flags for production only. When None, falls back to mdrun_flags." ), ) command_prefix: str | None = Field( None, description=( "Prefix prepended to all GROMACS commands. " "Use for container wrappers, e.g. " "'singularity exec --rocm --bind $PWD /path/to/gromacs.sif'" ), ) mpi_launcher_flags: str = Field( "", description=( "Extra flags passed to the MPI launcher (mpirun). " "Only used when the binary is a real-MPI build. " "E.g. '-genv I_MPI_FABRICS shm:tcp' for Intel MPI." ), ) module_load: str | None = Field(None, description="HPC module load command") env_exports: dict[str, str] = Field( default_factory=dict, description=("Environment variables exported before GROMACS commands (key=value pairs)"), ) setup_commands: list[str] = Field( default_factory=list, description="Shell commands run after module_load and before GROMACS commands", ) ntmpi: int = Field(1, ge=1, description="MPI ranks (-ntmpi); sets SLURM --ntasks") slurm_ntasks: int | None = Field( default=None, ge=1, description=( "Override SLURM --ntasks independently of GROMACS -ntmpi. " "Advanced option for multi-node MPI+GPU workflows where scheduler " "tasks differ from GROMACS thread-MPI ranks. When None, SLURM ntasks " "is set from ntmpi (default behavior)." ), ) ntomp: int = Field( 8, ge=1, description="OpenMP threads per rank (-ntomp); sets SLURM --cpus-per-task", ) gpu: bool = Field( False, description="Request GPU via SLURM (set True for CUDA-enabled GROMACS)", ) gpus: int = Field( 1, ge=1, description="Number of GPUs to request via SLURM when gpu is True", ) memory: str = Field("16G", description="SLURM --mem allocation for GROMACS jobs") @model_validator(mode="after") def _warn_gpu_ntmpi(self) -> Self: """Warn when GPU mode is combined with multiple thread-MPI ranks. Returns ------- Self The validated config instance """ if self.gpu and self.ntmpi > 1: LOGGER.warning( "GPU GROMACS uses thread-MPI (gmx mdrun -ntmpi), not real MPI ranks. " "With ntmpi=%d and gpus=%d, ensure enough GPUs are available or " "accept GPU sharing.", self.ntmpi, self.gpus, ) return self @model_validator(mode="after") def _warn_mdrun_flags_conflict(self) -> Self: """Warn when mdrun_flags conflict with explicit ntmpi or ntomp fields. Returns ------- Self The validated config instance """ if not self.mdrun_flags: return self try: tokens = shlex.split(self.mdrun_flags) except ValueError: return self flag_map: dict[str, str] = {} for i, tok in enumerate(tokens): if tok in ("-ntmpi", "-ntomp") and i + 1 < len(tokens): flag_map[tok] = tokens[i + 1] if "-ntmpi" in flag_map: try: flag_val = int(flag_map["-ntmpi"]) if flag_val != self.ntmpi: LOGGER.warning( "mdrun_flags contains '-ntmpi %s' but config field ntmpi=%d. " "The flag string will override at runtime.", flag_map["-ntmpi"], self.ntmpi, ) except ValueError: pass if "-ntomp" in flag_map: try: flag_val = int(flag_map["-ntomp"]) if flag_val != self.ntomp: LOGGER.warning( "mdrun_flags contains '-ntomp %s' but config field ntomp=%d. " "The flag string will override at runtime.", flag_map["-ntomp"], self.ntomp, ) except ValueError: pass return self
# ============================================================================= # Main Simulation Configuration # =============================================================================
[docs] class SimulationConfig(BaseModel): """Complete simulation configuration. This is the top-level configuration model that contains all settings for a PolyzyMD simulation. Attributes: name: Simulation name/identifier description: Optional description enzyme: Enzyme configuration substrate: Substrate/ligand configuration polymers: Polymer configuration (optional) solvent: Solvent and box configuration restraints: List of restraint configurations thermodynamics: Temperature/pressure settings simulation_phases: Equilibration and production settings output: Output file settings force_field: Force field selection Example: >>> config = SimulationConfig.from_yaml("simulation.yaml") >>> print(config.enzyme.name) "LipA" """ name: str = Field(..., description="Simulation identifier") description: str | None = Field(None, description="Simulation description") enzyme: EnzymeConfig = Field(..., description="Enzyme configuration") substrate: SubstrateConfig | None = Field(None, description="Substrate configuration") polymers: PolymerConfig | None = Field(None, description="Polymer configuration") solvent: SolventConfig = Field( default_factory=SolventConfig, description="Solvent configuration" ) restraints: list[RestraintConfig] = Field( default_factory=list, description="Restraint configurations" ) thermodynamics: ThermodynamicsConfig = Field(..., description="Thermodynamic conditions") simulation_phases: SimulationPhasesConfig = Field(..., description="Phase settings") output: OutputConfig = Field(default_factory=OutputConfig, description="Output settings") force_field: ForceFieldConfig = Field( default_factory=ForceFieldConfig, description="Force field settings" ) engine: Literal["openmm", "gromacs"] = Field(..., description="Simulation engine to use") openmm: OpenMMEngineConfig = Field( default_factory=OpenMMEngineConfig, description="OpenMM engine settings" ) gromacs: GromacsEngineConfig = Field( default_factory=GromacsEngineConfig, description="GROMACS engine settings" )
[docs] @classmethod def from_yaml(cls, path: str | Path) -> "SimulationConfig": """Load configuration from a YAML file. Args: path: Path to the YAML configuration file Returns: SimulationConfig instance Raises: FileNotFoundError: If file doesn't exist ValidationError: If configuration is invalid """ from polyzymd.config.loader import load_config return load_config(path)
[docs] def to_yaml(self, path: str | Path) -> None: """Save configuration to a YAML file. Args: path: Path to save the configuration file """ from polyzymd.config.loader import save_config save_config(self, path)
def _primary_solvent_token(self) -> str: """Format the primary solvent naming token. Returns ------- str Primary solvent token for naming templates. """ primary = self.solvent.primary primary_type = _format_safe_token(primary.type) if primary_type == "water": return f"water_{_format_safe_token(primary.model.value)}" return primary_type def _cosolvent_composition_token(self) -> str: """Format the co-solvent composition naming token. Returns ------- str Co-solvent token or ``none`` when no co-solvents are configured. """ if not self.solvent.co_solvents: return "none" tokens: list[tuple[str, str]] = [] for cosolvent in self.solvent.co_solvents: name = _format_safe_token(cosolvent.name) if cosolvent.volume_fraction is not None: percent = _format_decimal_token(cosolvent.volume_fraction * 100) token = f"{name}_{percent}pctv" elif cosolvent.concentration is not None: concentration = _format_decimal_token(cosolvent.concentration) token = f"{name}_{concentration}M" else: # Validation guarantees one composition mode, but keep errors explicit raise ValueError(f"Co-solvent '{cosolvent.name}' has no composition value") tokens.append((name, token)) return "_".join(token for _, token in sorted(tokens, key=lambda item: item[0])) def _run_directory_template_values(self, replicate: int | str = 1) -> dict[str, Any]: """Build centralized values for run-directory naming templates. Parameters ---------- replicate : int or str, optional Replicate token to insert into the template, by default 1. Returns ------- dict of str to Any Template values shared by directory naming and replicate discovery. """ polymer_type = "none" if self.polymers and self.polymers.enabled: probs = {m.label: m.probability for m in self.polymers.monomers} sorted_labels = sorted(probs.keys()) composition = "_".join(f"{lbl}{probs[lbl] * 100:.0f}" for lbl in sorted_labels) polymer_type = f"{self.polymers.type_prefix}_{composition}" substrate_name = self.substrate.name if self.substrate else "apo" primary_solvent = self._primary_solvent_token() cosolvent_composition = self._cosolvent_composition_token() solvent_composition = primary_solvent if cosolvent_composition != "none": solvent_composition = f"{primary_solvent}_{cosolvent_composition}" return { "enzyme": self.enzyme.name, "substrate": substrate_name.replace("-", ""), "polymer_type": polymer_type, "temperature": int(self.thermodynamics.temperature), "replicate": replicate, "duration": int(self.simulation_phases.production.duration), "primary_solvent": primary_solvent, "cosolvent_composition": cosolvent_composition, "solvent_composition": solvent_composition, }
[docs] def format_run_directory_name(self, replicate: int | str = 1) -> str: """Format the run directory name for a replicate. Parameters ---------- replicate : int or str, optional Replicate number or glob token, by default 1. Returns ------- str Formatted run directory name. """ return self.output.format_directory_name(**self._run_directory_template_values(replicate))
[docs] def get_working_directory(self, replicate: int = 1) -> Path: """Get the working directory path for a given replicate (in scratch). This returns the path where simulation output (trajectories, checkpoints) will be written, which is in the scratch directory. Args: replicate: Replicate number Returns: Path to the working directory (in scratch) """ dir_name = self.format_run_directory_name(replicate) return self.output.effective_scratch_directory / dir_name
[docs] def get_projects_directory(self) -> Path: """Get the projects directory path. This is where scripts, configs, and logs are stored. Returns: Path to the projects directory """ return self.output.projects_directory
[docs] def discover_replicate_dirs(self) -> list[tuple[int, Path]]: """Auto-detect all replicate directories on disk. Builds a glob pattern from the naming template with ``replicate="*"`` and scans the effective scratch directory. Returns: Sorted list of ``(replicate_number, directory_path)`` tuples. """ fields = { field for _, field, _, _ in Formatter().parse(self.output.naming_template) if field } if "replicate" not in fields: raise ValueError( "Cannot discover replicate directories because output.naming_template does not " "include the {replicate} placeholder." ) glob_pattern = self.format_run_directory_name(replicate="*") regex_pattern = "^" + re.escape(glob_pattern).replace(r"\*", r"(?P<replicate>\d+)") + "$" replicate_regex = re.compile(regex_pattern) scratch = self.output.effective_scratch_directory results: list[tuple[int, Path]] = [] for path in scratch.glob(glob_pattern): if not path.is_dir(): continue match = replicate_regex.fullmatch(path.name) if match: results.append((int(match.group("replicate")), path)) results.sort(key=lambda t: t[0]) return results
def _format_run_directory_name(self, replicate: int | str) -> str: """Format the run directory name for a replicate. Args: replicate: Replicate number Returns: Formatted directory name """ return self.format_run_directory_name(replicate)
[docs] def to_signac_statepoint(self, replicate: int = 1) -> dict[str, Any]: """Convert configuration to a Signac-compatible state point dictionary. Args: replicate: Replicate number Returns: Dictionary suitable for use as a Signac state point """ statepoint = { "enzyme": self.enzyme.name, "temperature": self.thermodynamics.temperature, "replicate": replicate, } if self.substrate: statepoint["substrate"] = self.substrate.name statepoint["substrate_conformer"] = self.substrate.conformer_index if self.polymers and self.polymers.enabled: statepoint["polymer_type"] = self.polymers.type_prefix statepoint["polymer_length"] = self.polymers.length statepoint["polymer_count"] = self.polymers.count # Include monomer probabilities for monomer in self.polymers.monomers: statepoint[f"monomer_{monomer.label}_prob"] = monomer.probability else: statepoint["polymer_type"] = "none" # Include co-solvent info for cosolvent in self.solvent.co_solvents: if cosolvent.mole_fraction is not None: statepoint[f"cosolvent_{cosolvent.name}_mole_fraction"] = cosolvent.mole_fraction elif cosolvent.concentration is not None: statepoint[f"cosolvent_{cosolvent.name}_molarity"] = cosolvent.concentration return statepoint