"""
Configuration schema for PolyzyMD simulations.
This module defines Pydantic models for all configuration sections,
providing validation, type safety, and YAML/JSON serialization support.
"""
from __future__ import annotations
import logging
import shlex
from enum import Enum
from pathlib import Path
from typing import Any, Literal, Self
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
LOGGER = logging.getLogger(__name__)
[docs]
class ChargeMethod(str, Enum):
"""Supported charge assignment methods for small molecules."""
NAGL = "nagl"
ESPALOMA = "espaloma"
AM1BCC = "am1bcc"
[docs]
class WaterModel(str, Enum):
"""Supported water models."""
TIP3P = "tip3p"
SPCE = "spce"
TIP4P = "tip4p"
TIP4PEW = "tip4pew"
OPC = "opc"
[docs]
class BoxShape(str, Enum):
"""Supported simulation box shapes."""
CUBE = "cube"
RHOMBIC_DODECAHEDRON = "rhombic_dodecahedron"
TRUNCATED_OCTAHEDRON = "truncated_octahedron"
[docs]
class Ensemble(str, Enum):
"""Thermodynamic ensemble types."""
NVT = "NVT"
NPT = "NPT"
NVE = "NVE"
[docs]
class ThermostatType(str, Enum):
"""Supported thermostat types."""
LANGEVIN_MIDDLE = "LangevinMiddle"
LANGEVIN = "Langevin"
ANDERSEN = "Andersen"
NOSE_HOOVER = "NoseHoover"
[docs]
class BarostatType(str, Enum):
"""Supported barostat types."""
MONTE_CARLO = "MC"
MONTE_CARLO_ANISOTROPIC = "MCA"
[docs]
class RestraintType(str, Enum):
"""Types of restraints that can be applied."""
FLAT_BOTTOM = "flat_bottom"
HARMONIC = "harmonic"
UPPER_WALL = "upper_wall"
LOWER_WALL = "lower_wall"
# =============================================================================
# Enzyme Configuration
# =============================================================================
[docs]
class EnzymeConfig(BaseModel):
"""Configuration for the enzyme/protein component.
Attributes:
name: Identifier for the enzyme (e.g., "LipA")
pdb_path: Path to the PDB file containing the enzyme structure
description: Optional description of the enzyme
"""
name: str = Field(..., description="Enzyme identifier")
pdb_path: Path = Field(..., description="Path to enzyme PDB file")
description: str | None = Field(None, description="Optional description")
[docs]
@field_validator("pdb_path")
@classmethod
def validate_pdb_path(cls, v: Path) -> Path:
"""Validate that PDB path has correct extension."""
if v.suffix.lower() != ".pdb":
raise ValueError(f"Expected .pdb file, got {v.suffix}")
return v
# =============================================================================
# Substrate/Ligand Configuration
# =============================================================================
[docs]
class SubstrateConfig(BaseModel):
"""Configuration for the docked substrate/ligand.
Attributes:
name: Identifier for the substrate (e.g., "Resorufin-Butyrate")
sdf_path: Path to SDF file with docked conformers
conformer_index: Which conformer to use (0-indexed)
charge_method: Method for assigning partial charges
residue_name: 3-letter residue name for topology (default: "LIG")
"""
name: str = Field(..., description="Substrate identifier")
sdf_path: Path = Field(..., description="Path to docked conformers SDF")
conformer_index: int = Field(0, ge=0, description="Index of conformer to use")
charge_method: ChargeMethod = Field(ChargeMethod.NAGL, description="Charge assignment method")
residue_name: str = Field("LIG", max_length=3, description="3-letter residue name")
[docs]
@field_validator("sdf_path")
@classmethod
def validate_sdf_path(cls, v: Path) -> Path:
"""Validate that SDF path has correct extension."""
if v.suffix.lower() != ".sdf":
raise ValueError(f"Expected .sdf file, got {v.suffix}")
return v
# =============================================================================
# Polymer Packing Configuration
# =============================================================================
[docs]
class PolymerPackingConfig(BaseModel):
"""Settings for packing polymers around the solute.
Controls the box size and PACKMOL behavior when packing polymers
around the protein-ligand complex.
Attributes:
padding: Box padding around the solute in nanometers. Larger values
give polymers more room and can speed up PACKMOL convergence.
tolerance: Minimum molecular spacing for PACKMOL in Angstrom.
movebadrandom: When ``True``, pass the ``movebadrandom`` keyword to
PACKMOL. This places badly-packed molecules at random positions
in the box rather than near well-packed neighbours, which
improves convergence for dense or heterogeneous systems (many
unique chain types). Has no effect when only a single chain
type is present. Default is ``False`` (PACKMOL default
behaviour is preserved).
box_vectors: Optional explicit box dimensions ``[Lx, Ly, Lz]`` in
nanometers. When set, overrides the auto-computed bounding box
plus *padding*. The protein is centered at the midpoint of
the box. Default is ``None`` (auto-compute from solute
bounding box + padding).
Example:
>>> PolymerPackingConfig(padding=2.5, movebadrandom=True)
>>> PolymerPackingConfig(box_vectors=[8.0, 10.0, 12.0])
"""
padding: float = Field(2.0, gt=0.0, description="Box padding around solute (nm)")
tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
movebadrandom: bool = Field(
False,
description=(
"Pass the 'movebadrandom' keyword to PACKMOL. "
"Improves convergence for heterogeneous polymer systems "
"by placing badly-packed molecules at random box positions."
),
)
box_vectors: list[float] | None = Field(
None,
min_length=3,
max_length=3,
description=(
"Optional explicit box dimensions [Lx, Ly, Lz] in nanometers. "
"Overrides auto-computed bounding box + padding. "
"Protein is centered at the midpoint of this box."
),
)
# =============================================================================
# Polymer Configuration
# =============================================================================
[docs]
class MonomerSpec(BaseModel):
"""Specification for a single monomer type in a co-polymer.
For dynamic polymer generation, provide the raw (unactivated) monomer SMILES.
The system will run initiation reactions to create the active fragments.
Attributes:
label: Single character label for this monomer (e.g., "A", "B")
probability: Probability of selecting this monomer (0-1)
name: Optional full name (e.g., "SBMA", "EGPMA")
smiles: Raw monomer SMILES string (required for dynamic generation)
residue_name: 3-character PDB residue name (auto-generated if not provided)
"""
label: str = Field(..., min_length=1, max_length=1, description="Monomer label")
probability: float = Field(..., ge=0.0, le=1.0, description="Selection probability")
name: str | None = Field(None, description="Full monomer name")
smiles: str | None = Field(
None,
description="Raw monomer SMILES (required for dynamic generation mode)",
)
residue_name: str | None = Field(
None,
max_length=3,
description="3-char PDB residue name (auto-generated from name if not provided)",
)
[docs]
@model_validator(mode="after")
def auto_generate_residue_name(self) -> "MonomerSpec":
"""Auto-generate residue name from monomer name if not provided."""
if self.residue_name is None and self.name is not None:
# Use first 3 characters of name, uppercase
object.__setattr__(self, "residue_name", self.name[:3].upper())
return self
[docs]
class PolymerGenerationMode(str, Enum):
"""Mode for polymer generation."""
CACHED = "cached" # Load pre-built SDF files from disk
DYNAMIC = "dynamic" # Generate polymers on-the-fly using Polymerist
[docs]
class ReactionConfig(BaseModel):
"""Paths to reaction templates for ATRP polymer generation.
These .rxn files define the chemical transformations used to create
polymer fragments from raw monomer SMILES. For ATRP, this includes:
- Initiation: Activates the vinyl group (e.g., chlorination)
- Polymerization: Creates chain-extending fragments
- Termination: Restores the alkene for chain ends
You can use "default" as a special value to use the bundled ATRP
methacrylate reaction templates that ship with PolyzyMD.
Example:
reactions:
initiation: "default"
polymerization: "default"
termination: "default"
Attributes:
initiation: Path to the initiation reaction template (.rxn) or "default"
polymerization: Path to the polymerization reaction template (.rxn) or "default"
termination: Path to the termination reaction template (.rxn) or "default"
"""
initiation: Path = Field(..., description="Path to ATRP initiation .rxn file or 'default'")
polymerization: Path = Field(
..., description="Path to ATRP polymerization .rxn file or 'default'"
)
termination: Path = Field(..., description="Path to ATRP termination .rxn file or 'default'")
[docs]
@field_validator("initiation", "polymerization", "termination", mode="before")
@classmethod
def resolve_default_paths(cls, v, info) -> Path:
"""Resolve 'default' to bundled ATRP reaction paths."""
from polyzymd.data.reactions import (
get_atrp_initiation_path,
get_atrp_polymerization_path,
get_atrp_termination_path,
)
if isinstance(v, str) and v.lower() == "default":
# Map field name to the appropriate getter
field_to_getter = {
"initiation": get_atrp_initiation_path,
"polymerization": get_atrp_polymerization_path,
"termination": get_atrp_termination_path,
}
getter = field_to_getter.get(info.field_name)
if getter:
return getter()
raise ValueError(f"Unknown reaction field: {info.field_name}")
# Convert string to Path if needed
if isinstance(v, str):
v = Path(v)
# Validate .rxn extension
if isinstance(v, Path) and v.suffix.lower() != ".rxn":
raise ValueError(f"Expected .rxn file, got {v.suffix}")
return v
[docs]
class PolymerConfig(BaseModel):
"""Configuration for polymer components.
Supports two generation modes:
- "cached": Load pre-built polymer SDF files from sdf_directory
- "dynamic": Generate polymers on-the-fly using Polymerist from SMILES
For dynamic mode, you must provide:
- SMILES for each monomer in monomers[].smiles
- Reaction templates in the reactions field
Attributes:
enabled: Whether to include polymers in the system
generation_mode: "cached" for pre-built SDFs, "dynamic" for on-the-fly generation
type_prefix: Prefix for polymer type in filenames (e.g., "SBMA-EGPMA")
monomers: List of monomer specifications with probabilities (and SMILES for dynamic)
length: Number of monomer units per polymer chain
count: Number of polymer chains to add
sdf_directory: Path to pre-built polymer SDF files (for cached mode)
reactions: Reaction templates for ATRP (required for dynamic mode)
charger: Charge assignment method for generated polymers
max_retries: Maximum retries for polymer generation (ring-piercing failures)
cache_directory: Directory for caching generated polymers and fragments
packing: Settings for packing polymers around the solute
random_seed: Random seed for polymer sequence generation (for reproducibility)
"""
enabled: bool = Field(True, description="Include polymers in system")
generation_mode: PolymerGenerationMode = Field(
PolymerGenerationMode.CACHED,
description="Polymer generation mode: 'cached' (pre-built SDFs) or 'dynamic' (generate from SMILES)",
)
type_prefix: str = Field(..., description="Polymer type prefix for filenames")
monomers: list[MonomerSpec] = Field(..., min_length=1, description="Monomer specifications")
length: int = Field(..., ge=1, description="Monomers per chain")
count: int = Field(..., ge=1, description="Number of polymer chains")
sdf_directory: Path | None = Field(
None, description="Directory with pre-built polymer SDFs (for cached mode)"
)
reactions: ReactionConfig | None = Field(
None, description="ATRP reaction templates (required for dynamic mode)"
)
charger: ChargeMethod = Field(
ChargeMethod.NAGL,
description="Charge assignment method for generated polymers",
)
max_retries: int = Field(
10,
ge=1,
description="Maximum retries for polymer generation (handles ring-piercing failures)",
)
cache_directory: Path = Field(
Path(".polymer_cache"), description="Cache directory for generated polymers and fragments"
)
packing: PolymerPackingConfig = Field(
default_factory=PolymerPackingConfig,
description="Polymer packing settings (padding, tolerance)",
)
random_seed: int | None = Field(
None,
description="Random seed for polymer sequence generation. If None, uses replicate number.",
)
[docs]
@model_validator(mode="after")
def validate_probabilities_sum_to_one(self) -> "PolymerConfig":
"""Ensure monomer probabilities sum to 1.0."""
if self.enabled:
total = sum(m.probability for m in self.monomers)
if abs(total - 1.0) > 1e-6:
raise ValueError(f"Monomer probabilities must sum to 1.0, got {total}")
return self
[docs]
@model_validator(mode="after")
def validate_generation_mode_requirements(self) -> "PolymerConfig":
"""Validate that required fields are present for the selected generation mode."""
if not self.enabled:
return self
if self.generation_mode == PolymerGenerationMode.DYNAMIC:
# Dynamic mode requires SMILES for all monomers
missing_smiles = [m.label for m in self.monomers if m.smiles is None]
if missing_smiles:
raise ValueError(
f"Dynamic generation mode requires 'smiles' for all monomers. "
f"Missing SMILES for monomers: {missing_smiles}"
)
# Dynamic mode requires reaction templates
if self.reactions is None:
raise ValueError(
"Dynamic generation mode requires 'reactions' field with ATRP reaction templates"
)
elif self.generation_mode == PolymerGenerationMode.CACHED:
# Cached mode requires sdf_directory
if self.sdf_directory is None:
raise ValueError(
"Cached generation mode requires 'sdf_directory' with pre-built polymer SDFs"
)
return self
# =============================================================================
# Solvent Configuration
# =============================================================================
[docs]
class CoSolventSpec(BaseModel):
"""Specification for a co-solvent component.
You must specify EITHER volume_fraction OR concentration, not both.
For co-solvents in the built-in library (dmso, dmf, urea, ethanol, etc.),
you can omit the smiles and density fields - they will be looked up
automatically.
Attributes:
name: Identifier for the co-solvent (e.g., "dmso")
smiles: SMILES string (optional if co-solvent is in library)
volume_fraction: Volume fraction (0-1), e.g., 0.30 for 30% v/v
concentration: Molar concentration (mol/L)
density: Density in g/mL (required for volume_fraction with custom molecules)
residue_name: 3-letter residue name (default: first 3 chars of name)
Example (library co-solvent with volume fraction):
>>> CoSolventSpec(name="dmso", volume_fraction=0.30)
Example (library co-solvent with concentration):
>>> CoSolventSpec(name="urea", concentration=2.0)
Example (custom co-solvent):
>>> CoSolventSpec(
... name="my_solvent",
... smiles="CCOC(=O)C",
... density=0.902,
... volume_fraction=0.15
... )
"""
name: str = Field(..., description="Co-solvent identifier")
smiles: str | None = Field(None, description="SMILES string (optional if in library)")
# Specification method 1: Volume fraction
volume_fraction: float | None = Field(
None,
gt=0.0,
lt=1.0,
description="Volume fraction (0-1), e.g., 0.30 for 30% v/v",
)
# Specification method 2: Molar concentration
concentration: float | None = Field(None, gt=0.0, description="Molar concentration (mol/L)")
# Physical property for volume fraction calculation
density: float | None = Field(
None,
gt=0.0,
description="Density in g/mL (required for volume_fraction with custom molecules)",
)
residue_name: str | None = Field(None, max_length=3, description="Residue name")
[docs]
@model_validator(mode="after")
def validate_and_populate(self) -> "CoSolventSpec":
"""Validate specification and populate missing fields from library."""
from polyzymd.data.cosolvent_library import get_cosolvent
# Check that exactly one of volume_fraction or concentration is specified
has_vf = self.volume_fraction is not None
has_conc = self.concentration is not None
if not has_vf and not has_conc:
raise ValueError(
f"Co-solvent '{self.name}': Must specify either 'volume_fraction' "
f"or 'concentration'"
)
if has_vf and has_conc:
raise ValueError(
f"Co-solvent '{self.name}': Cannot specify both 'volume_fraction' "
f"and 'concentration' - choose one"
)
# Look up from library
library_data = get_cosolvent(self.name)
# Populate SMILES if not provided
if self.smiles is None:
if library_data:
object.__setattr__(self, "smiles", library_data.smiles)
else:
raise ValueError(
f"Co-solvent '{self.name}' not in library. Please provide 'smiles' field."
)
# For volume_fraction, we need density
if has_vf and self.density is None:
if library_data:
object.__setattr__(self, "density", library_data.density)
else:
raise ValueError(
f"Co-solvent '{self.name}' not in library. "
f"Please provide 'density' field (g/mL) for volume_fraction calculation."
)
# Set default residue name
if self.residue_name is None:
object.__setattr__(self, "residue_name", self.name[:3].upper())
return self
[docs]
class PrimarySolventConfig(BaseModel):
"""Configuration for the primary solvent (usually water).
Attributes:
type: Solvent type identifier
model: Water model to use (if type is "water")
"""
type: str = Field("water", description="Primary solvent type")
model: WaterModel = Field(WaterModel.TIP3P, description="Water model")
[docs]
class IonConfig(BaseModel):
"""Configuration for ions in the solvent.
Attributes:
neutralize: Whether to add ions for charge neutralization
nacl_concentration: Additional NaCl concentration in mol/L
kcl_concentration: Additional KCl concentration in mol/L
mgcl2_concentration: Additional MgCl2 concentration in mol/L
"""
neutralize: bool = Field(True, description="Neutralize system charge")
nacl_concentration: float = Field(0.0, ge=0.0, description="NaCl conc. (mol/L)")
kcl_concentration: float = Field(0.0, ge=0.0, description="KCl conc. (mol/L)")
mgcl2_concentration: float = Field(0.0, ge=0.0, description="MgCl2 conc. (mol/L)")
[docs]
class BoxConfig(BaseModel):
"""Configuration for the simulation box.
Attributes:
padding: Distance from solute to box edge in nm
shape: Box geometry
target_density: Target density in g/mL
tolerance: Minimum molecular spacing for PACKMOL in Angstrom
"""
padding: float = Field(1.2, gt=0.0, description="Box padding (nm)")
shape: BoxShape = Field(BoxShape.RHOMBIC_DODECAHEDRON, description="Box shape")
target_density: float = Field(1.0, gt=0.0, description="Target density (g/mL)")
tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
[docs]
class SolventConfig(BaseModel):
"""Complete solvent configuration.
Attributes:
primary: Primary solvent settings
co_solvents: List of co-solvent specifications
ions: Ion configuration
box: Box geometry settings
"""
primary: PrimarySolventConfig = Field(
default_factory=PrimarySolventConfig, description="Primary solvent"
)
co_solvents: list[CoSolventSpec] = Field(default_factory=list, description="Co-solvents")
ions: IonConfig = Field(default_factory=IonConfig, description="Ion settings")
box: BoxConfig = Field(default_factory=BoxConfig, description="Box settings")
[docs]
@model_validator(mode="after")
def validate_volume_fractions(self) -> "SolventConfig":
"""Ensure co-solvent volume fractions don't exceed 1.0."""
total = sum(cs.volume_fraction for cs in self.co_solvents if cs.volume_fraction is not None)
if total >= 1.0:
raise ValueError(f"Total co-solvent volume fraction must be < 1.0, got {total}")
return self
# =============================================================================
# Restraint Configuration
# =============================================================================
[docs]
class AtomSelectionConfig(BaseModel):
"""Configuration for selecting atoms for restraints.
Uses MDAnalysis-compatible selection syntax for flexibility.
Attributes:
selection: MDAnalysis selection string (e.g., "resid 77 and name OG")
description: Optional human-readable description
"""
selection: str = Field(..., description="MDAnalysis selection string")
description: str | None = Field(None, description="Human-readable description")
[docs]
class RestraintConfig(BaseModel):
"""Configuration for a single restraint.
Attributes:
type: Type of restraint (flat_bottom, harmonic, etc.)
name: Identifier for this restraint
atom1: First atom selection
atom2: Second atom selection
distance: Target/threshold distance in Angstroms
force_constant: Force constant in kJ/mol/nm^2
enabled: Whether this restraint is active
"""
type: RestraintType = Field(..., description="Restraint type")
name: str = Field(..., description="Restraint identifier")
atom1: AtomSelectionConfig = Field(..., description="First atom selection")
atom2: AtomSelectionConfig = Field(..., description="Second atom selection")
distance: float = Field(..., gt=0.0, description="Distance threshold (Angstrom)")
force_constant: float = Field(10000.0, gt=0.0, description="Force constant (kJ/mol/nm^2)")
enabled: bool = Field(True, description="Whether restraint is active")
# =============================================================================
# Thermodynamics Configuration
# =============================================================================
[docs]
class ThermodynamicsConfig(BaseModel):
"""Thermodynamic conditions for the simulation.
Attributes:
temperature: System temperature in Kelvin
pressure: System pressure in atmospheres (for NPT)
salt_concentration: Ionic strength in mol/L (deprecated, use solvent.ions)
"""
temperature: float = Field(..., gt=0.0, description="Temperature (K)")
pressure: float = Field(1.0, gt=0.0, description="Pressure (atm)")
# =============================================================================
# Simulation Phase Configuration
# =============================================================================
[docs]
class SimulationPhaseConfig(BaseModel):
"""Configuration for a single simulation phase (equilibration or production).
Attributes:
ensemble: Thermodynamic ensemble (NVT, NPT)
duration: Simulation duration in nanoseconds
samples: Number of trajectory frames to save
report_interval: Explicit reporter interval in MD steps
time_step: Integration time step in femtoseconds
thermostat: Thermostat type
thermostat_timescale: Thermostat coupling timescale in ps
barostat: Barostat type (for NPT)
barostat_frequency: Barostat update frequency (steps)
checkpoint_interval: Wall-time interval (seconds) between restart
checkpoints for preemption resilience
"""
ensemble: Ensemble = Field(..., description="Thermodynamic ensemble")
duration: float = Field(..., gt=0.0, description="Duration (ns)")
samples: int = Field(..., ge=1, description="Trajectory frames to save")
report_interval: int = Field(..., ge=1, description="Reporter interval in MD steps")
time_step: float = Field(2.0, gt=0.0, description="Time step (fs)")
thermostat: ThermostatType = Field(
ThermostatType.LANGEVIN_MIDDLE, description="Thermostat type"
)
thermostat_timescale: float = Field(1.0, gt=0.0, description="Thermostat timescale (ps)")
barostat: BarostatType | None = Field(None, description="Barostat type")
barostat_frequency: int = Field(25, ge=1, description="Barostat update frequency")
checkpoint_interval: float = Field(
...,
gt=0.0,
description=(
"Wall-time interval in seconds between restart checkpoints. "
"Controls how frequently simulation state is saved for automatic "
"restart on SLURM preemption or hard kill. Independent of "
"trajectory/reporter output frequency. Must be positive so "
"unexpected termination recovery remains enabled."
),
)
[docs]
@model_validator(mode="after")
def validate_ensemble_barostat(self) -> "SimulationPhaseConfig":
"""Ensure NPT ensemble has a barostat configured."""
if self.ensemble == Ensemble.NPT and self.barostat is None:
self.barostat = BarostatType.MONTE_CARLO
return self
# =============================================================================
# Equilibration Stage Configuration (Multi-Stage Equilibration)
# =============================================================================
[docs]
class PositionRestraintConfig(BaseModel):
"""Configuration for positional restraints on an atom group.
Position restraints apply a harmonic potential to keep atoms near their
initial coordinates. This is commonly used during equilibration to
prevent large structural changes while the system relaxes.
Attributes:
group: Predefined atom group name
force_constant: Force constant in kJ/mol/nm^2 (4184.0 = 1.0 kcal/mol/A^2)
"""
group: str = Field(
...,
description=(
"Atom group: protein_heavy, protein_backbone, protein_calpha, "
"ligand_heavy, polymer_heavy, solvent, water_only, ions_only, cosolvents_only"
),
)
force_constant: float = Field(
4184.0, # 1.0 kcal/mol/A^2 in kJ/mol/nm^2
gt=0.0,
description="Force constant (kJ/mol/nm^2). Default 4184.0 = 1.0 kcal/mol/A^2",
)
[docs]
@field_validator("group")
@classmethod
def validate_group_name(cls, v: str) -> str:
"""Validate that the group name is a recognized predefined group."""
from polyzymd.core.atom_groups import PREDEFINED_GROUPS
if v not in PREDEFINED_GROUPS:
raise ValueError(
f"Unknown atom group: '{v}'. Valid groups: {sorted(PREDEFINED_GROUPS)}"
)
return v
[docs]
class EquilibrationStageConfig(BaseModel):
"""Configuration for a single equilibration stage.
Supports two temperature modes:
1. Constant temperature: Set 'temperature' field.
2. Temperature ramping (simulated annealing): Set 'temperature_start'
and 'temperature_end' fields.
Position restraints can be applied to hold specific atom groups in place
during the stage.
Attributes:
name: Stage identifier (used in output paths)
duration: Stage duration in nanoseconds
samples: Number of trajectory frames to save
ensemble: Thermodynamic ensemble (NVT or NPT)
temperature: Constant temperature in K (mutually exclusive with ramping)
temperature_start: Starting temperature for ramping in K
temperature_end: Ending temperature for ramping in K
temperature_increment: Temperature increment per update in K
temperature_interval: Time between temperature updates in fs
position_restraints: List of position restraints for this stage
time_step: Optional time step override in fs
thermostat: Optional thermostat type override
thermostat_timescale: Optional thermostat timescale override in ps
barostat: Optional barostat type (for NPT ensemble)
barostat_frequency: Optional barostat update frequency
"""
name: str = Field(..., description="Stage identifier (used in output paths)")
duration: float = Field(..., gt=0.0, description="Duration (ns)")
samples: int = Field(100, ge=1, description="Trajectory frames to save")
ensemble: Ensemble = Field(Ensemble.NVT, description="Thermodynamic ensemble")
# Constant temperature mode
temperature: float | None = Field(
None,
gt=0.0,
description="Constant temperature (K). Mutually exclusive with temperature ramping.",
)
# Temperature ramping mode (simulated annealing)
temperature_start: float | None = Field(
None, gt=0.0, description="Starting temperature for ramping (K)"
)
temperature_end: float | None = Field(
None, gt=0.0, description="Ending temperature for ramping (K)"
)
temperature_increment: float = Field(
1.0, gt=0.0, description="Temperature increment per update (K)"
)
temperature_interval: float = Field(
1200.0, gt=0.0, description="Time between temperature updates (fs)"
)
# Position restraints
position_restraints: list[PositionRestraintConfig] = Field(
default_factory=list, description="Position restraints for this stage"
)
# Optional overrides (inherit from parent/defaults if not specified)
time_step: float | None = Field(None, gt=0.0, description="Time step (fs)")
thermostat: ThermostatType | None = Field(None, description="Thermostat type")
thermostat_timescale: float | None = Field(
None, gt=0.0, description="Thermostat timescale (ps)"
)
barostat: BarostatType | None = Field(None, description="Barostat type (for NPT)")
barostat_frequency: int | None = Field(None, ge=1, description="Barostat update frequency")
[docs]
@model_validator(mode="after")
def validate_temperature_mode(self) -> "EquilibrationStageConfig":
"""Ensure valid temperature specification."""
has_constant = self.temperature is not None
has_start = self.temperature_start is not None
has_end = self.temperature_end is not None
if has_constant and (has_start or has_end):
raise ValueError(
"Cannot specify both 'temperature' and temperature ramping "
"('temperature_start'/'temperature_end')"
)
if not has_constant and not (has_start and has_end):
raise ValueError(
"Must specify either 'temperature' for constant temperature, "
"or both 'temperature_start' and 'temperature_end' for ramping"
)
if has_start and has_end and self.temperature_start > self.temperature_end:
raise ValueError(
f"temperature_start ({self.temperature_start}) must be <= "
f"temperature_end ({self.temperature_end})"
)
return self
[docs]
@model_validator(mode="after")
def validate_npt_barostat(self) -> "EquilibrationStageConfig":
"""Ensure NPT ensemble has a barostat configured."""
if self.ensemble == Ensemble.NPT and self.barostat is None:
self.barostat = BarostatType.MONTE_CARLO
return self
@property
def is_temperature_ramping(self) -> bool:
"""Check if this stage uses temperature ramping."""
return self.temperature_start is not None
[docs]
def get_start_temperature(self) -> float:
"""Get the starting temperature of this stage."""
if self.is_temperature_ramping:
return self.temperature_start
return self.temperature
[docs]
def get_final_temperature(self) -> float:
"""Get the final temperature of this stage."""
if self.is_temperature_ramping:
return self.temperature_end
return self.temperature
[docs]
class SimulationPhasesConfig(BaseModel):
"""Configuration for all simulation phases.
Attributes:
equilibration_stages: Multi-stage equilibration protocol
production: Production phase settings
"""
model_config = ConfigDict(extra="forbid")
equilibration_stages: list[EquilibrationStageConfig] | None = Field(
None, description="Multi-stage equilibration protocol with position restraints"
)
production: SimulationPhaseConfig = Field(..., description="Production settings")
[docs]
@model_validator(mode="after")
def validate_equilibration_mode(self) -> "SimulationPhasesConfig":
"""Ensure staged equilibration is configured."""
if self.equilibration_stages is None:
raise ValueError(
"PolyzyMD now requires 'equilibration_stages' in simulation_phases. "
"The legacy 'equilibration' block is no longer supported. "
"Convert your config to one or more staged equilibration entries."
)
if len(self.equilibration_stages) == 0:
raise ValueError("'equilibration_stages' must contain at least one stage")
return self
@property
def total_equilibration_duration(self) -> float:
"""Total equilibration duration in nanoseconds.
Returns the sum of all stage durations.
"""
return sum(stage.duration for stage in self.equilibration_stages)
@property
def total_equilibration_samples(self) -> int:
"""Total equilibration trajectory samples.
Returns the sum of all stage samples.
"""
return sum(stage.samples for stage in self.equilibration_stages)
# =============================================================================
# Output Configuration
# =============================================================================
[docs]
def expand_path(path: Path) -> Path:
"""Expand environment variables and user home in a path.
Supports:
- $VAR and ${VAR} syntax for environment variables
- ~ for user home directory
Args:
path: Path that may contain environment variables
Returns:
Path with variables expanded
"""
import os
expanded = os.path.expandvars(str(path))
expanded = os.path.expanduser(expanded)
return Path(expanded)
[docs]
class OutputConfig(BaseModel):
"""Configuration for simulation output.
Supports separate directories for:
- scripts/logs (projects_directory): Where job scripts and SLURM logs are written
- simulation data (scratch_directory): Where trajectories, checkpoints go
This separation allows running simulations on HPC systems where code lives
in long-term storage (projects) but data is written to high-performance
scratch storage.
Attributes:
projects_directory: Directory for scripts, configs, logs (long-term storage)
scratch_directory: Directory for simulation output (high-performance storage)
naming_template: Template for naming working directories
job_scripts_subdir: Subdirectory name for job scripts within projects
slurm_logs_subdir: Subdirectory name for SLURM logs within projects
save_checkpoint: Whether to save checkpoint files
save_state_data: Whether to save thermodynamic state data
trajectory_format: Output trajectory format
Example YAML:
output:
projects_directory: /projects/user/polyzymd
scratch_directory: /scratch/alpine/user/simulations
naming_template: "{enzyme}_{substrate}_{temperature}K_run{replicate}"
"""
model_config = ConfigDict(extra="forbid")
# Directory configuration
projects_directory: Path = Field(
Path("."),
description="Base directory for scripts and logs (typically in projects/long-term storage)",
)
scratch_directory: Path | None = Field(
None,
description="Base directory for simulation output (scratch storage). If None, uses projects_directory.",
)
naming_template: str = Field(
"{enzyme}_{substrate}_{polymer_type}_{duration}ns_{temperature}K_run{replicate}",
description="Directory naming template",
)
# Subdirectory organization
job_scripts_subdir: str = Field(
"job_scripts",
description="Subdirectory for generated job scripts",
)
slurm_logs_subdir: str = Field(
"slurm_logs",
description="Subdirectory for SLURM output logs",
)
# Output options
save_checkpoint: bool = Field(True, description="Save checkpoint files")
save_state_data: bool = Field(True, description="Save state data CSV")
trajectory_format: str = Field("dcd", description="Trajectory file format")
[docs]
@field_validator("projects_directory", "scratch_directory", mode="before")
@classmethod
def expand_env_vars_in_paths(cls, v: str | Path | None) -> Path | None:
"""Expand environment variables and ~ in path fields.
Supports $USER, ${HOME}, ~/path, etc.
"""
if v is None:
return None
return expand_path(Path(v))
@property
def effective_scratch_directory(self) -> Path:
"""Get the effective scratch directory (falls back to projects if not set)."""
return self.scratch_directory if self.scratch_directory else self.projects_directory
[docs]
def get_job_scripts_directory(self) -> Path:
"""Get the directory for job scripts.
Returns:
Path to job scripts directory (within projects)
"""
return self.projects_directory / self.job_scripts_subdir
[docs]
def get_slurm_logs_directory(self) -> Path:
"""Get the directory for SLURM log files.
Returns:
Path to SLURM logs directory (within projects)
"""
return self.projects_directory / self.slurm_logs_subdir
# =============================================================================
# Force Field Configuration
# =============================================================================
[docs]
class ForceFieldConfig(BaseModel):
"""Configuration for force field selection.
Attributes:
protein: Force field for proteins
small_molecule: Force field for small molecules
water: Water model force field (derived from solvent config)
"""
protein: str = Field("ff14sb_off_impropers_0.0.4.offxml", description="Protein force field")
small_molecule: str = Field("openff-2.0.0.offxml", description="Small molecule force field")
# =============================================================================
# Engine Configuration
# =============================================================================
[docs]
class OpenMMEngineConfig(BaseModel):
"""OpenMM-specific engine settings.
These settings control OpenMM platform selection and device configuration.
Only relevant when ``engine`` is ``"openmm"``.
Attributes:
platform: OpenMM platform to use for computation.
device_index: GPU device index (platform-specific).
precision: Floating-point precision mode.
"""
platform: str = Field("CUDA", description="OpenMM compute platform")
device_index: str | None = Field(None, description="GPU device index")
precision: str = Field("mixed", description="Floating-point precision")
[docs]
class GromacsEngineConfig(BaseModel):
"""GROMACS-specific engine settings.
These settings control how GROMACS binaries are located and invoked,
and how SLURM resources are allocated for GROMACS jobs.
GROMACS users configure parallelism via ``ntmpi`` (MPI ranks) and
``ntomp`` (OpenMP threads per rank). These map directly to the
``gmx mdrun -ntmpi`` and ``-ntomp`` flags. The corresponding SLURM
resources (``ntasks``, ``cpus_per_task``, ``memory``) are set to
match.
Set ``gpu: true`` when running a CUDA-enabled GROMACS build on a
GPU node. When ``False`` (default), the SLURM script omits GPU
directives entirely.
Attributes:
gmx_binary: Path or name of the GROMACS binary.
Resolved via config > $GMX_BIN > PATH discovery if None.
mdrun_flags: Additional flags passed to ``gmx mdrun`` (all stages).
grompp_flags: Additional flags passed to ``gmx grompp``.
module_load: Module load command for HPC (e.g. ``"module load gromacs/2024"``)
ntmpi: Number of MPI ranks for ``gmx mdrun -ntmpi``.
Also sets SLURM ``--ntasks``.
ntomp: Number of OpenMP threads per rank for ``gmx mdrun -ntomp``.
Also sets SLURM ``--cpus-per-task``.
gpu: Request a GPU via SLURM. When False, the ``--gres=gpu``
directive is omitted entirely.
gpus: Number of GPUs to request when ``gpu`` is True. Ignored
when ``gpu`` is False.
memory: SLURM ``--mem`` allocation for GROMACS jobs.
"""
gmx_binary: str | None = Field(None, description="GROMACS binary path or name")
mdrun_flags: str = Field("", description="Extra flags for gmx mdrun (all stages)")
grompp_flags: str = Field("-maxwarn 1", description="Extra flags for gmx grompp")
mdrun_flags_equilibration: str | None = Field(
None,
description=(
"Override mdrun_flags for equilibration stages only. "
"When None, falls back to mdrun_flags."
),
)
mdrun_flags_production: str | None = Field(
None,
description=(
"Override mdrun_flags for production only. When None, falls back to mdrun_flags."
),
)
command_prefix: str | None = Field(
None,
description=(
"Prefix prepended to all GROMACS commands. "
"Use for container wrappers, e.g. "
"'singularity exec --rocm --bind $PWD /path/to/gromacs.sif'"
),
)
mpi_launcher_flags: str = Field(
"",
description=(
"Extra flags passed to the MPI launcher (mpirun). "
"Only used when the binary is a real-MPI build. "
"E.g. '-genv I_MPI_FABRICS shm:tcp' for Intel MPI."
),
)
module_load: str | None = Field(None, description="HPC module load command")
env_exports: dict[str, str] = Field(
default_factory=dict,
description=("Environment variables exported before GROMACS commands (key=value pairs)"),
)
setup_commands: list[str] = Field(
default_factory=list,
description="Shell commands run after module_load and before GROMACS commands",
)
ntmpi: int = Field(1, ge=1, description="MPI ranks (-ntmpi); sets SLURM --ntasks")
slurm_ntasks: int | None = Field(
default=None,
ge=1,
description=(
"Override SLURM --ntasks independently of GROMACS -ntmpi. "
"Advanced option for multi-node MPI+GPU workflows where scheduler "
"tasks differ from GROMACS thread-MPI ranks. When None, SLURM ntasks "
"is set from ntmpi (default behavior)."
),
)
ntomp: int = Field(
8,
ge=1,
description="OpenMP threads per rank (-ntomp); sets SLURM --cpus-per-task",
)
gpu: bool = Field(
False,
description="Request GPU via SLURM (set True for CUDA-enabled GROMACS)",
)
gpus: int = Field(
1,
ge=1,
description="Number of GPUs to request via SLURM when gpu is True",
)
memory: str = Field("16G", description="SLURM --mem allocation for GROMACS jobs")
@model_validator(mode="after")
def _warn_gpu_ntmpi(self) -> Self:
"""Warn when GPU mode is combined with multiple thread-MPI ranks.
Returns
-------
Self
The validated config instance
"""
if self.gpu and self.ntmpi > 1:
LOGGER.warning(
"GPU GROMACS uses thread-MPI (gmx mdrun -ntmpi), not real MPI ranks. "
"With ntmpi=%d and gpus=%d, ensure enough GPUs are available or "
"accept GPU sharing.",
self.ntmpi,
self.gpus,
)
return self
@model_validator(mode="after")
def _warn_mdrun_flags_conflict(self) -> Self:
"""Warn when mdrun_flags conflict with explicit ntmpi or ntomp fields.
Returns
-------
Self
The validated config instance
"""
if not self.mdrun_flags:
return self
try:
tokens = shlex.split(self.mdrun_flags)
except ValueError:
return self
flag_map: dict[str, str] = {}
for i, tok in enumerate(tokens):
if tok in ("-ntmpi", "-ntomp") and i + 1 < len(tokens):
flag_map[tok] = tokens[i + 1]
if "-ntmpi" in flag_map:
try:
flag_val = int(flag_map["-ntmpi"])
if flag_val != self.ntmpi:
LOGGER.warning(
"mdrun_flags contains '-ntmpi %s' but config field ntmpi=%d. "
"The flag string will override at runtime.",
flag_map["-ntmpi"],
self.ntmpi,
)
except ValueError:
pass
if "-ntomp" in flag_map:
try:
flag_val = int(flag_map["-ntomp"])
if flag_val != self.ntomp:
LOGGER.warning(
"mdrun_flags contains '-ntomp %s' but config field ntomp=%d. "
"The flag string will override at runtime.",
flag_map["-ntomp"],
self.ntomp,
)
except ValueError:
pass
return self
# =============================================================================
# Main Simulation Configuration
# =============================================================================
[docs]
class SimulationConfig(BaseModel):
"""Complete simulation configuration.
This is the top-level configuration model that contains all settings
for a PolyzyMD simulation.
Attributes:
name: Simulation name/identifier
description: Optional description
enzyme: Enzyme configuration
substrate: Substrate/ligand configuration
polymers: Polymer configuration (optional)
solvent: Solvent and box configuration
restraints: List of restraint configurations
thermodynamics: Temperature/pressure settings
simulation_phases: Equilibration and production settings
output: Output file settings
force_field: Force field selection
Example:
>>> config = SimulationConfig.from_yaml("simulation.yaml")
>>> print(config.enzyme.name)
"LipA"
"""
name: str = Field(..., description="Simulation identifier")
description: str | None = Field(None, description="Simulation description")
enzyme: EnzymeConfig = Field(..., description="Enzyme configuration")
substrate: SubstrateConfig | None = Field(None, description="Substrate configuration")
polymers: PolymerConfig | None = Field(None, description="Polymer configuration")
solvent: SolventConfig = Field(
default_factory=SolventConfig, description="Solvent configuration"
)
restraints: list[RestraintConfig] = Field(
default_factory=list, description="Restraint configurations"
)
thermodynamics: ThermodynamicsConfig = Field(..., description="Thermodynamic conditions")
simulation_phases: SimulationPhasesConfig = Field(..., description="Phase settings")
output: OutputConfig = Field(default_factory=OutputConfig, description="Output settings")
force_field: ForceFieldConfig = Field(
default_factory=ForceFieldConfig, description="Force field settings"
)
engine: Literal["openmm", "gromacs"] = Field(..., description="Simulation engine to use")
openmm: OpenMMEngineConfig = Field(
default_factory=OpenMMEngineConfig, description="OpenMM engine settings"
)
gromacs: GromacsEngineConfig = Field(
default_factory=GromacsEngineConfig, description="GROMACS engine settings"
)
[docs]
@classmethod
def from_yaml(cls, path: str | Path) -> "SimulationConfig":
"""Load configuration from a YAML file.
Args:
path: Path to the YAML configuration file
Returns:
SimulationConfig instance
Raises:
FileNotFoundError: If file doesn't exist
ValidationError: If configuration is invalid
"""
from polyzymd.config.loader import load_config
return load_config(path)
[docs]
def to_yaml(self, path: str | Path) -> None:
"""Save configuration to a YAML file.
Args:
path: Path to save the configuration file
"""
from polyzymd.config.loader import save_config
save_config(self, path)
[docs]
def get_working_directory(self, replicate: int = 1) -> Path:
"""Get the working directory path for a given replicate (in scratch).
This returns the path where simulation output (trajectories, checkpoints)
will be written, which is in the scratch directory.
Args:
replicate: Replicate number
Returns:
Path to the working directory (in scratch)
"""
dir_name = self._format_run_directory_name(replicate)
return self.output.effective_scratch_directory / dir_name
[docs]
def get_projects_directory(self) -> Path:
"""Get the projects directory path.
This is where scripts, configs, and logs are stored.
Returns:
Path to the projects directory
"""
return self.output.projects_directory
[docs]
def discover_replicate_dirs(self) -> list[tuple[int, Path]]:
"""Auto-detect all replicate directories on disk.
Builds a glob pattern from the naming template with
``replicate="*"`` and scans the effective scratch directory.
Returns:
Sorted list of ``(replicate_number, directory_path)`` tuples.
"""
import re
# Build the same template values as _format_run_directory_name
# but substitute replicate with "*" for globbing.
polymer_type = "none"
if self.polymers and self.polymers.enabled:
probs = {m.label: m.probability for m in self.polymers.monomers}
sorted_labels = sorted(probs.keys())
composition = "_".join(f"{lbl}{probs[lbl] * 100:.0f}" for lbl in sorted_labels)
polymer_type = f"{self.polymers.type_prefix}_{composition}"
substrate_name = self.substrate.name if self.substrate else "apo"
glob_pattern = self.output.naming_template.format(
enzyme=self.enzyme.name,
substrate=substrate_name.replace("-", ""),
polymer_type=polymer_type,
temperature=int(self.thermodynamics.temperature),
replicate="*",
duration=int(self.simulation_phases.production.duration),
)
scratch = self.output.effective_scratch_directory
results: list[tuple[int, Path]] = []
for path in scratch.glob(glob_pattern):
if not path.is_dir():
continue
match = re.search(r"run(\d+)$", path.name)
if match:
results.append((int(match.group(1)), path))
results.sort(key=lambda t: t[0])
return results
def _format_run_directory_name(self, replicate: int) -> str:
"""Format the run directory name for a replicate.
Args:
replicate: Replicate number
Returns:
Formatted directory name
"""
polymer_type = "none"
if self.polymers and self.polymers.enabled:
# Format polymer type with full composition info
# Sort by label (A, B, C...) alphabetically - user controls which is "A"
probs = {m.label: m.probability for m in self.polymers.monomers}
sorted_labels = sorted(probs.keys())
composition = "_".join(f"{lbl}{probs[lbl] * 100:.0f}" for lbl in sorted_labels)
polymer_type = f"{self.polymers.type_prefix}_{composition}"
substrate_name = self.substrate.name if self.substrate else "apo"
return self.output.format_directory_name(
enzyme=self.enzyme.name,
substrate=substrate_name.replace("-", ""),
polymer_type=polymer_type,
temperature=self.thermodynamics.temperature,
replicate=replicate,
duration=self.simulation_phases.production.duration,
)
[docs]
def to_signac_statepoint(self, replicate: int = 1) -> dict[str, Any]:
"""Convert configuration to a Signac-compatible state point dictionary.
Args:
replicate: Replicate number
Returns:
Dictionary suitable for use as a Signac state point
"""
statepoint = {
"enzyme": self.enzyme.name,
"temperature": self.thermodynamics.temperature,
"replicate": replicate,
}
if self.substrate:
statepoint["substrate"] = self.substrate.name
statepoint["substrate_conformer"] = self.substrate.conformer_index
if self.polymers and self.polymers.enabled:
statepoint["polymer_type"] = self.polymers.type_prefix
statepoint["polymer_length"] = self.polymers.length
statepoint["polymer_count"] = self.polymers.count
# Include monomer probabilities
for monomer in self.polymers.monomers:
statepoint[f"monomer_{monomer.label}_prob"] = monomer.probability
else:
statepoint["polymer_type"] = "none"
# Include co-solvent info
for cosolvent in self.solvent.co_solvents:
if cosolvent.volume_fraction is not None:
statepoint[f"cosolvent_{cosolvent.name}_fraction"] = cosolvent.volume_fraction
elif cosolvent.concentration is not None:
statepoint[f"cosolvent_{cosolvent.name}_molarity"] = cosolvent.concentration
return statepoint