"""
Configuration schema for PolyzyMD simulations.
This module defines Pydantic models for all configuration sections,
providing validation, type safety, and YAML/JSON serialization support.
"""
from __future__ import annotations
import logging
import re
import shlex
from enum import Enum
from pathlib import Path
from string import Formatter
from typing import Any, Literal, Self
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
LOGGER = logging.getLogger(__name__)
_TOKEN_UNSAFE_CHARS = re.compile(r"[^A-Za-z0-9_-]+")
_TOKEN_UNDERSCORES = re.compile(r"_+")
def _format_safe_token(value: object) -> str:
"""Convert a value into a safe lowercase naming token.
Parameters
----------
value : object
Value to normalize for a directory or job-name token.
Returns
-------
str
Sanitized token using only alphanumerics, underscores, and hyphens.
"""
token = str(value).strip().lower()
token = _TOKEN_UNSAFE_CHARS.sub("_", token)
token = _TOKEN_UNDERSCORES.sub("_", token)
return token.strip("_") or "unknown"
def _format_decimal_token(value: float) -> str:
"""Format a decimal value without raw decimal points.
Parameters
----------
value : float
Numeric value to format.
Returns
-------
str
Compact token, using ``p`` as the decimal separator when needed.
"""
formatted = f"{value:g}"
return formatted.replace(".", "p")
[docs]
class ChargeMethod(str, Enum):
"""Supported charge assignment methods for small molecules."""
NAGL = "nagl"
ESPALOMA = "espaloma"
AM1BCC = "am1bcc"
[docs]
class WaterModel(str, Enum):
"""Supported water models."""
TIP3P = "tip3p"
SPCE = "spce"
TIP4P = "tip4p"
TIP4PEW = "tip4pew"
OPC = "opc"
[docs]
class BoxShape(str, Enum):
"""Supported simulation box shapes."""
CUBE = "cube"
RHOMBIC_DODECAHEDRON = "rhombic_dodecahedron"
TRUNCATED_OCTAHEDRON = "truncated_octahedron"
[docs]
class Ensemble(str, Enum):
"""Thermodynamic ensemble types."""
NVT = "NVT"
NPT = "NPT"
NVE = "NVE"
[docs]
class ThermostatType(str, Enum):
"""Supported thermostat types."""
LANGEVIN_MIDDLE = "LangevinMiddle"
LANGEVIN = "Langevin"
ANDERSEN = "Andersen"
NOSE_HOOVER = "NoseHoover"
[docs]
class BarostatType(str, Enum):
"""Supported barostat types."""
MONTE_CARLO = "MC"
MONTE_CARLO_ANISOTROPIC = "MCA"
[docs]
class RestraintType(str, Enum):
"""Types of restraints that can be applied."""
FLAT_BOTTOM = "flat_bottom"
HARMONIC = "harmonic"
UPPER_WALL = "upper_wall"
LOWER_WALL = "lower_wall"
# =============================================================================
# Enzyme Configuration
# =============================================================================
[docs]
class EnzymeConfig(BaseModel):
"""Configuration for the enzyme/protein component.
Attributes:
name: Identifier for the enzyme (e.g., "LipA")
pdb_path: Path to the PDB file containing the enzyme structure
description: Optional description of the enzyme
"""
name: str = Field(..., description="Enzyme identifier")
pdb_path: Path = Field(..., description="Path to enzyme PDB file")
description: str | None = Field(None, description="Optional description")
[docs]
@field_validator("pdb_path")
@classmethod
def validate_pdb_path(cls, v: Path) -> Path:
"""Validate that PDB path has correct extension."""
if v.suffix.lower() != ".pdb":
raise ValueError(f"Expected .pdb file, got {v.suffix}")
return v
# =============================================================================
# Substrate/Ligand Configuration
# =============================================================================
[docs]
class SubstrateConfig(BaseModel):
"""Configuration for the docked substrate/ligand.
Attributes:
name: Identifier for the substrate (e.g., "Resorufin-Butyrate")
sdf_path: Path to SDF file with docked conformers
conformer_index: Which conformer to use (0-indexed)
charge_method: Method for assigning partial charges
residue_name: 3-letter residue name for topology (default: "LIG")
"""
name: str = Field(..., description="Substrate identifier")
sdf_path: Path = Field(..., description="Path to docked conformers SDF")
conformer_index: int = Field(0, ge=0, description="Index of conformer to use")
charge_method: ChargeMethod = Field(ChargeMethod.NAGL, description="Charge assignment method")
residue_name: str = Field("LIG", max_length=3, description="3-letter residue name")
[docs]
@field_validator("sdf_path")
@classmethod
def validate_sdf_path(cls, v: Path) -> Path:
"""Validate that SDF path has correct extension."""
if v.suffix.lower() != ".sdf":
raise ValueError(f"Expected .sdf file, got {v.suffix}")
return v
# =============================================================================
# Polymer Packing Configuration
# =============================================================================
[docs]
class PolymerPackingConfig(BaseModel):
"""Settings for packing polymers around the solute.
Controls the box size and PACKMOL behavior when packing polymers
around the protein-ligand complex.
Attributes:
padding: Box padding around the solute in nanometers. Larger values
give polymers more room and can speed up PACKMOL convergence.
tolerance: Minimum molecular spacing for PACKMOL in Angstrom.
movebadrandom: When ``True``, pass the ``movebadrandom`` keyword to
PACKMOL. This places badly-packed molecules at random positions
in the box rather than near well-packed neighbours, which
improves convergence for dense or heterogeneous systems (many
unique chain types). Has no effect when only a single chain
type is present. Default is ``False`` (PACKMOL default
behaviour is preserved).
box_vectors: Optional explicit box dimensions ``[Lx, Ly, Lz]`` in
nanometers. When set, overrides the auto-computed bounding box
plus *padding*. The protein is centered at the midpoint of
the box. Default is ``None`` (auto-compute from solute
bounding box + padding).
Example:
>>> PolymerPackingConfig(padding=2.5, movebadrandom=True)
>>> PolymerPackingConfig(box_vectors=[8.0, 10.0, 12.0])
"""
padding: float = Field(2.0, gt=0.0, description="Box padding around solute (nm)")
tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
movebadrandom: bool = Field(
False,
description=(
"Pass the 'movebadrandom' keyword to PACKMOL. "
"Improves convergence for heterogeneous polymer systems "
"by placing badly-packed molecules at random box positions."
),
)
box_vectors: list[float] | None = Field(
None,
min_length=3,
max_length=3,
description=(
"Optional explicit box dimensions [Lx, Ly, Lz] in nanometers. "
"Overrides auto-computed bounding box + padding. "
"Protein is centered at the midpoint of this box."
),
)
# =============================================================================
# Polymer Configuration
# =============================================================================
[docs]
class MonomerSpec(BaseModel):
"""Specification for a single monomer type in a co-polymer.
For dynamic polymer generation, provide the raw (unactivated) monomer SMILES.
The system will run initiation reactions to create the active fragments.
Attributes:
label: Single character label for this monomer (e.g., "A", "B")
probability: Probability of selecting this monomer (0-1)
name: Optional full name (e.g., "SBMA", "EGPMA")
smiles: Raw monomer SMILES string (required for dynamic generation)
residue_name: 3-character PDB residue name (auto-generated if not provided)
"""
label: str = Field(..., min_length=1, max_length=1, description="Monomer label")
probability: float = Field(..., ge=0.0, le=1.0, description="Selection probability")
name: str | None = Field(None, description="Full monomer name")
smiles: str | None = Field(
None,
description="Raw monomer SMILES (required for dynamic generation mode)",
)
residue_name: str | None = Field(
None,
max_length=3,
description="3-char PDB residue name (auto-generated from name if not provided)",
)
[docs]
@model_validator(mode="after")
def auto_generate_residue_name(self) -> "MonomerSpec":
"""Auto-generate residue name from monomer name if not provided."""
if self.residue_name is None and self.name is not None:
# Use first 3 characters of name, uppercase
object.__setattr__(self, "residue_name", self.name[:3].upper())
return self
[docs]
class PolymerGenerationMode(str, Enum):
"""Mode for polymer generation."""
CACHED = "cached" # Load pre-built SDF files from disk
DYNAMIC = "dynamic" # Generate polymers on-the-fly using Polymerist
[docs]
class ReactionConfig(BaseModel):
"""Paths to reaction templates for ATRP polymer generation.
These .rxn files define the chemical transformations used to create
polymer fragments from raw monomer SMILES. For ATRP, this includes:
- Initiation: Activates the vinyl group (e.g., chlorination)
- Polymerization: Creates chain-extending fragments
- Termination: Restores the alkene for chain ends
You can use "default" as a special value to use the bundled ATRP
methacrylate reaction templates that ship with PolyzyMD.
Example:
reactions:
initiation: "default"
polymerization: "default"
termination: "default"
Attributes:
initiation: Path to the initiation reaction template (.rxn) or "default"
polymerization: Path to the polymerization reaction template (.rxn) or "default"
termination: Path to the termination reaction template (.rxn) or "default"
"""
initiation: Path = Field(..., description="Path to ATRP initiation .rxn file or 'default'")
polymerization: Path = Field(
..., description="Path to ATRP polymerization .rxn file or 'default'"
)
termination: Path = Field(..., description="Path to ATRP termination .rxn file or 'default'")
[docs]
@field_validator("initiation", "polymerization", "termination", mode="before")
@classmethod
def resolve_default_paths(cls, v, info) -> Path:
"""Resolve 'default' to bundled ATRP reaction paths."""
from polyzymd.data.reactions import (
get_atrp_initiation_path,
get_atrp_polymerization_path,
get_atrp_termination_path,
)
if isinstance(v, str) and v.lower() == "default":
# Map field name to the appropriate getter
field_to_getter = {
"initiation": get_atrp_initiation_path,
"polymerization": get_atrp_polymerization_path,
"termination": get_atrp_termination_path,
}
getter = field_to_getter.get(info.field_name)
if getter:
return getter()
raise ValueError(f"Unknown reaction field: {info.field_name}")
# Convert string to Path if needed
if isinstance(v, str):
v = Path(v)
# Validate .rxn extension
if isinstance(v, Path) and v.suffix.lower() != ".rxn":
raise ValueError(f"Expected .rxn file, got {v.suffix}")
return v
[docs]
class PolymerConfig(BaseModel):
"""Configuration for polymer components.
Supports two generation modes:
- "cached": Load pre-built polymer SDF files from sdf_directory
- "dynamic": Generate polymers on-the-fly using Polymerist from SMILES
For dynamic mode, you must provide:
- SMILES for each monomer in monomers[].smiles
- Reaction templates in the reactions field
Attributes:
enabled: Whether to include polymers in the system
generation_mode: "cached" for pre-built SDFs, "dynamic" for on-the-fly generation
type_prefix: Prefix for polymer type in filenames (e.g., "SBMA-EGPMA")
monomers: List of monomer specifications with probabilities (and SMILES for dynamic)
length: Number of monomer units per polymer chain
count: Number of polymer chains to add
sdf_directory: Path to pre-built polymer SDF files (for cached mode)
reactions: Reaction templates for ATRP (required for dynamic mode)
charger: Charge assignment method for generated polymers
max_retries: Maximum retries for polymer generation (ring-piercing failures)
cache_directory: Directory for caching generated polymers and fragments
packing: Settings for packing polymers around the solute
random_seed: Random seed for polymer sequence generation (for reproducibility)
"""
enabled: bool = Field(True, description="Include polymers in system")
generation_mode: PolymerGenerationMode = Field(
PolymerGenerationMode.CACHED,
description="Polymer generation mode: 'cached' (pre-built SDFs) or 'dynamic' (generate from SMILES)",
)
type_prefix: str = Field(..., description="Polymer type prefix for filenames")
monomers: list[MonomerSpec] = Field(..., min_length=1, description="Monomer specifications")
length: int = Field(..., ge=1, description="Monomers per chain")
count: int = Field(..., ge=1, description="Number of polymer chains")
sdf_directory: Path | None = Field(
None, description="Directory with pre-built polymer SDFs (for cached mode)"
)
reactions: ReactionConfig | None = Field(
None, description="ATRP reaction templates (required for dynamic mode)"
)
charger: ChargeMethod = Field(
ChargeMethod.NAGL,
description="Charge assignment method for generated polymers",
)
max_retries: int = Field(
10,
ge=1,
description="Maximum retries for polymer generation (handles ring-piercing failures)",
)
cache_directory: Path = Field(
Path(".polymer_cache"), description="Cache directory for generated polymers and fragments"
)
packing: PolymerPackingConfig = Field(
default_factory=PolymerPackingConfig,
description="Polymer packing settings (padding, tolerance)",
)
random_seed: int | None = Field(
None,
description="Random seed for polymer sequence generation. If None, uses replicate number.",
)
[docs]
@model_validator(mode="after")
def validate_probabilities_sum_to_one(self) -> "PolymerConfig":
"""Ensure monomer probabilities sum to 1.0."""
if self.enabled:
total = sum(m.probability for m in self.monomers)
if abs(total - 1.0) > 1e-6:
raise ValueError(f"Monomer probabilities must sum to 1.0, got {total}")
return self
[docs]
@model_validator(mode="after")
def validate_generation_mode_requirements(self) -> "PolymerConfig":
"""Validate that required fields are present for the selected generation mode."""
if not self.enabled:
return self
if self.generation_mode == PolymerGenerationMode.DYNAMIC:
# Dynamic mode requires SMILES for all monomers
missing_smiles = [m.label for m in self.monomers if m.smiles is None]
if missing_smiles:
raise ValueError(
f"Dynamic generation mode requires 'smiles' for all monomers. "
f"Missing SMILES for monomers: {missing_smiles}"
)
# Dynamic mode requires reaction templates
if self.reactions is None:
raise ValueError(
"Dynamic generation mode requires 'reactions' field with ATRP reaction templates"
)
elif self.generation_mode == PolymerGenerationMode.CACHED:
# Cached mode requires sdf_directory
if self.sdf_directory is None:
raise ValueError(
"Cached generation mode requires 'sdf_directory' with pre-built polymer SDFs"
)
return self
# =============================================================================
# Solvent Configuration
# =============================================================================
[docs]
class CoSolventSpec(BaseModel):
"""Specification for a co-solvent component.
You must specify either ``mole_fraction`` or ``concentration``, not both.
For co-solvents in the built-in library (dmso, dmf, urea, ethanol, etc.),
you can omit the smiles and density fields - they will be looked up
automatically.
Example (library co-solvent with mole fraction):
>>> CoSolventSpec(name="dmso", mole_fraction=0.10)
Example (library co-solvent with concentration):
>>> CoSolventSpec(name="urea", concentration=2.0)
Example (custom co-solvent):
>>> CoSolventSpec(
... name="my_solvent",
... smiles="CCOC(=O)C",
... mole_fraction=0.05
... )
"""
name: str = Field(..., description="Co-solvent identifier")
smiles: str | None = Field(None, description="SMILES string (optional if in library)")
# Specification method 1: Mole fraction
mole_fraction: float | None = Field(
None,
gt=0.0,
lt=1.0,
description="Mole fraction (0-1), e.g., 0.10 for 10 mol%",
)
# Specification method 2: Molar concentration
concentration: float | None = Field(None, gt=0.0, description="Molar concentration (mol/L)")
# Optional physical property for library metadata
density: float | None = Field(
None,
gt=0.0,
description="Optional density in g/mL for co-solvent metadata",
)
residue_name: str | None = Field(None, max_length=3, description="Residue name")
[docs]
@model_validator(mode="before")
@classmethod
def reject_volume_fraction(cls, data: Any) -> Any:
"""Reject removed volume-fraction configuration keys."""
if isinstance(data, dict) and "volume_fraction" in data:
raise ValueError(
"Co-solvent 'volume_fraction' has been removed. Use 'mole_fraction' "
"for solvent mole fractions or 'concentration' for molarity; values are "
"not converted automatically."
)
return data
[docs]
@model_validator(mode="after")
def validate_and_populate(self) -> "CoSolventSpec":
"""Validate specification and populate missing fields from library."""
from polyzymd.data.cosolvent_library import get_cosolvent
# Check that exactly one composition method is specified
has_mole_fraction = self.mole_fraction is not None
has_conc = self.concentration is not None
if not has_mole_fraction and not has_conc:
raise ValueError(
f"Co-solvent '{self.name}': Must specify either 'mole_fraction' "
f"or 'concentration'"
)
if has_mole_fraction and has_conc:
raise ValueError(
f"Co-solvent '{self.name}': Cannot specify both 'mole_fraction' "
f"and 'concentration' - choose one"
)
# Look up from library
library_data = get_cosolvent(self.name)
# Populate SMILES if not provided
if self.smiles is None:
if library_data:
object.__setattr__(self, "smiles", library_data.smiles)
else:
raise ValueError(
f"Co-solvent '{self.name}' not in library. Please provide 'smiles' field."
)
# Preserve library density as metadata when available
if self.density is None and library_data:
object.__setattr__(self, "density", library_data.density)
# Set default residue name
if self.residue_name is None:
object.__setattr__(self, "residue_name", self.name[:3].upper())
return self
[docs]
class PrimarySolventConfig(BaseModel):
"""Configuration for the primary solvent (usually water).
Attributes:
type: Solvent type identifier
model: Water model to use (if type is "water")
"""
type: str = Field("water", description="Primary solvent type")
model: WaterModel = Field(WaterModel.TIP3P, description="Water model")
[docs]
class IonConfig(BaseModel):
"""Configuration for ions in the solvent.
``nacl_concentration`` is a NaCl-equivalent target for the final ion
concentration. When ``neutralize`` is true, PolyzyMD adjusts Na+ and Cl-
counts to cancel solute charge while keeping the achieved concentration
near this target; it is not additional salt layered on top of neutralizing
ions.
"""
neutralize: bool = Field(True, description="Neutralize system charge")
nacl_concentration: float = Field(
0.0,
ge=0.0,
description="NaCl-equivalent target concentration (mol/L)",
)
kcl_concentration: float = Field(0.0, ge=0.0, description="KCl conc. (mol/L)")
mgcl2_concentration: float = Field(0.0, ge=0.0, description="MgCl2 conc. (mol/L)")
[docs]
class BoxConfig(BaseModel):
"""Configuration for the simulation box.
Attributes:
padding: Distance from solute to box edge in nm
shape: Box geometry
target_density: Target density in g/mL
tolerance: Minimum molecular spacing for PACKMOL in Angstrom
"""
padding: float = Field(1.2, gt=0.0, description="Box padding (nm)")
shape: BoxShape = Field(BoxShape.RHOMBIC_DODECAHEDRON, description="Box shape")
target_density: float = Field(1.0, gt=0.0, description="Target density (g/mL)")
tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
[docs]
class SolventConfig(BaseModel):
"""Complete solvent configuration."""
primary: PrimarySolventConfig = Field(
default_factory=PrimarySolventConfig, description="Primary solvent"
)
co_solvents: list[CoSolventSpec] = Field(default_factory=list, description="Co-solvents")
ions: IonConfig = Field(default_factory=IonConfig, description="Ion settings")
box: BoxConfig = Field(default_factory=BoxConfig, description="Box settings")
[docs]
@model_validator(mode="after")
def validate_mole_fractions(self) -> "SolventConfig":
"""Ensure co-solvent mole fractions leave room for water."""
total = sum(cs.mole_fraction for cs in self.co_solvents if cs.mole_fraction is not None)
if total >= 1.0:
raise ValueError(f"Total co-solvent mole fraction must be < 1.0, got {total}")
return self
# =============================================================================
# Restraint Configuration
# =============================================================================
[docs]
class AtomSelectionConfig(BaseModel):
"""Configuration for selecting atoms for restraints.
Uses MDAnalysis-compatible selection syntax for flexibility.
Attributes:
selection: MDAnalysis selection string (e.g., "resid 77 and name OG")
description: Optional human-readable description
"""
selection: str = Field(..., description="MDAnalysis selection string")
description: str | None = Field(None, description="Human-readable description")
[docs]
class RestraintConfig(BaseModel):
"""Configuration for a single restraint.
Attributes:
type: Type of restraint (flat_bottom, harmonic, etc.)
name: Identifier for this restraint
atom1: First atom selection
atom2: Second atom selection
distance: Target/threshold distance in Angstroms
force_constant: Force constant in kJ/mol/nm^2
enabled: Whether this restraint is active
"""
type: RestraintType = Field(..., description="Restraint type")
name: str = Field(..., description="Restraint identifier")
atom1: AtomSelectionConfig = Field(..., description="First atom selection")
atom2: AtomSelectionConfig = Field(..., description="Second atom selection")
distance: float = Field(..., gt=0.0, description="Distance threshold (Angstrom)")
force_constant: float = Field(10000.0, gt=0.0, description="Force constant (kJ/mol/nm^2)")
enabled: bool = Field(True, description="Whether restraint is active")
# =============================================================================
# Thermodynamics Configuration
# =============================================================================
[docs]
class ThermodynamicsConfig(BaseModel):
"""Thermodynamic conditions for the simulation.
Attributes:
temperature: System temperature in Kelvin
pressure: System pressure in atmospheres (for NPT)
salt_concentration: Ionic strength in mol/L (deprecated, use solvent.ions)
"""
temperature: float = Field(..., gt=0.0, description="Temperature (K)")
pressure: float = Field(1.0, gt=0.0, description="Pressure (atm)")
# =============================================================================
# Simulation Phase Configuration
# =============================================================================
[docs]
class SimulationPhaseConfig(BaseModel):
"""Configuration for a single simulation phase (equilibration or production).
Attributes:
ensemble: Thermodynamic ensemble (NVT, NPT)
duration: Simulation duration in nanoseconds
samples: Number of trajectory frames to save
report_interval: Explicit reporter interval in MD steps
time_step: Integration time step in femtoseconds
thermostat: Thermostat type
thermostat_timescale: Thermostat coupling timescale in ps
barostat: Barostat type (for NPT)
barostat_frequency: Barostat update frequency (steps)
checkpoint_interval: Wall-time interval (seconds) between restart
checkpoints for preemption resilience
"""
ensemble: Ensemble = Field(..., description="Thermodynamic ensemble")
duration: float = Field(..., gt=0.0, description="Duration (ns)")
samples: int = Field(..., ge=1, description="Trajectory frames to save")
report_interval: int = Field(..., ge=1, description="Reporter interval in MD steps")
time_step: float = Field(2.0, gt=0.0, description="Time step (fs)")
thermostat: ThermostatType = Field(
ThermostatType.LANGEVIN_MIDDLE, description="Thermostat type"
)
thermostat_timescale: float = Field(1.0, gt=0.0, description="Thermostat timescale (ps)")
barostat: BarostatType | None = Field(None, description="Barostat type")
barostat_frequency: int = Field(25, ge=1, description="Barostat update frequency")
checkpoint_interval: float = Field(
...,
gt=0.0,
description=(
"Wall-time interval in seconds between restart checkpoints. "
"Controls how frequently simulation state is saved for automatic "
"restart on SLURM preemption or hard kill. Independent of "
"trajectory/reporter output frequency. Must be positive so "
"unexpected termination recovery remains enabled."
),
)
[docs]
@model_validator(mode="after")
def validate_ensemble_barostat(self) -> "SimulationPhaseConfig":
"""Ensure NPT ensemble has a barostat configured."""
if self.ensemble == Ensemble.NPT and self.barostat is None:
self.barostat = BarostatType.MONTE_CARLO
return self
# =============================================================================
# Equilibration Stage Configuration (Multi-Stage Equilibration)
# =============================================================================
[docs]
class PositionRestraintConfig(BaseModel):
"""Configuration for positional restraints on an atom group.
Position restraints apply a harmonic potential to keep atoms near their
initial coordinates. This is commonly used during equilibration to
prevent large structural changes while the system relaxes.
Attributes:
group: Predefined atom group name
force_constant: Force constant in kJ/mol/nm^2 (4184.0 = 1.0 kcal/mol/A^2)
"""
group: str = Field(
...,
description=(
"Atom group: protein_heavy, protein_backbone, protein_calpha, "
"ligand_heavy, polymer_heavy, solvent, water_only, ions_only, cosolvents_only"
),
)
force_constant: float = Field(
4184.0, # 1.0 kcal/mol/A^2 in kJ/mol/nm^2
gt=0.0,
description="Force constant (kJ/mol/nm^2). Default 4184.0 = 1.0 kcal/mol/A^2",
)
[docs]
@field_validator("group")
@classmethod
def validate_group_name(cls, v: str) -> str:
"""Validate that the group name is a recognized predefined group."""
from polyzymd.core.atom_groups import PREDEFINED_GROUPS
if v not in PREDEFINED_GROUPS:
raise ValueError(
f"Unknown atom group: '{v}'. Valid groups: {sorted(PREDEFINED_GROUPS)}"
)
return v
[docs]
class EquilibrationStageConfig(BaseModel):
"""Configuration for a single equilibration stage.
Supports two temperature modes:
1. Constant temperature: Set 'temperature' field.
2. Temperature ramping (simulated annealing): Set 'temperature_start'
and 'temperature_end' fields.
Position restraints can be applied to hold specific atom groups in place
during the stage.
Attributes:
name: Stage identifier (used in output paths)
duration: Stage duration in nanoseconds
samples: Number of trajectory frames to save
ensemble: Thermodynamic ensemble (NVT or NPT)
temperature: Constant temperature in K (mutually exclusive with ramping)
temperature_start: Starting temperature for ramping in K
temperature_end: Ending temperature for ramping in K
temperature_increment: Temperature increment per update in K
temperature_interval: Time between temperature updates in fs
position_restraints: List of position restraints for this stage
time_step: Optional time step override in fs
thermostat: Optional thermostat type override
thermostat_timescale: Optional thermostat timescale override in ps
barostat: Optional barostat type (for NPT ensemble)
barostat_frequency: Optional barostat update frequency
"""
name: str = Field(..., description="Stage identifier (used in output paths)")
duration: float = Field(..., gt=0.0, description="Duration (ns)")
samples: int = Field(100, ge=1, description="Trajectory frames to save")
ensemble: Ensemble = Field(Ensemble.NVT, description="Thermodynamic ensemble")
# Constant temperature mode
temperature: float | None = Field(
None,
gt=0.0,
description="Constant temperature (K). Mutually exclusive with temperature ramping.",
)
# Temperature ramping mode (simulated annealing)
temperature_start: float | None = Field(
None, gt=0.0, description="Starting temperature for ramping (K)"
)
temperature_end: float | None = Field(
None, gt=0.0, description="Ending temperature for ramping (K)"
)
temperature_increment: float = Field(
1.0, gt=0.0, description="Temperature increment per update (K)"
)
temperature_interval: float = Field(
1200.0, gt=0.0, description="Time between temperature updates (fs)"
)
# Position restraints
position_restraints: list[PositionRestraintConfig] = Field(
default_factory=list, description="Position restraints for this stage"
)
# Optional overrides (inherit from parent/defaults if not specified)
time_step: float | None = Field(None, gt=0.0, description="Time step (fs)")
thermostat: ThermostatType | None = Field(None, description="Thermostat type")
thermostat_timescale: float | None = Field(
None, gt=0.0, description="Thermostat timescale (ps)"
)
barostat: BarostatType | None = Field(None, description="Barostat type (for NPT)")
barostat_frequency: int | None = Field(None, ge=1, description="Barostat update frequency")
[docs]
@model_validator(mode="after")
def validate_temperature_mode(self) -> "EquilibrationStageConfig":
"""Ensure valid temperature specification."""
has_constant = self.temperature is not None
has_start = self.temperature_start is not None
has_end = self.temperature_end is not None
if has_constant and (has_start or has_end):
raise ValueError(
"Cannot specify both 'temperature' and temperature ramping "
"('temperature_start'/'temperature_end')"
)
if not has_constant and not (has_start and has_end):
raise ValueError(
"Must specify either 'temperature' for constant temperature, "
"or both 'temperature_start' and 'temperature_end' for ramping"
)
if has_start and has_end and self.temperature_start > self.temperature_end:
raise ValueError(
f"temperature_start ({self.temperature_start}) must be <= "
f"temperature_end ({self.temperature_end})"
)
return self
[docs]
@model_validator(mode="after")
def validate_npt_barostat(self) -> "EquilibrationStageConfig":
"""Ensure NPT ensemble has a barostat configured."""
if self.ensemble == Ensemble.NPT and self.barostat is None:
self.barostat = BarostatType.MONTE_CARLO
return self
@property
def is_temperature_ramping(self) -> bool:
"""Check if this stage uses temperature ramping."""
return self.temperature_start is not None
[docs]
def get_start_temperature(self) -> float:
"""Get the starting temperature of this stage."""
if self.is_temperature_ramping:
return self.temperature_start
return self.temperature
[docs]
def get_final_temperature(self) -> float:
"""Get the final temperature of this stage."""
if self.is_temperature_ramping:
return self.temperature_end
return self.temperature
[docs]
class SimulationPhasesConfig(BaseModel):
"""Configuration for all simulation phases.
Attributes:
equilibration_stages: Multi-stage equilibration protocol
production: Production phase settings
"""
model_config = ConfigDict(extra="forbid")
equilibration_stages: list[EquilibrationStageConfig] | None = Field(
None, description="Multi-stage equilibration protocol with position restraints"
)
production: SimulationPhaseConfig = Field(..., description="Production settings")
[docs]
@model_validator(mode="after")
def validate_equilibration_mode(self) -> "SimulationPhasesConfig":
"""Ensure staged equilibration is configured."""
if self.equilibration_stages is None:
raise ValueError(
"PolyzyMD now requires 'equilibration_stages' in simulation_phases. "
"The legacy 'equilibration' block is no longer supported. "
"Convert your config to one or more staged equilibration entries."
)
if len(self.equilibration_stages) == 0:
raise ValueError("'equilibration_stages' must contain at least one stage")
return self
@property
def total_equilibration_duration(self) -> float:
"""Total equilibration duration in nanoseconds.
Returns the sum of all stage durations.
"""
return sum(stage.duration for stage in self.equilibration_stages)
@property
def total_equilibration_samples(self) -> int:
"""Total equilibration trajectory samples.
Returns the sum of all stage samples.
"""
return sum(stage.samples for stage in self.equilibration_stages)
# =============================================================================
# Output Configuration
# =============================================================================
[docs]
def expand_path(path: Path) -> Path:
"""Expand environment variables and user home in a path.
Supports:
- $VAR and ${VAR} syntax for environment variables
- ~ for user home directory
Args:
path: Path that may contain environment variables
Returns:
Path with variables expanded
"""
import os
expanded = os.path.expandvars(str(path))
expanded = os.path.expanduser(expanded)
return Path(expanded)
[docs]
class OutputConfig(BaseModel):
"""Configuration for simulation output.
Supports separate directories for:
- scripts/logs (projects_directory): Where job scripts and SLURM logs are written
- simulation data (scratch_directory): Where trajectories, checkpoints go
This separation allows running simulations on HPC systems where code lives
in long-term storage (projects) but data is written to high-performance
scratch storage.
Attributes:
projects_directory: Directory for scripts, configs, logs (long-term storage)
scratch_directory: Directory for simulation output (high-performance storage)
naming_template: Template for naming working directories
job_scripts_subdir: Subdirectory name for job scripts within projects
slurm_logs_subdir: Subdirectory name for SLURM logs within projects
save_checkpoint: Whether to save checkpoint files
save_state_data: Whether to save thermodynamic state data
trajectory_format: Output trajectory format
Example YAML:
output:
projects_directory: /projects/user/polyzymd
scratch_directory: /scratch/alpine/user/simulations
naming_template: "{enzyme}_{substrate}_{temperature}K_run{replicate}"
"""
model_config = ConfigDict(extra="forbid")
# Directory configuration
projects_directory: Path = Field(
Path("."),
description="Base directory for scripts and logs (typically in projects/long-term storage)",
)
scratch_directory: Path | None = Field(
None,
description="Base directory for simulation output (scratch storage). If None, uses projects_directory.",
)
naming_template: str = Field(
"{enzyme}_{substrate}_{polymer_type}_{duration}ns_{temperature}K_run{replicate}",
description="Directory naming template",
)
# Subdirectory organization
job_scripts_subdir: str = Field(
"job_scripts",
description="Subdirectory for generated job scripts",
)
slurm_logs_subdir: str = Field(
"slurm_logs",
description="Subdirectory for SLURM output logs",
)
# Output options
save_checkpoint: bool = Field(True, description="Save checkpoint files")
save_state_data: bool = Field(True, description="Save state data CSV")
trajectory_format: str = Field("dcd", description="Trajectory file format")
[docs]
@field_validator("projects_directory", "scratch_directory", mode="before")
@classmethod
def expand_env_vars_in_paths(cls, v: str | Path | None) -> Path | None:
"""Expand environment variables and ~ in path fields.
Supports $USER, ${HOME}, ~/path, etc.
"""
if v is None:
return None
return expand_path(Path(v))
@property
def effective_scratch_directory(self) -> Path:
"""Get the effective scratch directory (falls back to projects if not set)."""
return self.scratch_directory if self.scratch_directory else self.projects_directory
[docs]
def get_job_scripts_directory(self) -> Path:
"""Get the directory for job scripts.
Returns:
Path to job scripts directory (within projects)
"""
return self.projects_directory / self.job_scripts_subdir
[docs]
def get_slurm_logs_directory(self) -> Path:
"""Get the directory for SLURM log files.
Returns:
Path to SLURM logs directory (within projects)
"""
return self.projects_directory / self.slurm_logs_subdir
# =============================================================================
# Force Field Configuration
# =============================================================================
[docs]
class ForceFieldConfig(BaseModel):
"""Configuration for force field selection.
Attributes:
protein: Force field for proteins
small_molecule: Force field for small molecules
water: Water model force field (derived from solvent config)
"""
protein: str = Field("ff14sb_off_impropers_0.0.4.offxml", description="Protein force field")
small_molecule: str = Field("openff-2.0.0.offxml", description="Small molecule force field")
# =============================================================================
# Engine Configuration
# =============================================================================
[docs]
class OpenMMEngineConfig(BaseModel):
"""OpenMM-specific engine settings.
These settings control OpenMM platform selection and device configuration.
Only relevant when ``engine`` is ``"openmm"``.
Attributes:
platform: OpenMM platform to use for computation.
device_index: GPU device index (platform-specific).
precision: Floating-point precision mode.
"""
platform: str = Field("CUDA", description="OpenMM compute platform")
device_index: str | None = Field(None, description="GPU device index")
precision: str = Field("mixed", description="Floating-point precision")
[docs]
class GromacsEngineConfig(BaseModel):
"""GROMACS-specific engine settings.
These settings control how GROMACS binaries are located and invoked,
and how SLURM resources are allocated for GROMACS jobs.
GROMACS users configure parallelism via ``ntmpi`` (MPI ranks) and
``ntomp`` (OpenMP threads per rank). These map directly to the
``gmx mdrun -ntmpi`` and ``-ntomp`` flags. The corresponding SLURM
resources (``ntasks``, ``cpus_per_task``, ``memory``) are set to
match.
Set ``gpu: true`` when running a CUDA-enabled GROMACS build on a
GPU node. When ``False`` (default), the SLURM script omits GPU
directives entirely.
Attributes:
gmx_binary: Path or name of the GROMACS binary.
Resolved via config > $GMX_BIN > PATH discovery if None.
mdrun_flags: Additional flags passed to ``gmx mdrun`` (all stages).
grompp_flags: Additional flags passed to ``gmx grompp``.
module_load: Module load command for HPC (e.g. ``"module load gromacs/2024"``)
ntmpi: Number of MPI ranks for ``gmx mdrun -ntmpi``.
Also sets SLURM ``--ntasks``.
ntomp: Number of OpenMP threads per rank for ``gmx mdrun -ntomp``.
Also sets SLURM ``--cpus-per-task``.
gpu: Request a GPU via SLURM. When False, the ``--gres=gpu``
directive is omitted entirely.
gpus: Number of GPUs to request when ``gpu`` is True. Ignored
when ``gpu`` is False.
memory: SLURM ``--mem`` allocation for GROMACS jobs.
"""
gmx_binary: str | None = Field(None, description="GROMACS binary path or name")
mdrun_flags: str = Field("", description="Extra flags for gmx mdrun (all stages)")
grompp_flags: str = Field("-maxwarn 1", description="Extra flags for gmx grompp")
mdrun_flags_equilibration: str | None = Field(
None,
description=(
"Override mdrun_flags for equilibration stages only. "
"When None, falls back to mdrun_flags."
),
)
mdrun_flags_production: str | None = Field(
None,
description=(
"Override mdrun_flags for production only. When None, falls back to mdrun_flags."
),
)
command_prefix: str | None = Field(
None,
description=(
"Prefix prepended to all GROMACS commands. "
"Use for container wrappers, e.g. "
"'singularity exec --rocm --bind $PWD /path/to/gromacs.sif'"
),
)
mpi_launcher_flags: str = Field(
"",
description=(
"Extra flags passed to the MPI launcher (mpirun). "
"Only used when the binary is a real-MPI build. "
"E.g. '-genv I_MPI_FABRICS shm:tcp' for Intel MPI."
),
)
module_load: str | None = Field(None, description="HPC module load command")
env_exports: dict[str, str] = Field(
default_factory=dict,
description=("Environment variables exported before GROMACS commands (key=value pairs)"),
)
setup_commands: list[str] = Field(
default_factory=list,
description="Shell commands run after module_load and before GROMACS commands",
)
ntmpi: int = Field(1, ge=1, description="MPI ranks (-ntmpi); sets SLURM --ntasks")
slurm_ntasks: int | None = Field(
default=None,
ge=1,
description=(
"Override SLURM --ntasks independently of GROMACS -ntmpi. "
"Advanced option for multi-node MPI+GPU workflows where scheduler "
"tasks differ from GROMACS thread-MPI ranks. When None, SLURM ntasks "
"is set from ntmpi (default behavior)."
),
)
ntomp: int = Field(
8,
ge=1,
description="OpenMP threads per rank (-ntomp); sets SLURM --cpus-per-task",
)
gpu: bool = Field(
False,
description="Request GPU via SLURM (set True for CUDA-enabled GROMACS)",
)
gpus: int = Field(
1,
ge=1,
description="Number of GPUs to request via SLURM when gpu is True",
)
memory: str = Field("16G", description="SLURM --mem allocation for GROMACS jobs")
@model_validator(mode="after")
def _warn_gpu_ntmpi(self) -> Self:
"""Warn when GPU mode is combined with multiple thread-MPI ranks.
Returns
-------
Self
The validated config instance
"""
if self.gpu and self.ntmpi > 1:
LOGGER.warning(
"GPU GROMACS uses thread-MPI (gmx mdrun -ntmpi), not real MPI ranks. "
"With ntmpi=%d and gpus=%d, ensure enough GPUs are available or "
"accept GPU sharing.",
self.ntmpi,
self.gpus,
)
return self
@model_validator(mode="after")
def _warn_mdrun_flags_conflict(self) -> Self:
"""Warn when mdrun_flags conflict with explicit ntmpi or ntomp fields.
Returns
-------
Self
The validated config instance
"""
if not self.mdrun_flags:
return self
try:
tokens = shlex.split(self.mdrun_flags)
except ValueError:
return self
flag_map: dict[str, str] = {}
for i, tok in enumerate(tokens):
if tok in ("-ntmpi", "-ntomp") and i + 1 < len(tokens):
flag_map[tok] = tokens[i + 1]
if "-ntmpi" in flag_map:
try:
flag_val = int(flag_map["-ntmpi"])
if flag_val != self.ntmpi:
LOGGER.warning(
"mdrun_flags contains '-ntmpi %s' but config field ntmpi=%d. "
"The flag string will override at runtime.",
flag_map["-ntmpi"],
self.ntmpi,
)
except ValueError:
pass
if "-ntomp" in flag_map:
try:
flag_val = int(flag_map["-ntomp"])
if flag_val != self.ntomp:
LOGGER.warning(
"mdrun_flags contains '-ntomp %s' but config field ntomp=%d. "
"The flag string will override at runtime.",
flag_map["-ntomp"],
self.ntomp,
)
except ValueError:
pass
return self
# =============================================================================
# Main Simulation Configuration
# =============================================================================
[docs]
class SimulationConfig(BaseModel):
"""Complete simulation configuration.
This is the top-level configuration model that contains all settings
for a PolyzyMD simulation.
Attributes:
name: Simulation name/identifier
description: Optional description
enzyme: Enzyme configuration
substrate: Substrate/ligand configuration
polymers: Polymer configuration (optional)
solvent: Solvent and box configuration
restraints: List of restraint configurations
thermodynamics: Temperature/pressure settings
simulation_phases: Equilibration and production settings
output: Output file settings
force_field: Force field selection
Example:
>>> config = SimulationConfig.from_yaml("simulation.yaml")
>>> print(config.enzyme.name)
"LipA"
"""
name: str = Field(..., description="Simulation identifier")
description: str | None = Field(None, description="Simulation description")
enzyme: EnzymeConfig = Field(..., description="Enzyme configuration")
substrate: SubstrateConfig | None = Field(None, description="Substrate configuration")
polymers: PolymerConfig | None = Field(None, description="Polymer configuration")
solvent: SolventConfig = Field(
default_factory=SolventConfig, description="Solvent configuration"
)
restraints: list[RestraintConfig] = Field(
default_factory=list, description="Restraint configurations"
)
thermodynamics: ThermodynamicsConfig = Field(..., description="Thermodynamic conditions")
simulation_phases: SimulationPhasesConfig = Field(..., description="Phase settings")
output: OutputConfig = Field(default_factory=OutputConfig, description="Output settings")
force_field: ForceFieldConfig = Field(
default_factory=ForceFieldConfig, description="Force field settings"
)
engine: Literal["openmm", "gromacs"] = Field(..., description="Simulation engine to use")
openmm: OpenMMEngineConfig = Field(
default_factory=OpenMMEngineConfig, description="OpenMM engine settings"
)
gromacs: GromacsEngineConfig = Field(
default_factory=GromacsEngineConfig, description="GROMACS engine settings"
)
[docs]
@classmethod
def from_yaml(cls, path: str | Path) -> "SimulationConfig":
"""Load configuration from a YAML file.
Args:
path: Path to the YAML configuration file
Returns:
SimulationConfig instance
Raises:
FileNotFoundError: If file doesn't exist
ValidationError: If configuration is invalid
"""
from polyzymd.config.loader import load_config
return load_config(path)
[docs]
def to_yaml(self, path: str | Path) -> None:
"""Save configuration to a YAML file.
Args:
path: Path to save the configuration file
"""
from polyzymd.config.loader import save_config
save_config(self, path)
def _primary_solvent_token(self) -> str:
"""Format the primary solvent naming token.
Returns
-------
str
Primary solvent token for naming templates.
"""
primary = self.solvent.primary
primary_type = _format_safe_token(primary.type)
if primary_type == "water":
return f"water_{_format_safe_token(primary.model.value)}"
return primary_type
def _cosolvent_composition_token(self) -> str:
"""Format the co-solvent composition naming token.
Returns
-------
str
Co-solvent token or ``none`` when no co-solvents are configured.
"""
if not self.solvent.co_solvents:
return "none"
tokens: list[tuple[str, str]] = []
for cosolvent in self.solvent.co_solvents:
name = _format_safe_token(cosolvent.name)
if cosolvent.volume_fraction is not None:
percent = _format_decimal_token(cosolvent.volume_fraction * 100)
token = f"{name}_{percent}pctv"
elif cosolvent.concentration is not None:
concentration = _format_decimal_token(cosolvent.concentration)
token = f"{name}_{concentration}M"
else:
# Validation guarantees one composition mode, but keep errors explicit
raise ValueError(f"Co-solvent '{cosolvent.name}' has no composition value")
tokens.append((name, token))
return "_".join(token for _, token in sorted(tokens, key=lambda item: item[0]))
def _run_directory_template_values(self, replicate: int | str = 1) -> dict[str, Any]:
"""Build centralized values for run-directory naming templates.
Parameters
----------
replicate : int or str, optional
Replicate token to insert into the template, by default 1.
Returns
-------
dict of str to Any
Template values shared by directory naming and replicate discovery.
"""
polymer_type = "none"
if self.polymers and self.polymers.enabled:
probs = {m.label: m.probability for m in self.polymers.monomers}
sorted_labels = sorted(probs.keys())
composition = "_".join(f"{lbl}{probs[lbl] * 100:.0f}" for lbl in sorted_labels)
polymer_type = f"{self.polymers.type_prefix}_{composition}"
substrate_name = self.substrate.name if self.substrate else "apo"
primary_solvent = self._primary_solvent_token()
cosolvent_composition = self._cosolvent_composition_token()
solvent_composition = primary_solvent
if cosolvent_composition != "none":
solvent_composition = f"{primary_solvent}_{cosolvent_composition}"
return {
"enzyme": self.enzyme.name,
"substrate": substrate_name.replace("-", ""),
"polymer_type": polymer_type,
"temperature": int(self.thermodynamics.temperature),
"replicate": replicate,
"duration": int(self.simulation_phases.production.duration),
"primary_solvent": primary_solvent,
"cosolvent_composition": cosolvent_composition,
"solvent_composition": solvent_composition,
}
[docs]
def get_working_directory(self, replicate: int = 1) -> Path:
"""Get the working directory path for a given replicate (in scratch).
This returns the path where simulation output (trajectories, checkpoints)
will be written, which is in the scratch directory.
Args:
replicate: Replicate number
Returns:
Path to the working directory (in scratch)
"""
dir_name = self.format_run_directory_name(replicate)
return self.output.effective_scratch_directory / dir_name
[docs]
def get_projects_directory(self) -> Path:
"""Get the projects directory path.
This is where scripts, configs, and logs are stored.
Returns:
Path to the projects directory
"""
return self.output.projects_directory
[docs]
def discover_replicate_dirs(self) -> list[tuple[int, Path]]:
"""Auto-detect all replicate directories on disk.
Builds a glob pattern from the naming template with
``replicate="*"`` and scans the effective scratch directory.
Returns:
Sorted list of ``(replicate_number, directory_path)`` tuples.
"""
fields = {
field for _, field, _, _ in Formatter().parse(self.output.naming_template) if field
}
if "replicate" not in fields:
raise ValueError(
"Cannot discover replicate directories because output.naming_template does not "
"include the {replicate} placeholder."
)
glob_pattern = self.format_run_directory_name(replicate="*")
regex_pattern = "^" + re.escape(glob_pattern).replace(r"\*", r"(?P<replicate>\d+)") + "$"
replicate_regex = re.compile(regex_pattern)
scratch = self.output.effective_scratch_directory
results: list[tuple[int, Path]] = []
for path in scratch.glob(glob_pattern):
if not path.is_dir():
continue
match = replicate_regex.fullmatch(path.name)
if match:
results.append((int(match.group("replicate")), path))
results.sort(key=lambda t: t[0])
return results
def _format_run_directory_name(self, replicate: int | str) -> str:
"""Format the run directory name for a replicate.
Args:
replicate: Replicate number
Returns:
Formatted directory name
"""
return self.format_run_directory_name(replicate)
[docs]
def to_signac_statepoint(self, replicate: int = 1) -> dict[str, Any]:
"""Convert configuration to a Signac-compatible state point dictionary.
Args:
replicate: Replicate number
Returns:
Dictionary suitable for use as a Signac state point
"""
statepoint = {
"enzyme": self.enzyme.name,
"temperature": self.thermodynamics.temperature,
"replicate": replicate,
}
if self.substrate:
statepoint["substrate"] = self.substrate.name
statepoint["substrate_conformer"] = self.substrate.conformer_index
if self.polymers and self.polymers.enabled:
statepoint["polymer_type"] = self.polymers.type_prefix
statepoint["polymer_length"] = self.polymers.length
statepoint["polymer_count"] = self.polymers.count
# Include monomer probabilities
for monomer in self.polymers.monomers:
statepoint[f"monomer_{monomer.label}_prob"] = monomer.probability
else:
statepoint["polymer_type"] = "none"
# Include co-solvent info
for cosolvent in self.solvent.co_solvents:
if cosolvent.mole_fraction is not None:
statepoint[f"cosolvent_{cosolvent.name}_mole_fraction"] = cosolvent.mole_fraction
elif cosolvent.concentration is not None:
statepoint[f"cosolvent_{cosolvent.name}_molarity"] = cosolvent.concentration
return statepoint