Source code for polyzymd.config.schema

"""
Configuration schema for PolyzyMD simulations.

This module defines Pydantic models for all configuration sections,
providing validation, type safety, and YAML/JSON serialization support.
"""

from __future__ import annotations

import logging
import re
import shlex
from enum import Enum
from pathlib import Path
from string import Formatter
from typing import Any, Literal, Self

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

LOGGER = logging.getLogger(__name__)

_TOKEN_UNSAFE_CHARS = re.compile(r"[^A-Za-z0-9_-]+")
_TOKEN_UNDERSCORES = re.compile(r"_+")


def _format_safe_token(value: object) -> str:
    """Convert a value into a safe lowercase naming token.

    Parameters
    ----------
    value : object
        Value to normalize for a directory or job-name token.

    Returns
    -------
    str
        Sanitized token using only alphanumerics, underscores, and hyphens.
    """
    token = str(value).strip().lower()
    token = _TOKEN_UNSAFE_CHARS.sub("_", token)
    token = _TOKEN_UNDERSCORES.sub("_", token)
    return token.strip("_") or "unknown"


def _format_decimal_token(value: float) -> str:
    """Format a decimal value without raw decimal points.

    Parameters
    ----------
    value : float
        Numeric value to format.

    Returns
    -------
    str
        Compact token, using ``p`` as the decimal separator when needed.
    """
    formatted = f"{value:g}"
    return formatted.replace(".", "p")



[docs]
class ChargeMethod(str, Enum):
    """Supported charge assignment methods for small molecules."""

    NAGL = "nagl"
    ESPALOMA = "espaloma"
    AM1BCC = "am1bcc"




[docs]
class WaterModel(str, Enum):
    """Supported water models."""

    TIP3P = "tip3p"
    SPCE = "spce"
    TIP4P = "tip4p"
    TIP4PEW = "tip4pew"
    OPC = "opc"




[docs]
class BoxShape(str, Enum):
    """Supported simulation box shapes."""

    CUBE = "cube"
    RHOMBIC_DODECAHEDRON = "rhombic_dodecahedron"
    TRUNCATED_OCTAHEDRON = "truncated_octahedron"




[docs]
class Ensemble(str, Enum):
    """Thermodynamic ensemble types."""

    NVT = "NVT"
    NPT = "NPT"
    NVE = "NVE"




[docs]
class ThermostatType(str, Enum):
    """Supported thermostat types."""

    LANGEVIN_MIDDLE = "LangevinMiddle"
    LANGEVIN = "Langevin"
    ANDERSEN = "Andersen"
    NOSE_HOOVER = "NoseHoover"




[docs]
class BarostatType(str, Enum):
    """Supported barostat types."""

    MONTE_CARLO = "MC"
    MONTE_CARLO_ANISOTROPIC = "MCA"




[docs]
class RestraintType(str, Enum):
    """Types of restraints that can be applied."""

    FLAT_BOTTOM = "flat_bottom"
    HARMONIC = "harmonic"
    UPPER_WALL = "upper_wall"
    LOWER_WALL = "lower_wall"



# =============================================================================
# Enzyme Configuration
# =============================================================================



[docs]
class EnzymeConfig(BaseModel):
    """Configuration for the enzyme/protein component.

    Attributes:
        name: Identifier for the enzyme (e.g., "LipA")
        pdb_path: Path to the PDB file containing the enzyme structure
        description: Optional description of the enzyme
    """

    name: str = Field(..., description="Enzyme identifier")
    pdb_path: Path = Field(..., description="Path to enzyme PDB file")
    description: str | None = Field(None, description="Optional description")


[docs]
    @field_validator("pdb_path")
    @classmethod
    def validate_pdb_path(cls, v: Path) -> Path:
        """Validate that PDB path has correct extension."""
        if v.suffix.lower() != ".pdb":
            raise ValueError(f"Expected .pdb file, got {v.suffix}")
        return v




# =============================================================================
# Substrate/Ligand Configuration
# =============================================================================



[docs]
class SubstrateConfig(BaseModel):
    """Configuration for the docked substrate/ligand.

    Attributes:
        name: Identifier for the substrate (e.g., "Resorufin-Butyrate")
        sdf_path: Path to SDF file with docked conformers
        conformer_index: Which conformer to use (0-indexed)
        charge_method: Method for assigning partial charges
        residue_name: 3-letter residue name for topology (default: "LIG")
    """

    name: str = Field(..., description="Substrate identifier")
    sdf_path: Path = Field(..., description="Path to docked conformers SDF")
    conformer_index: int = Field(0, ge=0, description="Index of conformer to use")
    charge_method: ChargeMethod = Field(ChargeMethod.NAGL, description="Charge assignment method")
    residue_name: str = Field("LIG", max_length=3, description="3-letter residue name")


[docs]
    @field_validator("sdf_path")
    @classmethod
    def validate_sdf_path(cls, v: Path) -> Path:
        """Validate that SDF path has correct extension."""
        if v.suffix.lower() != ".sdf":
            raise ValueError(f"Expected .sdf file, got {v.suffix}")
        return v




# =============================================================================
# Polymer Packing Configuration
# =============================================================================



[docs]
class PolymerPackingConfig(BaseModel):
    """Settings for packing polymers around the solute.

    Controls the box size and PACKMOL behavior when packing polymers
    around the protein-ligand complex.

    Attributes:
        padding: Box padding around the solute in nanometers. Larger values
            give polymers more room and can speed up PACKMOL convergence.
        tolerance: Minimum molecular spacing for PACKMOL in Angstrom.
        movebadrandom: When ``True``, pass the ``movebadrandom`` keyword to
            PACKMOL.  This places badly-packed molecules at random positions
            in the box rather than near well-packed neighbours, which
            improves convergence for dense or heterogeneous systems (many
            unique chain types).  Has no effect when only a single chain
            type is present.  Default is ``False`` (PACKMOL default
            behaviour is preserved).
        box_vectors: Optional explicit box dimensions ``[Lx, Ly, Lz]`` in
            nanometers.  When set, overrides the auto-computed bounding box
            plus *padding*.  The protein is centered at the midpoint of
            the box.  Default is ``None`` (auto-compute from solute
            bounding box + padding).

    Example:
        >>> PolymerPackingConfig(padding=2.5, movebadrandom=True)
        >>> PolymerPackingConfig(box_vectors=[8.0, 10.0, 12.0])
    """

    padding: float = Field(2.0, gt=0.0, description="Box padding around solute (nm)")
    tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")
    movebadrandom: bool = Field(
        False,
        description=(
            "Pass the 'movebadrandom' keyword to PACKMOL. "
            "Improves convergence for heterogeneous polymer systems "
            "by placing badly-packed molecules at random box positions."
        ),
    )
    box_vectors: list[float] | None = Field(
        None,
        min_length=3,
        max_length=3,
        description=(
            "Optional explicit box dimensions [Lx, Ly, Lz] in nanometers. "
            "Overrides auto-computed bounding box + padding. "
            "Protein is centered at the midpoint of this box."
        ),
    )



# =============================================================================
# Polymer Configuration
# =============================================================================



[docs]
class MonomerSpec(BaseModel):
    """Specification for a single monomer type in a co-polymer.

    For dynamic polymer generation, provide the raw (unactivated) monomer SMILES.
    The system will run initiation reactions to create the active fragments.

    Attributes:
        label: Single character label for this monomer (e.g., "A", "B")
        probability: Probability of selecting this monomer (0-1)
        name: Optional full name (e.g., "SBMA", "EGPMA")
        smiles: Raw monomer SMILES string (required for dynamic generation)
        residue_name: 3-character PDB residue name (auto-generated if not provided)
    """

    label: str = Field(..., min_length=1, max_length=1, description="Monomer label")
    probability: float = Field(..., ge=0.0, le=1.0, description="Selection probability")
    name: str | None = Field(None, description="Full monomer name")
    smiles: str | None = Field(
        None,
        description="Raw monomer SMILES (required for dynamic generation mode)",
    )
    residue_name: str | None = Field(
        None,
        max_length=3,
        description="3-char PDB residue name (auto-generated from name if not provided)",
    )


[docs]
    @model_validator(mode="after")
    def auto_generate_residue_name(self) -> "MonomerSpec":
        """Auto-generate residue name from monomer name if not provided."""
        if self.residue_name is None and self.name is not None:
            # Use first 3 characters of name, uppercase
            object.__setattr__(self, "residue_name", self.name[:3].upper())
        return self





[docs]
class PolymerGenerationMode(str, Enum):
    """Mode for polymer generation."""

    CACHED = "cached"  # Load pre-built SDF files from disk
    DYNAMIC = "dynamic"  # Generate polymers on-the-fly using Polymerist




[docs]
class ReactionConfig(BaseModel):
    """Paths to reaction templates for ATRP polymer generation.

    These .rxn files define the chemical transformations used to create
    polymer fragments from raw monomer SMILES. For ATRP, this includes:
    - Initiation: Activates the vinyl group (e.g., chlorination)
    - Polymerization: Creates chain-extending fragments
    - Termination: Restores the alkene for chain ends

    You can use "default" as a special value to use the bundled ATRP
    methacrylate reaction templates that ship with PolyzyMD.

    Example:
        reactions:
          initiation: "default"
          polymerization: "default"
          termination: "default"

    Attributes:
        initiation: Path to the initiation reaction template (.rxn) or "default"
        polymerization: Path to the polymerization reaction template (.rxn) or "default"
        termination: Path to the termination reaction template (.rxn) or "default"
    """

    initiation: Path = Field(..., description="Path to ATRP initiation .rxn file or 'default'")
    polymerization: Path = Field(
        ..., description="Path to ATRP polymerization .rxn file or 'default'"
    )
    termination: Path = Field(..., description="Path to ATRP termination .rxn file or 'default'")


[docs]
    @field_validator("initiation", "polymerization", "termination", mode="before")
    @classmethod
    def resolve_default_paths(cls, v, info) -> Path:
        """Resolve 'default' to bundled ATRP reaction paths."""
        from polyzymd.data.reactions import (
            get_atrp_initiation_path,
            get_atrp_polymerization_path,
            get_atrp_termination_path,
        )

        if isinstance(v, str) and v.lower() == "default":
            # Map field name to the appropriate getter
            field_to_getter = {
                "initiation": get_atrp_initiation_path,
                "polymerization": get_atrp_polymerization_path,
                "termination": get_atrp_termination_path,
            }
            getter = field_to_getter.get(info.field_name)
            if getter:
                return getter()
            raise ValueError(f"Unknown reaction field: {info.field_name}")

        # Convert string to Path if needed
        if isinstance(v, str):
            v = Path(v)

        # Validate .rxn extension
        if isinstance(v, Path) and v.suffix.lower() != ".rxn":
            raise ValueError(f"Expected .rxn file, got {v.suffix}")

        return v





[docs]
class PolymerConfig(BaseModel):
    """Configuration for polymer components.

    Supports two generation modes:
    - "cached": Load pre-built polymer SDF files from sdf_directory
    - "dynamic": Generate polymers on-the-fly using Polymerist from SMILES

    For dynamic mode, you must provide:
    - SMILES for each monomer in monomers[].smiles
    - Reaction templates in the reactions field

    Attributes:
        enabled: Whether to include polymers in the system
        generation_mode: "cached" for pre-built SDFs, "dynamic" for on-the-fly generation
        type_prefix: Prefix for polymer type in filenames (e.g., "SBMA-EGPMA")
        monomers: List of monomer specifications with probabilities (and SMILES for dynamic)
        length: Number of monomer units per polymer chain
        count: Number of polymer chains to add
        sdf_directory: Path to pre-built polymer SDF files (for cached mode)
        reactions: Reaction templates for ATRP (required for dynamic mode)
        charger: Charge assignment method for generated polymers
        max_retries: Maximum retries for polymer generation (ring-piercing failures)
        cache_directory: Directory for caching generated polymers and fragments
        packing: Settings for packing polymers around the solute
        random_seed: Random seed for polymer sequence generation (for reproducibility)
    """

    enabled: bool = Field(True, description="Include polymers in system")
    generation_mode: PolymerGenerationMode = Field(
        PolymerGenerationMode.CACHED,
        description="Polymer generation mode: 'cached' (pre-built SDFs) or 'dynamic' (generate from SMILES)",
    )
    type_prefix: str = Field(..., description="Polymer type prefix for filenames")
    monomers: list[MonomerSpec] = Field(..., min_length=1, description="Monomer specifications")
    length: int = Field(..., ge=1, description="Monomers per chain")
    count: int = Field(..., ge=1, description="Number of polymer chains")
    sdf_directory: Path | None = Field(
        None, description="Directory with pre-built polymer SDFs (for cached mode)"
    )
    reactions: ReactionConfig | None = Field(
        None, description="ATRP reaction templates (required for dynamic mode)"
    )
    charger: ChargeMethod = Field(
        ChargeMethod.NAGL,
        description="Charge assignment method for generated polymers",
    )
    max_retries: int = Field(
        10,
        ge=1,
        description="Maximum retries for polymer generation (handles ring-piercing failures)",
    )
    cache_directory: Path = Field(
        Path(".polymer_cache"), description="Cache directory for generated polymers and fragments"
    )
    packing: PolymerPackingConfig = Field(
        default_factory=PolymerPackingConfig,
        description="Polymer packing settings (padding, tolerance)",
    )
    random_seed: int | None = Field(
        None,
        description="Random seed for polymer sequence generation. If None, uses replicate number.",
    )


[docs]
    @model_validator(mode="after")
    def validate_probabilities_sum_to_one(self) -> "PolymerConfig":
        """Ensure monomer probabilities sum to 1.0."""
        if self.enabled:
            total = sum(m.probability for m in self.monomers)
            if abs(total - 1.0) > 1e-6:
                raise ValueError(f"Monomer probabilities must sum to 1.0, got {total}")
        return self



[docs]
    @model_validator(mode="after")
    def validate_generation_mode_requirements(self) -> "PolymerConfig":
        """Validate that required fields are present for the selected generation mode."""
        if not self.enabled:
            return self

        if self.generation_mode == PolymerGenerationMode.DYNAMIC:
            # Dynamic mode requires SMILES for all monomers
            missing_smiles = [m.label for m in self.monomers if m.smiles is None]
            if missing_smiles:
                raise ValueError(
                    f"Dynamic generation mode requires 'smiles' for all monomers. "
                    f"Missing SMILES for monomers: {missing_smiles}"
                )

            # Dynamic mode requires reaction templates
            if self.reactions is None:
                raise ValueError(
                    "Dynamic generation mode requires 'reactions' field with ATRP reaction templates"
                )

        elif self.generation_mode == PolymerGenerationMode.CACHED:
            # Cached mode requires sdf_directory
            if self.sdf_directory is None:
                raise ValueError(
                    "Cached generation mode requires 'sdf_directory' with pre-built polymer SDFs"
                )

        return self




# =============================================================================
# Solvent Configuration
# =============================================================================



[docs]
class CoSolventSpec(BaseModel):
    """Specification for a co-solvent component.

    You must specify either ``mole_fraction`` or ``concentration``, not both.

    For co-solvents in the built-in library (dmso, dmf, urea, ethanol, etc.),
    you can omit the smiles and density fields - they will be looked up
    automatically.

    Example (library co-solvent with mole fraction):
        >>> CoSolventSpec(name="dmso", mole_fraction=0.10)

    Example (library co-solvent with concentration):
        >>> CoSolventSpec(name="urea", concentration=2.0)

    Example (custom co-solvent):
        >>> CoSolventSpec(
        ...     name="my_solvent",
        ...     smiles="CCOC(=O)C",
        ...     mole_fraction=0.05
        ... )
    """

    name: str = Field(..., description="Co-solvent identifier")
    smiles: str | None = Field(None, description="SMILES string (optional if in library)")

    # Specification method 1: Mole fraction
    mole_fraction: float | None = Field(
        None,
        gt=0.0,
        lt=1.0,
        description="Mole fraction (0-1), e.g., 0.10 for 10 mol%",
    )

    # Specification method 2: Molar concentration
    concentration: float | None = Field(None, gt=0.0, description="Molar concentration (mol/L)")

    # Optional physical property for library metadata
    density: float | None = Field(
        None,
        gt=0.0,
        description="Optional density in g/mL for co-solvent metadata",
    )

    residue_name: str | None = Field(None, max_length=3, description="Residue name")


[docs]
    @model_validator(mode="before")
    @classmethod
    def reject_volume_fraction(cls, data: Any) -> Any:
        """Reject removed volume-fraction configuration keys."""
        if isinstance(data, dict) and "volume_fraction" in data:
            raise ValueError(
                "Co-solvent 'volume_fraction' has been removed. Use 'mole_fraction' "
                "for solvent mole fractions or 'concentration' for molarity; values are "
                "not converted automatically."
            )
        return data



[docs]
    @model_validator(mode="after")
    def validate_and_populate(self) -> "CoSolventSpec":
        """Validate specification and populate missing fields from library."""
        from polyzymd.data.cosolvent_library import get_cosolvent

        # Check that exactly one composition method is specified
        has_mole_fraction = self.mole_fraction is not None
        has_conc = self.concentration is not None

        if not has_mole_fraction and not has_conc:
            raise ValueError(
                f"Co-solvent '{self.name}': Must specify either 'mole_fraction' "
                f"or 'concentration'"
            )
        if has_mole_fraction and has_conc:
            raise ValueError(
                f"Co-solvent '{self.name}': Cannot specify both 'mole_fraction' "
                f"and 'concentration' - choose one"
            )

        # Look up from library
        library_data = get_cosolvent(self.name)

        # Populate SMILES if not provided
        if self.smiles is None:
            if library_data:
                object.__setattr__(self, "smiles", library_data.smiles)
            else:
                raise ValueError(
                    f"Co-solvent '{self.name}' not in library. Please provide 'smiles' field."
                )

        # Preserve library density as metadata when available
        if self.density is None and library_data:
            object.__setattr__(self, "density", library_data.density)

        # Set default residue name
        if self.residue_name is None:
            object.__setattr__(self, "residue_name", self.name[:3].upper())

        return self





[docs]
class PrimarySolventConfig(BaseModel):
    """Configuration for the primary solvent (usually water).

    Attributes:
        type: Solvent type identifier
        model: Water model to use (if type is "water")
    """

    type: str = Field("water", description="Primary solvent type")
    model: WaterModel = Field(WaterModel.TIP3P, description="Water model")




[docs]
class IonConfig(BaseModel):
    """Configuration for ions in the solvent.

    ``nacl_concentration`` is a NaCl-equivalent target for the final ion
    concentration. When ``neutralize`` is true, PolyzyMD adjusts Na+ and Cl-
    counts to cancel solute charge while keeping the achieved concentration
    near this target; it is not additional salt layered on top of neutralizing
    ions.
    """

    neutralize: bool = Field(True, description="Neutralize system charge")
    nacl_concentration: float = Field(
        0.0,
        ge=0.0,
        description="NaCl-equivalent target concentration (mol/L)",
    )
    kcl_concentration: float = Field(0.0, ge=0.0, description="KCl conc. (mol/L)")
    mgcl2_concentration: float = Field(0.0, ge=0.0, description="MgCl2 conc. (mol/L)")




[docs]
class BoxConfig(BaseModel):
    """Configuration for the simulation box.

    Attributes:
        padding: Distance from solute to box edge in nm
        shape: Box geometry
        target_density: Target density in g/mL
        tolerance: Minimum molecular spacing for PACKMOL in Angstrom
    """

    padding: float = Field(1.2, gt=0.0, description="Box padding (nm)")
    shape: BoxShape = Field(BoxShape.RHOMBIC_DODECAHEDRON, description="Box shape")
    target_density: float = Field(1.0, gt=0.0, description="Target density (g/mL)")
    tolerance: float = Field(2.0, gt=0.0, description="PACKMOL tolerance (Angstrom)")




[docs]
class SolventConfig(BaseModel):
    """Complete solvent configuration."""

    primary: PrimarySolventConfig = Field(
        default_factory=PrimarySolventConfig, description="Primary solvent"
    )
    co_solvents: list[CoSolventSpec] = Field(default_factory=list, description="Co-solvents")
    ions: IonConfig = Field(default_factory=IonConfig, description="Ion settings")
    box: BoxConfig = Field(default_factory=BoxConfig, description="Box settings")


[docs]
    @model_validator(mode="after")
    def validate_mole_fractions(self) -> "SolventConfig":
        """Ensure co-solvent mole fractions leave room for water."""
        total = sum(cs.mole_fraction for cs in self.co_solvents if cs.mole_fraction is not None)
        if total >= 1.0:
            raise ValueError(f"Total co-solvent mole fraction must be < 1.0, got {total}")
        return self




# =============================================================================
# Restraint Configuration
# =============================================================================



[docs]
class AtomSelectionConfig(BaseModel):
    """Configuration for selecting atoms for restraints.

    Uses MDAnalysis-compatible selection syntax for flexibility.

    Attributes:
        selection: MDAnalysis selection string (e.g., "resid 77 and name OG")
        description: Optional human-readable description
    """

    selection: str = Field(..., description="MDAnalysis selection string")
    description: str | None = Field(None, description="Human-readable description")




[docs]
class RestraintConfig(BaseModel):
    """Configuration for a single restraint.

    Attributes:
        type: Type of restraint (flat_bottom, harmonic, etc.)
        name: Identifier for this restraint
        atom1: First atom selection
        atom2: Second atom selection
        distance: Target/threshold distance in Angstroms
        force_constant: Force constant in kJ/mol/nm^2
        enabled: Whether this restraint is active
    """

    type: RestraintType = Field(..., description="Restraint type")
    name: str = Field(..., description="Restraint identifier")
    atom1: AtomSelectionConfig = Field(..., description="First atom selection")
    atom2: AtomSelectionConfig = Field(..., description="Second atom selection")
    distance: float = Field(..., gt=0.0, description="Distance threshold (Angstrom)")
    force_constant: float = Field(10000.0, gt=0.0, description="Force constant (kJ/mol/nm^2)")
    enabled: bool = Field(True, description="Whether restraint is active")



# =============================================================================
# Thermodynamics Configuration
# =============================================================================



[docs]
class ThermodynamicsConfig(BaseModel):
    """Thermodynamic conditions for the simulation.

    Attributes:
        temperature: System temperature in Kelvin
        pressure: System pressure in atmospheres (for NPT)
        salt_concentration: Ionic strength in mol/L (deprecated, use solvent.ions)
    """

    temperature: float = Field(..., gt=0.0, description="Temperature (K)")
    pressure: float = Field(1.0, gt=0.0, description="Pressure (atm)")



# =============================================================================
# Simulation Phase Configuration
# =============================================================================



[docs]
class SimulationPhaseConfig(BaseModel):
    """Configuration for a single simulation phase (equilibration or production).

    Attributes:
        ensemble: Thermodynamic ensemble (NVT, NPT)
        duration: Simulation duration in nanoseconds
        samples: Number of trajectory frames to save
        report_interval: Explicit reporter interval in MD steps
        time_step: Integration time step in femtoseconds
        thermostat: Thermostat type
        thermostat_timescale: Thermostat coupling timescale in ps
        barostat: Barostat type (for NPT)
        barostat_frequency: Barostat update frequency (steps)
        checkpoint_interval: Wall-time interval (seconds) between restart
            checkpoints for preemption resilience
    """

    ensemble: Ensemble = Field(..., description="Thermodynamic ensemble")
    duration: float = Field(..., gt=0.0, description="Duration (ns)")
    samples: int = Field(..., ge=1, description="Trajectory frames to save")
    report_interval: int = Field(..., ge=1, description="Reporter interval in MD steps")
    time_step: float = Field(2.0, gt=0.0, description="Time step (fs)")
    thermostat: ThermostatType = Field(
        ThermostatType.LANGEVIN_MIDDLE, description="Thermostat type"
    )
    thermostat_timescale: float = Field(1.0, gt=0.0, description="Thermostat timescale (ps)")
    barostat: BarostatType | None = Field(None, description="Barostat type")
    barostat_frequency: int = Field(25, ge=1, description="Barostat update frequency")
    checkpoint_interval: float = Field(
        ...,
        gt=0.0,
        description=(
            "Wall-time interval in seconds between restart checkpoints. "
            "Controls how frequently simulation state is saved for automatic "
            "restart on SLURM preemption or hard kill. Independent of "
            "trajectory/reporter output frequency. Must be positive so "
            "unexpected termination recovery remains enabled."
        ),
    )


[docs]
    @model_validator(mode="after")
    def validate_ensemble_barostat(self) -> "SimulationPhaseConfig":
        """Ensure NPT ensemble has a barostat configured."""
        if self.ensemble == Ensemble.NPT and self.barostat is None:
            self.barostat = BarostatType.MONTE_CARLO
        return self




# =============================================================================
# Equilibration Stage Configuration (Multi-Stage Equilibration)
# =============================================================================



[docs]
class PositionRestraintConfig(BaseModel):
    """Configuration for positional restraints on an atom group.

    Position restraints apply a harmonic potential to keep atoms near their
    initial coordinates. This is commonly used during equilibration to
    prevent large structural changes while the system relaxes.

    Attributes:
        group: Predefined atom group name
        force_constant: Force constant in kJ/mol/nm^2 (4184.0 = 1.0 kcal/mol/A^2)
    """

    group: str = Field(
        ...,
        description=(
            "Atom group: protein_heavy, protein_backbone, protein_calpha, "
            "ligand_heavy, polymer_heavy, solvent, water_only, ions_only, cosolvents_only"
        ),
    )
    force_constant: float = Field(
        4184.0,  # 1.0 kcal/mol/A^2 in kJ/mol/nm^2
        gt=0.0,
        description="Force constant (kJ/mol/nm^2). Default 4184.0 = 1.0 kcal/mol/A^2",
    )


[docs]
    @field_validator("group")
    @classmethod
    def validate_group_name(cls, v: str) -> str:
        """Validate that the group name is a recognized predefined group."""
        from polyzymd.core.atom_groups import PREDEFINED_GROUPS

        if v not in PREDEFINED_GROUPS:
            raise ValueError(
                f"Unknown atom group: '{v}'. Valid groups: {sorted(PREDEFINED_GROUPS)}"
            )
        return v





[docs]
class EquilibrationStageConfig(BaseModel):
    """Configuration for a single equilibration stage.

    Supports two temperature modes:

    1. Constant temperature: Set 'temperature' field.
    2. Temperature ramping (simulated annealing): Set 'temperature_start'
       and 'temperature_end' fields.

    Position restraints can be applied to hold specific atom groups in place
    during the stage.

    Attributes:
        name: Stage identifier (used in output paths)
        duration: Stage duration in nanoseconds
        samples: Number of trajectory frames to save
        ensemble: Thermodynamic ensemble (NVT or NPT)
        temperature: Constant temperature in K (mutually exclusive with ramping)
        temperature_start: Starting temperature for ramping in K
        temperature_end: Ending temperature for ramping in K
        temperature_increment: Temperature increment per update in K
        temperature_interval: Time between temperature updates in fs
        position_restraints: List of position restraints for this stage
        time_step: Optional time step override in fs
        thermostat: Optional thermostat type override
        thermostat_timescale: Optional thermostat timescale override in ps
        barostat: Optional barostat type (for NPT ensemble)
        barostat_frequency: Optional barostat update frequency
    """

    name: str = Field(..., description="Stage identifier (used in output paths)")
    duration: float = Field(..., gt=0.0, description="Duration (ns)")
    samples: int = Field(100, ge=1, description="Trajectory frames to save")
    ensemble: Ensemble = Field(Ensemble.NVT, description="Thermodynamic ensemble")

    # Constant temperature mode
    temperature: float | None = Field(
        None,
        gt=0.0,
        description="Constant temperature (K). Mutually exclusive with temperature ramping.",
    )

    # Temperature ramping mode (simulated annealing)
    temperature_start: float | None = Field(
        None, gt=0.0, description="Starting temperature for ramping (K)"
    )
    temperature_end: float | None = Field(
        None, gt=0.0, description="Ending temperature for ramping (K)"
    )
    temperature_increment: float = Field(
        1.0, gt=0.0, description="Temperature increment per update (K)"
    )
    temperature_interval: float = Field(
        1200.0, gt=0.0, description="Time between temperature updates (fs)"
    )

    # Position restraints
    position_restraints: list[PositionRestraintConfig] = Field(
        default_factory=list, description="Position restraints for this stage"
    )

    # Optional overrides (inherit from parent/defaults if not specified)
    time_step: float | None = Field(None, gt=0.0, description="Time step (fs)")
    thermostat: ThermostatType | None = Field(None, description="Thermostat type")
    thermostat_timescale: float | None = Field(
        None, gt=0.0, description="Thermostat timescale (ps)"
    )
    barostat: BarostatType | None = Field(None, description="Barostat type (for NPT)")
    barostat_frequency: int | None = Field(None, ge=1, description="Barostat update frequency")


[docs]
    @model_validator(mode="after")
    def validate_temperature_mode(self) -> "EquilibrationStageConfig":
        """Ensure valid temperature specification."""
        has_constant = self.temperature is not None
        has_start = self.temperature_start is not None
        has_end = self.temperature_end is not None

        if has_constant and (has_start or has_end):
            raise ValueError(
                "Cannot specify both 'temperature' and temperature ramping "
                "('temperature_start'/'temperature_end')"
            )

        if not has_constant and not (has_start and has_end):
            raise ValueError(
                "Must specify either 'temperature' for constant temperature, "
                "or both 'temperature_start' and 'temperature_end' for ramping"
            )

        if has_start and has_end and self.temperature_start > self.temperature_end:
            raise ValueError(
                f"temperature_start ({self.temperature_start}) must be <= "
                f"temperature_end ({self.temperature_end})"
            )

        return self



[docs]
    @model_validator(mode="after")
    def validate_npt_barostat(self) -> "EquilibrationStageConfig":
        """Ensure NPT ensemble has a barostat configured."""
        if self.ensemble == Ensemble.NPT and self.barostat is None:
            self.barostat = BarostatType.MONTE_CARLO
        return self


    @property
    def is_temperature_ramping(self) -> bool:
        """Check if this stage uses temperature ramping."""
        return self.temperature_start is not None


[docs]
    def get_start_temperature(self) -> float:
        """Get the starting temperature of this stage."""
        if self.is_temperature_ramping:
            return self.temperature_start
        return self.temperature



[docs]
    def get_final_temperature(self) -> float:
        """Get the final temperature of this stage."""
        if self.is_temperature_ramping:
            return self.temperature_end
        return self.temperature





[docs]
class SimulationPhasesConfig(BaseModel):
    """Configuration for all simulation phases.

    Attributes:
        equilibration_stages: Multi-stage equilibration protocol
        production: Production phase settings
    """

    model_config = ConfigDict(extra="forbid")

    equilibration_stages: list[EquilibrationStageConfig] | None = Field(
        None, description="Multi-stage equilibration protocol with position restraints"
    )

    production: SimulationPhaseConfig = Field(..., description="Production settings")


[docs]
    @model_validator(mode="after")
    def validate_equilibration_mode(self) -> "SimulationPhasesConfig":
        """Ensure staged equilibration is configured."""
        if self.equilibration_stages is None:
            raise ValueError(
                "PolyzyMD now requires 'equilibration_stages' in simulation_phases. "
                "The legacy 'equilibration' block is no longer supported. "
                "Convert your config to one or more staged equilibration entries."
            )

        if len(self.equilibration_stages) == 0:
            raise ValueError("'equilibration_stages' must contain at least one stage")

        return self


    @property
    def total_equilibration_duration(self) -> float:
        """Total equilibration duration in nanoseconds.

        Returns the sum of all stage durations.
        """
        return sum(stage.duration for stage in self.equilibration_stages)

    @property
    def total_equilibration_samples(self) -> int:
        """Total equilibration trajectory samples.

        Returns the sum of all stage samples.
        """
        return sum(stage.samples for stage in self.equilibration_stages)



# =============================================================================
# Output Configuration
# =============================================================================



[docs]
def expand_path(path: Path) -> Path:
    """Expand environment variables and user home in a path.

    Supports:
    - $VAR and ${VAR} syntax for environment variables
    - ~ for user home directory

    Args:
        path: Path that may contain environment variables

    Returns:
        Path with variables expanded
    """
    import os

    expanded = os.path.expandvars(str(path))
    expanded = os.path.expanduser(expanded)
    return Path(expanded)




[docs]
class OutputConfig(BaseModel):
    """Configuration for simulation output.

    Supports separate directories for:
    - scripts/logs (projects_directory): Where job scripts and SLURM logs are written
    - simulation data (scratch_directory): Where trajectories, checkpoints go

    This separation allows running simulations on HPC systems where code lives
    in long-term storage (projects) but data is written to high-performance
    scratch storage.

    Attributes:
        projects_directory: Directory for scripts, configs, logs (long-term storage)
        scratch_directory: Directory for simulation output (high-performance storage)
        naming_template: Template for naming working directories
        job_scripts_subdir: Subdirectory name for job scripts within projects
        slurm_logs_subdir: Subdirectory name for SLURM logs within projects
        save_checkpoint: Whether to save checkpoint files
        save_state_data: Whether to save thermodynamic state data
        trajectory_format: Output trajectory format

    Example YAML:
        output:
          projects_directory: /projects/user/polyzymd
          scratch_directory: /scratch/alpine/user/simulations
          naming_template: "{enzyme}_{substrate}_{temperature}K_run{replicate}"
    """

    model_config = ConfigDict(extra="forbid")

    # Directory configuration
    projects_directory: Path = Field(
        Path("."),
        description="Base directory for scripts and logs (typically in projects/long-term storage)",
    )
    scratch_directory: Path | None = Field(
        None,
        description="Base directory for simulation output (scratch storage). If None, uses projects_directory.",
    )
    naming_template: str = Field(
        "{enzyme}_{substrate}_{polymer_type}_{duration}ns_{temperature}K_run{replicate}",
        description="Directory naming template",
    )

    # Subdirectory organization
    job_scripts_subdir: str = Field(
        "job_scripts",
        description="Subdirectory for generated job scripts",
    )
    slurm_logs_subdir: str = Field(
        "slurm_logs",
        description="Subdirectory for SLURM output logs",
    )

    # Output options
    save_checkpoint: bool = Field(True, description="Save checkpoint files")
    save_state_data: bool = Field(True, description="Save state data CSV")
    trajectory_format: str = Field("dcd", description="Trajectory file format")


[docs]
    @field_validator("projects_directory", "scratch_directory", mode="before")
    @classmethod
    def expand_env_vars_in_paths(cls, v: str | Path | None) -> Path | None:
        """Expand environment variables and ~ in path fields.

        Supports $USER, ${HOME}, ~/path, etc.
        """
        if v is None:
            return None
        return expand_path(Path(v))


    @property
    def effective_scratch_directory(self) -> Path:
        """Get the effective scratch directory (falls back to projects if not set)."""
        return self.scratch_directory if self.scratch_directory else self.projects_directory


[docs]
    def format_directory_name(
        self,
        enzyme: str,
        substrate: str,
        polymer_type: str,
        temperature: float,
        replicate: int | str,
        duration: float = 0.0,
        primary_solvent: str = "water_tip3p",
        cosolvent_composition: str = "none",
        solvent_composition: str | None = None,
        **kwargs: Any,
    ) -> str:
        """Format the directory name using the template.

        Parameters
        ----------
        enzyme : str
            Enzyme name.
        substrate : str
            Substrate name.
        polymer_type : str
            Polymer type, or ``"none"``.
        temperature : float
            Temperature in K.
        replicate : int or str
            Replicate number or glob token.
        duration : float, optional
            Production duration in ns, by default 0.0.
        primary_solvent : str, optional
            Primary solvent token, by default ``"water_tip3p"``.
        cosolvent_composition : str, optional
            Co-solvent composition token, by default ``"none"``.
        solvent_composition : str or None, optional
            Full solvent composition token. When not provided, the primary solvent
            and co-solvent tokens are combined.
        **kwargs : Any
            Additional template variables.

        Returns
        -------
        str
            Formatted directory name.
        """
        if solvent_composition is None:
            solvent_composition = primary_solvent
            if cosolvent_composition != "none":
                solvent_composition = f"{primary_solvent}_{cosolvent_composition}"

        return self.naming_template.format(
            enzyme=enzyme,
            substrate=substrate,
            polymer_type=polymer_type,
            temperature=int(temperature),
            replicate=replicate,
            duration=int(duration),
            primary_solvent=primary_solvent,
            cosolvent_composition=cosolvent_composition,
            solvent_composition=solvent_composition,
            **kwargs,
        )



[docs]
    def get_job_scripts_directory(self) -> Path:
        """Get the directory for job scripts.

        Returns:
            Path to job scripts directory (within projects)
        """
        return self.projects_directory / self.job_scripts_subdir



[docs]
    def get_slurm_logs_directory(self) -> Path:
        """Get the directory for SLURM log files.

        Returns:
            Path to SLURM logs directory (within projects)
        """
        return self.projects_directory / self.slurm_logs_subdir




# =============================================================================
# Force Field Configuration
# =============================================================================



[docs]
class ForceFieldConfig(BaseModel):
    """Configuration for force field selection.

    Attributes:
        protein: Force field for proteins
        small_molecule: Force field for small molecules
        water: Water model force field (derived from solvent config)
    """

    protein: str = Field("ff14sb_off_impropers_0.0.4.offxml", description="Protein force field")
    small_molecule: str = Field("openff-2.0.0.offxml", description="Small molecule force field")



# =============================================================================
# Engine Configuration
# =============================================================================



[docs]
class OpenMMEngineConfig(BaseModel):
    """OpenMM-specific engine settings.

    These settings control OpenMM platform selection and device configuration.
    Only relevant when ``engine`` is ``"openmm"``.

    Attributes:
        platform: OpenMM platform to use for computation.
        device_index: GPU device index (platform-specific).
        precision: Floating-point precision mode.
    """

    platform: str = Field("CUDA", description="OpenMM compute platform")
    device_index: str | None = Field(None, description="GPU device index")
    precision: str = Field("mixed", description="Floating-point precision")




[docs]
class GromacsEngineConfig(BaseModel):
    """GROMACS-specific engine settings.

    These settings control how GROMACS binaries are located and invoked,
    and how SLURM resources are allocated for GROMACS jobs.

    GROMACS users configure parallelism via ``ntmpi`` (MPI ranks) and
    ``ntomp`` (OpenMP threads per rank). These map directly to the
    ``gmx mdrun -ntmpi`` and ``-ntomp`` flags. The corresponding SLURM
    resources (``ntasks``, ``cpus_per_task``, ``memory``) are set to
    match.

    Set ``gpu: true`` when running a CUDA-enabled GROMACS build on a
    GPU node. When ``False`` (default), the SLURM script omits GPU
    directives entirely.

    Attributes:
        gmx_binary: Path or name of the GROMACS binary.
            Resolved via config > $GMX_BIN > PATH discovery if None.
        mdrun_flags: Additional flags passed to ``gmx mdrun`` (all stages).
        grompp_flags: Additional flags passed to ``gmx grompp``.
        module_load: Module load command for HPC (e.g. ``"module load gromacs/2024"``)
        ntmpi: Number of MPI ranks for ``gmx mdrun -ntmpi``.
            Also sets SLURM ``--ntasks``.
        ntomp: Number of OpenMP threads per rank for ``gmx mdrun -ntomp``.
            Also sets SLURM ``--cpus-per-task``.
        gpu: Request a GPU via SLURM. When False, the ``--gres=gpu``
            directive is omitted entirely.
        gpus: Number of GPUs to request when ``gpu`` is True. Ignored
            when ``gpu`` is False.
        memory: SLURM ``--mem`` allocation for GROMACS jobs.
    """

    gmx_binary: str | None = Field(None, description="GROMACS binary path or name")
    mdrun_flags: str = Field("", description="Extra flags for gmx mdrun (all stages)")
    grompp_flags: str = Field("-maxwarn 1", description="Extra flags for gmx grompp")
    mdrun_flags_equilibration: str | None = Field(
        None,
        description=(
            "Override mdrun_flags for equilibration stages only. "
            "When None, falls back to mdrun_flags."
        ),
    )
    mdrun_flags_production: str | None = Field(
        None,
        description=(
            "Override mdrun_flags for production only. When None, falls back to mdrun_flags."
        ),
    )
    command_prefix: str | None = Field(
        None,
        description=(
            "Prefix prepended to all GROMACS commands. "
            "Use for container wrappers, e.g. "
            "'singularity exec --rocm --bind $PWD /path/to/gromacs.sif'"
        ),
    )
    mpi_launcher_flags: str = Field(
        "",
        description=(
            "Extra flags passed to the MPI launcher (mpirun). "
            "Only used when the binary is a real-MPI build. "
            "E.g. '-genv I_MPI_FABRICS shm:tcp' for Intel MPI."
        ),
    )
    module_load: str | None = Field(None, description="HPC module load command")
    env_exports: dict[str, str] = Field(
        default_factory=dict,
        description=("Environment variables exported before GROMACS commands (key=value pairs)"),
    )
    setup_commands: list[str] = Field(
        default_factory=list,
        description="Shell commands run after module_load and before GROMACS commands",
    )
    ntmpi: int = Field(1, ge=1, description="MPI ranks (-ntmpi); sets SLURM --ntasks")
    slurm_ntasks: int | None = Field(
        default=None,
        ge=1,
        description=(
            "Override SLURM --ntasks independently of GROMACS -ntmpi. "
            "Advanced option for multi-node MPI+GPU workflows where scheduler "
            "tasks differ from GROMACS thread-MPI ranks. When None, SLURM ntasks "
            "is set from ntmpi (default behavior)."
        ),
    )
    ntomp: int = Field(
        8,
        ge=1,
        description="OpenMP threads per rank (-ntomp); sets SLURM --cpus-per-task",
    )
    gpu: bool = Field(
        False,
        description="Request GPU via SLURM (set True for CUDA-enabled GROMACS)",
    )
    gpus: int = Field(
        1,
        ge=1,
        description="Number of GPUs to request via SLURM when gpu is True",
    )
    memory: str = Field("16G", description="SLURM --mem allocation for GROMACS jobs")

    @model_validator(mode="after")
    def _warn_gpu_ntmpi(self) -> Self:
        """Warn when GPU mode is combined with multiple thread-MPI ranks.

        Returns
        -------
        Self
            The validated config instance
        """
        if self.gpu and self.ntmpi > 1:
            LOGGER.warning(
                "GPU GROMACS uses thread-MPI (gmx mdrun -ntmpi), not real MPI ranks. "
                "With ntmpi=%d and gpus=%d, ensure enough GPUs are available or "
                "accept GPU sharing.",
                self.ntmpi,
                self.gpus,
            )
        return self

    @model_validator(mode="after")
    def _warn_mdrun_flags_conflict(self) -> Self:
        """Warn when mdrun_flags conflict with explicit ntmpi or ntomp fields.

        Returns
        -------
        Self
            The validated config instance
        """
        if not self.mdrun_flags:
            return self

        try:
            tokens = shlex.split(self.mdrun_flags)
        except ValueError:
            return self

        flag_map: dict[str, str] = {}
        for i, tok in enumerate(tokens):
            if tok in ("-ntmpi", "-ntomp") and i + 1 < len(tokens):
                flag_map[tok] = tokens[i + 1]

        if "-ntmpi" in flag_map:
            try:
                flag_val = int(flag_map["-ntmpi"])
                if flag_val != self.ntmpi:
                    LOGGER.warning(
                        "mdrun_flags contains '-ntmpi %s' but config field ntmpi=%d. "
                        "The flag string will override at runtime.",
                        flag_map["-ntmpi"],
                        self.ntmpi,
                    )
            except ValueError:
                pass

        if "-ntomp" in flag_map:
            try:
                flag_val = int(flag_map["-ntomp"])
                if flag_val != self.ntomp:
                    LOGGER.warning(
                        "mdrun_flags contains '-ntomp %s' but config field ntomp=%d. "
                        "The flag string will override at runtime.",
                        flag_map["-ntomp"],
                        self.ntomp,
                    )
            except ValueError:
                pass

        return self



# =============================================================================
# Main Simulation Configuration
# =============================================================================



[docs]
class SimulationConfig(BaseModel):
    """Complete simulation configuration.

    This is the top-level configuration model that contains all settings
    for a PolyzyMD simulation.

    Attributes:
        name: Simulation name/identifier
        description: Optional description
        enzyme: Enzyme configuration
        substrate: Substrate/ligand configuration
        polymers: Polymer configuration (optional)
        solvent: Solvent and box configuration
        restraints: List of restraint configurations
        thermodynamics: Temperature/pressure settings
        simulation_phases: Equilibration and production settings
        output: Output file settings
        force_field: Force field selection

    Example:
        >>> config = SimulationConfig.from_yaml("simulation.yaml")
        >>> print(config.enzyme.name)
        "LipA"
    """

    name: str = Field(..., description="Simulation identifier")
    description: str | None = Field(None, description="Simulation description")

    enzyme: EnzymeConfig = Field(..., description="Enzyme configuration")
    substrate: SubstrateConfig | None = Field(None, description="Substrate configuration")
    polymers: PolymerConfig | None = Field(None, description="Polymer configuration")
    solvent: SolventConfig = Field(
        default_factory=SolventConfig, description="Solvent configuration"
    )
    restraints: list[RestraintConfig] = Field(
        default_factory=list, description="Restraint configurations"
    )
    thermodynamics: ThermodynamicsConfig = Field(..., description="Thermodynamic conditions")
    simulation_phases: SimulationPhasesConfig = Field(..., description="Phase settings")
    output: OutputConfig = Field(default_factory=OutputConfig, description="Output settings")
    force_field: ForceFieldConfig = Field(
        default_factory=ForceFieldConfig, description="Force field settings"
    )
    engine: Literal["openmm", "gromacs"] = Field(..., description="Simulation engine to use")
    openmm: OpenMMEngineConfig = Field(
        default_factory=OpenMMEngineConfig, description="OpenMM engine settings"
    )
    gromacs: GromacsEngineConfig = Field(
        default_factory=GromacsEngineConfig, description="GROMACS engine settings"
    )


[docs]
    @classmethod
    def from_yaml(cls, path: str | Path) -> "SimulationConfig":
        """Load configuration from a YAML file.

        Args:
            path: Path to the YAML configuration file

        Returns:
            SimulationConfig instance

        Raises:
            FileNotFoundError: If file doesn't exist
            ValidationError: If configuration is invalid
        """
        from polyzymd.config.loader import load_config

        return load_config(path)



[docs]
    def to_yaml(self, path: str | Path) -> None:
        """Save configuration to a YAML file.

        Args:
            path: Path to save the configuration file
        """
        from polyzymd.config.loader import save_config

        save_config(self, path)


    def _primary_solvent_token(self) -> str:
        """Format the primary solvent naming token.

        Returns
        -------
        str
            Primary solvent token for naming templates.
        """
        primary = self.solvent.primary
        primary_type = _format_safe_token(primary.type)
        if primary_type == "water":
            return f"water_{_format_safe_token(primary.model.value)}"
        return primary_type

    def _cosolvent_composition_token(self) -> str:
        """Format the co-solvent composition naming token.

        Returns
        -------
        str
            Co-solvent token or ``none`` when no co-solvents are configured.
        """
        if not self.solvent.co_solvents:
            return "none"

        tokens: list[tuple[str, str]] = []
        for cosolvent in self.solvent.co_solvents:
            name = _format_safe_token(cosolvent.name)
            if cosolvent.volume_fraction is not None:
                percent = _format_decimal_token(cosolvent.volume_fraction * 100)
                token = f"{name}_{percent}pctv"
            elif cosolvent.concentration is not None:
                concentration = _format_decimal_token(cosolvent.concentration)
                token = f"{name}_{concentration}M"
            else:
                # Validation guarantees one composition mode, but keep errors explicit
                raise ValueError(f"Co-solvent '{cosolvent.name}' has no composition value")
            tokens.append((name, token))

        return "_".join(token for _, token in sorted(tokens, key=lambda item: item[0]))

    def _run_directory_template_values(self, replicate: int | str = 1) -> dict[str, Any]:
        """Build centralized values for run-directory naming templates.

        Parameters
        ----------
        replicate : int or str, optional
            Replicate token to insert into the template, by default 1.

        Returns
        -------
        dict of str to Any
            Template values shared by directory naming and replicate discovery.
        """
        polymer_type = "none"
        if self.polymers and self.polymers.enabled:
            probs = {m.label: m.probability for m in self.polymers.monomers}
            sorted_labels = sorted(probs.keys())
            composition = "_".join(f"{lbl}{probs[lbl] * 100:.0f}" for lbl in sorted_labels)
            polymer_type = f"{self.polymers.type_prefix}_{composition}"

        substrate_name = self.substrate.name if self.substrate else "apo"
        primary_solvent = self._primary_solvent_token()
        cosolvent_composition = self._cosolvent_composition_token()
        solvent_composition = primary_solvent
        if cosolvent_composition != "none":
            solvent_composition = f"{primary_solvent}_{cosolvent_composition}"

        return {
            "enzyme": self.enzyme.name,
            "substrate": substrate_name.replace("-", ""),
            "polymer_type": polymer_type,
            "temperature": int(self.thermodynamics.temperature),
            "replicate": replicate,
            "duration": int(self.simulation_phases.production.duration),
            "primary_solvent": primary_solvent,
            "cosolvent_composition": cosolvent_composition,
            "solvent_composition": solvent_composition,
        }


[docs]
    def format_run_directory_name(self, replicate: int | str = 1) -> str:
        """Format the run directory name for a replicate.

        Parameters
        ----------
        replicate : int or str, optional
            Replicate number or glob token, by default 1.

        Returns
        -------
        str
            Formatted run directory name.
        """
        return self.output.format_directory_name(**self._run_directory_template_values(replicate))



[docs]
    def get_working_directory(self, replicate: int = 1) -> Path:
        """Get the working directory path for a given replicate (in scratch).

        This returns the path where simulation output (trajectories, checkpoints)
        will be written, which is in the scratch directory.

        Args:
            replicate: Replicate number

        Returns:
            Path to the working directory (in scratch)
        """
        dir_name = self.format_run_directory_name(replicate)
        return self.output.effective_scratch_directory / dir_name



[docs]
    def get_projects_directory(self) -> Path:
        """Get the projects directory path.

        This is where scripts, configs, and logs are stored.

        Returns:
            Path to the projects directory
        """
        return self.output.projects_directory



[docs]
    def discover_replicate_dirs(self) -> list[tuple[int, Path]]:
        """Auto-detect all replicate directories on disk.

        Builds a glob pattern from the naming template with
        ``replicate="*"`` and scans the effective scratch directory.

        Returns:
            Sorted list of ``(replicate_number, directory_path)`` tuples.
        """
        fields = {
            field for _, field, _, _ in Formatter().parse(self.output.naming_template) if field
        }
        if "replicate" not in fields:
            raise ValueError(
                "Cannot discover replicate directories because output.naming_template does not "
                "include the {replicate} placeholder."
            )

        glob_pattern = self.format_run_directory_name(replicate="*")
        regex_pattern = "^" + re.escape(glob_pattern).replace(r"\*", r"(?P<replicate>\d+)") + "$"
        replicate_regex = re.compile(regex_pattern)

        scratch = self.output.effective_scratch_directory
        results: list[tuple[int, Path]] = []
        for path in scratch.glob(glob_pattern):
            if not path.is_dir():
                continue
            match = replicate_regex.fullmatch(path.name)
            if match:
                results.append((int(match.group("replicate")), path))

        results.sort(key=lambda t: t[0])
        return results


    def _format_run_directory_name(self, replicate: int | str) -> str:
        """Format the run directory name for a replicate.

        Args:
            replicate: Replicate number

        Returns:
            Formatted directory name
        """
        return self.format_run_directory_name(replicate)


[docs]
    def to_signac_statepoint(self, replicate: int = 1) -> dict[str, Any]:
        """Convert configuration to a Signac-compatible state point dictionary.

        Args:
            replicate: Replicate number

        Returns:
            Dictionary suitable for use as a Signac state point
        """
        statepoint = {
            "enzyme": self.enzyme.name,
            "temperature": self.thermodynamics.temperature,
            "replicate": replicate,
        }

        if self.substrate:
            statepoint["substrate"] = self.substrate.name
            statepoint["substrate_conformer"] = self.substrate.conformer_index

        if self.polymers and self.polymers.enabled:
            statepoint["polymer_type"] = self.polymers.type_prefix
            statepoint["polymer_length"] = self.polymers.length
            statepoint["polymer_count"] = self.polymers.count
            # Include monomer probabilities
            for monomer in self.polymers.monomers:
                statepoint[f"monomer_{monomer.label}_prob"] = monomer.probability
        else:
            statepoint["polymer_type"] = "none"

        # Include co-solvent info
        for cosolvent in self.solvent.co_solvents:
            if cosolvent.mole_fraction is not None:
                statepoint[f"cosolvent_{cosolvent.name}_mole_fraction"] = cosolvent.mole_fraction
            elif cosolvent.concentration is not None:
                statepoint[f"cosolvent_{cosolvent.name}_molarity"] = cosolvent.concentration

        return statepoint