Source code for polyzymd.builders.system_builder

"""
System builder for orchestrating the complete simulation system construction.

This module coordinates all builders to create a complete, solvated
molecular system and generate the OpenFF Interchange for simulation.
"""

from __future__ import annotations

import logging
import time
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

if TYPE_CHECKING:
    from openff.interchange import Interchange
    from openff.toolkit import ForceField, Molecule, Topology

    from polyzymd.config.schema import SimulationConfig

from polyzymd.builders.enzyme import EnzymeBuilder
from polyzymd.builders.polymer import PolymerBuilder
from polyzymd.builders.solvent import SolventBuilder, SolventComposition
from polyzymd.builders.substrate import SubstrateBuilder

LOGGER = logging.getLogger(__name__)



[docs]
class SystemBuilder:
    """Orchestrator for building complete simulation systems.

    This class coordinates enzyme loading, substrate charging, polymer
    generation, solvation, and Interchange creation through the canonical
    single-call OpenFF pathway.

    Example:
        >>> builder = SystemBuilder.from_config(config)
        >>> interchange = builder.build()
        >>> # Or step by step:
        >>> builder = SystemBuilder()
        >>> builder.build_enzyme("enzyme.pdb")
        >>> builder.build_substrate("docked.sdf")
        >>> builder.build_polymers(...)
        >>> builder.solvate()
        >>> interchange = builder.create_interchange()
    """


[docs]
    def __init__(
        self,
        protein_forcefield: str = "ff14sb_off_impropers_0.0.4.offxml",
        small_molecule_forcefield: str = "openff-2.0.0.offxml",
    ) -> None:
        """Initialize the SystemBuilder.

        Args:
            protein_forcefield: Force field for proteins.
            small_molecule_forcefield: Force field for small molecules.
        """
        self._protein_ff = protein_forcefield
        self._sm_ff = small_molecule_forcefield

        # Sub-builders
        self._enzyme_builder = EnzymeBuilder()
        self._substrate_builder = SubstrateBuilder()
        self._polymer_builder: Optional[PolymerBuilder] = None
        self._solvent_builder = SolventBuilder()

        # State
        self._enzyme_topology: Optional[Topology] = None
        self._substrate_molecule: Optional[Molecule] = None
        self._polymer_molecules: List[Molecule] = []
        self._polymer_counts: List[int] = []
        self._combined_topology: Optional[Topology] = None
        self._solvated_topology: Optional[Topology] = None
        self._interchange: Optional[Interchange] = None
        self._working_dir: Optional[Path] = None

        # Molecule count tracking for PDB chain/residue assignment
        # These are set during build and used by _assign_pdb_identifiers()
        self._n_enzyme_molecules: int = 0
        self._n_substrate_molecules: int = 0
        self._n_polymer_chains: int = 0


    @property
    def interchange(self) -> Optional[Interchange]:
        """Get the created Interchange object."""
        return self._interchange

    @property
    def solvated_topology(self) -> Optional[Topology]:
        """Get the solvated topology."""
        return self._solvated_topology


[docs]
    @classmethod
    def from_config(cls, config: "SimulationConfig") -> "SystemBuilder":
        """Create a SystemBuilder from a configuration object.

        Args:
            config: SimulationConfig with all system settings.

        Returns:
            Configured SystemBuilder instance.
        """
        builder = cls(
            protein_forcefield=config.force_field.protein,
            small_molecule_forcefield=config.force_field.small_molecule,
        )
        builder._config = config
        return builder



[docs]
    def build_enzyme(self, pdb_path: Union[str, Path]) -> Topology:
        """Build the enzyme component.

        Args:
            pdb_path: Path to enzyme PDB file.

        Returns:
            Enzyme topology.
        """
        self._enzyme_topology = self._enzyme_builder.build(pdb_path)
        self._n_enzyme_molecules = 1
        return self._enzyme_topology



[docs]
    def build_substrate(
        self,
        sdf_path: Union[str, Path],
        conformer_index: int = 0,
        charge_method: str = "nagl",
        residue_name: str = "LIG",
    ) -> Molecule:
        """Build the substrate component.

        Args:
            sdf_path: Path to docked conformers SDF.
            conformer_index: Index of conformer to use.
            charge_method: Charge assignment method.
            residue_name: 3-letter residue name.

        Returns:
            Charged substrate molecule.
        """
        self._substrate_molecule = self._substrate_builder.build(
            sdf_path=sdf_path,
            conformer_index=conformer_index,
            charge_method=charge_method,
            residue_name=residue_name,
        )
        self._n_substrate_molecules = 1
        return self._substrate_molecule



[docs]
    def build_polymers(
        self,
        characters: List[str],
        probabilities: List[float],
        length: int,
        count: int,
        type_prefix: str,
        sdf_directory: Optional[Union[str, Path]] = None,
        seed: Optional[int] = None,
        # New parameters for dynamic generation
        generation_mode: str = "cached",
        monomer_smiles: Optional[Dict[str, str]] = None,
        monomer_names: Optional[Dict[str, str]] = None,
        residue_names: Optional[Dict[str, str]] = None,
        reactions: Optional[Any] = None,
        charger_type: str = "nagl",
        max_retries: int = 10,
        cache_directory: Optional[Union[str, Path]] = None,
    ) -> Tuple[List[Molecule], List[int]]:
        """Build polymer components.

        Supports two generation modes:
        - "cached": Load pre-built polymer SDFs from sdf_directory
        - "dynamic": Generate polymers on-the-fly from monomer SMILES

        Args:
            characters: Monomer labels.
            probabilities: Monomer selection probabilities.
            length: Monomers per chain.
            count: Number of polymer chains.
            type_prefix: Filename prefix.
            sdf_directory: Directory with pre-built polymer SDFs (cached mode).
            seed: Random seed for reproducibility.
            generation_mode: "cached" or "dynamic".
            monomer_smiles: Dict of monomer name -> SMILES (dynamic mode).
            monomer_names: Dict of label -> monomer name (dynamic mode).
            residue_names: Dict of monomer name -> 3-char residue name.
            reactions: ReactionConfig with ATRP reaction paths (dynamic mode).
            charger_type: Charge method ("nagl", "espaloma", "am1bcc").
            max_retries: Max retries for polymer generation.
            cache_directory: Directory for caching generated polymers.

        Returns:
            Tuple of (unique polymer molecules, counts).
        """
        self._polymer_builder = PolymerBuilder(
            characters=characters,
            probabilities=probabilities,
            length=length,
            type_prefix=type_prefix,
            sdf_directory=sdf_directory,
            cache_directory=cache_directory,
            generation_mode=generation_mode,
            monomer_smiles=monomer_smiles,
            monomer_names=monomer_names,
            residue_names=residue_names,
            reactions=reactions,
            charger_type=charger_type,
            max_retries=max_retries,
        )

        molecules, counts = self._polymer_builder.build(count=count, seed=seed)
        self._polymer_molecules = molecules
        self._polymer_counts = counts
        self._n_polymer_chains = sum(counts)

        return molecules, counts



[docs]
    def combine_solutes(self) -> Topology:
        """Combine enzyme, substrate, and polymers into a single topology.

        Returns:
            Combined topology ready for solvation.

        Raises:
            RuntimeError: If enzyme has not been built.
        """
        if self._enzyme_topology is None:
            raise RuntimeError("Enzyme must be built before combining solutes")

        LOGGER.info("Combining solute components")

        from openff.toolkit import Topology

        # Start with enzyme
        molecules = [self._enzyme_topology.molecule(0)]

        # Add substrate if present
        if self._substrate_molecule is not None:
            molecules.append(self._substrate_molecule)

        # Create combined topology
        self._combined_topology = Topology.from_molecules(molecules)

        # Re-number chains
        self._renumber_chains(self._combined_topology)

        LOGGER.info(
            f"Combined topology: {self._combined_topology.n_molecules} molecules, "
            f"{self._combined_topology.n_atoms} atoms"
        )

        return self._combined_topology



[docs]
    def pack_polymers(
        self,
        padding: float = 2.0,
        tolerance: float = 2.0,
        movebadrandom: bool = False,
        working_directory: Optional[Union[str, Path]] = None,
        box_vectors_nm: Optional[List[float]] = None,
    ) -> Topology:
        """Pack polymers around the combined solute topology.

        Args:
            padding: Box padding in nm. Larger values give polymers more room
                and can significantly speed up PACKMOL convergence.  Ignored
                when *box_vectors_nm* is provided.
            tolerance: PACKMOL tolerance in Angstrom.
            movebadrandom: When True, pass the ``movebadrandom`` keyword to
                PACKMOL. Improves convergence for dense or heterogeneous
                polymer systems (many unique chain types) by placing
                badly-packed molecules at random positions in the box.
            working_directory: Directory for PACKMOL files.
            box_vectors_nm: Optional explicit box dimensions ``[Lx, Ly, Lz]``
                in nanometers.  When provided, overrides the auto-computed
                bounding box + *padding*.  The protein is centered at the
                midpoint of this box.

        Returns:
            Topology with polymers packed.

        Raises:
            RuntimeError: If solutes not combined or no polymers built.
        """
        if self._combined_topology is None:
            self.combine_solutes()

        if not self._polymer_molecules:
            LOGGER.info("No polymers to pack, returning combined topology")
            return self._combined_topology

        import numpy as np
        from openff.units import Quantity

        from polyzymd.utils import boxvectors
        from polyzymd.utils.packmol import pack_polymers as _pack_polymers

        # Calculate box vectors — explicit override or auto from bbox + padding
        if box_vectors_nm is not None:
            LOGGER.info(
                f"Using explicit box vectors: "
                f"[{box_vectors_nm[0]:.2f}, {box_vectors_nm[1]:.2f}, "
                f"{box_vectors_nm[2]:.2f}] nm"
            )
            box_vecs = Quantity(np.diag(box_vectors_nm), "nanometer")
        else:
            padding_qty = Quantity(padding, "nanometer")
            min_box_vecs = boxvectors.get_topology_bbox(self._combined_topology)
            box_vecs = boxvectors.pad_box_vectors_uniform(min_box_vecs, padding_qty)

        LOGGER.info(
            f"Packing {sum(self._polymer_counts)} polymer chains "
            f"({len(self._polymer_molecules)} unique types), "
            f"tolerance={tolerance} A" + (" [movebadrandom]" if movebadrandom else "")
        )

        # Pack polymers using our custom PACKMOL runner (supports movebadrandom)
        packed_top = _pack_polymers(
            molecules=self._polymer_molecules,
            number_of_copies=self._polymer_counts,
            solute=self._combined_topology,
            box_vectors=box_vecs,
            tolerance_angstrom=tolerance,
            movebadrandom=movebadrandom,
            working_directory=str(working_directory) if working_directory else None,
            retain_working_files=True,
        )

        # Re-number chains
        self._renumber_chains(packed_top)

        self._combined_topology = packed_top

        LOGGER.info(f"Packed topology: {packed_top.n_atoms} atoms")

        return packed_top



[docs]
    def solvate(
        self,
        composition: Optional[SolventComposition] = None,
        padding: float = 0.9,
        box_shape: str = "rhombic_dodecahedron",
    ) -> Topology:
        """Solvate the system with water and ions.

        Args:
            composition: Solvent composition specification.
            padding: Box padding in nm.
            box_shape: Box geometry.

        Returns:
            Solvated topology.

        Raises:
            RuntimeError: If solutes not combined.
        """
        if self._combined_topology is None:
            self.combine_solutes()

        LOGGER.info("Solvating system")

        self._solvated_topology = self._solvent_builder.solvate(
            topology=self._combined_topology,
            composition=composition,
            padding=padding,
            box_shape=box_shape,
        )

        return self._solvated_topology



[docs]
    def create_interchange(self) -> Interchange:
        """Create the OpenFF Interchange for simulation.

        By default, passes the entire solvated topology to a single
        ``ForceField.create_interchange()`` call.  OpenFF's internal
        ``identical_molecule_groups`` mechanism deduplicates SMIRKS
        matching and charge assignment so that work scales with the
        number of *unique* molecule types, not total molecules.

        Pre-computed charges are supplied via ``charge_from_molecules``
        for water, polymers, substrate, and co-solvents so that no
        AM1BCC calculations are triggered at runtime.

        Returns:
            OpenFF Interchange object.

        Raises:
            RuntimeError: If system not solvated.
        """
        if self._solvated_topology is None:
            raise RuntimeError("System must be solvated before creating Interchange")

        from openff.toolkit import ForceField

        from polyzymd.data.solvent_molecules import get_solvent_molecule

        LOGGER.info("Creating Interchange")

        ff = ForceField(self._protein_ff, self._sm_ff)

        # Get water molecule with pre-computed charges for charge_from_molecules
        # Use PolyzyMD's cached water instead of Polymerist's for consistency
        water_model = "tip3p"
        if self._solvent_builder._composition:
            water_model = self._solvent_builder._composition.water_model
        water_mol = get_solvent_molecule(water_model)

        LOGGER.info("Using single-call Interchange creation")
        self._interchange = self._create_interchange_single_call(ff, water_mol)

        LOGGER.info("Interchange created successfully")

        return self._interchange


    def _build_molecule_name_mapping(self) -> Dict[str, str]:
        """Build SMILES -> friendly name mapping for logging.

        Creates a mapping from molecule SMILES to human-readable names
        for use in log messages during Interchange creation.

        Returns:
            Dictionary mapping SMILES strings to display names.

        Example output:
            {
                "[large-enzyme-smiles]": "LipA",
                "[substrate-smiles]": "ResorufinButyrate",
                "[polymer-smiles]": "EGPMA-SBMA_AAABA",
                "CS(=O)C": "dmso",
            }
        """
        name_map: Dict[str, str] = {}

        # Add enzyme name from config or default
        if self._enzyme_topology:
            enzyme_smiles = self._enzyme_topology.molecule(0).to_smiles()
            config = getattr(self, "_config", None)
            if config and hasattr(config, "enzyme") and config.enzyme:
                name_map[enzyme_smiles] = config.enzyme.name
            else:
                name_map[enzyme_smiles] = "Enzyme"

        # Add substrate name from config or default
        if self._substrate_molecule:
            substrate_smiles = self._substrate_molecule.to_smiles()
            config = getattr(self, "_config", None)
            if config and hasattr(config, "substrate") and config.substrate:
                name_map[substrate_smiles] = config.substrate.name
            else:
                name_map[substrate_smiles] = "Substrate"

        # Add polymer names: type_prefix + sequence (e.g., "EGPMA-SBMA_AAABA")
        if self._polymer_builder:
            for sequence, mol in self._polymer_builder.loaded_molecules.items():
                polymer_smiles = mol.to_smiles()
                name_map[polymer_smiles] = f"{self._polymer_builder._type_prefix}_{sequence}"

        # Add co-solvent names
        if self._solvent_builder._composition:
            for cosolvent in self._solvent_builder._composition.co_solvents:
                if cosolvent.molecule:
                    cosolvent_smiles = cosolvent.molecule.to_smiles()
                    # Use name if available, otherwise use SMILES
                    name_map[cosolvent_smiles] = cosolvent.name

        return name_map

    def _build_charge_template_mapping(self, water_mol: Molecule) -> Dict[str, Molecule]:
        """Build SMILES -> charge template molecule mapping.

        Creates a mapping from molecule SMILES to template molecules with
        pre-computed charges for use with charge_from_molecules parameter.

        Args:
            water_mol: Water molecule with pre-computed charges.

        Returns:
            Dictionary mapping SMILES strings to charged template molecules.
        """
        smiles_to_template: Dict[str, Molecule] = {}

        # Add water
        smiles_to_template[water_mol.to_smiles()] = water_mol

        # Add polymers (already have charges from SDF)
        for mol in self._polymer_molecules:
            smiles_to_template[mol.to_smiles()] = mol

        # Add substrate (already has charges from NAGL/AM1BCC)
        if self._substrate_molecule:
            smiles_to_template[self._substrate_molecule.to_smiles()] = self._substrate_molecule

        # Add co-solvents with pre-computed charges
        if self._solvent_builder._composition:
            for cosolvent in self._solvent_builder._composition.co_solvents:
                if cosolvent.molecule is not None:
                    smiles_to_template[cosolvent.molecule.to_smiles()] = cosolvent.molecule

        return smiles_to_template

    def _create_interchange_single_call(self, ff: ForceField, water_mol: Molecule) -> Interchange:
        """Create Interchange with a single ``ForceField.create_interchange()`` call.

        Passes the entire solvated topology at once.  OpenFF internally
        groups identical molecules (``Topology.identical_molecule_groups``)
        so SMIRKS matching and charge assignment scale with the number of
        *unique* molecule types, not total molecules.

        This avoids ``Interchange.combine()`` entirely — previously the
        dominant bottleneck (84 min for a 308 K-atom system).

        Args:
            ff: OpenFF ForceField.
            water_mol: Water molecule with pre-computed charges.

        Returns:
            Interchange object with box vectors set.
        """
        # Build charge templates: one per unique species
        t0 = time.perf_counter()
        charge_from: List[Molecule] = [water_mol]

        seen_smiles: set = set()
        for mol in self._polymer_molecules:
            smi = mol.to_smiles()
            if smi not in seen_smiles:
                charge_from.append(mol)
                seen_smiles.add(smi)

        if self._substrate_molecule:
            charge_from.append(self._substrate_molecule)

        if self._solvent_builder._composition:
            for cosolvent in self._solvent_builder._composition.co_solvents:
                if cosolvent.molecule is not None:
                    charge_from.append(cosolvent.molecule)

        LOGGER.info(
            f"Charge templates: {len(charge_from)} "
            f"(water + {len(seen_smiles)} polymer type(s) + substrate + co-solvents)"
        )
        LOGGER.info(f"  Template prep took {time.perf_counter() - t0:.1f}s")

        # Single parameterization call
        t1 = time.perf_counter()
        LOGGER.info(
            f"Calling ff.create_interchange() on full topology "
            f"({self._solvated_topology.n_molecules} molecules, "
            f"{self._solvated_topology.n_atoms} atoms) ..."
        )

        # Suppress per-atom INFO logging from OpenFF's LibraryCharges handler.
        # It emits one line per atom during charge assignment (~200K lines for
        # a typical solvated system), which floods logs without adding value.
        _nonbonded_logger = logging.getLogger("openff.interchange.smirnoff._nonbonded")
        _prev_level = _nonbonded_logger.level
        _nonbonded_logger.setLevel(logging.WARNING)
        try:
            interchange = ff.create_interchange(
                self._solvated_topology,
                charge_from_molecules=charge_from,
            )
        finally:
            _nonbonded_logger.setLevel(_prev_level)

        t2 = time.perf_counter()
        LOGGER.info(f"  create_interchange() completed in {t2 - t1:.1f}s")

        # Set box vectors
        interchange.box = self._solvated_topology.box_vectors

        return interchange

    def _renumber_chains(self, topology: Topology) -> None:
        """Normalize chain IDs before canonical PDB identifier assignment.

        This lightweight normalization keeps intermediate OpenFF metadata
        deterministic before ``_assign_pdb_identifiers()`` applies the
        canonical PolyzyMD chain convention during topology export.

        Args:
            topology: Topology to modify.
        """
        for i, mol in enumerate(topology.molecules):
            for atom in mol.atoms:
                atom.metadata["chain_id"] = str(i + 1)

    def _assign_pdb_identifiers(self) -> None:
        """Assign PDB-compliant chain IDs and residue numbers to the solvated topology.

        This method assigns chain IDs and residue numbers based on molecule types
        tracked during the build process. The config YAML serves as the single
        source of truth for what each molecule represents.

        Chain assignment uses FIXED letters regardless of component presence:
        - Chain A: Protein (preserves original residue numbers from input PDB)
        - Chain B: Substrate (residue 1; letter reserved even if no substrate)
        - Chain C: Polymers (preserves per-monomer residue numbers)
        - Chain D+: Solvent (overflow at 9999 residues per chain)

        Using fixed letters ensures consistency with SystemComponentInfo and
        AtomGroupResolver, which hardcode A=protein, B=substrate, C=polymer.
        Without this, absent components (e.g., no substrate) would shift later
        chain letters, causing restraints to target wrong atoms.

        This ensures every atom can be uniquely identified by the tuple
        (chain_id, residue_number, residue_name, atom_name) for downstream
        analysis tools like MDAnalysis and PyMOL.

        Raises:
            RuntimeError: If no solvated topology exists.
        """
        if self._solvated_topology is None:
            raise RuntimeError("No solvated topology. Call solvate() first.")

        CHAIN_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        mol_idx = 0

        # Fixed chain letter assignments per component type.
        # A=protein, B=substrate, C=polymer, D+=solvent — regardless of whether
        # a component is present. This ensures downstream code (SystemComponentInfo,
        # AtomGroupResolver, from_topology()) always sees the expected chain IDs.
        PROTEIN_CHAIN = "A"
        SUBSTRATE_CHAIN = "B"
        POLYMER_CHAIN = "C"
        SOLVENT_START_IDX = 3  # index of 'D' in CHAIN_LETTERS

        # 1. Protein: Always chain A, preserve original residue numbers
        if self._n_enzyme_molecules > 0:
            LOGGER.debug(f"Assigning chain {PROTEIN_CHAIN} to protein")

            for _ in range(self._n_enzyme_molecules):
                mol = self._solvated_topology.molecule(mol_idx)
                for atom in mol.atoms:
                    atom.metadata["chain_id"] = PROTEIN_CHAIN
                    # Ensure residue_number is a string (PDB loader may store as int)
                    # OpenMM's addResidue(id=...) expects a string
                    if "residue_number" in atom.metadata:
                        atom.metadata["residue_number"] = str(atom.metadata["residue_number"])
                mol_idx += 1

        # 2. Substrate: Always chain B, residue 1
        if self._n_substrate_molecules > 0:
            LOGGER.debug(f"Assigning chain {SUBSTRATE_CHAIN} to substrate")

            for _ in range(self._n_substrate_molecules):
                mol = self._solvated_topology.molecule(mol_idx)
                for atom in mol.atoms:
                    atom.metadata["chain_id"] = SUBSTRATE_CHAIN
                    atom.metadata["residue_number"] = "1"
                mol_idx += 1

        # 3. Polymers: Always chain C, continue residue numbering across chains
        if self._n_polymer_chains > 0:
            LOGGER.debug(
                f"Assigning chain {POLYMER_CHAIN} to {self._n_polymer_chains} polymer chain(s)"
            )

            # Track residue number across all polymer chains (continue, don't restart)
            polymer_residue_num = 1

            for _ in range(self._n_polymer_chains):
                mol = self._solvated_topology.molecule(mol_idx)

                # Group atoms by their current residue_number to identify monomers
                # Polymer molecules have per-monomer residue metadata from SDF
                current_monomer_residue = None

                for atom in mol.atoms:
                    atom.metadata["chain_id"] = POLYMER_CHAIN

                    # Check if this atom belongs to a new monomer
                    atom_residue = atom.metadata.get("residue_number", "0")
                    if atom_residue != current_monomer_residue:
                        # New monomer - increment our counter
                        if current_monomer_residue is not None:
                            polymer_residue_num += 1
                        current_monomer_residue = atom_residue

                    # Assign the sequential residue number
                    atom.metadata["residue_number"] = str(polymer_residue_num)

                # After processing this polymer chain, increment for next chain's first monomer
                polymer_residue_num += 1
                mol_idx += 1

        # 4. Solvent: Always starts at chain D (index 3)
        self._assign_solvent_identifiers(
            start_mol_idx=mol_idx,
            start_chain_idx=SOLVENT_START_IDX,
            chain_letters=CHAIN_LETTERS,
        )

        LOGGER.info(
            f"PDB identifiers assigned: protein={self._n_enzyme_molecules}, "
            f"substrate={self._n_substrate_molecules}, polymers={self._n_polymer_chains}, "
            f"solvent molecules start at chain {CHAIN_LETTERS[SOLVENT_START_IDX]}"
        )

    def _assign_solvent_identifiers(
        self,
        start_mol_idx: int,
        start_chain_idx: int,
        chain_letters: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
        max_residue: int = 9999,
    ) -> None:
        """Assign chain IDs and residue numbers to solvent molecules.

        Solvent molecules (water, ions, co-solvents) are numbered sequentially.
        When residue number exceeds max_residue (9999), overflow to the next
        chain letter and restart residue numbering at 1.

        Args:
            start_mol_idx: Index of first solvent molecule in topology.
            start_chain_idx: Index into chain_letters for first solvent chain.
            chain_letters: Available chain ID characters.
            max_residue: Maximum residue number before overflow (default 9999).
        """
        chain_idx = start_chain_idx
        residue_num = 1

        n_solvent = self._solvated_topology.n_molecules - start_mol_idx
        LOGGER.debug(f"Assigning identifiers to {n_solvent} solvent molecules")

        for mol_idx in range(start_mol_idx, self._solvated_topology.n_molecules):
            # Check for overflow before processing this molecule
            if residue_num > max_residue:
                chain_idx += 1
                residue_num = 1

                if chain_idx >= len(chain_letters):
                    LOGGER.warning(
                        f"Exceeded {len(chain_letters)} chain letters - cycling. "
                        "Consider using a topology format with larger chain ID capacity."
                    )
                    chain_idx = chain_idx % len(chain_letters)

            chain_id = chain_letters[chain_idx]
            mol = self._solvated_topology.molecule(mol_idx)

            for atom in mol.atoms:
                atom.metadata["chain_id"] = chain_id
                atom.metadata["residue_number"] = str(residue_num)

            residue_num += 1

        # Log summary
        n_chains_used = chain_idx - start_chain_idx + 1
        if n_chains_used > 1:
            LOGGER.info(
                f"Solvent required {n_chains_used} chains "
                f"({chain_letters[start_chain_idx]}-{chain_letters[chain_idx]})"
            )


[docs]
    def save_topology(
        self,
        path: Union[str, Path],
        topology: Optional[Topology] = None,
    ) -> None:
        """Save a topology to PDB file.

        Assigns PDB-compliant chain IDs and residue numbers before writing
        to ensure downstream analysis tools can uniquely identify all atoms.

        Args:
            path: Output path.
            topology: Topology to save (defaults to solvated).
        """
        if topology is None:
            topology = self._solvated_topology

        if topology is None:
            raise RuntimeError("No topology to save")

        # Assign PDB-compliant identifiers before writing
        # This ensures unique (chain_id, residue_number, atom_name) tuples
        if topology is self._solvated_topology:
            self._assign_pdb_identifiers()

        path = Path(path)
        topology.to_file(str(path), keep_ids=True)
        LOGGER.info(f"Saved topology to {path}")



[docs]
    def build_from_config(
        self,
        config: "SimulationConfig",
        working_dir: Optional[Union[str, Path]] = None,
        polymer_seed: Optional[int] = None,
    ) -> Interchange:
        """Build the complete system from a configuration.

        This is the main entry point for config-driven builds.

        Args:
            config: SimulationConfig with all settings.
            working_dir: Working directory for output files.
            polymer_seed: Random seed for polymer generation. This is used as a
                fallback if config.polymers.random_seed is not set.

        Returns:
            OpenFF Interchange ready for simulation.
        """
        self._working_dir = Path(working_dir) if working_dir else None

        # 1. Build enzyme
        LOGGER.info(f"Building enzyme: {config.enzyme.name}")
        self.build_enzyme(config.enzyme.pdb_path)

        # 2. Build substrate (if configured)
        if config.substrate:
            LOGGER.info(f"Building substrate: {config.substrate.name}")
            self._substrate_builder.build_from_config(config.substrate)
            self._substrate_molecule = self._substrate_builder.molecule
            self._n_substrate_molecules = 1

        # 3. Combine enzyme + substrate
        self.combine_solutes()

        # 4. Build and pack polymers (if configured)
        if config.polymers and config.polymers.enabled:
            LOGGER.info(f"Building polymers: {config.polymers.type_prefix}")

            characters = [m.label for m in config.polymers.monomers]
            probabilities = [m.probability for m in config.polymers.monomers]

            # Determine effective seed: config.random_seed takes precedence over polymer_seed
            effective_seed = config.polymers.random_seed
            if effective_seed is None:
                effective_seed = polymer_seed
            LOGGER.info(f"Using polymer random seed: {effective_seed}")

            # Extract dynamic mode parameters if applicable
            generation_mode = config.polymers.generation_mode.value
            monomer_smiles = None
            monomer_names = None
            residue_names = None
            reactions = None

            if generation_mode == "dynamic":
                # Build monomer_smiles: name -> SMILES
                monomer_smiles = {
                    m.name: m.smiles for m in config.polymers.monomers if m.smiles is not None
                }
                # Build monomer_names: label -> name
                monomer_names = {m.label: m.name for m in config.polymers.monomers}
                # Build residue_names: name -> 3-char residue name
                residue_names = {
                    m.name: m.residue_name
                    for m in config.polymers.monomers
                    if m.residue_name is not None
                }
                # Get reaction config
                reactions = config.polymers.reactions

                LOGGER.info(
                    f"Dynamic mode: monomers={list(monomer_smiles.keys())}, "
                    f"charger={config.polymers.charger.value}"
                )

            self.build_polymers(
                characters=characters,
                probabilities=probabilities,
                length=config.polymers.length,
                count=config.polymers.count,
                type_prefix=config.polymers.type_prefix,
                sdf_directory=config.polymers.sdf_directory,
                seed=effective_seed,
                generation_mode=generation_mode,
                monomer_smiles=monomer_smiles,
                monomer_names=monomer_names,
                residue_names=residue_names,
                reactions=reactions,
                charger_type=config.polymers.charger.value,
                max_retries=config.polymers.max_retries,
                cache_directory=config.polymers.cache_directory,
            )

            # Get packing config (uses defaults if not specified)
            packing = config.polymers.packing
            self.pack_polymers(
                padding=packing.padding,
                tolerance=packing.tolerance,
                movebadrandom=packing.movebadrandom,
                working_directory=self._working_dir,
            )

        # 5. Solvate
        LOGGER.info("Solvating system")
        self._solvent_builder.solvate_from_config(self._combined_topology, config.solvent)
        self._solvated_topology = self._solvent_builder.solvated_topology

        # Save solvated PDB if working dir specified
        if self._working_dir:
            self._working_dir.mkdir(parents=True, exist_ok=True)
            pdb_path = self._working_dir / "solvated_system.pdb"
            self.save_topology(pdb_path)

        # 6. Create Interchange
        self.create_interchange()

        return self._interchange



[docs]
    def get_openmm_components(self) -> Tuple[Any, Any, Any]:
        """Extract OpenMM components from the Interchange.

        Returns:
            Tuple of (topology, system, positions).

        Raises:
            RuntimeError: If Interchange not created.
        """
        if self._interchange is None:
            raise RuntimeError("Interchange not created. Call create_interchange() first.")

        from openff.interchange.interop.openmm._positions import to_openmm_positions

        t0 = time.perf_counter()
        omm_topology = self._interchange.to_openmm_topology()
        t_topo = time.perf_counter() - t0
        LOGGER.info("  to_openmm_topology: %.1fs", t_topo)

        t0 = time.perf_counter()
        omm_system = self._interchange.to_openmm(combine_nonbonded_forces=True)
        t_sys = time.perf_counter() - t0
        LOGGER.info("  to_openmm (system): %.1fs", t_sys)

        t0 = time.perf_counter()
        omm_positions = to_openmm_positions(self._interchange, include_virtual_sites=True)
        t_pos = time.perf_counter() - t0
        LOGGER.info("  to_openmm_positions: %.1fs", t_pos)

        LOGGER.info(
            "  OpenMM extraction total: %.1fs (topo=%.1f, sys=%.1f, pos=%.1f)",
            t_topo + t_sys + t_pos,
            t_topo,
            t_sys,
            t_pos,
        )

        return omm_topology, omm_system, omm_positions



[docs]
    def export_to_gromacs(
        self,
        output_dir: Union[str, Path],
        prefix: Optional[str] = None,
        gmx_command: str = "gmx",
        generate_mdps: bool = True,
    ) -> Dict[str, Any]:
        """Export the system to GROMACS format with full simulation setup.

        Generates a complete GROMACS simulation setup including:
        - .gro (coordinates) and .top (topology) files
        - MDP files for energy minimization, equilibration, and production
        - Position restraint files for equilibration stages
        - Run script for executing the full workflow

        The topology is split into separate .itp files for each molecule type
        (monolithic=False), which is cleaner for multi-component systems.

        MDP files are generated from config.yaml parameters to match OpenMM
        simulation settings (temperature, pressure, duration, etc.). OpenFF
        defaults are used for force field parameters (rcoulomb=0.9, rvdw=0.9,
        PME, etc.) to ensure 1:1 parity with OpenMM.

        Args:
            output_dir: Directory to write GROMACS files. Will be created if it
                doesn't exist.
            prefix: Filename prefix for output files. If None, generates a
                descriptive name from the config (e.g., "LipA_EGPMA-SBMA").
            gmx_command: GROMACS command/path for the run script (default "gmx").
            generate_mdps: If True (default), generate MDP files from config.
                If False, only export coordinates and topology.

        Returns:
            Dictionary with paths to generated files:
            - "gro": Path to coordinate file
            - "top": Path to topology file
            - "em_mdp": Path to energy minimization MDP (if generate_mdps=True)
            - "eq_mdps": List of equilibration MDP paths (if generate_mdps=True)
            - "prod_mdp": Path to production MDP (if generate_mdps=True)
            - "posres": Dict of position restraint files (if applicable)
            - "run_script": Path to run script (if generate_mdps=True)

        Raises:
            RuntimeError: If Interchange has not been created.

        Example:
            >>> builder = SystemBuilder.from_config(config)
            >>> builder.build_from_config(config, working_dir)
            >>> result = builder.export_to_gromacs(
            ...     output_dir="gromacs/",
            ...     prefix="my_system"
            ... )
            >>> print(f"Run: cd {result['gro'].parent} && ./{result['run_script'].name}")
        """
        if self._interchange is None:
            raise RuntimeError(
                "Interchange not created. Call create_interchange() or build_from_config() first."
            )

        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Generate prefix from config if not provided
        if prefix is None:
            prefix = self._generate_gromacs_prefix()

        LOGGER.info(f"Exporting to GROMACS format: {output_dir / prefix}.*")

        # Check if we have config for MDP generation
        config = getattr(self, "_config", None)

        if generate_mdps and config is None:
            raise RuntimeError(
                "GROMACS MDP generation requires a validated SimulationConfig. "
                "Pass generate_mdps=False to export coordinates and topology only."
            )

        if generate_mdps:
            # Use GromacsExporter for full export with MDP generation
            from polyzymd.exporters.gromacs import GromacsExporter

            # Get component info for position restraints
            try:
                component_info = self.get_component_info()
            except RuntimeError:
                component_info = None
                LOGGER.warning(
                    "Could not get component info for position restraints. "
                    "Position restraint files will not be generated."
                )

            exporter = GromacsExporter(
                interchange=self._interchange,
                config=config,
                component_info=component_info,
            )

            result = exporter.export(
                output_dir=output_dir,
                prefix=prefix,
                gmx_command=gmx_command,
            )

            return result
        # Build-only export without MDP generation
        return self._export_gromacs_minimal(output_dir, prefix)


    def _export_gromacs_minimal(
        self,
        output_dir: Path,
        prefix: str,
    ) -> Dict[str, Any]:
        """Export GROMACS coordinates and topology only.

        This build-only mode lets users export ``.gro``, ``.top``, and
        ``.itp`` files and then manage downstream GROMACS inputs themselves.

        Args:
            output_dir: Output directory.
            prefix: Filename prefix.

        Returns:
            Dictionary with gro and top paths.
        """
        # Fix 0-indexed residues (GROMACS requires 1-indexed)
        self._fix_zero_indexed_residues()

        # Export using OpenFF Interchange
        output_prefix = str(output_dir / prefix)
        self._interchange.to_gromacs(
            prefix=output_prefix,
            monolithic=False,
            _merge_atom_types=True,
        )

        gro_path = output_dir / f"{prefix}.gro"
        top_path = output_dir / f"{prefix}.top"
        mdp_path = output_dir / f"{prefix}.mdp"

        LOGGER.info("Generated GROMACS files:")
        LOGGER.info(f"  Coordinates: {gro_path}")
        LOGGER.info(f"  Topology:    {top_path}")
        LOGGER.info(f"  MDP stub:    {mdp_path}")
        LOGGER.info("")
        LOGGER.info("Note: The .mdp file is a stub for single-point energy calculation.")
        LOGGER.info("For MD simulations, use generate_mdps=True with a config file,")
        LOGGER.info("or create custom MDP files.")

        return {
            "gro": gro_path,
            "top": top_path,
            "mdp_stub": mdp_path,
        }

    def _generate_gromacs_prefix(self) -> str:
        """Generate a descriptive filename prefix from the config.

        Creates a prefix in the format: {enzyme_name}_{polymer_prefix}
        Falls back to "system" if config information is not available.

        Returns:
            Filename prefix string (without extension).
        """
        parts = []

        # Get enzyme name from config
        config = getattr(self, "_config", None)
        if config and hasattr(config, "enzyme") and config.enzyme:
            parts.append(config.enzyme.name)

        # Get polymer type prefix from config
        if config and hasattr(config, "polymers") and config.polymers:
            if config.polymers.enabled and config.polymers.type_prefix:
                parts.append(config.polymers.type_prefix)

        if parts:
            return "_".join(parts)
        else:
            return "system"

    def _fix_zero_indexed_residues(self) -> None:
        """Fix 0-indexed residues in the topology for GROMACS compatibility.

        GROMACS requires residue numbers to be 1-indexed. This method checks
        all atoms in the topology and increments any 0-indexed residue numbers.
        """
        if self._interchange is None:
            return

        found_zero_indexed = False

        # First pass: check for 0-indexed residues
        for molecule in self._interchange.topology.molecules:
            for atom in molecule.atoms:
                residue_num = atom.metadata.get("residue_number")
                if residue_num is not None:
                    # Handle both string and int representations
                    if isinstance(residue_num, str):
                        try:
                            if int(residue_num) == 0:
                                found_zero_indexed = True
                                break
                        except ValueError:
                            pass
                    elif residue_num == 0:
                        found_zero_indexed = True
                        break
            if found_zero_indexed:
                break

        if not found_zero_indexed:
            LOGGER.debug("No 0-indexed residues found")
            return

        # Second pass: fix 0-indexed residues
        LOGGER.info("Fixing 0-indexed residues for GROMACS compatibility")
        for molecule in self._interchange.topology.molecules:
            for atom in molecule.atoms:
                residue_num = atom.metadata.get("residue_number")
                if residue_num is not None:
                    if isinstance(residue_num, str):
                        try:
                            atom.metadata["residue_number"] = str(int(residue_num) + 1)
                        except ValueError:
                            pass
                    else:
                        atom.metadata["residue_number"] = residue_num + 1

        LOGGER.info("Fixed all 0-indexed residues")


[docs]
    def get_component_info(self) -> "SystemComponentInfo":
        """Get system component information for atom group resolution.

        This method returns a SystemComponentInfo dataclass containing the
        atom counts and chain assignments for each system component (protein,
        substrate, polymers, solvent). This information is needed by the
        AtomGroupResolver to resolve predefined atom group names to indices.

        Returns:
            SystemComponentInfo with atom counts and chain assignments

        Raises:
            RuntimeError: If solvated topology not created.
        """
        from polyzymd.core.atom_groups import SystemComponentInfo

        if self._solvated_topology is None:
            raise RuntimeError("No solvated topology. Call solvate() first.")

        # Count atoms per component from topology
        n_protein_atoms = 0
        n_substrate_atoms = 0
        n_polymer_atoms = 0

        mol_idx = 0

        # Protein atoms
        for _ in range(self._n_enzyme_molecules):
            mol = self._solvated_topology.molecule(mol_idx)
            n_protein_atoms += mol.n_atoms
            mol_idx += 1

        # Substrate atoms
        for _ in range(self._n_substrate_molecules):
            mol = self._solvated_topology.molecule(mol_idx)
            n_substrate_atoms += mol.n_atoms
            mol_idx += 1

        # Polymer atoms
        for _ in range(self._n_polymer_chains):
            mol = self._solvated_topology.molecule(mol_idx)
            n_polymer_atoms += mol.n_atoms
            mol_idx += 1

        LOGGER.debug(
            f"Component info: protein={n_protein_atoms} atoms, "
            f"substrate={n_substrate_atoms} atoms, polymer={n_polymer_atoms} atoms"
        )

        return SystemComponentInfo(
            n_protein_atoms=n_protein_atoms,
            n_substrate_atoms=n_substrate_atoms,
            n_polymer_atoms=n_polymer_atoms,
            protein_chain_id="A",
            substrate_chain_id="B",
            polymer_chain_id="C",
            solvent_start_chain_id="D",
        )