Source code for polyzymd.analyses.shared.config_hash

"""Config hashing for analysis cache validation.

When analysis results are cached, we store a hash of the relevant config
parameters. If the config changes, we warn the user that cached results
may be invalid.

This module provides:
- `compute_config_hash`: Generate a hash of analysis-relevant config parameters
- `validate_config_hash`: Check if stored hash matches current config

Design Decision:
    Config immutability is expected. If a user modifies config parameters
    (e.g., temperature), they should create a new project directory.
    The hash validation is a safety check, not an enforcement mechanism.
"""

import hashlib
import json
import re
import warnings
from pathlib import Path
from typing import TYPE_CHECKING

from pydantic import BaseModel

if TYPE_CHECKING:
    from collections.abc import Mapping

    from polyzymd.config.schema import SimulationConfig


SETTINGS_FINGERPRINT_PATTERN = re.compile(r"_s(?P<fp>[0-9a-f]{8})(?:_|\.)")


[docs] def compute_config_hash(config: "SimulationConfig") -> str: """Compute hash of config parameters relevant to analysis. Includes parameters that affect trajectory interpretation: - enzyme configuration - substrate configuration - polymer configuration - thermodynamics (temperature, pressure) - output paths (for trajectory location) Excludes parameters that don't affect analysis of completed trajectories: - simulation_phases (equilibration_stages/production settings) - force_field (already baked into trajectory) Parameters ---------- config : SimulationConfig PolyzyMD simulation configuration Returns ------- str Hex digest of SHA-256 hash (first 16 characters for brevity) Examples -------- >>> from polyzymd.config import load_config >>> config = load_config("config.yaml") >>> hash_val = compute_config_hash(config) >>> print(f"Config hash: {hash_val}") Config hash: a3b2c1d4e5f67890 """ # Extract relevant config sections hash_data = { "name": config.name, "enzyme": { "name": config.enzyme.name, "pdb_path": str(config.enzyme.pdb_path), }, "thermodynamics": { "temperature": config.thermodynamics.temperature, "pressure": config.thermodynamics.pressure, }, "output": { "projects_directory": str(config.output.projects_directory), "scratch_directory": str(config.output.effective_scratch_directory), "naming_template": config.output.naming_template, }, } # Add substrate if present if config.substrate is not None: hash_data["substrate"] = { "name": config.substrate.name, "sdf_path": str(config.substrate.sdf_path), } # Add polymer config if enabled if config.polymers is not None and config.polymers.enabled: hash_data["polymers"] = { "type_prefix": config.polymers.type_prefix, "length": config.polymers.length, "count": config.polymers.count, "monomers": [ {"label": m.label, "probability": m.probability, "name": m.name} for m in config.polymers.monomers ], } # Serialize and hash json_str = json.dumps(hash_data, sort_keys=True, default=str) hash_obj = hashlib.sha256(json_str.encode()) # Return first 16 chars for brevity return hash_obj.hexdigest()[:16]
[docs] def settings_fingerprint(settings: BaseModel) -> str: """Compute a short deterministic fingerprint for analysis settings. The fingerprint is derived from canonical JSON produced with ``json.dumps(settings.model_dump(mode="json"), sort_keys=True)``, then hashed with SHA-256. It is intended for cache identity, so changing settings (for example contacts cutoff) naturally changes cache filenames. Parameters ---------- settings : BaseModel Analysis plugin settings model. Returns ------- str First 8 hexadecimal characters of the SHA-256 digest. """ serialized = json.dumps(settings.model_dump(mode="json"), sort_keys=True) digest = hashlib.sha256(serialized.encode("utf-8")).hexdigest() return digest[:8]
[docs] def compute_cache_identity( *, config_hash: str, settings: BaseModel | None = None, settings_fp: str | None = None, cache_params: "Mapping[str, object] | None" = None, length: int = 12, ) -> str: """Compute a deterministic cache identity across config and settings. This helper unifies cache identity generation for analysis caches. The identity combines: - Simulation config hash - Analysis settings fingerprint - Extra cache parameters when needed Parameters ---------- config_hash : str Hash of simulation config returned by :func:`compute_config_hash`. settings : BaseModel or None, optional Analysis settings model. Used only when ``settings_fp`` is not provided. settings_fp : str or None, optional Precomputed settings fingerprint. If provided, this takes precedence over computing from ``settings``. cache_params : Mapping[str, object] or None, optional Additional cache identity inputs such as equilibration or selection. length : int, optional Number of hex characters to return, by default 12. Returns ------- str Short hex identity safe for filenames. Raises ------ ValueError If neither ``settings`` nor ``settings_fp`` is provided. """ if settings_fp is None: if settings is None: raise ValueError("Provide either settings or settings_fp to compute cache identity") settings_fp = settings_fingerprint(settings) payload = { "config_hash": config_hash, "settings_fingerprint": settings_fp, "cache_params": dict(cache_params or {}), } canonical = json.dumps(payload, sort_keys=True, default=str) digest = hashlib.sha256(canonical.encode("utf-8")).hexdigest() return digest[:length]
[docs] def extract_settings_fingerprint_from_path(cache_path: str | Path) -> str | None: """Extract settings fingerprint from a cache filename when present. Parameters ---------- cache_path : str or Path Path to a cached result file. Returns ------- str | None Parsed 8-character fingerprint, or ``None`` when filename does not encode a settings fingerprint. """ match = SETTINGS_FINGERPRINT_PATTERN.search(Path(cache_path).name) return match.group("fp") if match is not None else None
[docs] def validate_settings_fingerprint( stored_fingerprint: str | None, current_settings: BaseModel, *, warn: bool = True, source: str | Path | None = None, ) -> bool: """Validate cached settings fingerprint against current analysis settings. Parameters ---------- stored_fingerprint : str or None Fingerprint read from cached result metadata or filename. current_settings : BaseModel Current plugin settings used for this analysis invocation. warn : bool, optional Emit warnings on mismatch or missing fingerprint, by default True. source : str or Path or None, optional Optional cache source path for diagnostics. Returns ------- bool ``True`` when cache settings are compatible with current settings, otherwise ``False``. Notes ----- Legacy cache files may not encode settings fingerprints. These files are treated as compatible for backward compatibility, with a warning to encourage recomputation. """ current_fingerprint = settings_fingerprint(current_settings) source_text = f" ({source})" if source is not None else "" if stored_fingerprint is None: if warn: warnings.warn( "Cached analysis result is missing settings fingerprint" f"{source_text}; loading legacy cache without strict validation", UserWarning, stacklevel=2, ) return True if stored_fingerprint != current_fingerprint: if warn: warnings.warn( "Cached settings fingerprint mismatch detected" f"{source_text}: stored={stored_fingerprint}, current={current_fingerprint}. " "Recomputing analysis result for current settings.", UserWarning, stacklevel=2, ) return False return True
[docs] def validate_config_hash( stored_hash: str, current_config: "SimulationConfig", warn: bool = True, ) -> bool: """Check if stored hash matches current config. If the hashes don't match, this indicates the config has changed since the analysis was performed. This could mean: 1. The user modified the config (bad practice - should create new project) 2. The analysis was performed on a different config file 3. A bug in the hashing algorithm Parameters ---------- stored_hash : str Hash stored in cached analysis results current_config : SimulationConfig Current configuration being used warn : bool, optional If True (default), print a loud warning when hashes don't match Returns ------- bool True if hashes match, False otherwise Examples -------- >>> stored_hash = loaded_result.get("config_hash", "") >>> config = load_config("config.yaml") >>> if not validate_config_hash(stored_hash, config): ... print("Warning: Results may be stale!") """ current_hash = compute_config_hash(current_config) if stored_hash != current_hash: if warn: warning_msg = ( "\n" "=" * 70 + "\n" "WARNING: CONFIG HASH MISMATCH DETECTED\n" "=" * 70 + "\n" f"Stored hash: {stored_hash}\n" f"Current hash: {current_hash}\n" "\n" "This indicates the config.yaml has changed since these results\n" "were computed. Cached results may be INVALID.\n" "\n" "If you intentionally changed the config, you should:\n" " 1. Create a new project directory with 'polyzymd init'\n" " 2. Run new simulations with the updated config\n" " 3. Re-run analysis on the new trajectories\n" "\n" "To recompute analysis with current config, use --recompute flag.\n" "=" * 70 ) warnings.warn(warning_msg, UserWarning, stacklevel=2) return False return True