Source code for polyzymd.analyses.shared.aa_classification

"""Amino acid classification and SASA reference data.

This module provides centralized reference data for amino acid properties:

- Maximum accessible surface area (maxASA) from Tien et al. 2013
- Standard amino acid classification by physicochemical properties
- Default MDAnalysis selection strings for each AA class

These constants are used by:

- Protein grouping in contact analysis
- Template generation for analysis configs

References
----------
Tien MZ, Meyer AG, Sydykova DK, Spielman SJ, Wilke CO.
Maximum allowed solvent accessibilities of residues in proteins.
PLoS One. 2013 Nov 21;8(11):e80635.
doi: 10.1371/journal.pone.0080635. PMID: 24278298; PMCID: PMC3836772.
"""

from __future__ import annotations

from enum import Enum
from typing import Final

# Canonical ordering of amino acid classes for consistent display across plots.
# Used by contacts plotters.
CANONICAL_AA_CLASS_ORDER: Final[list[str]] = [
    "aromatic",
    "polar",
    "nonpolar",
    "charged_positive",
    "charged_negative",
]


[docs] class AAClass(str, Enum): """Standard amino acid classifications.""" AROMATIC = "aromatic" POLAR = "polar" NONPOLAR = "nonpolar" CHARGED_POSITIVE = "charged_positive" CHARGED_NEGATIVE = "charged_negative" UNKNOWN = "unknown"
# ============================================================================= # Maximum Accessible Surface Area (maxASA) # ============================================================================= # Tien et al. 2013 empirical maxASA values (Angstrom^2) # These represent the maximum solvent-accessible surface area # for each amino acid type in extended tripeptide conformations. MAX_ASA_TABLE: Final[dict[str, float]] = { "ALA": 121.0, "ARG": 265.0, "ASN": 187.0, "ASP": 187.0, "CYS": 148.0, "GLU": 214.0, "GLN": 214.0, "GLY": 97.0, "HIS": 216.0, "ILE": 195.0, "LEU": 191.0, "LYS": 230.0, "MET": 203.0, "PHE": 228.0, "PRO": 154.0, "SER": 143.0, "THR": 163.0, "TRP": 264.0, "TYR": 255.0, "VAL": 165.0, } # ============================================================================= # Amino Acid Classification # ============================================================================= # Standard classification by physicochemical properties # Note: Histidine is classified as aromatic (contains imidazole ring) AA_CLASSIFICATION_TABLE: Final[dict[str, str]] = { # Aromatic (contain aromatic rings) "PHE": "aromatic", "TRP": "aromatic", "TYR": "aromatic", "HIS": "aromatic", # Imidazole ring # Polar (uncharged, can form H-bonds) "SER": "polar", "THR": "polar", "ASN": "polar", "GLN": "polar", "CYS": "polar", # Nonpolar (hydrophobic) "ALA": "nonpolar", "VAL": "nonpolar", "ILE": "nonpolar", "LEU": "nonpolar", "MET": "nonpolar", "GLY": "nonpolar", "PRO": "nonpolar", # Charged positive (basic) "ARG": "charged_positive", "LYS": "charged_positive", # Charged negative (acidic) "ASP": "charged_negative", "GLU": "charged_negative", } # All standard 3-letter amino acid codes STANDARD_AA_CODES: Final[list[str]] = list(AA_CLASSIFICATION_TABLE.keys()) # ============================================================================= # MDAnalysis Selection Strings # ============================================================================= # Default MDAnalysis selections for each AA class # These are used in analysis config templates and contacts grouping DEFAULT_AA_CLASS_SELECTIONS: Final[dict[str, str]] = { "aromatic": "protein and resname PHE TRP TYR HIS", "polar": "protein and resname SER THR ASN GLN CYS", "nonpolar": "protein and resname ALA VAL ILE LEU MET GLY PRO", "charged_positive": "protein and resname ARG LYS", "charged_negative": "protein and resname ASP GLU", } # Residue names for each class (for programmatic access) AA_CLASS_RESIDUES: Final[dict[str, list[str]]] = { "aromatic": ["PHE", "TRP", "TYR", "HIS"], "polar": ["SER", "THR", "ASN", "GLN", "CYS"], "nonpolar": ["ALA", "VAL", "ILE", "LEU", "MET", "GLY", "PRO"], "charged_positive": ["ARG", "LYS"], "charged_negative": ["ASP", "GLU"], } # ============================================================================= # Helper Functions # =============================================================================
[docs] def get_aa_class(resname: str) -> str: """Get amino acid classification for a residue name. Parameters ---------- resname : str 3-letter amino acid code (case-insensitive) Returns ------- str Classification: 'aromatic', 'polar', 'nonpolar', 'charged_positive', 'charged_negative', or 'unknown' Examples -------- >>> get_aa_class("PHE") 'aromatic' >>> get_aa_class("lys") 'charged_positive' >>> get_aa_class("UNK") 'unknown' """ # Handle common protonation state variants variants = { "HIE": "HIS", "HID": "HIS", "HIP": "HIS", "HSE": "HIS", "HSD": "HIS", "HSP": "HIS", "CYSH": "CYS", "CYX": "CYS", "ASH": "ASP", "GLH": "GLU", "LYN": "LYS", } normalized = resname.upper().strip() normalized = variants.get(normalized, normalized) return AA_CLASSIFICATION_TABLE.get(normalized, "unknown")
[docs] def get_max_asa(resname: str) -> float | None: """Get maximum accessible surface area for a residue name. Parameters ---------- resname : str 3-letter amino acid code (case-insensitive) Returns ------- float or None Maximum ASA in Angstrom^2, or None if residue not in table Examples -------- >>> get_max_asa("ALA") 121.0 >>> get_max_asa("TRP") 264.0 >>> get_max_asa("UNK") # Returns None for unknown residues """ normalized = resname.upper().strip() return MAX_ASA_TABLE.get(normalized)
[docs] def get_residues_for_class(aa_class: str) -> list[str]: """Get all residue names belonging to an amino acid class. Parameters ---------- aa_class : str One of: 'aromatic', 'polar', 'nonpolar', 'charged_positive', 'charged_negative' Returns ------- list[str] List of 3-letter amino acid codes in this class Raises ------ ValueError If aa_class is not a valid classification Examples -------- >>> get_residues_for_class("aromatic") ['PHE', 'TRP', 'TYR', 'HIS'] """ if aa_class not in AA_CLASS_RESIDUES: valid = list(AA_CLASS_RESIDUES.keys()) raise ValueError(f"Unknown AA class '{aa_class}'. Valid: {valid}") return AA_CLASS_RESIDUES[aa_class].copy()
[docs] def get_selection_for_class(aa_class: str) -> str: """Get MDAnalysis selection string for an amino acid class. Parameters ---------- aa_class : str One of: 'aromatic', 'polar', 'nonpolar', 'charged_positive', 'charged_negative' Returns ------- str MDAnalysis selection string Raises ------ ValueError If aa_class is not a valid classification Examples -------- >>> get_selection_for_class("aromatic") 'protein and resname PHE TRP TYR HIS' """ if aa_class not in DEFAULT_AA_CLASS_SELECTIONS: valid = list(DEFAULT_AA_CLASS_SELECTIONS.keys()) raise ValueError(f"Unknown AA class '{aa_class}'. Valid: {valid}") return DEFAULT_AA_CLASS_SELECTIONS[aa_class]