"""Binding preference result models and data containers."""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Literal
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
[docs]
class PolymerComposition(BaseModel):
"""Polymer composition data extracted from trajectory.
Contains counts of residues and heavy atoms for each polymer type,
enabling normalization of enrichment ratios by polymer availability.
This data is used for dual normalization of binding preference:
- **Residue-based**: Normalizes by number of polymer residues per type.
Matches the experimental viewpoint where concentrations are specified
in terms of monomer units.
- **Atom-based**: Normalizes by number of heavy atoms (non-hydrogen) per type.
Accounts for differences in monomer size, since larger monomers have
more surface area and thus more opportunity for contacts.
Attributes
----------
residue_counts : dict[str, int]
Number of residues per polymer type (e.g., {"SBM": 50, "EGM": 50})
heavy_atom_counts : dict[str, int]
Number of heavy atoms (non-hydrogen) per polymer type.
Heavy atoms are defined as all atoms with element != 'H'.
Examples
--------
>>> composition = PolymerComposition(
... residue_counts={"SBM": 50, "EGM": 50},
... heavy_atom_counts={"SBM": 750, "EGM": 400},
... )
>>> composition.total_residues
100
>>> composition.residue_fraction("SBM")
0.5
>>> composition.heavy_atom_fraction("SBM")
0.652 # SBM has larger monomers
"""
residue_counts: dict[str, int] = Field(
default_factory=dict,
description="Number of residues per polymer type",
)
heavy_atom_counts: dict[str, int] = Field(
default_factory=dict,
description="Number of heavy atoms (non-H) per polymer type",
)
@property
def total_residues(self) -> int:
"""Total polymer residues across all types."""
return sum(self.residue_counts.values())
@property
def total_heavy_atoms(self) -> int:
"""Total heavy atoms across all polymer types."""
return sum(self.heavy_atom_counts.values())
[docs]
def residue_fraction(self, polymer_type: str) -> float:
"""Fraction of residues that are this polymer type.
Parameters
----------
polymer_type : str
Polymer type name (e.g., "SBM")
Returns
-------
float
Fraction in range [0, 1], or 0.0 if type not found
"""
total = self.total_residues
if total == 0:
return 0.0
return self.residue_counts.get(polymer_type, 0) / total
[docs]
def heavy_atom_fraction(self, polymer_type: str) -> float:
"""Fraction of heavy atoms that are this polymer type.
Parameters
----------
polymer_type : str
Polymer type name (e.g., "SBM")
Returns
-------
float
Fraction in range [0, 1], or 0.0 if type not found
"""
total = self.total_heavy_atoms
if total == 0:
return 0.0
return self.heavy_atom_counts.get(polymer_type, 0) / total
[docs]
def polymer_types(self) -> list[str]:
"""Get sorted list of polymer types in this composition."""
return sorted(set(self.residue_counts.keys()) | set(self.heavy_atom_counts.keys()))
[docs]
class BindingPreferenceEntry(BaseModel):
"""Single entry in the binding preference matrix.
Represents the binding preference metrics for one
(polymer_type, protein_group) combination.
Enrichment Interpretation (centered at zero)
--------------------------------------------
The enrichment ratio measures whether a polymer type contacts a protein
group more or less than expected based on **surface availability**.
- enrichment > 0: Preferential binding (more contacts than expected)
- +0.5 means "50% more contacts than expected"
- +1.0 means "2× as many contacts as expected"
- enrichment = 0: Neutral (contact frequency matches surface availability)
- enrichment < 0: Avoidance (fewer contacts than expected)
- -0.3 means "30% fewer contacts than expected"
- enrichment = -1: Complete avoidance (no contacts at all)
The expected share is based on protein surface availability:
expected_share = n_exposed_in_group / total_exposed_residues
This normalization answers: "Given how much of the protein surface is
aromatic/charged/etc., does this polymer type contact that surface
proportionally, more than proportionally, or less?"
Attributes
----------
polymer_type : str
Polymer residue type (e.g., "SBM", "EGM")
protein_group : str
Protein group label (e.g., "aromatic", "charged_positive")
total_contact_frames : int
Sum of contact frames across all exposed residues in this group.
mean_contact_fraction : float
Average per-residue contact fraction within this group.
n_residues_in_group : int
Total residues in this protein group (exposed + buried)
n_exposed_in_group : int
Surface-exposed residues in this group (used for enrichment)
n_residues_contacted : int
Number of exposed residues that had at least one contact
contact_share : float
Fraction of this polymer's total contacts that went to this group.
expected_share : float
Expected contact share based on surface availability
(n_exposed_in_group / total_exposed_residues)
enrichment : float | None
Zero-centered enrichment: (contact_share / expected_share) - 1
polymer_residue_count : int
Number of residues of this polymer type (metadata)
total_polymer_residues : int
Total polymer residues across all types (metadata)
polymer_heavy_atom_count : int
Number of heavy atoms for this polymer type (metadata)
total_polymer_heavy_atoms : int
Total polymer heavy atoms across all types (metadata)
"""
polymer_type: str
protein_group: str
total_contact_frames: int = Field(
description="Sum of contact frames for all exposed residues in group"
)
mean_contact_fraction: float = Field(
description="Average per-residue contact fraction in group"
)
n_residues_in_group: int = Field(description="Total residues in this protein group")
n_exposed_in_group: int = Field(description="Surface-exposed residues in group")
n_residues_contacted: int = Field(
default=0, description="Exposed residues with at least one contact"
)
contact_share: float = Field(
default=0.0, description="Fraction of polymer's contacts to this group"
)
expected_share: float = Field(
default=0.0,
description="Expected share based on protein surface availability",
)
enrichment: float | None = Field(
default=None,
description="Zero-centered enrichment: (contact_share / expected_share) - 1",
)
# Polymer composition metadata (for secondary analysis)
polymer_residue_count: int = Field(
default=0,
description="Number of residues of this polymer type (metadata)",
)
total_polymer_residues: int = Field(
default=0,
description="Total polymer residues across all types (metadata)",
)
polymer_heavy_atom_count: int = Field(
default=0,
description="Number of heavy atoms (non-H) for this polymer type (metadata)",
)
total_polymer_heavy_atoms: int = Field(
default=0,
description="Total heavy atoms across all polymer types (metadata)",
)
[docs]
class BindingPreferenceResult(BaseModel):
"""Complete binding preference analysis result.
Provides enrichment-normalized metrics for polymer-protein
binding preferences, answering questions like:
- "Does SBMA preferentially bind aromatic residues?"
- "How does EGMA's preference for charged residues compare to SBMA?"
- "Which amino acid class does this polymer type prefer?"
Enrichment values are centered at zero:
- enrichment > 0: Preferential binding
- enrichment = 0: Neutral (random chance)
- enrichment < 0: Avoidance
Attributes
----------
entries : list[BindingPreferenceEntry]
All (polymer_type × protein_group) combinations
n_frames : int
Total frames analyzed
total_exposed_residues : int
Number of surface-exposed residues considered
surface_exposure_threshold : float
SASA threshold used for surface filtering
protein_groups_used : dict[str, str]
Mapping of group name to MDAnalysis selection string
polymer_types_used : dict[str, str]
Mapping of polymer type name to MDAnalysis selection string
polymer_composition : PolymerComposition
Polymer composition data (residue/atom counts per type, metadata only)
system_coverage : SystemCoverageResult, optional
System-level coverage metrics collapsed across polymer types.
Answers: "What does this polymer mixture collectively cover?"
schema_version : int
Version for forward compatibility. Version 4 adds system_coverage.
"""
entries: list[BindingPreferenceEntry] = Field(
default_factory=list,
description="DEPRECATED: Overlapping-groups entries. Use binding_preference instead.",
)
n_frames: int = 0
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
protein_groups_used: dict[str, str] = Field(default_factory=dict)
polymer_types_used: dict[str, str] = Field(default_factory=dict)
polymer_composition: PolymerComposition | None = Field(
default=None,
description="Polymer composition data (residue/atom counts per type, metadata only)",
)
system_coverage: "SystemCoverageResult | None" = Field(
default=None,
description="System-level coverage metrics collapsed across polymer types",
)
binding_preference: "PolymerBindingPreferenceResult | None" = Field(
default=None,
description=(
"Partition-based per-polymer binding preference. "
"contact_share sums to 1.0 within each partition for each polymer. "
"This is the primary binding preference output (v5+)."
),
)
metadata: dict[str, Any] = Field(default_factory=dict)
schema_version: int = 5 # Version 5: adds partition-based binding_preference
[docs]
def to_dataframe(self) -> "pd.DataFrame":
"""Convert to pandas DataFrame for analysis/plotting.
Returns
-------
pd.DataFrame
Columns: polymer_type, protein_group, total_contact_frames,
mean_contact_fraction, n_residues_in_group, n_exposed_in_group,
n_residues_contacted, contact_share, expected_share, enrichment
"""
import pandas as pd
return pd.DataFrame([e.model_dump() for e in self.entries])
[docs]
def enrichment_matrix(self) -> dict[str, dict[str, float]]:
"""Get enrichment as nested dict: {polymer_type: {protein_group: value}}.
Enrichment values are centered at zero and normalized by protein
surface availability:
- > 0: Preferential binding (more contacts than expected)
- = 0: Neutral (matches surface availability)
- < 0: Avoidance (fewer contacts than expected)
Returns
-------
dict[str, dict[str, float]]
Nested mapping of enrichment values.
Missing/invalid values are returned as 0.0.
Examples
--------
>>> matrix = result.enrichment_matrix()
>>> print(matrix["SBM"]["aromatic"])
0.45 # 45% more contacts than expected based on surface availability
"""
result: dict[str, dict[str, float]] = {}
for entry in self.entries:
if entry.polymer_type not in result:
result[entry.polymer_type] = {}
value = entry.enrichment
result[entry.polymer_type][entry.protein_group] = value if value is not None else 0.0
return result
[docs]
def get_enrichment(self, polymer_type: str, protein_group: str) -> float | None:
"""Get enrichment for a specific (polymer_type, protein_group) pair.
Parameters
----------
polymer_type : str
Polymer type name
protein_group : str
Protein group name
Returns
-------
float or None
Enrichment value (centered at zero), or None if pair not found.
Enrichment is based on protein surface availability:
(contact_share / expected_share) - 1
"""
for entry in self.entries:
if entry.polymer_type == polymer_type and entry.protein_group == protein_group:
return entry.enrichment
return None
[docs]
def get_entry(self, polymer_type: str, protein_group: str) -> BindingPreferenceEntry | None:
"""Get the full entry for a (polymer_type, protein_group) pair.
Parameters
----------
polymer_type : str
Polymer type name
protein_group : str
Protein group name
Returns
-------
BindingPreferenceEntry or None
Full entry, or None if not found
"""
for entry in self.entries:
if entry.polymer_type == polymer_type and entry.protein_group == protein_group:
return entry
return None
[docs]
def polymer_types(self) -> list[str]:
"""Get list of polymer types in this result."""
return sorted({e.polymer_type for e in self.entries})
[docs]
def protein_groups(self) -> list[str]:
"""Get list of protein groups in this result."""
return sorted({e.protein_group for e in self.entries})
[docs]
def save(self, path: str | Path) -> None:
"""Save to JSON file."""
Path(path).write_text(json.dumps(self.model_dump(), indent=2))
logger.info(f"Saved binding preference result to {path}")
[docs]
@classmethod
def load(cls, path: str | Path) -> "BindingPreferenceResult":
"""Load from JSON file.
Parameters
----------
path : str or Path
Path to JSON file
Returns
-------
BindingPreferenceResult
Loaded result
"""
data = json.loads(Path(path).read_text())
return cls.model_validate(data)
[docs]
class AggregatedBindingPreferenceEntry(BaseModel):
"""Aggregated binding preference for one (polymer_type, protein_group) pair.
Contains mean ± SEM across replicates for enrichment based on
protein surface availability.
Enrichment values are centered at zero:
- > 0: Preferential binding (more contacts than expected by surface area)
- = 0: Neutral (contact frequency matches surface availability)
- < 0: Avoidance (fewer contacts than expected by surface area)
The expected share is based on protein surface availability:
expected_share = n_exposed_in_group / total_exposed_residues
This normalization answers: "Given how much of the protein surface is
aromatic/charged/etc., does this polymer type contact that surface
proportionally, more than proportionally, or less?"
"""
polymer_type: str
protein_group: str
# Enrichment (surface-availability normalized)
mean_enrichment: float | None = Field(
default=None,
description="Mean enrichment across replicates (surface-normalized)",
)
sem_enrichment: float | None = Field(
default=None,
description="Standard error of enrichment",
)
per_replicate_enrichments: list[float] = Field(
default_factory=list,
description="Enrichment values from each replicate",
)
# Contact metrics
mean_contact_fraction: float = Field(
default=0.0,
description="Mean per-residue contact fraction",
)
sem_contact_fraction: float = Field(
default=0.0,
description="Standard error of contact fraction",
)
mean_contact_share: float = Field(
default=0.0,
description="Mean contact share",
)
# Expected share (from protein surface availability)
expected_share: float = Field(
default=0.0,
description="Expected contact share based on protein surface availability",
)
# Group metadata
n_exposed_in_group: int = Field(
default=0,
description="Surface-exposed residues in group",
)
n_residues_in_group: int = Field(
default=0,
description="Total residues in group",
)
n_replicates: int = Field(
default=0,
description="Number of replicates with valid data",
)
[docs]
class AggregatedBindingPreferenceResult(BaseModel):
"""Binding preference aggregated across replicates.
Contains mean ± SEM for all metrics across multiple replicates.
Enrichment is normalized by protein surface availability.
Enrichment values are centered at zero:
- > 0: Preferential binding (more contacts than expected by surface area)
- = 0: Neutral (contact frequency matches surface availability)
- < 0: Avoidance (fewer contacts than expected by surface area)
"""
entries: list[AggregatedBindingPreferenceEntry] = Field(default_factory=list)
n_replicates: int = 0
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
protein_groups_used: dict[str, str] = Field(default_factory=dict)
polymer_types_used: dict[str, str] = Field(default_factory=dict)
polymer_composition: PolymerComposition | None = Field(
default=None,
description="Polymer composition data (residue/atom counts per type, metadata only)",
)
system_coverage: "AggregatedSystemCoverageResult | None" = Field(
default=None,
description="Aggregated system-level coverage metrics",
)
binding_preference: "AggregatedPolymerBindingPreferenceResult | None" = Field(
default=None,
description=(
"Aggregated partition-based per-polymer binding preference. "
"contact_share sums to 1.0 within each partition for each polymer. "
"This is the primary binding preference output (v5+)."
),
)
schema_version: int = 5 # Version 5: adds partition-based binding_preference
[docs]
def to_dataframe(self) -> "pd.DataFrame":
"""Convert to pandas DataFrame."""
import pandas as pd
return pd.DataFrame([e.model_dump() for e in self.entries])
[docs]
def enrichment_matrix(self) -> dict[str, dict[str, float]]:
"""Get mean enrichment as nested dict.
Enrichment values are centered at zero and normalized by protein
surface availability:
- > 0: Preferential binding (more contacts than expected)
- = 0: Neutral (matches surface availability)
- < 0: Avoidance (fewer contacts than expected)
Returns
-------
dict[str, dict[str, float]]
Nested mapping: {polymer_type: {protein_group: mean_enrichment}}.
Missing/invalid values are returned as 0.0.
"""
result: dict[str, dict[str, float]] = {}
for entry in self.entries:
if entry.polymer_type not in result:
result[entry.polymer_type] = {}
value = entry.mean_enrichment
result[entry.polymer_type][entry.protein_group] = value if value is not None else 0.0
return result
[docs]
def get_entry(
self, polymer_type: str, protein_group: str
) -> AggregatedBindingPreferenceEntry | None:
"""Get entry for a (polymer_type, protein_group) pair."""
for entry in self.entries:
if entry.polymer_type == polymer_type and entry.protein_group == protein_group:
return entry
return None
[docs]
def polymer_types(self) -> list[str]:
"""Get list of polymer types."""
return sorted({e.polymer_type for e in self.entries})
[docs]
def protein_groups(self) -> list[str]:
"""Get list of protein groups."""
return sorted({e.protein_group for e in self.entries})
[docs]
def save(self, path: str | Path) -> None:
"""Save to JSON file."""
Path(path).write_text(json.dumps(self.model_dump(), indent=2))
logger.info(f"Saved aggregated binding preference to {path}")
[docs]
@classmethod
def load(cls, path: str | Path) -> "AggregatedBindingPreferenceResult":
"""Load from JSON file."""
data = json.loads(Path(path).read_text())
return cls.model_validate(data)
[docs]
class SystemCoverageEntry(BaseModel):
"""System-level coverage entry for one protein group.
While BindingPreferenceEntry answers "What does SBMA prefer?", this entry
answers "What fraction of ALL polymer contacts in this system go to this
protein group?" — collapsing across polymer types for condition-level analysis.
Use Case
--------
Compare copolymer compositions (conditions) with each other. For example:
"Does a 70:30 SBMA:EGMA mixture cover aromatic residues differently than
a 30:70 mixture?"
Coverage Enrichment Calculation
-------------------------------
For each protein group:
coverage_share = Σ(all polymer contacts to group) / Σ(all polymer contacts)
expected_share = n_exposed_in_group / total_exposed_residues
coverage_enrichment = (coverage_share / expected_share) - 1
Interpretation (centered at zero):
- coverage_enrichment > 0: Preferential coverage (more than surface predicts)
- +0.5 means "50% more coverage than expected"
- coverage_enrichment = 0: Neutral (coverage matches surface availability)
- coverage_enrichment < 0: Under-coverage (less than surface predicts)
- -0.3 means "30% less coverage than expected"
- coverage_enrichment = -1: Complete avoidance (no coverage at all)
Attributes
----------
protein_group : str
Protein group label (e.g., "aromatic", "charged_positive")
total_contact_frames : int
Sum of contact frames from ALL polymer types to this group.
coverage_share : float
Fraction of all polymer contacts that went to this group.
expected_share : float
Expected coverage based on protein surface availability
(n_exposed_in_group / total_exposed_residues)
coverage_enrichment : float | None
Zero-centered enrichment: (coverage_share / expected_share) - 1
n_exposed_in_group : int
Surface-exposed residues in this group
n_residues_in_group : int
Total residues in this group (exposed + buried)
polymer_contributions : dict[str, float]
Breakdown of coverage by polymer type: {"SBMA": 0.35, "EGMA": 0.65}
Values sum to 1.0 (fraction of contacts to this group from each polymer)
"""
protein_group: str
total_contact_frames: int = Field(
default=0,
description="Sum of contact frames from ALL polymer types to this group",
)
coverage_share: float = Field(
default=0.0,
description="Fraction of all polymer contacts that went to this group",
)
expected_share: float = Field(
default=0.0,
description="Expected coverage based on protein surface availability",
)
coverage_enrichment: float | None = Field(
default=None,
description="Zero-centered enrichment: (coverage_share / expected_share) - 1",
)
n_exposed_in_group: int = Field(
default=0,
description="Surface-exposed residues in this group",
)
n_residues_in_group: int = Field(
default=0,
description="Total residues in this group",
)
polymer_contributions: dict[str, float] = Field(
default_factory=dict,
description="Fraction of contacts to this group from each polymer type (sums to 1.0)",
)
[docs]
class PartitionCoverageEntry(BaseModel):
"""Coverage metrics for one element in a partition.
A partition element is a mutually exclusive subset of protein residues.
Within a partition, all elements together cover the entire protein surface
exactly once (no residue is counted in multiple elements).
This ensures that:
- coverage_share sums to 1.0 across all elements in the partition
- expected_share sums to 1.0 across all elements in the partition
- enrichment is mathematically valid (no inflated denominators)
Attributes
----------
partition_element : str
Name of this partition element (e.g., "aromatic", "lid_helix_5", "rest_of_protein")
total_contact_frames : int
Sum of contact frames from ALL polymer types to residues in this element
coverage_share : float
Fraction of all polymer contacts that went to this element.
Sums to 1.0 across all elements in the partition.
expected_share : float
Expected coverage based on surface availability (n_exposed / total_exposed).
Sums to 1.0 across all elements in the partition.
coverage_enrichment : float | None
Zero-centered enrichment: (coverage_share / expected_share) - 1
n_exposed_in_element : int
Number of surface-exposed residues in this element
n_residues_in_element : int
Total residues in this element (exposed + buried)
polymer_contributions : dict[str, float]
Breakdown of coverage by polymer type (sums to 1.0 for this element)
"""
partition_element: str
total_contact_frames: int = Field(
default=0,
description="Sum of contact frames from ALL polymer types to this element",
)
coverage_share: float = Field(
default=0.0,
description="Fraction of all polymer contacts that went to this element",
)
expected_share: float = Field(
default=0.0,
description="Expected coverage based on protein surface availability",
)
coverage_enrichment: float | None = Field(
default=None,
description="Zero-centered enrichment: (coverage_share / expected_share) - 1",
)
n_exposed_in_element: int = Field(
default=0,
description="Surface-exposed residues in this element",
)
n_residues_in_element: int = Field(
default=0,
description="Total residues in this element",
)
polymer_contributions: dict[str, float] = Field(
default_factory=dict,
description="Fraction of contacts to this element from each polymer type",
)
[docs]
class PartitionCoverageResult(BaseModel):
"""Coverage analysis for a complete partition of the protein surface.
A partition divides the protein surface into mutually exclusive regions
that together cover the entire surface. This ensures valid enrichment
calculations where both coverage_share and expected_share sum to 1.0.
Partition Types
---------------
- **aa_class**: 5-way partition by amino acid class
(aromatic, polar, nonpolar, charged_positive, charged_negative)
- **binary_custom**: 2-way partition for a custom group vs rest_of_protein
(e.g., "lid_helix_5" vs "rest_of_protein")
- **combined_custom**: N+1 way partition with all non-overlapping custom groups
plus "rest_of_protein"
Attributes
----------
partition_name : str
Descriptive name (e.g., "aa_class", "lid_helix_5_vs_rest")
partition_type : str
One of: "aa_class", "binary_custom", "combined_custom"
entries : list[PartitionCoverageEntry]
Coverage metrics for each element in the partition
total_coverage_share : float
Validation check: should be ~1.0
total_expected_share : float
Validation check: should be ~1.0
"""
partition_name: str
partition_type: Literal["aa_class", "binary_custom", "combined_custom", "user_defined"]
entries: list[PartitionCoverageEntry] = Field(default_factory=list)
total_coverage_share: float = Field(
default=1.0,
description="Sum of coverage_share across elements (validation: should be ~1.0)",
)
total_expected_share: float = Field(
default=1.0,
description="Sum of expected_share across elements (validation: should be ~1.0)",
)
[docs]
def to_dataframe(self) -> "pd.DataFrame":
"""Convert to pandas DataFrame for analysis/plotting."""
import pandas as pd
return pd.DataFrame([e.model_dump() for e in self.entries])
[docs]
def coverage_enrichment_dict(self) -> dict[str, float]:
"""Get coverage enrichment as dict: {element: enrichment}."""
return {
e.partition_element: (
e.coverage_enrichment if e.coverage_enrichment is not None else 0.0
)
for e in self.entries
}
[docs]
def coverage_share_dict(self) -> dict[str, float]:
"""Get coverage shares as dict: {element: share}."""
return {e.partition_element: e.coverage_share for e in self.entries}
[docs]
def expected_share_dict(self) -> dict[str, float]:
"""Get expected shares as dict: {element: share}."""
return {e.partition_element: e.expected_share for e in self.entries}
[docs]
def get_entry(self, element_name: str) -> PartitionCoverageEntry | None:
"""Get the entry for a specific partition element."""
for entry in self.entries:
if entry.partition_element == element_name:
return entry
return None
[docs]
def element_names(self) -> list[str]:
"""Get list of partition element names."""
return [e.partition_element for e in self.entries]
[docs]
class SystemCoverageResult(BaseModel):
"""System-level coverage analysis with proper partition structure.
This result uses partitions to ensure mathematically valid enrichment
calculations. A partition divides the protein surface into mutually
exclusive regions, avoiding the overlap bug where custom groups and
AA class groups can inflate the expected_share denominator.
Partition Strategy
------------------
1. **AA Class Partition** (always computed):
5-way partition by amino acid class. Every surface residue belongs
to exactly one class.
2. **Binary Custom Partitions** (per custom group):
Each custom group is compared to "rest_of_protein". This answers:
"Does my lid_helix_5 have enriched polymer contacts vs non-lid regions?"
3. **Combined Custom Partition** (optional):
If custom groups don't overlap, all custom groups + rest_of_protein
form a single partition. If groups overlap, this is not computed
and an error is raised if explicitly requested.
4. **User-Defined Partitions** (from protein_partitions config):
Custom partitions specified by the user in the YAML config. Each
partition references groups from protein_groups and must be mutually
exclusive. 'rest_of_protein' is auto-added if the groups don't cover
all exposed protein residues. One plot per partition is generated.
Attributes
----------
aa_class_coverage : PartitionCoverageResult
5-way partition by amino acid class. Always computed.
custom_group_coverages : dict[str, PartitionCoverageResult]
Binary partitions for each custom group vs rest_of_protein.
Keys are custom group names.
combined_custom_coverage : PartitionCoverageResult | None
All custom groups + rest_of_protein as a single partition.
Only computed if custom groups don't overlap.
user_defined_partitions : dict[str, PartitionCoverageResult]
User-defined partitions from protein_partitions config.
Keys are partition names, values are the computed coverage partitions.
'rest_of_protein' is auto-added if groups don't fully cover the protein.
n_frames : int
Total frames analyzed
total_contact_frames : int
Sum of all polymer contacts across all groups
total_exposed_residues : int
Number of surface-exposed protein residues
surface_exposure_threshold : float | None
SASA threshold used for surface filtering
custom_group_selections : dict[str, str]
Custom group name to MDAnalysis selection (for metadata)
polymer_types_included : list[str]
Polymer types that contributed to coverage
has_overlapping_custom_groups : bool
True if custom groups share residues (combined partition not computed)
overlapping_group_pairs : list[tuple[str, str]]
Pairs of custom groups that overlap (for diagnostics)
schema_version : int
Schema version (2 = partition-based)
"""
aa_class_coverage: PartitionCoverageResult
custom_group_coverages: dict[str, PartitionCoverageResult] = Field(default_factory=dict)
combined_custom_coverage: PartitionCoverageResult | None = None
user_defined_partitions: dict[str, PartitionCoverageResult] = Field(
default_factory=dict,
description=(
"User-defined partitions from protein_partitions config. "
"Each partition contains mutually exclusive groups defined by the user, "
"with 'rest_of_protein' auto-added if groups don't cover all protein residues."
),
)
# Metadata
n_frames: int = 0
total_contact_frames: int = Field(
default=0,
description="Sum of all polymer contacts across all groups",
)
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
custom_group_selections: dict[str, str] = Field(default_factory=dict)
polymer_types_included: list[str] = Field(default_factory=list)
has_overlapping_custom_groups: bool = False
overlapping_group_pairs: list[tuple[str, str]] = Field(default_factory=list)
schema_version: int = 2
[docs]
def get_aa_class_enrichment(self, aa_class: str) -> float | None:
"""Get coverage enrichment for an AA class.
Parameters
----------
aa_class : str
One of: aromatic, polar, nonpolar, charged_positive, charged_negative
Returns
-------
float | None
Coverage enrichment, or None if not found
"""
entry = self.aa_class_coverage.get_entry(aa_class)
return entry.coverage_enrichment if entry else None
[docs]
def get_custom_group_enrichment(self, group_name: str) -> float | None:
"""Get coverage enrichment for a custom group (vs rest_of_protein).
Parameters
----------
group_name : str
Custom group name (e.g., "lid_helix_5")
Returns
-------
float | None
Coverage enrichment for the custom group, or None if not found
"""
if group_name not in self.custom_group_coverages:
return None
partition = self.custom_group_coverages[group_name]
entry = partition.get_entry(group_name)
return entry.coverage_enrichment if entry else None
[docs]
def aa_class_enrichment_dict(self) -> dict[str, float]:
"""Get AA class enrichments as dict: {aa_class: enrichment}."""
return self.aa_class_coverage.coverage_enrichment_dict()
[docs]
def custom_group_enrichment_dict(self) -> dict[str, float]:
"""Get custom group enrichments as dict: {group_name: enrichment}.
Each custom group's enrichment is relative to rest_of_protein.
"""
result = {}
for group_name, partition in self.custom_group_coverages.items():
entry = partition.get_entry(group_name)
if entry and entry.coverage_enrichment is not None:
result[group_name] = entry.coverage_enrichment
else:
result[group_name] = 0.0
return result
[docs]
def aa_class_names(self) -> list[str]:
"""Get list of AA class names in canonical order."""
canonical_order = ["aromatic", "polar", "nonpolar", "charged_positive", "charged_negative"]
names = self.aa_class_coverage.element_names()
return [n for n in canonical_order if n in names]
[docs]
def custom_group_names(self) -> list[str]:
"""Get list of custom group names."""
return sorted(self.custom_group_coverages.keys())
[docs]
def user_partition_names(self) -> list[str]:
"""Get list of user-defined partition names."""
return sorted(self.user_defined_partitions.keys())
[docs]
def get_user_partition(self, partition_name: str) -> PartitionCoverageResult | None:
"""Get a user-defined partition by name.
Parameters
----------
partition_name : str
Name of the partition (e.g., "lid_helices")
Returns
-------
PartitionCoverageResult | None
The partition result, or None if not found
"""
return self.user_defined_partitions.get(partition_name)
[docs]
def save(self, path: str | Path) -> None:
"""Save to JSON file.
Parameters
----------
path : str or Path
Output path for JSON file
"""
Path(path).write_text(json.dumps(self.model_dump(), indent=2))
logger.info(f"Saved system coverage result to {path}")
[docs]
@classmethod
def load(cls, path: str | Path) -> "SystemCoverageResult":
"""Load from JSON file.
Parameters
----------
path : str or Path
Path to JSON file
Returns
-------
SystemCoverageResult
Loaded result
"""
data = json.loads(Path(path).read_text())
return cls.model_validate(data)
[docs]
class PartitionBindingEntry(BaseModel):
"""Binding metrics for one partition element for a specific polymer type.
A partition element is a mutually exclusive subset of protein residues.
Within a partition, all elements together cover the entire protein surface
exactly once (no residue is counted in multiple elements).
This entry is for a SINGLE polymer type, answering:
"What fraction of SBMA's contacts go to aromatic residues?"
This ensures that:
- contact_share sums to 1.0 across all elements in the partition (for this polymer)
- expected_share sums to 1.0 across all elements in the partition
- enrichment is mathematically valid (no inflated denominators)
Attributes
----------
partition_element : str
Name of this partition element (e.g., "aromatic", "lid_helix_5", "rest_of_protein")
polymer_type : str
Polymer type this entry is for (e.g., "SBM", "EGM")
total_contact_frames : int
Sum of contact frames from THIS polymer type to residues in this element
contact_share : float
Fraction of this polymer's contacts that went to this element.
Sums to 1.0 across all elements in the partition (for this polymer).
expected_share : float
Expected share based on surface availability (n_exposed / total_exposed).
Sums to 1.0 across all elements in the partition.
enrichment : float | None
Zero-centered enrichment: (contact_share / expected_share) - 1
n_exposed_in_element : int
Number of surface-exposed residues in this element
n_residues_in_element : int
Total residues in this element (exposed + buried)
n_residues_contacted : int
Number of exposed residues that had at least one contact from this polymer
"""
partition_element: str
polymer_type: str
total_contact_frames: int = Field(
default=0,
description="Sum of contact frames from THIS polymer type to this element",
)
contact_share: float = Field(
default=0.0,
description="Fraction of this polymer's contacts that went to this element",
)
expected_share: float = Field(
default=0.0,
description="Expected share based on protein surface availability",
)
enrichment: float | None = Field(
default=None,
description="Zero-centered enrichment: (contact_share / expected_share) - 1",
)
n_exposed_in_element: int = Field(
default=0,
description="Surface-exposed residues in this element",
)
n_residues_in_element: int = Field(
default=0,
description="Total residues in this element",
)
n_residues_contacted: int = Field(
default=0,
description="Exposed residues contacted by this polymer type",
)
[docs]
class PartitionBindingResult(BaseModel):
"""Binding preference for a complete partition for ONE polymer type.
A partition divides the protein surface into mutually exclusive regions
that together cover the entire surface. This class stores the binding
preference of a single polymer type across all partition elements.
This ensures valid enrichment calculations where both contact_share
and expected_share sum to 1.0.
Partition Types
---------------
- **aa_class**: 5-way partition by amino acid class
(aromatic, polar, nonpolar, charged_positive, charged_negative)
- **user_defined**: N+1 way partition with user-specified groups
plus "rest_of_protein" (auto-added if groups don't cover all residues)
Attributes
----------
partition_name : str
Descriptive name (e.g., "aa_class", "lid_helices")
partition_type : str
One of: "aa_class", "user_defined"
polymer_type : str
Polymer type this result is for (e.g., "SBM", "EGM")
entries : list[PartitionBindingEntry]
Binding metrics for each element in the partition
total_contact_share : float
Validation check: should be ~1.0
total_expected_share : float
Validation check: should be ~1.0
total_contact_frames : int
Total contact frames from this polymer type (across all elements)
"""
partition_name: str
partition_type: Literal["aa_class", "user_defined"]
polymer_type: str
entries: list[PartitionBindingEntry] = Field(default_factory=list)
total_contact_share: float = Field(
default=1.0,
description="Sum of contact_share across elements (validation: should be ~1.0)",
)
total_expected_share: float = Field(
default=1.0,
description="Sum of expected_share across elements (validation: should be ~1.0)",
)
total_contact_frames: int = Field(
default=0,
description="Total contact frames from this polymer type",
)
[docs]
def to_dataframe(self) -> "pd.DataFrame":
"""Convert to pandas DataFrame for analysis/plotting."""
import pandas as pd
return pd.DataFrame([e.model_dump() for e in self.entries])
[docs]
def enrichment_dict(self) -> dict[str, float]:
"""Get enrichment as dict: {element: enrichment}."""
return {
e.partition_element: (e.enrichment if e.enrichment is not None else 0.0)
for e in self.entries
}
[docs]
def expected_share_dict(self) -> dict[str, float]:
"""Get expected shares as dict: {element: share}."""
return {e.partition_element: e.expected_share for e in self.entries}
[docs]
def get_entry(self, element_name: str) -> PartitionBindingEntry | None:
"""Get the entry for a specific partition element."""
for entry in self.entries:
if entry.partition_element == element_name:
return entry
return None
[docs]
def element_names(self) -> list[str]:
"""Get list of partition element names."""
return [e.partition_element for e in self.entries]
[docs]
class PolymerBindingPreferenceResult(BaseModel):
"""Per-polymer binding preference using proper partition structure.
This result stores binding preference for ALL polymer types, with each
polymer having its own partition-based enrichment calculations.
Unlike SystemCoverageResult (which collapses all polymer contacts), this
maintains per-polymer data to answer: "Does SBMA prefer aromatic residues
more than EGMA does?"
Partition Strategy (per polymer type)
-------------------------------------
1. **AA Class Partition** (always computed):
5-way partition by amino acid class. Every surface residue belongs
to exactly one class. Each polymer type gets its own enrichment values.
2. **User-Defined Partitions** (from protein_partitions config):
Custom partitions specified by the user. Each partition references groups
from protein_groups. 'rest_of_protein' is auto-added if groups don't
cover all exposed protein residues. Each polymer type gets its own
enrichment values per partition.
Attributes
----------
aa_class_binding : dict[str, PartitionBindingResult]
AA class partition binding for each polymer type.
Keys are polymer type names (e.g., "SBM", "EGM").
user_defined_partitions : dict[str, dict[str, PartitionBindingResult]]
User-defined partitions for each polymer type.
Outer keys are partition names, inner keys are polymer types.
Example: {"lid_helices": {"SBM": ..., "EGM": ...}}
n_frames : int
Total frames analyzed
total_exposed_residues : int
Number of surface-exposed protein residues
surface_exposure_threshold : float | None
SASA threshold used for surface filtering
polymer_types : list[str]
Polymer types included in this result
polymer_composition : PolymerComposition | None
Polymer composition metadata
schema_version : int
Schema version (5 = partition-based binding preference)
"""
aa_class_binding: dict[str, PartitionBindingResult] = Field(
default_factory=dict,
description="AA class partition binding for each polymer type",
)
user_defined_partitions: dict[str, dict[str, PartitionBindingResult]] = Field(
default_factory=dict,
description=(
"User-defined partitions for each polymer type. "
"Outer key: partition name, inner key: polymer type."
),
)
# Metadata
n_frames: int = 0
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
polymer_types: list[str] = Field(default_factory=list)
polymer_composition: PolymerComposition | None = None
protein_groups_used: dict[str, str] = Field(default_factory=dict)
schema_version: int = 5 # Version 5: partition-based per-polymer binding
[docs]
def get_aa_class_enrichment(self, polymer_type: str, aa_class: str) -> float | None:
"""Get binding enrichment for an AA class for a specific polymer.
Parameters
----------
polymer_type : str
Polymer type (e.g., "SBM")
aa_class : str
One of: aromatic, polar, nonpolar, charged_positive, charged_negative
Returns
-------
float | None
Binding enrichment, or None if not found
"""
if polymer_type not in self.aa_class_binding:
return None
entry = self.aa_class_binding[polymer_type].get_entry(aa_class)
return entry.enrichment if entry else None
[docs]
def get_user_partition_enrichment(
self, partition_name: str, polymer_type: str, element_name: str
) -> float | None:
"""Get binding enrichment for a user partition element for a specific polymer.
Parameters
----------
partition_name : str
Name of the user-defined partition (e.g., "lid_helices")
polymer_type : str
Polymer type (e.g., "SBM")
element_name : str
Element within the partition (e.g., "lid_helix_5")
Returns
-------
float | None
Binding enrichment, or None if not found
"""
if partition_name not in self.user_defined_partitions:
return None
if polymer_type not in self.user_defined_partitions[partition_name]:
return None
entry = self.user_defined_partitions[partition_name][polymer_type].get_entry(element_name)
return entry.enrichment if entry else None
[docs]
def aa_class_enrichment_matrix(self) -> dict[str, dict[str, float]]:
"""Get AA class enrichments as nested dict: {polymer_type: {aa_class: enrichment}}.
Returns
-------
dict[str, dict[str, float]]
Nested mapping of enrichment values.
"""
result: dict[str, dict[str, float]] = {}
for poly_type, partition_result in self.aa_class_binding.items():
result[poly_type] = partition_result.enrichment_dict()
return result
[docs]
def user_partition_enrichment_matrix(self, partition_name: str) -> dict[str, dict[str, float]]:
"""Get user partition enrichments as nested dict.
Parameters
----------
partition_name : str
Name of the user-defined partition
Returns
-------
dict[str, dict[str, float]]
{polymer_type: {element_name: enrichment}}
"""
if partition_name not in self.user_defined_partitions:
return {}
result: dict[str, dict[str, float]] = {}
for poly_type, partition_result in self.user_defined_partitions[partition_name].items():
result[poly_type] = partition_result.enrichment_dict()
return result
[docs]
def aa_class_names(self) -> list[str]:
"""Get list of AA class names in canonical order."""
canonical_order = ["aromatic", "polar", "nonpolar", "charged_positive", "charged_negative"]
if not self.aa_class_binding:
return []
# Get from first polymer type
first_poly = next(iter(self.aa_class_binding.values()))
names = first_poly.element_names()
return [n for n in canonical_order if n in names]
[docs]
def user_partition_names(self) -> list[str]:
"""Get list of user-defined partition names."""
return sorted(self.user_defined_partitions.keys())
[docs]
def save(self, path: str | Path) -> None:
"""Save to JSON file."""
Path(path).write_text(json.dumps(self.model_dump(), indent=2))
logger.info(f"Saved polymer binding preference result to {path}")
[docs]
@classmethod
def load(cls, path: str | Path) -> "PolymerBindingPreferenceResult":
"""Load from JSON file."""
data = json.loads(Path(path).read_text())
return cls.model_validate(data)
[docs]
class AggregatedPartitionBindingEntry(BaseModel):
"""Aggregated binding metrics for one partition element for a specific polymer type.
Contains mean ± SEM across replicates for binding preference, enabling
statistical comparison of binding enrichment across conditions.
Attributes
----------
partition_element : str
Element name (e.g., "aromatic", "lid_helix_5", "rest_of_protein")
polymer_type : str
Polymer type this entry is for (e.g., "SBM", "EGM")
mean_contact_share : float
Mean contact share across replicates
sem_contact_share : float
Standard error of contact share
mean_enrichment : float | None
Mean enrichment across replicates
sem_enrichment : float | None
Standard error of enrichment
per_replicate_enrichments : list[float]
Enrichment values from each replicate
expected_share : float
Expected share based on surface availability
n_exposed_in_element : int
Surface-exposed residues in this element
n_residues_in_element : int
Total residues in this element
n_replicates : int
Number of replicates with valid data
"""
partition_element: str
polymer_type: str
mean_contact_share: float = Field(
default=0.0,
description="Mean contact share across replicates",
)
sem_contact_share: float = Field(
default=0.0,
description="Standard error of contact share",
)
mean_enrichment: float | None = Field(
default=None,
description="Mean enrichment across replicates",
)
sem_enrichment: float | None = Field(
default=None,
description="Standard error of enrichment",
)
per_replicate_enrichments: list[float] = Field(
default_factory=list,
description="Enrichment values from each replicate",
)
expected_share: float = Field(
default=0.0,
description="Expected share based on surface availability",
)
n_exposed_in_element: int = Field(
default=0,
description="Surface-exposed residues in this element",
)
n_residues_in_element: int = Field(
default=0,
description="Total residues in this element",
)
n_replicates: int = Field(
default=0,
description="Number of replicates with valid data",
)
[docs]
class AggregatedPartitionBindingResult(BaseModel):
"""Aggregated binding preference for a partition for ONE polymer type.
Contains aggregated statistics across replicates for all partition elements.
Attributes
----------
partition_name : str
Descriptive name (e.g., "aa_class", "lid_helices")
partition_type : str
One of: "aa_class", "user_defined"
polymer_type : str
Polymer type this result is for
entries : list[AggregatedPartitionBindingEntry]
Aggregated binding metrics for each element
mean_total_contact_share : float
Mean of total_contact_share across replicates (validation: should be ~1.0)
n_replicates : int
Number of replicates
"""
partition_name: str
partition_type: Literal["aa_class", "user_defined"]
polymer_type: str
entries: list[AggregatedPartitionBindingEntry] = Field(default_factory=list)
mean_total_contact_share: float = Field(
default=1.0,
description="Mean sum of contact_share across elements (should be ~1.0)",
)
n_replicates: int = Field(default=0)
[docs]
def enrichment_dict(self) -> dict[str, float]:
"""Get mean enrichment as dict: {element: mean_enrichment}."""
return {
e.partition_element: (e.mean_enrichment if e.mean_enrichment is not None else 0.0)
for e in self.entries
}
[docs]
def element_names(self) -> list[str]:
"""Get list of partition element names."""
return [e.partition_element for e in self.entries]
[docs]
class AggregatedPolymerBindingPreferenceResult(BaseModel):
"""Aggregated per-polymer binding preference across replicates.
Contains mean ± SEM for all partition-based binding metrics.
Attributes
----------
aa_class_binding : dict[str, AggregatedPartitionBindingResult]
Aggregated AA class partition binding for each polymer type.
user_defined_partitions : dict[str, dict[str, AggregatedPartitionBindingResult]]
Aggregated user-defined partitions for each polymer type.
n_replicates : int
Number of replicates
total_exposed_residues : int
Number of surface-exposed protein residues
surface_exposure_threshold : float | None
SASA threshold used
polymer_types : list[str]
Polymer types included
schema_version : int
Schema version
"""
aa_class_binding: dict[str, AggregatedPartitionBindingResult] = Field(
default_factory=dict,
description="Aggregated AA class partition binding for each polymer type",
)
user_defined_partitions: dict[str, dict[str, AggregatedPartitionBindingResult]] = Field(
default_factory=dict,
description="Aggregated user-defined partitions for each polymer type",
)
n_replicates: int = 0
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
polymer_types: list[str] = Field(default_factory=list)
schema_version: int = 5
[docs]
def aa_class_enrichment_matrix(self) -> dict[str, dict[str, float]]:
"""Get AA class enrichments as nested dict: {polymer_type: {aa_class: mean_enrichment}}."""
result: dict[str, dict[str, float]] = {}
for poly_type, partition_result in self.aa_class_binding.items():
result[poly_type] = partition_result.enrichment_dict()
return result
[docs]
def aa_class_names(self) -> list[str]:
"""Get list of AA class names in canonical order."""
canonical_order = ["aromatic", "polar", "nonpolar", "charged_positive", "charged_negative"]
if not self.aa_class_binding:
return []
first_poly = next(iter(self.aa_class_binding.values()))
names = first_poly.element_names()
return [n for n in canonical_order if n in names]
[docs]
def user_partition_names(self) -> list[str]:
"""Get list of user-defined partition names."""
return sorted(self.user_defined_partitions.keys())
[docs]
class AggregatedPartitionCoverageEntry(BaseModel):
"""Aggregated coverage for one partition element across replicates.
Contains mean ± SEM for coverage metrics, enabling statistical comparison
of coverage enrichment across conditions.
Attributes
----------
partition_element : str
Element name (e.g., "aromatic", "lid_helix_5", "rest_of_protein")
mean_coverage_share : float
Mean coverage share across replicates
sem_coverage_share : float
Standard error of coverage share
mean_coverage_enrichment : float | None
Mean coverage enrichment across replicates
sem_coverage_enrichment : float | None
Standard error of coverage enrichment
per_replicate_enrichments : list[float]
Coverage enrichment values from each replicate
expected_share : float
Expected coverage based on surface availability
n_exposed_in_element : int
Surface-exposed residues in this element
n_residues_in_element : int
Total residues in this element
n_replicates : int
Number of replicates with valid data
mean_polymer_contributions : dict[str, float]
Mean polymer contributions across replicates
"""
partition_element: str
mean_coverage_share: float = Field(
default=0.0,
description="Mean coverage share across replicates",
)
sem_coverage_share: float = Field(
default=0.0,
description="Standard error of coverage share",
)
mean_coverage_enrichment: float | None = Field(
default=None,
description="Mean coverage enrichment across replicates",
)
sem_coverage_enrichment: float | None = Field(
default=None,
description="Standard error of coverage enrichment",
)
per_replicate_enrichments: list[float] = Field(
default_factory=list,
description="Coverage enrichment values from each replicate",
)
expected_share: float = Field(
default=0.0,
description="Expected coverage based on surface availability",
)
n_exposed_in_element: int = Field(
default=0,
description="Surface-exposed residues in this element",
)
n_residues_in_element: int = Field(
default=0,
description="Total residues in this element",
)
n_replicates: int = Field(
default=0,
description="Number of replicates with valid data",
)
mean_polymer_contributions: dict[str, float] = Field(
default_factory=dict,
description="Mean polymer contributions across replicates",
)
[docs]
class AggregatedPartitionCoverageResult(BaseModel):
"""Aggregated coverage for a partition across replicates.
Contains mean ± SEM for all elements in the partition.
Attributes
----------
partition_name : str
Name of the partition
partition_type : str
One of: "aa_class", "binary_custom", "combined_custom"
entries : list[AggregatedPartitionCoverageEntry]
Aggregated coverage for each element
n_replicates : int
Number of replicates aggregated
"""
partition_name: str
partition_type: Literal["aa_class", "binary_custom", "combined_custom", "user_defined"]
entries: list[AggregatedPartitionCoverageEntry] = Field(default_factory=list)
n_replicates: int = 0
[docs]
def to_dataframe(self) -> "pd.DataFrame":
"""Convert to pandas DataFrame."""
import pandas as pd
return pd.DataFrame([e.model_dump() for e in self.entries])
[docs]
def coverage_enrichment_dict(self) -> dict[str, float]:
"""Get mean coverage enrichment as dict: {element: enrichment}."""
return {
e.partition_element: (
e.mean_coverage_enrichment if e.mean_coverage_enrichment is not None else 0.0
)
for e in self.entries
}
[docs]
def get_entry(self, element_name: str) -> AggregatedPartitionCoverageEntry | None:
"""Get entry for a specific partition element."""
for entry in self.entries:
if entry.partition_element == element_name:
return entry
return None
[docs]
def element_names(self) -> list[str]:
"""Get list of partition element names."""
return [e.partition_element for e in self.entries]
[docs]
class AggregatedSystemCoverageResult(BaseModel):
"""System coverage aggregated across replicates (schema v2).
Contains aggregated partition coverages with mean ± SEM statistics
for statistical comparison between conditions.
Attributes
----------
aa_class_coverage : AggregatedPartitionCoverageResult
Aggregated 5-way AA class partition
custom_group_coverages : dict[str, AggregatedPartitionCoverageResult]
Aggregated binary partitions for each custom group
combined_custom_coverage : AggregatedPartitionCoverageResult | None
Aggregated combined custom partition (if applicable)
n_replicates : int
Number of replicates aggregated
total_exposed_residues : int
Number of surface-exposed protein residues
surface_exposure_threshold : float | None
SASA threshold used for surface filtering
custom_group_selections : dict[str, str]
Custom group name to MDAnalysis selection
polymer_types_included : list[str]
Polymer types that contributed to coverage
has_overlapping_custom_groups : bool
True if custom groups share residues
schema_version : int
Schema version (2 = partition-based)
"""
aa_class_coverage: AggregatedPartitionCoverageResult
custom_group_coverages: dict[str, AggregatedPartitionCoverageResult] = Field(
default_factory=dict
)
combined_custom_coverage: AggregatedPartitionCoverageResult | None = None
user_defined_partitions: dict[str, AggregatedPartitionCoverageResult] = Field(
default_factory=dict,
description=(
"Aggregated user-defined partitions from protein_partitions config. "
"Keys are partition names, values are aggregated coverage results."
),
)
# Metadata
n_replicates: int = 0
total_exposed_residues: int = 0
surface_exposure_threshold: float | None = None
custom_group_selections: dict[str, str] = Field(default_factory=dict)
polymer_types_included: list[str] = Field(default_factory=list)
has_overlapping_custom_groups: bool = False
schema_version: int = 2
[docs]
def get_aa_class_enrichment(self, aa_class: str) -> float | None:
"""Get mean coverage enrichment for an AA class."""
entry = self.aa_class_coverage.get_entry(aa_class)
return entry.mean_coverage_enrichment if entry else None
[docs]
def get_custom_group_enrichment(self, group_name: str) -> float | None:
"""Get mean coverage enrichment for a custom group (vs rest_of_protein)."""
if group_name not in self.custom_group_coverages:
return None
partition = self.custom_group_coverages[group_name]
entry = partition.get_entry(group_name)
return entry.mean_coverage_enrichment if entry else None
[docs]
def aa_class_enrichment_dict(self) -> dict[str, float]:
"""Get AA class mean enrichments as dict: {aa_class: enrichment}."""
return self.aa_class_coverage.coverage_enrichment_dict()
[docs]
def custom_group_enrichment_dict(self) -> dict[str, float]:
"""Get custom group mean enrichments as dict: {group_name: enrichment}."""
result = {}
for group_name, partition in self.custom_group_coverages.items():
entry = partition.get_entry(group_name)
if entry and entry.mean_coverage_enrichment is not None:
result[group_name] = entry.mean_coverage_enrichment
else:
result[group_name] = 0.0
return result
[docs]
def aa_class_names(self) -> list[str]:
"""Get list of AA class names in canonical order."""
canonical_order = ["aromatic", "polar", "nonpolar", "charged_positive", "charged_negative"]
names = self.aa_class_coverage.element_names()
return [n for n in canonical_order if n in names]
[docs]
def custom_group_names(self) -> list[str]:
"""Get list of custom group names."""
return sorted(self.custom_group_coverages.keys())
[docs]
def save(self, path: str | Path) -> None:
"""Save to JSON file."""
Path(path).write_text(json.dumps(self.model_dump(), indent=2))
logger.info(f"Saved aggregated system coverage to {path}")
[docs]
@classmethod
def load(cls, path: str | Path) -> "AggregatedSystemCoverageResult":
"""Load from JSON file."""
data = json.loads(Path(path).read_text())
return cls.model_validate(data)