"""Triangulator: compare claims across the 4 PEN-STACK packages.
Each discrepancy category corresponds to a rule in config/triangulation_rules_v3.yaml.
Rules are applied to every row in the unified editor universe; results are returned
as DiscrepancyRecord objects that can be serialised to a Parquet file.
"""
from collections.abc import Iterator
from dataclasses import asdict, dataclass
from pathlib import Path
import pandas as pd
import yaml
[docs]
@dataclass
class DiscrepancyRecord:
entity_id: str
source: str # "natural" | "design"
category: str # matches a key in triangulation_rules_v3.yaml
severity: str # high | medium | low
sources_involved: str # pipe-separated source names
details: str # human-readable explanation of the discrepancy
[docs]
class Triangulator:
def __init__(
self,
rules_path: Path = Path("config/triangulation_rules_v3.yaml"),
) -> None:
self.rules = yaml.safe_load(rules_path.read_text())
# ──────────────────────────────────────────────────────────────────────────
# Public API
# ──────────────────────────────────────────────────────────────────────────
[docs]
def audit(self, entity_id: str, universe: pd.DataFrame) -> list[DiscrepancyRecord]:
"""Return all discrepancy records for one entity."""
rows = universe[universe["entity_id"] == entity_id]
if rows.empty:
return []
row = rows.iloc[0]
return list(self._apply_all_rules(row))
[docs]
def run_full(self, universe: pd.DataFrame) -> pd.DataFrame:
"""Apply all rules to the full universe; return a flat DataFrame."""
records: list[dict] = []
for _, row in universe.iterrows():
for rec in self._apply_all_rules(row):
records.append(asdict(rec))
if not records:
return pd.DataFrame(columns=list(DiscrepancyRecord.__dataclass_fields__))
return pd.DataFrame(records)
# ──────────────────────────────────────────────────────────────────────────
# Rule implementations
# ──────────────────────────────────────────────────────────────────────────
def _apply_all_rules(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
yield from self._rule_axis_vs_tier(row)
yield from self._rule_mech_vs_pfam(row)
yield from self._rule_cargo_inconsistency(row)
yield from self._rule_evidence_gap(row)
yield from self._rule_size_inconsistency(row)
def _rule_axis_vs_tier(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
"""AXIS_VS_TIER (high): S_DSB contradicts mech-class tier_a_gate for natural editors."""
if row.get("source") != "natural":
return
s_dsb = _float(row, "s_dsb")
tier_a = bool(row.get("tier_a_gate", False))
cfg = self.rules["discrepancy_categories"]["AXIS_VS_TIER"]
if s_dsb is not None and s_dsb >= 0.95 and not tier_a:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="AXIS_VS_TIER",
severity=cfg["severity"],
sources_involved="MECH_CLASS|PEN_SCORE",
details=(
f"S_DSB={s_dsb:.3f} ≥ 0.95 (pen-score: DSB-free) "
f"but tier_a_gate=False (mech-class: not IS110 Tier-A). "
f"pen-score and mech-class disagree on DSB-avoidance classification."
),
)
if s_dsb is not None and s_dsb < 0.80 and tier_a:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="AXIS_VS_TIER",
severity=cfg["severity"],
sources_involved="MECH_CLASS|PEN_SCORE",
details=(
f"tier_a_gate=True (mech-class: IS110 Tier-A) "
f"but S_DSB={s_dsb:.3f} < 0.80 (pen-score: unexpectedly low for IS110). "
f"Possible OOD probe or scoring artefact."
),
)
def _rule_mech_vs_pfam(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
"""MECH_VS_PFAM (high): PFAM-based atlas classification disagrees with mech-class tier."""
if row.get("source") != "natural":
return
atlas = bool(row.get("atlas_system_present", False))
tier_a = bool(row.get("tier_a_gate", False))
s_dsb = _float(row, "s_dsb")
cfg = self.rules["discrepancy_categories"]["MECH_VS_PFAM"]
# In atlas (PFAM evidence for DSB-free) but mech-class says NOT IS110
if atlas and not tier_a and s_dsb is not None and s_dsb >= 0.95:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="MECH_VS_PFAM",
severity=cfg["severity"],
sources_involved="GENOME_ATLAS|MECH_CLASS",
details=(
f"atlas_system_present=True (genome-atlas has PFAM-based entry) "
f"and S_DSB={s_dsb:.3f} (pen-score: DSB-free), "
f"but tier_a_gate=False (mech-class: not IS110 Tier-A). "
f"PFAM evidence supports DSB-free mechanism; mech-class disagrees."
),
)
# mech-class says IS110 but no atlas entry — PFAM evidence absent
if tier_a and not atlas and s_dsb is not None and s_dsb >= 0.95:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="MECH_VS_PFAM",
severity=cfg["severity"],
sources_involved="GENOME_ATLAS|MECH_CLASS",
details=(
"tier_a_gate=True (mech-class: IS110 Tier-A) "
"but atlas_system_present=False (genome-atlas has no entry). "
"mech-class calls IS110 without supporting PFAM atlas record."
),
)
def _rule_cargo_inconsistency(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
"""CARGO_INCONSISTENCY (medium): intrinsic_cargo=True but S_Cargo < 0.60."""
intrinsic = bool(row.get("intrinsic_cargo_mechanism", False))
s_cargo = _float(row, "s_cargo")
cfg = self.rules["discrepancy_categories"]["CARGO_INCONSISTENCY"]
if intrinsic and s_cargo is not None and s_cargo < 0.60:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source=str(row.get("source", "unknown")),
category="CARGO_INCONSISTENCY",
severity=cfg["severity"],
sources_involved="PEN_SCORE",
details=(
f"intrinsic_cargo_mechanism=True (metadata: native cargo delivery) "
f"but S_Cargo={s_cargo:.3f} < 0.60 (pen-score: limited cargo demonstrated). "
f"Metadata flag and cargo axis score are discordant."
),
)
def _rule_evidence_gap(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
"""EVIDENCE_GAP (low): IS110 confirmed bridge recombinase with no cell-based evidence."""
if row.get("source") != "natural":
return
tier_a = bool(row.get("tier_a_gate", False))
s_dsb = _float(row, "s_dsb")
cell = bool(row.get("cell_based_evidence", False))
cfg = self.rules["discrepancy_categories"]["EVIDENCE_GAP"]
if tier_a and s_dsb is not None and s_dsb >= 0.95 and not cell:
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="EVIDENCE_GAP",
severity=cfg["severity"],
sources_involved="MECH_CLASS|PEN_SCORE",
details=(
f"tier_a_gate=True AND S_DSB={s_dsb:.3f} (confirmed IS110 bridge recombinase) "
f"but cell_based_evidence=False. The Molecular Pen hypothesis is untested "
f"in mammalian cells for this IS110 member."
),
)
def _rule_size_inconsistency(self, row: pd.Series) -> Iterator[DiscrepancyRecord]:
"""SIZE_INCONSISTENCY (medium): atlas entry present but length_aa unknown in pen-score."""
if row.get("source") != "natural":
return
atlas = bool(row.get("atlas_system_present", False))
length_aa = row.get("length_aa")
cfg = self.rules["discrepancy_categories"]["SIZE_INCONSISTENCY"]
if atlas and pd.isna(length_aa):
yield DiscrepancyRecord(
entity_id=str(row["entity_id"]),
source="natural",
category="SIZE_INCONSISTENCY",
severity=cfg["severity"],
sources_involved="GENOME_ATLAS|PEN_SCORE",
details=(
"atlas_system_present=True (genome-atlas has entry with UniProt record) "
"but length_aa=None in unified universe (pen-score EditorEntry does not "
"expose sequence length). Gate 4 deliverability falls back to split_aav "
"heuristic; cross-source length verification not possible."
),
)
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def _float(row: pd.Series, col: str) -> float | None:
val = row.get(col)
if val is None or (hasattr(val, "__class__") and pd.isna(val)):
return None
try:
return float(val)
except (TypeError, ValueError):
return None