Source code for pen_compare.triangulation.triangulator

"""Triangulator: compare claims across the 4 PEN-STACK packages.

Each discrepancy category corresponds to a rule in config/triangulation_rules_v3.yaml.
Rules are applied to every row in the unified editor universe; results are returned
as DiscrepancyRecord objects that can be serialised to a Parquet file.
"""

from collections.abc import Iterator
from dataclasses import asdict, dataclass
from pathlib import Path

import pandas as pd
import yaml


[docs] @dataclass class DiscrepancyRecord: entity_id: str source: str # "natural" | "design" category: str # matches a key in triangulation_rules_v3.yaml severity: str # high | medium | low sources_involved: str # pipe-separated source names details: str # human-readable explanation of the discrepancy
[docs] class Triangulator: def __init__( self, rules_path: Path = Path("config/triangulation_rules_v3.yaml"), ) -> None: self.rules = yaml.safe_load(rules_path.read_text()) # ────────────────────────────────────────────────────────────────────────── # Public API # ──────────────────────────────────────────────────────────────────────────
[docs] def audit(self, entity_id: str, universe: pd.DataFrame) -> list[DiscrepancyRecord]: """Return all discrepancy records for one entity.""" rows = universe[universe["entity_id"] == entity_id] if rows.empty: return [] row = rows.iloc[0] return list(self._apply_all_rules(row))
[docs] def run_full(self, universe: pd.DataFrame) -> pd.DataFrame: """Apply all rules to the full universe; return a flat DataFrame.""" records: list[dict] = [] for _, row in universe.iterrows(): for rec in self._apply_all_rules(row): records.append(asdict(rec)) if not records: return pd.DataFrame(columns=list(DiscrepancyRecord.__dataclass_fields__)) return pd.DataFrame(records)
# ────────────────────────────────────────────────────────────────────────── # Rule implementations # ────────────────────────────────────────────────────────────────────────── def _apply_all_rules(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: yield from self._rule_axis_vs_tier(row) yield from self._rule_mech_vs_pfam(row) yield from self._rule_cargo_inconsistency(row) yield from self._rule_evidence_gap(row) yield from self._rule_size_inconsistency(row) def _rule_axis_vs_tier(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: """AXIS_VS_TIER (high): S_DSB contradicts mech-class tier_a_gate for natural editors.""" if row.get("source") != "natural": return s_dsb = _float(row, "s_dsb") tier_a = bool(row.get("tier_a_gate", False)) cfg = self.rules["discrepancy_categories"]["AXIS_VS_TIER"] if s_dsb is not None and s_dsb >= 0.95 and not tier_a: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="AXIS_VS_TIER", severity=cfg["severity"], sources_involved="MECH_CLASS|PEN_SCORE", details=( f"S_DSB={s_dsb:.3f} ≥ 0.95 (pen-score: DSB-free) " f"but tier_a_gate=False (mech-class: not IS110 Tier-A). " f"pen-score and mech-class disagree on DSB-avoidance classification." ), ) if s_dsb is not None and s_dsb < 0.80 and tier_a: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="AXIS_VS_TIER", severity=cfg["severity"], sources_involved="MECH_CLASS|PEN_SCORE", details=( f"tier_a_gate=True (mech-class: IS110 Tier-A) " f"but S_DSB={s_dsb:.3f} < 0.80 (pen-score: unexpectedly low for IS110). " f"Possible OOD probe or scoring artefact." ), ) def _rule_mech_vs_pfam(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: """MECH_VS_PFAM (high): PFAM-based atlas classification disagrees with mech-class tier.""" if row.get("source") != "natural": return atlas = bool(row.get("atlas_system_present", False)) tier_a = bool(row.get("tier_a_gate", False)) s_dsb = _float(row, "s_dsb") cfg = self.rules["discrepancy_categories"]["MECH_VS_PFAM"] # In atlas (PFAM evidence for DSB-free) but mech-class says NOT IS110 if atlas and not tier_a and s_dsb is not None and s_dsb >= 0.95: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="MECH_VS_PFAM", severity=cfg["severity"], sources_involved="GENOME_ATLAS|MECH_CLASS", details=( f"atlas_system_present=True (genome-atlas has PFAM-based entry) " f"and S_DSB={s_dsb:.3f} (pen-score: DSB-free), " f"but tier_a_gate=False (mech-class: not IS110 Tier-A). " f"PFAM evidence supports DSB-free mechanism; mech-class disagrees." ), ) # mech-class says IS110 but no atlas entry — PFAM evidence absent if tier_a and not atlas and s_dsb is not None and s_dsb >= 0.95: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="MECH_VS_PFAM", severity=cfg["severity"], sources_involved="GENOME_ATLAS|MECH_CLASS", details=( "tier_a_gate=True (mech-class: IS110 Tier-A) " "but atlas_system_present=False (genome-atlas has no entry). " "mech-class calls IS110 without supporting PFAM atlas record." ), ) def _rule_cargo_inconsistency(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: """CARGO_INCONSISTENCY (medium): intrinsic_cargo=True but S_Cargo < 0.60.""" intrinsic = bool(row.get("intrinsic_cargo_mechanism", False)) s_cargo = _float(row, "s_cargo") cfg = self.rules["discrepancy_categories"]["CARGO_INCONSISTENCY"] if intrinsic and s_cargo is not None and s_cargo < 0.60: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source=str(row.get("source", "unknown")), category="CARGO_INCONSISTENCY", severity=cfg["severity"], sources_involved="PEN_SCORE", details=( f"intrinsic_cargo_mechanism=True (metadata: native cargo delivery) " f"but S_Cargo={s_cargo:.3f} < 0.60 (pen-score: limited cargo demonstrated). " f"Metadata flag and cargo axis score are discordant." ), ) def _rule_evidence_gap(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: """EVIDENCE_GAP (low): IS110 confirmed bridge recombinase with no cell-based evidence.""" if row.get("source") != "natural": return tier_a = bool(row.get("tier_a_gate", False)) s_dsb = _float(row, "s_dsb") cell = bool(row.get("cell_based_evidence", False)) cfg = self.rules["discrepancy_categories"]["EVIDENCE_GAP"] if tier_a and s_dsb is not None and s_dsb >= 0.95 and not cell: yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="EVIDENCE_GAP", severity=cfg["severity"], sources_involved="MECH_CLASS|PEN_SCORE", details=( f"tier_a_gate=True AND S_DSB={s_dsb:.3f} (confirmed IS110 bridge recombinase) " f"but cell_based_evidence=False. The Molecular Pen hypothesis is untested " f"in mammalian cells for this IS110 member." ), ) def _rule_size_inconsistency(self, row: pd.Series) -> Iterator[DiscrepancyRecord]: """SIZE_INCONSISTENCY (medium): atlas entry present but length_aa unknown in pen-score.""" if row.get("source") != "natural": return atlas = bool(row.get("atlas_system_present", False)) length_aa = row.get("length_aa") cfg = self.rules["discrepancy_categories"]["SIZE_INCONSISTENCY"] if atlas and pd.isna(length_aa): yield DiscrepancyRecord( entity_id=str(row["entity_id"]), source="natural", category="SIZE_INCONSISTENCY", severity=cfg["severity"], sources_involved="GENOME_ATLAS|PEN_SCORE", details=( "atlas_system_present=True (genome-atlas has entry with UniProt record) " "but length_aa=None in unified universe (pen-score EditorEntry does not " "expose sequence length). Gate 4 deliverability falls back to split_aav " "heuristic; cross-source length verification not possible." ), )
# ────────────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────────────── def _float(row: pd.Series, col: str) -> float | None: val = row.get(col) if val is None or (hasattr(val, "__class__") and pd.isna(val)): return None try: return float(val) except (TypeError, ValueError): return None