feat(05-01): implement tiering logic and evidence summary module

- Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count - Add supporting_layers and evidence_gaps columns per gene - Use vectorized polars expressions for performance - Configurable thresholds for tier assignment
2026-02-12 03:56:42 +08:00
parent 6ab7fd1378
commit d2ef3a2b84
3 changed files with 179 additions and 0 deletions
--- a/src/usher_pipeline/output/init.py
+++ b/src/usher_pipeline/output/init.py
@@ -0,0 +1,14 @@
+"""Output generation: tiered candidate classification and dual-format file writing."""
+
+from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary
+from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers
+
+# writers.py exports will be added in Task 2
+
+__all__ = [
+    "assign_tiers",
+    "TIER_THRESHOLDS",
+    "add_evidence_summary",
+    "EVIDENCE_LAYERS",
+    # "write_candidate_output" will be added in Task 2
+]
--- a/src/usher_pipeline/output/evidence_summary.py
+++ b/src/usher_pipeline/output/evidence_summary.py
@@ -0,0 +1,82 @@
+"""Per-gene evidence summary: supporting layers and gaps."""
+
+import polars as pl
+
+# Six evidence layer names (must match column names in scored_genes)
+EVIDENCE_LAYERS = [
+    "gnomad",
+    "expression",
+    "annotation",
+    "localization",
+    "animal_model",
+    "literature",
+]
+
+
+def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame:
+    """
+    Add supporting_layers and evidence_gaps columns to scored genes.
+
+    For each gene, identifies which evidence layers contributed scores
+    (supporting_layers) and which layers are missing (evidence_gaps).
+
+    Args:
+        df: Polars DataFrame with columns like:
+            - gene_id, gene_symbol, composite_score, evidence_count
+            - gnomad_score, expression_score, annotation_score, etc. (all nullable)
+
+    Returns:
+        DataFrame with two added columns:
+            - supporting_layers (str): comma-separated list of layers with non-NULL scores
+            - evidence_gaps (str): comma-separated list of layers with NULL scores
+
+    Examples:
+        - Gene with gnomad, expression, annotation scores:
+          supporting_layers = "gnomad,expression,annotation"
+          evidence_gaps = "localization,animal_model,literature"
+        - Gene with all NULL scores:
+          supporting_layers = ""
+          evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature"
+
+    Notes:
+        - Uses polars expressions (no pandas conversion)
+        - Empty string for supporting_layers if no evidence
+        - Preserves DataFrame order and all other columns
+    """
+    # Build supporting_layers: comma-separated list of non-NULL layers
+    # Strategy: create a list column, filter nulls, join to string
+    supporting_exprs = []
+    gap_exprs = []
+
+    for layer in EVIDENCE_LAYERS:
+        score_col = f"{layer}_score"
+
+        # For supporting_layers: keep layer name if score is NOT NULL, else NULL
+        supporting_exprs.append(
+            pl.when(pl.col(score_col).is_not_null())
+            .then(pl.lit(layer))
+            .otherwise(pl.lit(None))
+        )
+
+        # For evidence_gaps: keep layer name if score IS NULL, else NULL
+        gap_exprs.append(
+            pl.when(pl.col(score_col).is_null())
+            .then(pl.lit(layer))
+            .otherwise(pl.lit(None))
+        )
+
+    # Combine into list columns, drop nulls, join with comma
+    result = df.with_columns(
+        # supporting_layers: join all non-NULL layer names
+        pl.concat_list(supporting_exprs)
+        .list.drop_nulls()
+        .list.join(",")
+        .alias("supporting_layers"),
+        # evidence_gaps: join all NULL layer names
+        pl.concat_list(gap_exprs)
+        .list.drop_nulls()
+        .list.join(",")
+        .alias("evidence_gaps"),
+    )
+
+    return result
--- a/src/usher_pipeline/output/tiers.py
+++ b/src/usher_pipeline/output/tiers.py
@@ -0,0 +1,83 @@
+"""Confidence tiering logic for scored candidate genes."""
+
+import polars as pl
+
+# Default tier thresholds from research
+TIER_THRESHOLDS = {
+    "HIGH": {"composite_score": 0.7, "evidence_count": 3},
+    "MEDIUM": {"composite_score": 0.4, "evidence_count": 2},
+    "LOW": {"composite_score": 0.2, "evidence_count": 1},
+}
+
+
+def assign_tiers(
+    scored_df: pl.DataFrame, thresholds: dict | None = None
+) -> pl.DataFrame:
+    """
+    Assign confidence tiers to scored genes and filter out EXCLUDED genes.
+
+    Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers
+    based on composite_score and evidence_count. Genes below LOW threshold
+    are marked as EXCLUDED and filtered out.
+
+    Args:
+        scored_df: Polars DataFrame with columns:
+            - gene_id (str)
+            - gene_symbol (str)
+            - composite_score (float, nullable)
+            - evidence_count (int)
+            - quality_flag (str)
+            - All 6 layer score columns (nullable)
+            - All 6 contribution columns (nullable)
+        thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format:
+            {
+                "HIGH": {"composite_score": float, "evidence_count": int},
+                "MEDIUM": {"composite_score": float, "evidence_count": int},
+                "LOW": {"composite_score": float, "evidence_count": int},
+            }
+
+    Returns:
+        DataFrame with added confidence_tier column (str), sorted by
+        composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out.
+
+    Notes:
+        - Uses vectorized polars expressions (not row-by-row iteration)
+        - Genes with NULL composite_score are always EXCLUDED
+        - Deterministic sorting for reproducibility
+        - Filtering happens before return (EXCLUDED rows removed)
+    """
+    # Use provided thresholds or defaults
+    t = thresholds if thresholds is not None else TIER_THRESHOLDS
+
+    # Extract threshold values for readability
+    high_score = t["HIGH"]["composite_score"]
+    high_count = t["HIGH"]["evidence_count"]
+    med_score = t["MEDIUM"]["composite_score"]
+    med_count = t["MEDIUM"]["evidence_count"]
+    low_score = t["LOW"]["composite_score"]
+
+    # Add confidence_tier column using vectorized when/then/otherwise chain
+    result = scored_df.with_columns(
+        pl.when(
+            (pl.col("composite_score") >= high_score)
+            & (pl.col("evidence_count") >= high_count)
+        )
+        .then(pl.lit("HIGH"))
+        .when(
+            (pl.col("composite_score") >= med_score)
+            & (pl.col("evidence_count") >= med_count)
+        )
+        .then(pl.lit("MEDIUM"))
+        .when(pl.col("composite_score") >= low_score)
+        .then(pl.lit("LOW"))
+        .otherwise(pl.lit("EXCLUDED"))
+        .alias("confidence_tier")
+    )
+
+    # Filter out EXCLUDED genes
+    result = result.filter(pl.col("confidence_tier") != "EXCLUDED")
+
+    # Sort deterministically: composite_score DESC, gene_id ASC
+    result = result.sort(["composite_score", "gene_id"], descending=[True, False])
+
+    return result