feat(05-01): implement tiering logic and evidence summary module

- Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count - Add supporting_layers and evidence_gaps columns per gene - Use vectorized polars expressions for performance - Configurable thresholds for tier assignment
2026-02-12 03:56:42 +08:00
parent 6ab7fd1378
commit d2ef3a2b84
3 changed files with 179 additions and 0 deletions
--- a/src/usher_pipeline/output/init.py
+++ b/src/usher_pipeline/output/init.py
@@ -0,0 +1,14 @@
 """Output generation: tiered candidate classification and dual-format file writing."""
 from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary
 from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers
 # writers.py exports will be added in Task 2
 __all__ = [
    "assign_tiers",
    "TIER_THRESHOLDS",
    "add_evidence_summary",
    "EVIDENCE_LAYERS",
    # "write_candidate_output" will be added in Task 2
 ]
--- a/src/usher_pipeline/output/evidence_summary.py
+++ b/src/usher_pipeline/output/evidence_summary.py
@@ -0,0 +1,82 @@
 """Per-gene evidence summary: supporting layers and gaps."""
 import polars as pl
 # Six evidence layer names (must match column names in scored_genes)
 EVIDENCE_LAYERS = [
    "gnomad",
    "expression",
    "annotation",
    "localization",
    "animal_model",
    "literature",
 ]
 def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame:
    """
    Add supporting_layers and evidence_gaps columns to scored genes.
    For each gene, identifies which evidence layers contributed scores
    (supporting_layers) and which layers are missing (evidence_gaps).
    Args:
        df: Polars DataFrame with columns like:
            - gene_id, gene_symbol, composite_score, evidence_count
            - gnomad_score, expression_score, annotation_score, etc. (all nullable)
    Returns:
        DataFrame with two added columns:
            - supporting_layers (str): comma-separated list of layers with non-NULL scores
            - evidence_gaps (str): comma-separated list of layers with NULL scores
    Examples:
        - Gene with gnomad, expression, annotation scores:
          supporting_layers = "gnomad,expression,annotation"
          evidence_gaps = "localization,animal_model,literature"
        - Gene with all NULL scores:
          supporting_layers = ""
          evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature"
    Notes:
        - Uses polars expressions (no pandas conversion)
        - Empty string for supporting_layers if no evidence
        - Preserves DataFrame order and all other columns
    """
    # Build supporting_layers: comma-separated list of non-NULL layers
    # Strategy: create a list column, filter nulls, join to string
    supporting_exprs = []
    gap_exprs = []
    for layer in EVIDENCE_LAYERS:
        score_col = f"{layer}_score"
        # For supporting_layers: keep layer name if score is NOT NULL, else NULL
        supporting_exprs.append(
            pl.when(pl.col(score_col).is_not_null())
            .then(pl.lit(layer))
            .otherwise(pl.lit(None))
        )
        # For evidence_gaps: keep layer name if score IS NULL, else NULL
        gap_exprs.append(
            pl.when(pl.col(score_col).is_null())
            .then(pl.lit(layer))
            .otherwise(pl.lit(None))
        )
    # Combine into list columns, drop nulls, join with comma
    result = df.with_columns(
        # supporting_layers: join all non-NULL layer names
        pl.concat_list(supporting_exprs)
        .list.drop_nulls()
        .list.join(",")
        .alias("supporting_layers"),
        # evidence_gaps: join all NULL layer names
        pl.concat_list(gap_exprs)
        .list.drop_nulls()
        .list.join(",")
        .alias("evidence_gaps"),
    )
    return result
--- a/src/usher_pipeline/output/tiers.py
+++ b/src/usher_pipeline/output/tiers.py
@@ -0,0 +1,83 @@
 """Confidence tiering logic for scored candidate genes."""
 import polars as pl
 # Default tier thresholds from research
 TIER_THRESHOLDS = {
    "HIGH": {"composite_score": 0.7, "evidence_count": 3},
    "MEDIUM": {"composite_score": 0.4, "evidence_count": 2},
    "LOW": {"composite_score": 0.2, "evidence_count": 1},
 }
 def assign_tiers(
    scored_df: pl.DataFrame, thresholds: dict | None = None
 ) -> pl.DataFrame:
    """
    Assign confidence tiers to scored genes and filter out EXCLUDED genes.
    Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers
    based on composite_score and evidence_count. Genes below LOW threshold
    are marked as EXCLUDED and filtered out.
    Args:
        scored_df: Polars DataFrame with columns:
            - gene_id (str)
            - gene_symbol (str)
            - composite_score (float, nullable)
            - evidence_count (int)
            - quality_flag (str)
            - All 6 layer score columns (nullable)
            - All 6 contribution columns (nullable)
        thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format:
            {
                "HIGH": {"composite_score": float, "evidence_count": int},
                "MEDIUM": {"composite_score": float, "evidence_count": int},
                "LOW": {"composite_score": float, "evidence_count": int},
            }
    Returns:
        DataFrame with added confidence_tier column (str), sorted by
        composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out.
    Notes:
        - Uses vectorized polars expressions (not row-by-row iteration)
        - Genes with NULL composite_score are always EXCLUDED
        - Deterministic sorting for reproducibility
        - Filtering happens before return (EXCLUDED rows removed)
    """
    # Use provided thresholds or defaults
    t = thresholds if thresholds is not None else TIER_THRESHOLDS
    # Extract threshold values for readability
    high_score = t["HIGH"]["composite_score"]
    high_count = t["HIGH"]["evidence_count"]
    med_score = t["MEDIUM"]["composite_score"]
    med_count = t["MEDIUM"]["evidence_count"]
    low_score = t["LOW"]["composite_score"]
    # Add confidence_tier column using vectorized when/then/otherwise chain
    result = scored_df.with_columns(
        pl.when(
            (pl.col("composite_score") >= high_score)
            & (pl.col("evidence_count") >= high_count)
        )
        .then(pl.lit("HIGH"))
        .when(
            (pl.col("composite_score") >= med_score)
            & (pl.col("evidence_count") >= med_count)
        )
        .then(pl.lit("MEDIUM"))
        .when(pl.col("composite_score") >= low_score)
        .then(pl.lit("LOW"))
        .otherwise(pl.lit("EXCLUDED"))
        .alias("confidence_tier")
    )
    # Filter out EXCLUDED genes
    result = result.filter(pl.col("confidence_tier") != "EXCLUDED")
    # Sort deterministically: composite_score DESC, gene_id ASC
    result = result.sort(["composite_score", "gene_id"], descending=[True, False])
    return result