From d2ef3a2b84795dbd981cf03cb00f53a4121fa8b6 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Thu, 12 Feb 2026 03:56:42 +0800 Subject: [PATCH] feat(05-01): implement tiering logic and evidence summary module - Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count - Add supporting_layers and evidence_gaps columns per gene - Use vectorized polars expressions for performance - Configurable thresholds for tier assignment --- src/usher_pipeline/output/__init__.py | 14 ++++ src/usher_pipeline/output/evidence_summary.py | 82 ++++++++++++++++++ src/usher_pipeline/output/tiers.py | 83 +++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 src/usher_pipeline/output/__init__.py create mode 100644 src/usher_pipeline/output/evidence_summary.py create mode 100644 src/usher_pipeline/output/tiers.py diff --git a/src/usher_pipeline/output/__init__.py b/src/usher_pipeline/output/__init__.py new file mode 100644 index 0000000..94dedd5 --- /dev/null +++ b/src/usher_pipeline/output/__init__.py @@ -0,0 +1,14 @@ +"""Output generation: tiered candidate classification and dual-format file writing.""" + +from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary +from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers + +# writers.py exports will be added in Task 2 + +__all__ = [ + "assign_tiers", + "TIER_THRESHOLDS", + "add_evidence_summary", + "EVIDENCE_LAYERS", + # "write_candidate_output" will be added in Task 2 +] diff --git a/src/usher_pipeline/output/evidence_summary.py b/src/usher_pipeline/output/evidence_summary.py new file mode 100644 index 0000000..fb16e97 --- /dev/null +++ b/src/usher_pipeline/output/evidence_summary.py @@ -0,0 +1,82 @@ +"""Per-gene evidence summary: supporting layers and gaps.""" + +import polars as pl + +# Six evidence layer names (must match column names in scored_genes) +EVIDENCE_LAYERS = [ + "gnomad", + "expression", + "annotation", + "localization", + "animal_model", + "literature", +] + + +def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame: + """ + Add supporting_layers and evidence_gaps columns to scored genes. + + For each gene, identifies which evidence layers contributed scores + (supporting_layers) and which layers are missing (evidence_gaps). + + Args: + df: Polars DataFrame with columns like: + - gene_id, gene_symbol, composite_score, evidence_count + - gnomad_score, expression_score, annotation_score, etc. (all nullable) + + Returns: + DataFrame with two added columns: + - supporting_layers (str): comma-separated list of layers with non-NULL scores + - evidence_gaps (str): comma-separated list of layers with NULL scores + + Examples: + - Gene with gnomad, expression, annotation scores: + supporting_layers = "gnomad,expression,annotation" + evidence_gaps = "localization,animal_model,literature" + - Gene with all NULL scores: + supporting_layers = "" + evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature" + + Notes: + - Uses polars expressions (no pandas conversion) + - Empty string for supporting_layers if no evidence + - Preserves DataFrame order and all other columns + """ + # Build supporting_layers: comma-separated list of non-NULL layers + # Strategy: create a list column, filter nulls, join to string + supporting_exprs = [] + gap_exprs = [] + + for layer in EVIDENCE_LAYERS: + score_col = f"{layer}_score" + + # For supporting_layers: keep layer name if score is NOT NULL, else NULL + supporting_exprs.append( + pl.when(pl.col(score_col).is_not_null()) + .then(pl.lit(layer)) + .otherwise(pl.lit(None)) + ) + + # For evidence_gaps: keep layer name if score IS NULL, else NULL + gap_exprs.append( + pl.when(pl.col(score_col).is_null()) + .then(pl.lit(layer)) + .otherwise(pl.lit(None)) + ) + + # Combine into list columns, drop nulls, join with comma + result = df.with_columns( + # supporting_layers: join all non-NULL layer names + pl.concat_list(supporting_exprs) + .list.drop_nulls() + .list.join(",") + .alias("supporting_layers"), + # evidence_gaps: join all NULL layer names + pl.concat_list(gap_exprs) + .list.drop_nulls() + .list.join(",") + .alias("evidence_gaps"), + ) + + return result diff --git a/src/usher_pipeline/output/tiers.py b/src/usher_pipeline/output/tiers.py new file mode 100644 index 0000000..9beef70 --- /dev/null +++ b/src/usher_pipeline/output/tiers.py @@ -0,0 +1,83 @@ +"""Confidence tiering logic for scored candidate genes.""" + +import polars as pl + +# Default tier thresholds from research +TIER_THRESHOLDS = { + "HIGH": {"composite_score": 0.7, "evidence_count": 3}, + "MEDIUM": {"composite_score": 0.4, "evidence_count": 2}, + "LOW": {"composite_score": 0.2, "evidence_count": 1}, +} + + +def assign_tiers( + scored_df: pl.DataFrame, thresholds: dict | None = None +) -> pl.DataFrame: + """ + Assign confidence tiers to scored genes and filter out EXCLUDED genes. + + Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers + based on composite_score and evidence_count. Genes below LOW threshold + are marked as EXCLUDED and filtered out. + + Args: + scored_df: Polars DataFrame with columns: + - gene_id (str) + - gene_symbol (str) + - composite_score (float, nullable) + - evidence_count (int) + - quality_flag (str) + - All 6 layer score columns (nullable) + - All 6 contribution columns (nullable) + thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format: + { + "HIGH": {"composite_score": float, "evidence_count": int}, + "MEDIUM": {"composite_score": float, "evidence_count": int}, + "LOW": {"composite_score": float, "evidence_count": int}, + } + + Returns: + DataFrame with added confidence_tier column (str), sorted by + composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out. + + Notes: + - Uses vectorized polars expressions (not row-by-row iteration) + - Genes with NULL composite_score are always EXCLUDED + - Deterministic sorting for reproducibility + - Filtering happens before return (EXCLUDED rows removed) + """ + # Use provided thresholds or defaults + t = thresholds if thresholds is not None else TIER_THRESHOLDS + + # Extract threshold values for readability + high_score = t["HIGH"]["composite_score"] + high_count = t["HIGH"]["evidence_count"] + med_score = t["MEDIUM"]["composite_score"] + med_count = t["MEDIUM"]["evidence_count"] + low_score = t["LOW"]["composite_score"] + + # Add confidence_tier column using vectorized when/then/otherwise chain + result = scored_df.with_columns( + pl.when( + (pl.col("composite_score") >= high_score) + & (pl.col("evidence_count") >= high_count) + ) + .then(pl.lit("HIGH")) + .when( + (pl.col("composite_score") >= med_score) + & (pl.col("evidence_count") >= med_count) + ) + .then(pl.lit("MEDIUM")) + .when(pl.col("composite_score") >= low_score) + .then(pl.lit("LOW")) + .otherwise(pl.lit("EXCLUDED")) + .alias("confidence_tier") + ) + + # Filter out EXCLUDED genes + result = result.filter(pl.col("confidence_tier") != "EXCLUDED") + + # Sort deterministically: composite_score DESC, gene_id ASC + result = result.sort(["composite_score", "gene_id"], descending=[True, False]) + + return result