feat(05-01): implement tiering logic and evidence summary module

- Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count
- Add supporting_layers and evidence_gaps columns per gene
- Use vectorized polars expressions for performance
- Configurable thresholds for tier assignment
This commit is contained in:
2026-02-12 03:56:42 +08:00
parent 6ab7fd1378
commit d2ef3a2b84
3 changed files with 179 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
"""Output generation: tiered candidate classification and dual-format file writing."""
from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary
from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers
# writers.py exports will be added in Task 2
__all__ = [
"assign_tiers",
"TIER_THRESHOLDS",
"add_evidence_summary",
"EVIDENCE_LAYERS",
# "write_candidate_output" will be added in Task 2
]

View File

@@ -0,0 +1,82 @@
"""Per-gene evidence summary: supporting layers and gaps."""
import polars as pl
# Six evidence layer names (must match column names in scored_genes)
EVIDENCE_LAYERS = [
"gnomad",
"expression",
"annotation",
"localization",
"animal_model",
"literature",
]
def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame:
"""
Add supporting_layers and evidence_gaps columns to scored genes.
For each gene, identifies which evidence layers contributed scores
(supporting_layers) and which layers are missing (evidence_gaps).
Args:
df: Polars DataFrame with columns like:
- gene_id, gene_symbol, composite_score, evidence_count
- gnomad_score, expression_score, annotation_score, etc. (all nullable)
Returns:
DataFrame with two added columns:
- supporting_layers (str): comma-separated list of layers with non-NULL scores
- evidence_gaps (str): comma-separated list of layers with NULL scores
Examples:
- Gene with gnomad, expression, annotation scores:
supporting_layers = "gnomad,expression,annotation"
evidence_gaps = "localization,animal_model,literature"
- Gene with all NULL scores:
supporting_layers = ""
evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature"
Notes:
- Uses polars expressions (no pandas conversion)
- Empty string for supporting_layers if no evidence
- Preserves DataFrame order and all other columns
"""
# Build supporting_layers: comma-separated list of non-NULL layers
# Strategy: create a list column, filter nulls, join to string
supporting_exprs = []
gap_exprs = []
for layer in EVIDENCE_LAYERS:
score_col = f"{layer}_score"
# For supporting_layers: keep layer name if score is NOT NULL, else NULL
supporting_exprs.append(
pl.when(pl.col(score_col).is_not_null())
.then(pl.lit(layer))
.otherwise(pl.lit(None))
)
# For evidence_gaps: keep layer name if score IS NULL, else NULL
gap_exprs.append(
pl.when(pl.col(score_col).is_null())
.then(pl.lit(layer))
.otherwise(pl.lit(None))
)
# Combine into list columns, drop nulls, join with comma
result = df.with_columns(
# supporting_layers: join all non-NULL layer names
pl.concat_list(supporting_exprs)
.list.drop_nulls()
.list.join(",")
.alias("supporting_layers"),
# evidence_gaps: join all NULL layer names
pl.concat_list(gap_exprs)
.list.drop_nulls()
.list.join(",")
.alias("evidence_gaps"),
)
return result

View File

@@ -0,0 +1,83 @@
"""Confidence tiering logic for scored candidate genes."""
import polars as pl
# Default tier thresholds from research
TIER_THRESHOLDS = {
"HIGH": {"composite_score": 0.7, "evidence_count": 3},
"MEDIUM": {"composite_score": 0.4, "evidence_count": 2},
"LOW": {"composite_score": 0.2, "evidence_count": 1},
}
def assign_tiers(
scored_df: pl.DataFrame, thresholds: dict | None = None
) -> pl.DataFrame:
"""
Assign confidence tiers to scored genes and filter out EXCLUDED genes.
Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers
based on composite_score and evidence_count. Genes below LOW threshold
are marked as EXCLUDED and filtered out.
Args:
scored_df: Polars DataFrame with columns:
- gene_id (str)
- gene_symbol (str)
- composite_score (float, nullable)
- evidence_count (int)
- quality_flag (str)
- All 6 layer score columns (nullable)
- All 6 contribution columns (nullable)
thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format:
{
"HIGH": {"composite_score": float, "evidence_count": int},
"MEDIUM": {"composite_score": float, "evidence_count": int},
"LOW": {"composite_score": float, "evidence_count": int},
}
Returns:
DataFrame with added confidence_tier column (str), sorted by
composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out.
Notes:
- Uses vectorized polars expressions (not row-by-row iteration)
- Genes with NULL composite_score are always EXCLUDED
- Deterministic sorting for reproducibility
- Filtering happens before return (EXCLUDED rows removed)
"""
# Use provided thresholds or defaults
t = thresholds if thresholds is not None else TIER_THRESHOLDS
# Extract threshold values for readability
high_score = t["HIGH"]["composite_score"]
high_count = t["HIGH"]["evidence_count"]
med_score = t["MEDIUM"]["composite_score"]
med_count = t["MEDIUM"]["evidence_count"]
low_score = t["LOW"]["composite_score"]
# Add confidence_tier column using vectorized when/then/otherwise chain
result = scored_df.with_columns(
pl.when(
(pl.col("composite_score") >= high_score)
& (pl.col("evidence_count") >= high_count)
)
.then(pl.lit("HIGH"))
.when(
(pl.col("composite_score") >= med_score)
& (pl.col("evidence_count") >= med_count)
)
.then(pl.lit("MEDIUM"))
.when(pl.col("composite_score") >= low_score)
.then(pl.lit("LOW"))
.otherwise(pl.lit("EXCLUDED"))
.alias("confidence_tier")
)
# Filter out EXCLUDED genes
result = result.filter(pl.col("confidence_tier") != "EXCLUDED")
# Sort deterministically: composite_score DESC, gene_id ASC
result = result.sort(["composite_score", "gene_id"], descending=[True, False])
return result