feat(05-01): implement tiering logic and evidence summary module
- Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count - Add supporting_layers and evidence_gaps columns per gene - Use vectorized polars expressions for performance - Configurable thresholds for tier assignment
This commit is contained in:
14
src/usher_pipeline/output/__init__.py
Normal file
14
src/usher_pipeline/output/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""Output generation: tiered candidate classification and dual-format file writing."""
|
||||
|
||||
from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary
|
||||
from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers
|
||||
|
||||
# writers.py exports will be added in Task 2
|
||||
|
||||
__all__ = [
|
||||
"assign_tiers",
|
||||
"TIER_THRESHOLDS",
|
||||
"add_evidence_summary",
|
||||
"EVIDENCE_LAYERS",
|
||||
# "write_candidate_output" will be added in Task 2
|
||||
]
|
||||
82
src/usher_pipeline/output/evidence_summary.py
Normal file
82
src/usher_pipeline/output/evidence_summary.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Per-gene evidence summary: supporting layers and gaps."""
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Six evidence layer names (must match column names in scored_genes)
|
||||
EVIDENCE_LAYERS = [
|
||||
"gnomad",
|
||||
"expression",
|
||||
"annotation",
|
||||
"localization",
|
||||
"animal_model",
|
||||
"literature",
|
||||
]
|
||||
|
||||
|
||||
def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""
|
||||
Add supporting_layers and evidence_gaps columns to scored genes.
|
||||
|
||||
For each gene, identifies which evidence layers contributed scores
|
||||
(supporting_layers) and which layers are missing (evidence_gaps).
|
||||
|
||||
Args:
|
||||
df: Polars DataFrame with columns like:
|
||||
- gene_id, gene_symbol, composite_score, evidence_count
|
||||
- gnomad_score, expression_score, annotation_score, etc. (all nullable)
|
||||
|
||||
Returns:
|
||||
DataFrame with two added columns:
|
||||
- supporting_layers (str): comma-separated list of layers with non-NULL scores
|
||||
- evidence_gaps (str): comma-separated list of layers with NULL scores
|
||||
|
||||
Examples:
|
||||
- Gene with gnomad, expression, annotation scores:
|
||||
supporting_layers = "gnomad,expression,annotation"
|
||||
evidence_gaps = "localization,animal_model,literature"
|
||||
- Gene with all NULL scores:
|
||||
supporting_layers = ""
|
||||
evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature"
|
||||
|
||||
Notes:
|
||||
- Uses polars expressions (no pandas conversion)
|
||||
- Empty string for supporting_layers if no evidence
|
||||
- Preserves DataFrame order and all other columns
|
||||
"""
|
||||
# Build supporting_layers: comma-separated list of non-NULL layers
|
||||
# Strategy: create a list column, filter nulls, join to string
|
||||
supporting_exprs = []
|
||||
gap_exprs = []
|
||||
|
||||
for layer in EVIDENCE_LAYERS:
|
||||
score_col = f"{layer}_score"
|
||||
|
||||
# For supporting_layers: keep layer name if score is NOT NULL, else NULL
|
||||
supporting_exprs.append(
|
||||
pl.when(pl.col(score_col).is_not_null())
|
||||
.then(pl.lit(layer))
|
||||
.otherwise(pl.lit(None))
|
||||
)
|
||||
|
||||
# For evidence_gaps: keep layer name if score IS NULL, else NULL
|
||||
gap_exprs.append(
|
||||
pl.when(pl.col(score_col).is_null())
|
||||
.then(pl.lit(layer))
|
||||
.otherwise(pl.lit(None))
|
||||
)
|
||||
|
||||
# Combine into list columns, drop nulls, join with comma
|
||||
result = df.with_columns(
|
||||
# supporting_layers: join all non-NULL layer names
|
||||
pl.concat_list(supporting_exprs)
|
||||
.list.drop_nulls()
|
||||
.list.join(",")
|
||||
.alias("supporting_layers"),
|
||||
# evidence_gaps: join all NULL layer names
|
||||
pl.concat_list(gap_exprs)
|
||||
.list.drop_nulls()
|
||||
.list.join(",")
|
||||
.alias("evidence_gaps"),
|
||||
)
|
||||
|
||||
return result
|
||||
83
src/usher_pipeline/output/tiers.py
Normal file
83
src/usher_pipeline/output/tiers.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Confidence tiering logic for scored candidate genes."""
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Default tier thresholds from research
|
||||
TIER_THRESHOLDS = {
|
||||
"HIGH": {"composite_score": 0.7, "evidence_count": 3},
|
||||
"MEDIUM": {"composite_score": 0.4, "evidence_count": 2},
|
||||
"LOW": {"composite_score": 0.2, "evidence_count": 1},
|
||||
}
|
||||
|
||||
|
||||
def assign_tiers(
|
||||
scored_df: pl.DataFrame, thresholds: dict | None = None
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Assign confidence tiers to scored genes and filter out EXCLUDED genes.
|
||||
|
||||
Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers
|
||||
based on composite_score and evidence_count. Genes below LOW threshold
|
||||
are marked as EXCLUDED and filtered out.
|
||||
|
||||
Args:
|
||||
scored_df: Polars DataFrame with columns:
|
||||
- gene_id (str)
|
||||
- gene_symbol (str)
|
||||
- composite_score (float, nullable)
|
||||
- evidence_count (int)
|
||||
- quality_flag (str)
|
||||
- All 6 layer score columns (nullable)
|
||||
- All 6 contribution columns (nullable)
|
||||
thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format:
|
||||
{
|
||||
"HIGH": {"composite_score": float, "evidence_count": int},
|
||||
"MEDIUM": {"composite_score": float, "evidence_count": int},
|
||||
"LOW": {"composite_score": float, "evidence_count": int},
|
||||
}
|
||||
|
||||
Returns:
|
||||
DataFrame with added confidence_tier column (str), sorted by
|
||||
composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out.
|
||||
|
||||
Notes:
|
||||
- Uses vectorized polars expressions (not row-by-row iteration)
|
||||
- Genes with NULL composite_score are always EXCLUDED
|
||||
- Deterministic sorting for reproducibility
|
||||
- Filtering happens before return (EXCLUDED rows removed)
|
||||
"""
|
||||
# Use provided thresholds or defaults
|
||||
t = thresholds if thresholds is not None else TIER_THRESHOLDS
|
||||
|
||||
# Extract threshold values for readability
|
||||
high_score = t["HIGH"]["composite_score"]
|
||||
high_count = t["HIGH"]["evidence_count"]
|
||||
med_score = t["MEDIUM"]["composite_score"]
|
||||
med_count = t["MEDIUM"]["evidence_count"]
|
||||
low_score = t["LOW"]["composite_score"]
|
||||
|
||||
# Add confidence_tier column using vectorized when/then/otherwise chain
|
||||
result = scored_df.with_columns(
|
||||
pl.when(
|
||||
(pl.col("composite_score") >= high_score)
|
||||
& (pl.col("evidence_count") >= high_count)
|
||||
)
|
||||
.then(pl.lit("HIGH"))
|
||||
.when(
|
||||
(pl.col("composite_score") >= med_score)
|
||||
& (pl.col("evidence_count") >= med_count)
|
||||
)
|
||||
.then(pl.lit("MEDIUM"))
|
||||
.when(pl.col("composite_score") >= low_score)
|
||||
.then(pl.lit("LOW"))
|
||||
.otherwise(pl.lit("EXCLUDED"))
|
||||
.alias("confidence_tier")
|
||||
)
|
||||
|
||||
# Filter out EXCLUDED genes
|
||||
result = result.filter(pl.col("confidence_tier") != "EXCLUDED")
|
||||
|
||||
# Sort deterministically: composite_score DESC, gene_id ASC
|
||||
result = result.sort(["composite_score", "gene_id"], descending=[True, False])
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user