feat(05-01): implement tiering logic and evidence summary module
- Add confidence tier classification (HIGH/MEDIUM/LOW) based on composite_score and evidence_count - Add supporting_layers and evidence_gaps columns per gene - Use vectorized polars expressions for performance - Configurable thresholds for tier assignment
This commit is contained in:
14
src/usher_pipeline/output/__init__.py
Normal file
14
src/usher_pipeline/output/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
"""Output generation: tiered candidate classification and dual-format file writing."""
|
||||||
|
|
||||||
|
from usher_pipeline.output.evidence_summary import EVIDENCE_LAYERS, add_evidence_summary
|
||||||
|
from usher_pipeline.output.tiers import TIER_THRESHOLDS, assign_tiers
|
||||||
|
|
||||||
|
# writers.py exports will be added in Task 2
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"assign_tiers",
|
||||||
|
"TIER_THRESHOLDS",
|
||||||
|
"add_evidence_summary",
|
||||||
|
"EVIDENCE_LAYERS",
|
||||||
|
# "write_candidate_output" will be added in Task 2
|
||||||
|
]
|
||||||
82
src/usher_pipeline/output/evidence_summary.py
Normal file
82
src/usher_pipeline/output/evidence_summary.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""Per-gene evidence summary: supporting layers and gaps."""
|
||||||
|
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
# Six evidence layer names (must match column names in scored_genes)
|
||||||
|
EVIDENCE_LAYERS = [
|
||||||
|
"gnomad",
|
||||||
|
"expression",
|
||||||
|
"annotation",
|
||||||
|
"localization",
|
||||||
|
"animal_model",
|
||||||
|
"literature",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Add supporting_layers and evidence_gaps columns to scored genes.
|
||||||
|
|
||||||
|
For each gene, identifies which evidence layers contributed scores
|
||||||
|
(supporting_layers) and which layers are missing (evidence_gaps).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Polars DataFrame with columns like:
|
||||||
|
- gene_id, gene_symbol, composite_score, evidence_count
|
||||||
|
- gnomad_score, expression_score, annotation_score, etc. (all nullable)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with two added columns:
|
||||||
|
- supporting_layers (str): comma-separated list of layers with non-NULL scores
|
||||||
|
- evidence_gaps (str): comma-separated list of layers with NULL scores
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- Gene with gnomad, expression, annotation scores:
|
||||||
|
supporting_layers = "gnomad,expression,annotation"
|
||||||
|
evidence_gaps = "localization,animal_model,literature"
|
||||||
|
- Gene with all NULL scores:
|
||||||
|
supporting_layers = ""
|
||||||
|
evidence_gaps = "gnomad,expression,annotation,localization,animal_model,literature"
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Uses polars expressions (no pandas conversion)
|
||||||
|
- Empty string for supporting_layers if no evidence
|
||||||
|
- Preserves DataFrame order and all other columns
|
||||||
|
"""
|
||||||
|
# Build supporting_layers: comma-separated list of non-NULL layers
|
||||||
|
# Strategy: create a list column, filter nulls, join to string
|
||||||
|
supporting_exprs = []
|
||||||
|
gap_exprs = []
|
||||||
|
|
||||||
|
for layer in EVIDENCE_LAYERS:
|
||||||
|
score_col = f"{layer}_score"
|
||||||
|
|
||||||
|
# For supporting_layers: keep layer name if score is NOT NULL, else NULL
|
||||||
|
supporting_exprs.append(
|
||||||
|
pl.when(pl.col(score_col).is_not_null())
|
||||||
|
.then(pl.lit(layer))
|
||||||
|
.otherwise(pl.lit(None))
|
||||||
|
)
|
||||||
|
|
||||||
|
# For evidence_gaps: keep layer name if score IS NULL, else NULL
|
||||||
|
gap_exprs.append(
|
||||||
|
pl.when(pl.col(score_col).is_null())
|
||||||
|
.then(pl.lit(layer))
|
||||||
|
.otherwise(pl.lit(None))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine into list columns, drop nulls, join with comma
|
||||||
|
result = df.with_columns(
|
||||||
|
# supporting_layers: join all non-NULL layer names
|
||||||
|
pl.concat_list(supporting_exprs)
|
||||||
|
.list.drop_nulls()
|
||||||
|
.list.join(",")
|
||||||
|
.alias("supporting_layers"),
|
||||||
|
# evidence_gaps: join all NULL layer names
|
||||||
|
pl.concat_list(gap_exprs)
|
||||||
|
.list.drop_nulls()
|
||||||
|
.list.join(",")
|
||||||
|
.alias("evidence_gaps"),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
83
src/usher_pipeline/output/tiers.py
Normal file
83
src/usher_pipeline/output/tiers.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"""Confidence tiering logic for scored candidate genes."""
|
||||||
|
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
# Default tier thresholds from research
|
||||||
|
TIER_THRESHOLDS = {
|
||||||
|
"HIGH": {"composite_score": 0.7, "evidence_count": 3},
|
||||||
|
"MEDIUM": {"composite_score": 0.4, "evidence_count": 2},
|
||||||
|
"LOW": {"composite_score": 0.2, "evidence_count": 1},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def assign_tiers(
|
||||||
|
scored_df: pl.DataFrame, thresholds: dict | None = None
|
||||||
|
) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Assign confidence tiers to scored genes and filter out EXCLUDED genes.
|
||||||
|
|
||||||
|
Uses configurable thresholds to classify genes into HIGH/MEDIUM/LOW tiers
|
||||||
|
based on composite_score and evidence_count. Genes below LOW threshold
|
||||||
|
are marked as EXCLUDED and filtered out.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scored_df: Polars DataFrame with columns:
|
||||||
|
- gene_id (str)
|
||||||
|
- gene_symbol (str)
|
||||||
|
- composite_score (float, nullable)
|
||||||
|
- evidence_count (int)
|
||||||
|
- quality_flag (str)
|
||||||
|
- All 6 layer score columns (nullable)
|
||||||
|
- All 6 contribution columns (nullable)
|
||||||
|
thresholds: Optional dict overriding TIER_THRESHOLDS. Expected format:
|
||||||
|
{
|
||||||
|
"HIGH": {"composite_score": float, "evidence_count": int},
|
||||||
|
"MEDIUM": {"composite_score": float, "evidence_count": int},
|
||||||
|
"LOW": {"composite_score": float, "evidence_count": int},
|
||||||
|
}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with added confidence_tier column (str), sorted by
|
||||||
|
composite_score DESC, gene_id ASC. EXCLUDED genes are filtered out.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Uses vectorized polars expressions (not row-by-row iteration)
|
||||||
|
- Genes with NULL composite_score are always EXCLUDED
|
||||||
|
- Deterministic sorting for reproducibility
|
||||||
|
- Filtering happens before return (EXCLUDED rows removed)
|
||||||
|
"""
|
||||||
|
# Use provided thresholds or defaults
|
||||||
|
t = thresholds if thresholds is not None else TIER_THRESHOLDS
|
||||||
|
|
||||||
|
# Extract threshold values for readability
|
||||||
|
high_score = t["HIGH"]["composite_score"]
|
||||||
|
high_count = t["HIGH"]["evidence_count"]
|
||||||
|
med_score = t["MEDIUM"]["composite_score"]
|
||||||
|
med_count = t["MEDIUM"]["evidence_count"]
|
||||||
|
low_score = t["LOW"]["composite_score"]
|
||||||
|
|
||||||
|
# Add confidence_tier column using vectorized when/then/otherwise chain
|
||||||
|
result = scored_df.with_columns(
|
||||||
|
pl.when(
|
||||||
|
(pl.col("composite_score") >= high_score)
|
||||||
|
& (pl.col("evidence_count") >= high_count)
|
||||||
|
)
|
||||||
|
.then(pl.lit("HIGH"))
|
||||||
|
.when(
|
||||||
|
(pl.col("composite_score") >= med_score)
|
||||||
|
& (pl.col("evidence_count") >= med_count)
|
||||||
|
)
|
||||||
|
.then(pl.lit("MEDIUM"))
|
||||||
|
.when(pl.col("composite_score") >= low_score)
|
||||||
|
.then(pl.lit("LOW"))
|
||||||
|
.otherwise(pl.lit("EXCLUDED"))
|
||||||
|
.alias("confidence_tier")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter out EXCLUDED genes
|
||||||
|
result = result.filter(pl.col("confidence_tier") != "EXCLUDED")
|
||||||
|
|
||||||
|
# Sort deterministically: composite_score DESC, gene_id ASC
|
||||||
|
result = result.sort(["composite_score", "gene_id"], descending=[True, False])
|
||||||
|
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user