feat(04-01): implement multi-evidence weighted scoring integration
- Create join_evidence_layers() with LEFT JOIN preserving NULLs from all 6 evidence tables - Implement compute_composite_scores() with NULL-preserving weighted average (weighted_sum / available_weight) - Add quality_flag classification based on evidence_count (sufficient/moderate/sparse/no_evidence) - Include per-layer contribution columns for explainability - Add persist_scored_genes() to save scored_genes table to DuckDB - Log summary stats: coverage, mean/median scores, quality distribution, NULL rates
This commit is contained in:
300
src/usher_pipeline/scoring/integration.py
Normal file
300
src/usher_pipeline/scoring/integration.py
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
"""Multi-evidence weighted scoring integration with NULL preservation."""
|
||||||
|
|
||||||
|
import duckdb
|
||||||
|
import polars as pl
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from usher_pipeline.config.schema import ScoringWeights
|
||||||
|
from usher_pipeline.persistence.duckdb_store import PipelineStore
|
||||||
|
|
||||||
|
logger = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Join gene_universe with all 6 evidence tables on gene_id.
|
||||||
|
|
||||||
|
Performs LEFT JOIN to preserve all genes from gene_universe, even those
|
||||||
|
without evidence in some or all layers. NULL scores indicate missing
|
||||||
|
evidence, which is semantically distinct from zero scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
store: PipelineStore with database connection
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with columns:
|
||||||
|
- gene_id (str)
|
||||||
|
- gene_symbol (str)
|
||||||
|
- gnomad_score (float, nullable)
|
||||||
|
- expression_score (float, nullable)
|
||||||
|
- annotation_score (float, nullable)
|
||||||
|
- localization_score (float, nullable)
|
||||||
|
- animal_model_score (float, nullable)
|
||||||
|
- literature_score (float, nullable)
|
||||||
|
- evidence_count (int): count of non-NULL scores
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Uses LEFT JOIN pattern to preserve NULLs
|
||||||
|
- evidence_count = sum of non-NULL layers (0-6)
|
||||||
|
"""
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
g.gene_id,
|
||||||
|
g.gene_symbol,
|
||||||
|
gnomad.loeuf_normalized AS gnomad_score,
|
||||||
|
expr.expression_score_normalized AS expression_score,
|
||||||
|
annot.annotation_score_normalized AS annotation_score,
|
||||||
|
loc.localization_score_normalized AS localization_score,
|
||||||
|
animal.animal_model_score_normalized AS animal_model_score,
|
||||||
|
lit.literature_score_normalized AS literature_score,
|
||||||
|
(
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
|
) AS evidence_count
|
||||||
|
FROM gene_universe g
|
||||||
|
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
|
||||||
|
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
|
||||||
|
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
|
||||||
|
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
|
||||||
|
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
|
||||||
|
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Execute query and convert to polars
|
||||||
|
result = store.conn.execute(query).pl()
|
||||||
|
|
||||||
|
# Log summary statistics
|
||||||
|
total_genes = result.height
|
||||||
|
mean_evidence = result["evidence_count"].mean()
|
||||||
|
|
||||||
|
# Calculate NULL rates per layer
|
||||||
|
null_rates = {
|
||||||
|
"gnomad": result["gnomad_score"].null_count() / total_genes,
|
||||||
|
"expression": result["expression_score"].null_count() / total_genes,
|
||||||
|
"annotation": result["annotation_score"].null_count() / total_genes,
|
||||||
|
"localization": result["localization_score"].null_count() / total_genes,
|
||||||
|
"animal_model": result["animal_model_score"].null_count() / total_genes,
|
||||||
|
"literature": result["literature_score"].null_count() / total_genes,
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"join_evidence_layers_complete",
|
||||||
|
total_genes=total_genes,
|
||||||
|
mean_evidence_count=f"{mean_evidence:.2f}",
|
||||||
|
null_rates={k: f"{v:.2%}" for k, v in null_rates.items()},
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Compute weighted composite scores from multiple evidence layers.
|
||||||
|
|
||||||
|
Uses NULL-preserving weighted average: only available evidence contributes
|
||||||
|
to the composite score. Genes without any evidence receive NULL (not zero).
|
||||||
|
|
||||||
|
Formula:
|
||||||
|
composite_score = weighted_sum / available_weight
|
||||||
|
where:
|
||||||
|
weighted_sum = sum(score_i * weight_i) for non-NULL scores
|
||||||
|
available_weight = sum(weight_i) for non-NULL scores
|
||||||
|
|
||||||
|
Args:
|
||||||
|
store: PipelineStore with database connection
|
||||||
|
weights: ScoringWeights instance (must sum to 1.0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with columns:
|
||||||
|
- gene_id, gene_symbol
|
||||||
|
- All 6 individual layer scores (nullable)
|
||||||
|
- composite_score (float, nullable)
|
||||||
|
- quality_flag (str): sufficient/moderate/sparse/no_evidence
|
||||||
|
- Per-layer contribution columns (score * weight, nullable)
|
||||||
|
- evidence_count (int): count of non-NULL scores
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If weights do not sum to 1.0
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Validates weight sum before computing scores
|
||||||
|
- NULL scores do NOT contribute to weighted average
|
||||||
|
- quality_flag based on evidence_count thresholds
|
||||||
|
- Ordered by composite_score DESC NULLS LAST
|
||||||
|
"""
|
||||||
|
# Validate weights first
|
||||||
|
weights.validate_sum()
|
||||||
|
|
||||||
|
query = f"""
|
||||||
|
SELECT
|
||||||
|
g.gene_id,
|
||||||
|
g.gene_symbol,
|
||||||
|
gnomad.loeuf_normalized AS gnomad_score,
|
||||||
|
expr.expression_score_normalized AS expression_score,
|
||||||
|
annot.annotation_score_normalized AS annotation_score,
|
||||||
|
loc.localization_score_normalized AS localization_score,
|
||||||
|
animal.animal_model_score_normalized AS animal_model_score,
|
||||||
|
lit.literature_score_normalized AS literature_score,
|
||||||
|
(
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
|
) AS evidence_count,
|
||||||
|
(
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN {weights.gnomad} ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN {weights.expression} ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN {weights.annotation} ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN {weights.localization} ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN {weights.animal_model} ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN {weights.literature} ELSE 0 END
|
||||||
|
) AS available_weight,
|
||||||
|
(
|
||||||
|
COALESCE(gnomad.loeuf_normalized * {weights.gnomad}, 0) +
|
||||||
|
COALESCE(expr.expression_score_normalized * {weights.expression}, 0) +
|
||||||
|
COALESCE(annot.annotation_score_normalized * {weights.annotation}, 0) +
|
||||||
|
COALESCE(loc.localization_score_normalized * {weights.localization}, 0) +
|
||||||
|
COALESCE(animal.animal_model_score_normalized * {weights.animal_model}, 0) +
|
||||||
|
COALESCE(lit.literature_score_normalized * {weights.literature}, 0)
|
||||||
|
) AS weighted_sum,
|
||||||
|
CASE
|
||||||
|
WHEN (
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN {weights.gnomad} ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN {weights.expression} ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN {weights.annotation} ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN {weights.localization} ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN {weights.animal_model} ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN {weights.literature} ELSE 0 END
|
||||||
|
) > 0 THEN (
|
||||||
|
COALESCE(gnomad.loeuf_normalized * {weights.gnomad}, 0) +
|
||||||
|
COALESCE(expr.expression_score_normalized * {weights.expression}, 0) +
|
||||||
|
COALESCE(annot.annotation_score_normalized * {weights.annotation}, 0) +
|
||||||
|
COALESCE(loc.localization_score_normalized * {weights.localization}, 0) +
|
||||||
|
COALESCE(animal.animal_model_score_normalized * {weights.animal_model}, 0) +
|
||||||
|
COALESCE(lit.literature_score_normalized * {weights.literature}, 0)
|
||||||
|
) / (
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN {weights.gnomad} ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN {weights.expression} ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN {weights.annotation} ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN {weights.localization} ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN {weights.animal_model} ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN {weights.literature} ELSE 0 END
|
||||||
|
)
|
||||||
|
ELSE NULL
|
||||||
|
END AS composite_score,
|
||||||
|
CASE
|
||||||
|
WHEN (
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
|
) >= 4 THEN 'sufficient_evidence'
|
||||||
|
WHEN (
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
|
) >= 2 THEN 'moderate_evidence'
|
||||||
|
WHEN (
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
|
) >= 1 THEN 'sparse_evidence'
|
||||||
|
ELSE 'no_evidence'
|
||||||
|
END AS quality_flag,
|
||||||
|
-- Per-layer contributions (NULL if score is NULL)
|
||||||
|
CASE WHEN gnomad.loeuf_normalized IS NOT NULL THEN gnomad.loeuf_normalized * {weights.gnomad} ELSE NULL END AS gnomad_contribution,
|
||||||
|
CASE WHEN expr.expression_score_normalized IS NOT NULL THEN expr.expression_score_normalized * {weights.expression} ELSE NULL END AS expression_contribution,
|
||||||
|
CASE WHEN annot.annotation_score_normalized IS NOT NULL THEN annot.annotation_score_normalized * {weights.annotation} ELSE NULL END AS annotation_contribution,
|
||||||
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution,
|
||||||
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution,
|
||||||
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution
|
||||||
|
FROM gene_universe g
|
||||||
|
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
|
||||||
|
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
|
||||||
|
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
|
||||||
|
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
|
||||||
|
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
|
||||||
|
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
|
||||||
|
ORDER BY composite_score DESC NULLS LAST
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Execute query and convert to polars
|
||||||
|
result = store.conn.execute(query).pl()
|
||||||
|
|
||||||
|
# Log summary statistics
|
||||||
|
total_genes = result.height
|
||||||
|
genes_with_score = result.filter(pl.col("composite_score").is_not_null()).height
|
||||||
|
|
||||||
|
# Calculate mean/median for non-NULL scores
|
||||||
|
non_null_scores = result.filter(pl.col("composite_score").is_not_null())["composite_score"]
|
||||||
|
mean_score = non_null_scores.mean() if len(non_null_scores) > 0 else None
|
||||||
|
median_score = non_null_scores.median() if len(non_null_scores) > 0 else None
|
||||||
|
|
||||||
|
# Quality flag distribution
|
||||||
|
quality_dist = result.group_by("quality_flag").agg(pl.count()).sort("quality_flag")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"compute_composite_scores_complete",
|
||||||
|
total_genes=total_genes,
|
||||||
|
genes_with_score=genes_with_score,
|
||||||
|
coverage_pct=f"{genes_with_score / total_genes * 100:.1f}%",
|
||||||
|
mean_score=f"{mean_score:.4f}" if mean_score is not None else "N/A",
|
||||||
|
median_score=f"{median_score:.4f}" if median_score is not None else "N/A",
|
||||||
|
quality_distribution=quality_dist.to_dicts(),
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def persist_scored_genes(store: PipelineStore, scored_df: pl.DataFrame, weights: ScoringWeights) -> None:
|
||||||
|
"""
|
||||||
|
Persist scored genes DataFrame to DuckDB.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
store: PipelineStore for database access
|
||||||
|
scored_df: DataFrame from compute_composite_scores()
|
||||||
|
weights: ScoringWeights used for scoring (logged in metadata)
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Saves to table: scored_genes
|
||||||
|
- Replaces existing table if present
|
||||||
|
- Logs quality flag distribution and row count
|
||||||
|
"""
|
||||||
|
# Save to DuckDB
|
||||||
|
store.save_dataframe(
|
||||||
|
df=scored_df,
|
||||||
|
table_name="scored_genes",
|
||||||
|
description="Multi-evidence weighted composite scores with per-layer contributions",
|
||||||
|
replace=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log quality flag distribution
|
||||||
|
quality_dist = scored_df.group_by("quality_flag").agg(pl.count()).sort("quality_flag")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"persist_scored_genes_complete",
|
||||||
|
row_count=scored_df.height,
|
||||||
|
quality_distribution=quality_dist.to_dicts(),
|
||||||
|
weights={
|
||||||
|
"gnomad": weights.gnomad,
|
||||||
|
"expression": weights.expression,
|
||||||
|
"annotation": weights.annotation,
|
||||||
|
"localization": weights.localization,
|
||||||
|
"animal_model": weights.animal_model,
|
||||||
|
"literature": weights.literature,
|
||||||
|
},
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user