feat(04-02): implement positive control validation
- Create validation.py with known gene ranking validation - validate_known_gene_ranking: PERCENT_RANK window function over all genes - Computes median percentile, top quartile count/fraction for known genes - generate_validation_report: human-readable text output with formatted table - Update __init__.py to export run_qc_checks, validate_known_gene_ranking, generate_validation_report
This commit is contained in:
@@ -11,6 +11,13 @@ from usher_pipeline.scoring.integration import (
|
|||||||
compute_composite_scores,
|
compute_composite_scores,
|
||||||
persist_scored_genes,
|
persist_scored_genes,
|
||||||
)
|
)
|
||||||
|
from usher_pipeline.scoring.quality_control import (
|
||||||
|
run_qc_checks,
|
||||||
|
)
|
||||||
|
from usher_pipeline.scoring.validation import (
|
||||||
|
validate_known_gene_ranking,
|
||||||
|
generate_validation_report,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"OMIM_USHER_GENES",
|
"OMIM_USHER_GENES",
|
||||||
@@ -20,4 +27,7 @@ __all__ = [
|
|||||||
"join_evidence_layers",
|
"join_evidence_layers",
|
||||||
"compute_composite_scores",
|
"compute_composite_scores",
|
||||||
"persist_scored_genes",
|
"persist_scored_genes",
|
||||||
|
"run_qc_checks",
|
||||||
|
"validate_known_gene_ranking",
|
||||||
|
"generate_validation_report",
|
||||||
]
|
]
|
||||||
|
|||||||
227
src/usher_pipeline/scoring/validation.py
Normal file
227
src/usher_pipeline/scoring/validation.py
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
"""Positive control validation against known gene rankings."""
|
||||||
|
|
||||||
|
import duckdb
|
||||||
|
import polars as pl
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from usher_pipeline.persistence.duckdb_store import PipelineStore
|
||||||
|
from usher_pipeline.scoring.known_genes import compile_known_genes
|
||||||
|
|
||||||
|
logger = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_known_gene_ranking(
|
||||||
|
store: PipelineStore,
|
||||||
|
percentile_threshold: float = 0.75
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Validate that known cilia/Usher genes rank highly in composite scores.
|
||||||
|
|
||||||
|
Computes percentile ranks for known genes using PERCENT_RANK window function
|
||||||
|
across ALL genes (not just known genes). Validates that known genes rank in
|
||||||
|
top quartile before exclusion filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
store: PipelineStore with scored_genes table
|
||||||
|
percentile_threshold: Minimum median percentile for validation (default 0.75)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys:
|
||||||
|
- total_known_expected: int - count of known genes in reference list
|
||||||
|
- total_known_in_dataset: int - count of known genes found in scored_genes
|
||||||
|
- median_percentile: float - median percentile rank of known genes
|
||||||
|
- top_quartile_count: int - count of known genes >= 75th percentile
|
||||||
|
- top_quartile_fraction: float - fraction in top quartile
|
||||||
|
- validation_passed: bool - True if median >= percentile_threshold
|
||||||
|
- known_gene_details: list[dict] - top 20 known genes with scores and ranks
|
||||||
|
- reason: str - explanation if validation failed (optional)
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Uses PERCENT_RANK() which returns 0.0 (lowest) to 1.0 (highest)
|
||||||
|
- Ranks computed BEFORE known gene exclusion (validates scoring system)
|
||||||
|
- Genes without composite_score (NULL) are excluded from ranking
|
||||||
|
- Creates temporary table _known_genes for the join
|
||||||
|
"""
|
||||||
|
logger.info("validate_known_gene_ranking_start", threshold=percentile_threshold)
|
||||||
|
|
||||||
|
# Compile known genes
|
||||||
|
known_df = compile_known_genes()
|
||||||
|
total_known_expected = known_df["gene_symbol"].n_unique()
|
||||||
|
|
||||||
|
# Register known genes as temporary DuckDB table
|
||||||
|
store.conn.execute("DROP TABLE IF EXISTS _known_genes")
|
||||||
|
store.conn.execute("CREATE TEMP TABLE _known_genes AS SELECT * FROM known_df")
|
||||||
|
|
||||||
|
# Query to compute percentile ranks for known genes
|
||||||
|
query = """
|
||||||
|
WITH ranked_genes AS (
|
||||||
|
SELECT
|
||||||
|
gene_symbol,
|
||||||
|
composite_score,
|
||||||
|
PERCENT_RANK() OVER (ORDER BY composite_score) AS percentile_rank
|
||||||
|
FROM scored_genes
|
||||||
|
WHERE composite_score IS NOT NULL
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
rg.gene_symbol,
|
||||||
|
rg.composite_score,
|
||||||
|
rg.percentile_rank,
|
||||||
|
kg.source
|
||||||
|
FROM ranked_genes rg
|
||||||
|
INNER JOIN _known_genes kg ON rg.gene_symbol = kg.gene_symbol
|
||||||
|
ORDER BY rg.percentile_rank DESC
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = store.conn.execute(query).pl()
|
||||||
|
|
||||||
|
# Clean up temp table
|
||||||
|
store.conn.execute("DROP TABLE IF EXISTS _known_genes")
|
||||||
|
|
||||||
|
# If no known genes found, return failure
|
||||||
|
if result.height == 0:
|
||||||
|
logger.error(
|
||||||
|
"validate_known_gene_ranking_failed",
|
||||||
|
reason="no_known_genes_found",
|
||||||
|
expected=total_known_expected,
|
||||||
|
found=0,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"total_known_expected": total_known_expected,
|
||||||
|
"total_known_in_dataset": 0,
|
||||||
|
"median_percentile": None,
|
||||||
|
"top_quartile_count": 0,
|
||||||
|
"top_quartile_fraction": 0.0,
|
||||||
|
"validation_passed": False,
|
||||||
|
"known_gene_details": [],
|
||||||
|
"reason": "no_known_genes_found",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute validation metrics
|
||||||
|
total_known_in_dataset = result.height
|
||||||
|
percentiles = result["percentile_rank"].to_numpy()
|
||||||
|
median_percentile = float(result["percentile_rank"].median())
|
||||||
|
|
||||||
|
top_quartile_genes = result.filter(pl.col("percentile_rank") >= 0.75)
|
||||||
|
top_quartile_count = top_quartile_genes.height
|
||||||
|
top_quartile_fraction = top_quartile_count / total_known_in_dataset
|
||||||
|
|
||||||
|
validation_passed = median_percentile >= percentile_threshold
|
||||||
|
|
||||||
|
# Extract top 20 known genes for reporting
|
||||||
|
known_gene_details = result.head(20).select([
|
||||||
|
"gene_symbol",
|
||||||
|
"composite_score",
|
||||||
|
"percentile_rank",
|
||||||
|
"source"
|
||||||
|
]).to_dicts()
|
||||||
|
|
||||||
|
# Log validation results
|
||||||
|
if validation_passed:
|
||||||
|
logger.info(
|
||||||
|
"validate_known_gene_ranking_passed",
|
||||||
|
total_expected=total_known_expected,
|
||||||
|
total_found=total_known_in_dataset,
|
||||||
|
median_percentile=f"{median_percentile:.4f}",
|
||||||
|
top_quartile_count=top_quartile_count,
|
||||||
|
top_quartile_fraction=f"{top_quartile_fraction:.2%}",
|
||||||
|
threshold=percentile_threshold,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"validate_known_gene_ranking_failed",
|
||||||
|
reason="median_below_threshold",
|
||||||
|
median_percentile=f"{median_percentile:.4f}",
|
||||||
|
threshold=percentile_threshold,
|
||||||
|
top_quartile_fraction=f"{top_quartile_fraction:.2%}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_known_expected": total_known_expected,
|
||||||
|
"total_known_in_dataset": total_known_in_dataset,
|
||||||
|
"median_percentile": median_percentile,
|
||||||
|
"top_quartile_count": top_quartile_count,
|
||||||
|
"top_quartile_fraction": top_quartile_fraction,
|
||||||
|
"validation_passed": validation_passed,
|
||||||
|
"known_gene_details": known_gene_details,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_validation_report(metrics: dict) -> str:
|
||||||
|
"""
|
||||||
|
Generate human-readable validation report.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metrics: Dict returned from validate_known_gene_ranking()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Multi-line text report summarizing validation results
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Formats percentiles as percentages (e.g., "87.3%")
|
||||||
|
- Includes table of top-ranked known genes
|
||||||
|
- Shows pass/fail status prominently
|
||||||
|
"""
|
||||||
|
passed = metrics["validation_passed"]
|
||||||
|
status = "PASSED ✓" if passed else "FAILED ✗"
|
||||||
|
|
||||||
|
# Handle case where no known genes found
|
||||||
|
if metrics["total_known_in_dataset"] == 0:
|
||||||
|
return f"""
|
||||||
|
Positive Control Validation: {status}
|
||||||
|
|
||||||
|
Reason: No known genes found in scored dataset
|
||||||
|
Expected: {metrics['total_known_expected']} known genes
|
||||||
|
Found: 0 genes
|
||||||
|
|
||||||
|
This indicates either:
|
||||||
|
1. Known genes were already filtered out
|
||||||
|
2. Gene symbol mismatch between known list and scored_genes
|
||||||
|
3. No genes have composite scores yet
|
||||||
|
"""
|
||||||
|
|
||||||
|
median_pct = metrics["median_percentile"] * 100
|
||||||
|
top_q_frac = metrics["top_quartile_fraction"] * 100
|
||||||
|
|
||||||
|
report = [
|
||||||
|
f"Positive Control Validation: {status}",
|
||||||
|
"",
|
||||||
|
"Summary:",
|
||||||
|
f" Known genes expected: {metrics['total_known_expected']}",
|
||||||
|
f" Known genes found: {metrics['total_known_in_dataset']}",
|
||||||
|
f" Median percentile: {median_pct:.1f}%",
|
||||||
|
f" Top quartile count: {metrics['top_quartile_count']}",
|
||||||
|
f" Top quartile fraction: {top_q_frac:.1f}%",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add interpretation
|
||||||
|
if passed:
|
||||||
|
report.append(
|
||||||
|
f"Known cilia/Usher genes rank highly (median >= 75th percentile), "
|
||||||
|
"validating the scoring system."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
report.append(
|
||||||
|
f"Warning: Known genes rank below expected threshold. "
|
||||||
|
f"Median percentile ({median_pct:.1f}%) < 75.0%."
|
||||||
|
)
|
||||||
|
report.append(
|
||||||
|
"This may indicate issues with evidence layer weights or data quality."
|
||||||
|
)
|
||||||
|
|
||||||
|
report.append("")
|
||||||
|
report.append("Top-Ranked Known Genes:")
|
||||||
|
report.append("-" * 80)
|
||||||
|
report.append(f"{'Gene':<12} {'Score':>8} {'Percentile':>12} {'Source':<20}")
|
||||||
|
report.append("-" * 80)
|
||||||
|
|
||||||
|
for gene in metrics["known_gene_details"]:
|
||||||
|
gene_symbol = gene["gene_symbol"]
|
||||||
|
score = gene["composite_score"]
|
||||||
|
percentile = gene["percentile_rank"] * 100
|
||||||
|
source = gene["source"]
|
||||||
|
report.append(
|
||||||
|
f"{gene_symbol:<12} {score:>8.4f} {percentile:>11.1f}% {source:<20}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(report)
|
||||||
Reference in New Issue
Block a user