feat(04-02): implement positive control validation

- Create validation.py with known gene ranking validation
- validate_known_gene_ranking: PERCENT_RANK window function over all genes
- Computes median percentile, top quartile count/fraction for known genes
- generate_validation_report: human-readable text output with formatted table
- Update __init__.py to export run_qc_checks, validate_known_gene_ranking, generate_validation_report
This commit is contained in:
2026-02-11 20:47:59 +08:00
parent ba2f97ac55
commit 70a5d6eff8
2 changed files with 237 additions and 0 deletions

View File

@@ -11,6 +11,13 @@ from usher_pipeline.scoring.integration import (
compute_composite_scores,
persist_scored_genes,
)
from usher_pipeline.scoring.quality_control import (
run_qc_checks,
)
from usher_pipeline.scoring.validation import (
validate_known_gene_ranking,
generate_validation_report,
)
__all__ = [
"OMIM_USHER_GENES",
@@ -20,4 +27,7 @@ __all__ = [
"join_evidence_layers",
"compute_composite_scores",
"persist_scored_genes",
"run_qc_checks",
"validate_known_gene_ranking",
"generate_validation_report",
]

View File

@@ -0,0 +1,227 @@
"""Positive control validation against known gene rankings."""
import duckdb
import polars as pl
import structlog
from usher_pipeline.persistence.duckdb_store import PipelineStore
from usher_pipeline.scoring.known_genes import compile_known_genes
logger = structlog.get_logger(__name__)
def validate_known_gene_ranking(
store: PipelineStore,
percentile_threshold: float = 0.75
) -> dict:
"""
Validate that known cilia/Usher genes rank highly in composite scores.
Computes percentile ranks for known genes using PERCENT_RANK window function
across ALL genes (not just known genes). Validates that known genes rank in
top quartile before exclusion filtering.
Args:
store: PipelineStore with scored_genes table
percentile_threshold: Minimum median percentile for validation (default 0.75)
Returns:
Dict with keys:
- total_known_expected: int - count of known genes in reference list
- total_known_in_dataset: int - count of known genes found in scored_genes
- median_percentile: float - median percentile rank of known genes
- top_quartile_count: int - count of known genes >= 75th percentile
- top_quartile_fraction: float - fraction in top quartile
- validation_passed: bool - True if median >= percentile_threshold
- known_gene_details: list[dict] - top 20 known genes with scores and ranks
- reason: str - explanation if validation failed (optional)
Notes:
- Uses PERCENT_RANK() which returns 0.0 (lowest) to 1.0 (highest)
- Ranks computed BEFORE known gene exclusion (validates scoring system)
- Genes without composite_score (NULL) are excluded from ranking
- Creates temporary table _known_genes for the join
"""
logger.info("validate_known_gene_ranking_start", threshold=percentile_threshold)
# Compile known genes
known_df = compile_known_genes()
total_known_expected = known_df["gene_symbol"].n_unique()
# Register known genes as temporary DuckDB table
store.conn.execute("DROP TABLE IF EXISTS _known_genes")
store.conn.execute("CREATE TEMP TABLE _known_genes AS SELECT * FROM known_df")
# Query to compute percentile ranks for known genes
query = """
WITH ranked_genes AS (
SELECT
gene_symbol,
composite_score,
PERCENT_RANK() OVER (ORDER BY composite_score) AS percentile_rank
FROM scored_genes
WHERE composite_score IS NOT NULL
)
SELECT
rg.gene_symbol,
rg.composite_score,
rg.percentile_rank,
kg.source
FROM ranked_genes rg
INNER JOIN _known_genes kg ON rg.gene_symbol = kg.gene_symbol
ORDER BY rg.percentile_rank DESC
"""
result = store.conn.execute(query).pl()
# Clean up temp table
store.conn.execute("DROP TABLE IF EXISTS _known_genes")
# If no known genes found, return failure
if result.height == 0:
logger.error(
"validate_known_gene_ranking_failed",
reason="no_known_genes_found",
expected=total_known_expected,
found=0,
)
return {
"total_known_expected": total_known_expected,
"total_known_in_dataset": 0,
"median_percentile": None,
"top_quartile_count": 0,
"top_quartile_fraction": 0.0,
"validation_passed": False,
"known_gene_details": [],
"reason": "no_known_genes_found",
}
# Compute validation metrics
total_known_in_dataset = result.height
percentiles = result["percentile_rank"].to_numpy()
median_percentile = float(result["percentile_rank"].median())
top_quartile_genes = result.filter(pl.col("percentile_rank") >= 0.75)
top_quartile_count = top_quartile_genes.height
top_quartile_fraction = top_quartile_count / total_known_in_dataset
validation_passed = median_percentile >= percentile_threshold
# Extract top 20 known genes for reporting
known_gene_details = result.head(20).select([
"gene_symbol",
"composite_score",
"percentile_rank",
"source"
]).to_dicts()
# Log validation results
if validation_passed:
logger.info(
"validate_known_gene_ranking_passed",
total_expected=total_known_expected,
total_found=total_known_in_dataset,
median_percentile=f"{median_percentile:.4f}",
top_quartile_count=top_quartile_count,
top_quartile_fraction=f"{top_quartile_fraction:.2%}",
threshold=percentile_threshold,
)
else:
logger.warning(
"validate_known_gene_ranking_failed",
reason="median_below_threshold",
median_percentile=f"{median_percentile:.4f}",
threshold=percentile_threshold,
top_quartile_fraction=f"{top_quartile_fraction:.2%}",
)
return {
"total_known_expected": total_known_expected,
"total_known_in_dataset": total_known_in_dataset,
"median_percentile": median_percentile,
"top_quartile_count": top_quartile_count,
"top_quartile_fraction": top_quartile_fraction,
"validation_passed": validation_passed,
"known_gene_details": known_gene_details,
}
def generate_validation_report(metrics: dict) -> str:
"""
Generate human-readable validation report.
Args:
metrics: Dict returned from validate_known_gene_ranking()
Returns:
Multi-line text report summarizing validation results
Notes:
- Formats percentiles as percentages (e.g., "87.3%")
- Includes table of top-ranked known genes
- Shows pass/fail status prominently
"""
passed = metrics["validation_passed"]
status = "PASSED ✓" if passed else "FAILED ✗"
# Handle case where no known genes found
if metrics["total_known_in_dataset"] == 0:
return f"""
Positive Control Validation: {status}
Reason: No known genes found in scored dataset
Expected: {metrics['total_known_expected']} known genes
Found: 0 genes
This indicates either:
1. Known genes were already filtered out
2. Gene symbol mismatch between known list and scored_genes
3. No genes have composite scores yet
"""
median_pct = metrics["median_percentile"] * 100
top_q_frac = metrics["top_quartile_fraction"] * 100
report = [
f"Positive Control Validation: {status}",
"",
"Summary:",
f" Known genes expected: {metrics['total_known_expected']}",
f" Known genes found: {metrics['total_known_in_dataset']}",
f" Median percentile: {median_pct:.1f}%",
f" Top quartile count: {metrics['top_quartile_count']}",
f" Top quartile fraction: {top_q_frac:.1f}%",
"",
]
# Add interpretation
if passed:
report.append(
f"Known cilia/Usher genes rank highly (median >= 75th percentile), "
"validating the scoring system."
)
else:
report.append(
f"Warning: Known genes rank below expected threshold. "
f"Median percentile ({median_pct:.1f}%) < 75.0%."
)
report.append(
"This may indicate issues with evidence layer weights or data quality."
)
report.append("")
report.append("Top-Ranked Known Genes:")
report.append("-" * 80)
report.append(f"{'Gene':<12} {'Score':>8} {'Percentile':>12} {'Source':<20}")
report.append("-" * 80)
for gene in metrics["known_gene_details"]:
gene_symbol = gene["gene_symbol"]
score = gene["composite_score"]
percentile = gene["percentile_rank"] * 100
source = gene["source"]
report.append(
f"{gene_symbol:<12} {score:>8.4f} {percentile:>11.1f}% {source:<20}"
)
return "\n".join(report)