feat(06-01): add negative control validation with housekeeping genes

- Create negative_controls.py with 13 curated housekeeping genes - HOUSEKEEPING_GENES_CORE frozenset with source provenance - compile_housekeeping_genes() returns DataFrame matching known_genes pattern - validate_negative_controls() uses PERCENT_RANK with inverted threshold logic - generate_negative_control_report() provides human-readable output
2026-02-12 04:39:48 +08:00
parent 844295c681
commit e488ff2d7a
1 changed files with 287 additions and 0 deletions
--- a/src/usher_pipeline/scoring/negative_controls.py
+++ b/src/usher_pipeline/scoring/negative_controls.py
@@ -0,0 +1,287 @@
+"""Negative control validation using housekeeping genes."""
+
+import polars as pl
+import structlog
+
+from usher_pipeline.persistence.duckdb_store import PipelineStore
+
+logger = structlog.get_logger(__name__)
+
+# Housekeeping genes: ubiquitously expressed, essential genes that should NOT rank
+# highly in a cilia/Usher-specific scoring system. These serve as negative controls
+# to ensure the scoring system shows specificity.
+#
+# Source: Literature-validated housekeeping genes from Eisenberg & Levanon (2013)
+# "Human housekeeping genes, revisited" Trends in Genetics 29(10):569-574
+# and widely used reference genes in expression normalization studies.
+HOUSEKEEPING_GENES_CORE = frozenset([
+    # Ribosomal proteins (structural components of ribosomes)
+    "RPL13A",   # 60S ribosomal protein L13a
+    "RPL32",    # 60S ribosomal protein L32
+    "RPLP0",    # 60S acidic ribosomal protein P0
+
+    # Metabolic enzymes (glycolysis, oxidative phosphorylation)
+    "GAPDH",    # Glyceraldehyde-3-phosphate dehydrogenase
+    "ACTB",     # Actin beta (cytoskeletal)
+    "PGK1",     # Phosphoglycerate kinase 1
+    "SDHA",     # Succinate dehydrogenase complex subunit A
+
+    # Transcription/reference genes
+    "B2M",      # Beta-2-microglobulin
+    "HPRT1",    # Hypoxanthine phosphoribosyltransferase 1
+    "TBP",      # TATA-box binding protein
+
+    # Protein folding/modification
+    "PPIA",     # Peptidylprolyl isomerase A (cyclophilin A)
+    "UBC",      # Ubiquitin C
+    "YWHAZ",    # Tyrosine 3-monooxygenase/tryptophan 5-monooxygenase activation protein zeta
+])
+
+
+def compile_housekeeping_genes() -> pl.DataFrame:
+    """
+    Compile housekeeping genes into a structured DataFrame.
+
+    Returns:
+        DataFrame with columns:
+        - gene_symbol (str): Gene symbol
+        - source (str): "literature_validated" for all entries
+        - confidence (str): "HIGH" for all entries in this curated set
+
+    Notes:
+        - All genes are literature-validated housekeeping genes
+        - Pattern matches compile_known_genes() from known_genes.py
+        - Total rows = len(HOUSEKEEPING_GENES_CORE)
+    """
+    df = pl.DataFrame({
+        "gene_symbol": list(HOUSEKEEPING_GENES_CORE),
+        "source": ["literature_validated"] * len(HOUSEKEEPING_GENES_CORE),
+        "confidence": ["HIGH"] * len(HOUSEKEEPING_GENES_CORE),
+    })
+
+    return df
+
+
+def validate_negative_controls(
+    store: PipelineStore,
+    percentile_threshold: float = 0.50
+) -> dict:
+    """
+    Validate that housekeeping genes rank LOW in composite scores (negative controls).
+
+    Computes percentile ranks for housekeeping genes using PERCENT_RANK window function
+    across ALL genes. Validates that housekeeping genes rank BELOW median (inverted
+    threshold logic vs positive controls).
+
+    Args:
+        store: PipelineStore with scored_genes table
+        percentile_threshold: Maximum median percentile for validation (default 0.50)
+
+    Returns:
+        Dict with keys:
+        - total_expected: int - count of housekeeping genes in reference list
+        - total_in_dataset: int - count of housekeeping genes found in scored_genes
+        - median_percentile: float - median percentile rank of housekeeping genes
+        - top_quartile_count: int - count of housekeeping genes >= 75th percentile (should be low)
+        - in_high_tier_count: int - count of housekeeping genes >= 70% score (should be near zero)
+        - validation_passed: bool - True if median < percentile_threshold (INVERTED)
+        - housekeeping_gene_details: list[dict] - top 20 housekeeping genes by percentile ASC (lowest-ranking)
+        - reason: str - explanation if validation failed (optional)
+
+    Notes:
+        - INVERTED LOGIC: Low percentile ranks are GOOD for negative controls
+        - Uses PERCENT_RANK() which returns 0.0 (lowest) to 1.0 (highest)
+        - Genes without composite_score (NULL) are excluded from ranking
+        - Creates temporary table _housekeeping_genes for the join
+        - Cleans up temp table after query completion
+    """
+    logger.info("validate_negative_controls_start", threshold=percentile_threshold)
+
+    # Compile housekeeping genes
+    housekeeping_df = compile_housekeeping_genes()
+    total_expected = housekeeping_df["gene_symbol"].n_unique()
+
+    # Register housekeeping genes as temporary DuckDB table
+    store.conn.execute("DROP TABLE IF EXISTS _housekeeping_genes")
+    store.conn.execute("CREATE TEMP TABLE _housekeeping_genes AS SELECT * FROM housekeeping_df")
+
+    # Query to compute percentile ranks for housekeeping genes
+    query = """
+    WITH ranked_genes AS (
+        SELECT
+            gene_symbol,
+            composite_score,
+            PERCENT_RANK() OVER (ORDER BY composite_score) AS percentile_rank
+        FROM scored_genes
+        WHERE composite_score IS NOT NULL
+    )
+    SELECT
+        rg.gene_symbol,
+        rg.composite_score,
+        rg.percentile_rank,
+        hg.source
+    FROM ranked_genes rg
+    INNER JOIN _housekeeping_genes hg ON rg.gene_symbol = hg.gene_symbol
+    ORDER BY rg.percentile_rank ASC
+    """
+
+    result = store.conn.execute(query).pl()
+
+    # Clean up temp table
+    store.conn.execute("DROP TABLE IF EXISTS _housekeeping_genes")
+
+    # If no housekeeping genes found, return failure
+    if result.height == 0:
+        logger.error(
+            "validate_negative_controls_failed",
+            reason="no_housekeeping_genes_found",
+            expected=total_expected,
+            found=0,
+        )
+        return {
+            "total_expected": total_expected,
+            "total_in_dataset": 0,
+            "median_percentile": None,
+            "top_quartile_count": 0,
+            "in_high_tier_count": 0,
+            "validation_passed": False,
+            "housekeeping_gene_details": [],
+            "reason": "no_housekeeping_genes_found",
+        }
+
+    # Compute validation metrics
+    total_in_dataset = result.height
+    median_percentile = float(result["percentile_rank"].median())
+
+    # Count housekeeping genes in top quartile (bad for negative controls)
+    top_quartile_genes = result.filter(pl.col("percentile_rank") >= 0.75)
+    top_quartile_count = top_quartile_genes.height
+
+    # Count housekeeping genes with high composite scores (>= 0.70, near HIGH tier threshold)
+    high_tier_genes = result.filter(pl.col("composite_score") >= 0.70)
+    in_high_tier_count = high_tier_genes.height
+
+    # INVERTED validation logic: median should be BELOW threshold
+    validation_passed = median_percentile < percentile_threshold
+
+    # Extract top 20 housekeeping genes by percentile ASC (lowest-ranking = best for negative controls)
+    housekeeping_gene_details = result.head(20).select([
+        "gene_symbol",
+        "composite_score",
+        "percentile_rank",
+        "source"
+    ]).to_dicts()
+
+    # Log validation results
+    if validation_passed:
+        logger.info(
+            "validate_negative_controls_passed",
+            total_expected=total_expected,
+            total_found=total_in_dataset,
+            median_percentile=f"{median_percentile:.4f}",
+            top_quartile_count=top_quartile_count,
+            in_high_tier_count=in_high_tier_count,
+            threshold=percentile_threshold,
+        )
+    else:
+        logger.warning(
+            "validate_negative_controls_failed",
+            reason="median_above_threshold",
+            median_percentile=f"{median_percentile:.4f}",
+            threshold=percentile_threshold,
+            top_quartile_count=top_quartile_count,
+            in_high_tier_count=in_high_tier_count,
+        )
+
+    return {
+        "total_expected": total_expected,
+        "total_in_dataset": total_in_dataset,
+        "median_percentile": median_percentile,
+        "top_quartile_count": top_quartile_count,
+        "in_high_tier_count": in_high_tier_count,
+        "validation_passed": validation_passed,
+        "housekeeping_gene_details": housekeeping_gene_details,
+    }
+
+
+def generate_negative_control_report(metrics: dict) -> str:
+    """
+    Generate human-readable negative control validation report.
+
+    Args:
+        metrics: Dict returned from validate_negative_controls()
+
+    Returns:
+        Multi-line text report summarizing validation results
+
+    Notes:
+        - Formats percentiles as percentages (e.g., "32.5%")
+        - Includes table of lowest-ranked housekeeping genes (best outcome)
+        - Shows pass/fail status prominently
+        - INVERTED interpretation: LOW percentiles are GOOD for negative controls
+    """
+    passed = metrics["validation_passed"]
+    status = "PASSED ✓" if passed else "FAILED ✗"
+
+    # Handle case where no housekeeping genes found
+    if metrics["total_in_dataset"] == 0:
+        return f"""
+Negative Control Validation: {status}
+
+Reason: No housekeeping genes found in scored dataset
+Expected: {metrics['total_expected']} housekeeping genes
+Found: 0 genes
+
+This indicates either:
+1. Housekeeping genes were filtered out
+2. Gene symbol mismatch between housekeeping list and scored_genes
+3. No genes have composite scores yet
+"""
+
+    median_pct = metrics["median_percentile"] * 100
+    top_q_count = metrics["top_quartile_count"]
+    high_tier_count = metrics["in_high_tier_count"]
+
+    report = [
+        f"Negative Control Validation: {status}",
+        "",
+        "Summary:",
+        f"  Housekeeping genes expected: {metrics['total_expected']}",
+        f"  Housekeeping genes found: {metrics['total_in_dataset']}",
+        f"  Median percentile: {median_pct:.1f}%",
+        f"  Top quartile count: {top_q_count}",
+        f"  High-tier count (score >= 0.70): {high_tier_count}",
+        "",
+    ]
+
+    # Add interpretation (INVERTED: low percentile is good)
+    if passed:
+        report.append(
+            f"Housekeeping genes rank LOW (median < 50th percentile), "
+            "confirming scoring system specificity."
+        )
+    else:
+        report.append(
+            f"Warning: Housekeeping genes rank higher than expected. "
+            f"Median percentile ({median_pct:.1f}%) >= 50.0%."
+        )
+        report.append(
+            "This may indicate lack of specificity in evidence layer weights."
+        )
+
+    report.append("")
+    report.append("Lowest-Ranked Housekeeping Genes (Best Outcome):")
+    report.append("-" * 80)
+    report.append(f"{'Gene':<12} {'Score':>8} {'Percentile':>12} {'Source':<20}")
+    report.append("-" * 80)
+
+    for gene in metrics["housekeeping_gene_details"]:
+        gene_symbol = gene["gene_symbol"]
+        score = gene["composite_score"]
+        percentile = gene["percentile_rank"] * 100
+        source = gene["source"]
+        report.append(
+            f"{gene_symbol:<12} {score:>8.4f} {percentile:>11.1f}% {source:<20}"
+        )
+
+    return "\n".join(report)