feat(06-02): implement sensitivity analysis module with weight perturbation and Spearman correlation

- Add perturb_weight() function with renormalization to maintain sum=1.0 - Add run_sensitivity_analysis() for parameter sweep across all layers and deltas - Add summarize_sensitivity() for stability classification - Add generate_sensitivity_report() for human-readable output - Default perturbations: ±5% and ±10% with stability threshold 0.85
2026-02-12 04:40:21 +08:00
parent e488ff2d7a
commit a7589d9bf1
1 changed files with 378 additions and 0 deletions
--- a/src/usher_pipeline/scoring/sensitivity.py
+++ b/src/usher_pipeline/scoring/sensitivity.py
@@ -0,0 +1,378 @@
 """Parameter sweep sensitivity analysis for scoring weight validation."""
 import polars as pl
 import structlog
 from scipy.stats import spearmanr
 from usher_pipeline.config.schema import ScoringWeights
 from usher_pipeline.persistence.duckdb_store import PipelineStore
 from usher_pipeline.scoring.integration import compute_composite_scores
 logger = structlog.get_logger(__name__)
 # Evidence layer names (must match ScoringWeights fields)
 EVIDENCE_LAYERS = [
    "gnomad",
    "expression",
    "annotation",
    "localization",
    "animal_model",
    "literature",
 ]
 # Default perturbation deltas (±5% and ±10%)
 DEFAULT_DELTAS = [-0.10, -0.05, 0.05, 0.10]
 # Spearman correlation threshold for stability classification
 STABILITY_THRESHOLD = 0.85
 def perturb_weight(baseline: ScoringWeights, layer: str, delta: float) -> ScoringWeights:
    """
    Perturb a single weight and renormalize to maintain sum=1.0 constraint.
    Args:
        baseline: Baseline ScoringWeights instance
        layer: Evidence layer name to perturb (must be in EVIDENCE_LAYERS)
        delta: Perturbation amount (can be negative)
    Returns:
        New ScoringWeights instance with perturbed and renormalized weights
    Raises:
        ValueError: If layer not in EVIDENCE_LAYERS
    Notes:
        - Clamps perturbed weight to [0.0, 1.0] before renormalization
        - Renormalizes ALL weights so they sum to 1.0
        - Maintains weights.validate_sum() guarantee
    """
    if layer not in EVIDENCE_LAYERS:
        raise ValueError(
            f"Invalid layer '{layer}'. Must be one of {EVIDENCE_LAYERS}"
        )
    # Get baseline weights as dict
    w_dict = baseline.model_dump()
    # Apply perturbation with clamping
    w_dict[layer] = max(0.0, min(1.0, w_dict[layer] + delta))
    # Renormalize to sum=1.0
    total = sum(w_dict[k] for k in EVIDENCE_LAYERS)
    if total > 0:
        for k in EVIDENCE_LAYERS:
            w_dict[k] = w_dict[k] / total
    else:
        # Edge case: all weights became zero (should not happen in practice)
        # Revert to uniform distribution
        uniform = 1.0 / len(EVIDENCE_LAYERS)
        for k in EVIDENCE_LAYERS:
            w_dict[k] = uniform
    # Return new ScoringWeights instance
    return ScoringWeights(**w_dict)
 def run_sensitivity_analysis(
    store: PipelineStore,
    baseline_weights: ScoringWeights,
    deltas: list[float] | None = None,
    top_n: int = 100,
 ) -> dict:
    """
    Run sensitivity analysis by perturbing each weight and measuring rank stability.
    For each layer and each delta, perturbs the weight, recomputes composite scores,
    and measures Spearman rank correlation on the top-N genes compared to baseline.
    Args:
        store: PipelineStore with evidence layer tables
        baseline_weights: Baseline ScoringWeights to perturb
        deltas: List of perturbation amounts (default: DEFAULT_DELTAS)
        top_n: Number of top-ranked genes to compare (default: 100)
    Returns:
        Dict with keys:
        - baseline_weights: dict - baseline weights as dict
        - results: list[dict] - per-perturbation results with:
            - layer: str
            - delta: float
            - perturbed_weights: dict
            - spearman_rho: float or None
            - spearman_pval: float or None
            - overlap_count: int - genes in both top-N lists
            - top_n: int
        - top_n: int
        - total_perturbations: int
    Notes:
        - compute_composite_scores re-queries DB each time (by design)
        - Spearman correlation computed on composite_score of overlapping genes
        - If overlap < 10 genes, records rho=None and logs warning
    """
    if deltas is None:
        deltas = DEFAULT_DELTAS
    logger.info(
        "run_sensitivity_analysis_start",
        baseline_weights=baseline_weights.model_dump(),
        deltas=deltas,
        top_n=top_n,
        total_perturbations=len(EVIDENCE_LAYERS) * len(deltas),
    )
    # Compute baseline scores and get top-N genes
    baseline_scores = compute_composite_scores(store, baseline_weights)
    baseline_top_n = (
        baseline_scores
        .filter(pl.col("composite_score").is_not_null())
        .sort("composite_score", descending=True)
        .head(top_n)
        .select(["gene_symbol", "composite_score"])
        .rename({"composite_score": "baseline_score"})
    )
    results = []
    # For each layer, for each delta, compute perturbation
    for layer in EVIDENCE_LAYERS:
        for delta in deltas:
            # Create perturbed weights
            perturbed_weights = perturb_weight(baseline_weights, layer, delta)
            # Compute perturbed scores
            perturbed_scores = compute_composite_scores(store, perturbed_weights)
            perturbed_top_n = (
                perturbed_scores
                .filter(pl.col("composite_score").is_not_null())
                .sort("composite_score", descending=True)
                .head(top_n)
                .select(["gene_symbol", "composite_score"])
                .rename({"composite_score": "perturbed_score"})
            )
            # Inner join to get overlapping genes
            joined = baseline_top_n.join(perturbed_top_n, on="gene_symbol", how="inner")
            overlap_count = joined.height
            # Compute Spearman correlation if sufficient overlap
            if overlap_count < 10:
                logger.warning(
                    "run_sensitivity_analysis_low_overlap",
                    layer=layer,
                    delta=delta,
                    overlap_count=overlap_count,
                    message="Insufficient overlap for Spearman correlation (need >= 10)",
                )
                spearman_rho = None
                spearman_pval = None
            else:
                # Extract paired scores
                baseline_vals = joined["baseline_score"].to_numpy()
                perturbed_vals = joined["perturbed_score"].to_numpy()
                # Compute Spearman correlation
                rho, pval = spearmanr(baseline_vals, perturbed_vals)
                spearman_rho = float(rho)
                spearman_pval = float(pval)
            # Record result
            result = {
                "layer": layer,
                "delta": delta,
                "perturbed_weights": perturbed_weights.model_dump(),
                "spearman_rho": spearman_rho,
                "spearman_pval": spearman_pval,
                "overlap_count": overlap_count,
                "top_n": top_n,
            }
            results.append(result)
            # Log each perturbation result
            logger.info(
                "run_sensitivity_analysis_perturbation",
                layer=layer,
                delta=f"{delta:+.2f}",
                spearman_rho=f"{spearman_rho:.4f}" if spearman_rho is not None else "N/A",
                spearman_pval=f"{spearman_pval:.4e}" if spearman_pval is not None else "N/A",
                overlap_count=overlap_count,
                stable=spearman_rho >= STABILITY_THRESHOLD if spearman_rho is not None else None,
            )
    logger.info(
        "run_sensitivity_analysis_complete",
        total_perturbations=len(results),
        layers=len(EVIDENCE_LAYERS),
        deltas=len(deltas),
    )
    return {
        "baseline_weights": baseline_weights.model_dump(),
        "results": results,
        "top_n": top_n,
        "total_perturbations": len(results),
    }
 def summarize_sensitivity(analysis_result: dict) -> dict:
    """
    Summarize sensitivity analysis results with stability classification.
    Args:
        analysis_result: Dict returned from run_sensitivity_analysis()
    Returns:
        Dict with keys:
        - min_rho: float - minimum Spearman rho (excluding None)
        - max_rho: float - maximum Spearman rho (excluding None)
        - mean_rho: float - mean Spearman rho (excluding None)
        - stable_count: int - count of perturbations with rho >= STABILITY_THRESHOLD
        - unstable_count: int - count of perturbations with rho < STABILITY_THRESHOLD
        - total_perturbations: int
        - overall_stable: bool - True if all non-None rhos >= STABILITY_THRESHOLD
        - most_sensitive_layer: str - layer with lowest mean rho
        - most_robust_layer: str - layer with highest mean rho
    Notes:
        - Excludes None rho values from all statistics
        - most_sensitive/robust computed from per-layer mean rho
    """
    results = analysis_result["results"]
    # Filter out None rho values
    valid_results = [r for r in results if r["spearman_rho"] is not None]
    if not valid_results:
        # Edge case: all perturbations had insufficient overlap
        return {
            "min_rho": None,
            "max_rho": None,
            "mean_rho": None,
            "stable_count": 0,
            "unstable_count": 0,
            "total_perturbations": analysis_result["total_perturbations"],
            "overall_stable": False,
            "most_sensitive_layer": None,
            "most_robust_layer": None,
        }
    # Compute global statistics
    rho_values = [r["spearman_rho"] for r in valid_results]
    min_rho = min(rho_values)
    max_rho = max(rho_values)
    mean_rho = sum(rho_values) / len(rho_values)
    # Count stable/unstable
    stable_count = sum(1 for rho in rho_values if rho >= STABILITY_THRESHOLD)
    unstable_count = len(rho_values) - stable_count
    # Overall stability: all non-None rhos must be >= threshold
    overall_stable = all(rho >= STABILITY_THRESHOLD for rho in rho_values)
    # Compute per-layer mean rho
    layer_rho_map = {}
    for layer in EVIDENCE_LAYERS:
        layer_results = [
            r["spearman_rho"]
            for r in valid_results
            if r["layer"] == layer and r["spearman_rho"] is not None
        ]
        if layer_results:
            layer_rho_map[layer] = sum(layer_results) / len(layer_results)
    # Find most sensitive (lowest mean rho) and most robust (highest mean rho)
    if layer_rho_map:
        most_sensitive_layer = min(layer_rho_map, key=layer_rho_map.get)
        most_robust_layer = max(layer_rho_map, key=layer_rho_map.get)
    else:
        most_sensitive_layer = None
        most_robust_layer = None
    return {
        "min_rho": min_rho,
        "max_rho": max_rho,
        "mean_rho": mean_rho,
        "stable_count": stable_count,
        "unstable_count": unstable_count,
        "total_perturbations": analysis_result["total_perturbations"],
        "overall_stable": overall_stable,
        "most_sensitive_layer": most_sensitive_layer,
        "most_robust_layer": most_robust_layer,
    }
 def generate_sensitivity_report(analysis_result: dict, summary: dict) -> str:
    """
    Generate human-readable sensitivity analysis report.
    Args:
        analysis_result: Dict returned from run_sensitivity_analysis()
        summary: Dict returned from summarize_sensitivity()
    Returns:
        Multi-line text report with perturbation table and summary
    Notes:
        - Follows formatting pattern from generate_validation_report()
        - Shows table with Layer | Delta | Spearman rho | p-value | Stable?
        - Includes interpretation text
    """
    status = "STABLE ✓" if summary["overall_stable"] else "UNSTABLE ✗"
    report = [
        f"Sensitivity Analysis: {status}",
        "",
        "Summary:",
        f"  Total perturbations: {summary['total_perturbations']}",
        f"  Stable perturbations: {summary['stable_count']} (rho >= {STABILITY_THRESHOLD})",
        f"  Unstable perturbations: {summary['unstable_count']}",
        f"  Mean Spearman rho: {summary['mean_rho']:.4f}" if summary['mean_rho'] is not None else "  Mean Spearman rho: N/A",
        f"  Range: [{summary['min_rho']:.4f}, {summary['max_rho']:.4f}]" if summary['min_rho'] is not None else "  Range: N/A",
        "",
    ]
    # Add interpretation
    if summary["overall_stable"]:
        report.append(
            f"All weight perturbations (±5-10%) produce stable rankings (rho >= {STABILITY_THRESHOLD}), "
            "validating result robustness."
        )
    else:
        report.append(
            f"Warning: Some perturbations produce unstable rankings (rho < {STABILITY_THRESHOLD}). "
            "Results may be sensitive to weight choices."
        )
    if summary["most_sensitive_layer"] and summary["most_robust_layer"]:
        report.append("")
        report.append(f"  Most sensitive layer: {summary['most_sensitive_layer']}")
        report.append(f"  Most robust layer: {summary['most_robust_layer']}")
    report.append("")
    report.append("Perturbation Results:")
    report.append("-" * 100)
    report.append(f"{'Layer':<15} {'Delta':>8} {'Spearman rho':>14} {'p-value':>12} {'Overlap':>10} {'Stable?':>10}")
    report.append("-" * 100)
    for result in analysis_result["results"]:
        layer = result["layer"]
        delta = result["delta"]
        rho = result["spearman_rho"]
        pval = result["spearman_pval"]
        overlap = result["overlap_count"]
        if rho is not None:
            stable_mark = "✓" if rho >= STABILITY_THRESHOLD else "✗"
            rho_str = f"{rho:.4f}"
            pval_str = f"{pval:.2e}"
        else:
            stable_mark = "N/A"
            rho_str = "N/A"
            pval_str = "N/A"
        report.append(
            f"{layer:<15} {delta:>+8.2f} {rho_str:>14} {pval_str:>12} {overlap:>10} {stable_mark:>10}"
        )
    return "\n".join(report)