From d57a5f28260db1d7f6098e4d1e9e5ceab659727f Mon Sep 17 00:00:00 2001 From: gbanyan Date: Wed, 11 Feb 2026 20:52:37 +0800 Subject: [PATCH] feat(04-03): add CLI score command with checkpoint-restart - Created score_cmd.py following evidence_cmd.py pattern - Orchestrates full scoring pipeline: known genes -> composite scores -> QC -> validation - Options: --force, --skip-qc, --skip-validation for flexible iteration - Registered score command in main CLI group - Displays comprehensive summary with quality flag distribution --- src/usher_pipeline/cli/main.py | 2 + src/usher_pipeline/cli/score_cmd.py | 341 ++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 src/usher_pipeline/cli/score_cmd.py diff --git a/src/usher_pipeline/cli/main.py b/src/usher_pipeline/cli/main.py index 7b4a185..6f989ac 100644 --- a/src/usher_pipeline/cli/main.py +++ b/src/usher_pipeline/cli/main.py @@ -12,6 +12,7 @@ from usher_pipeline import __version__ from usher_pipeline.config.loader import load_config from usher_pipeline.cli.setup_cmd import setup from usher_pipeline.cli.evidence_cmd import evidence +from usher_pipeline.cli.score_cmd import score # Configure logging @@ -99,6 +100,7 @@ def info(ctx): # Register commands cli.add_command(setup) cli.add_command(evidence) +cli.add_command(score) if __name__ == '__main__': diff --git a/src/usher_pipeline/cli/score_cmd.py b/src/usher_pipeline/cli/score_cmd.py new file mode 100644 index 0000000..8fa4e5e --- /dev/null +++ b/src/usher_pipeline/cli/score_cmd.py @@ -0,0 +1,341 @@ +"""Scoring command: Integrate multi-evidence layers and compute composite scores. + +Commands for: +- Loading known genes (positive controls) +- Computing composite scores with weighted averaging +- Running quality control checks +- Validating against known gene rankings +""" + +import logging +import sys +from pathlib import Path + +import click +import structlog + +from usher_pipeline.config.loader import load_config +from usher_pipeline.persistence import PipelineStore, ProvenanceTracker +from usher_pipeline.scoring import ( + load_known_genes_to_duckdb, + compute_composite_scores, + persist_scored_genes, + run_qc_checks, + validate_known_gene_ranking, + generate_validation_report, +) + +logger = logging.getLogger(__name__) + + +@click.command('score') +@click.option( + '--force', + is_flag=True, + help='Re-run scoring even if scored_genes checkpoint exists' +) +@click.option( + '--skip-qc', + is_flag=True, + help='Skip quality control checks (for faster iteration)' +) +@click.option( + '--skip-validation', + is_flag=True, + help='Skip known gene validation' +) +@click.pass_context +def score(ctx, force, skip_qc, skip_validation): + """Compute multi-evidence composite scores for all genes. + + Integrates all 6 evidence layers (constraint, expression, annotation, + localization, animal models, literature) with configurable weights. + Validates scoring quality via QC checks and known gene rankings. + + Supports checkpoint-restart: skips processing if scored_genes table exists + (use --force to re-run). + + Pipeline steps: + 1. Load known genes (OMIM Usher + SYSCILIA SCGS) as positive controls + 2. Compute composite scores with NULL-preserving weighted average + 3. Persist scored_genes table with per-layer contributions + 4. Run QC checks (missing data thresholds, outlier detection) + 5. Validate known gene rankings (top quartile threshold) + + Examples: + + # First run: full scoring pipeline + usher-pipeline score + + # Force re-run + usher-pipeline score --force + + # Skip QC and validation (faster iteration) + usher-pipeline score --skip-qc --skip-validation + """ + config_path = ctx.obj['config_path'] + + click.echo(click.style("=== Multi-Evidence Scoring Pipeline ===", bold=True)) + click.echo() + + store = None + try: + # Load config + click.echo("Loading configuration...") + config = load_config(config_path) + click.echo(click.style(f" Config loaded: {config_path}", fg='green')) + click.echo() + + # Initialize storage and provenance + click.echo("Initializing storage and provenance tracking...") + store = PipelineStore.from_config(config) + provenance = ProvenanceTracker.from_config(config) + click.echo(click.style(" Storage initialized", fg='green')) + click.echo() + + # Check checkpoint + has_checkpoint = store.has_checkpoint('scored_genes') + + if has_checkpoint and not force: + click.echo(click.style( + "scored_genes checkpoint exists. Skipping processing (use --force to re-run).", + fg='yellow' + )) + click.echo() + + # Load existing data for summary display + df = store.load_dataframe('scored_genes') + if df is not None: + total_genes = df.height + genes_with_score = df.filter(df['composite_score'].is_not_null()).height + mean_score = df.filter(df['composite_score'].is_not_null())['composite_score'].mean() + + # Quality flag distribution + sufficient = df.filter(df['quality_flag'] == 'sufficient_evidence').height + moderate = df.filter(df['quality_flag'] == 'moderate_evidence').height + sparse = df.filter(df['quality_flag'] == 'sparse_evidence').height + no_evidence = df.filter(df['quality_flag'] == 'no_evidence').height + + click.echo(click.style("=== Summary ===", bold=True)) + click.echo(f"Total Genes: {total_genes}") + click.echo(f"Genes with scores: {genes_with_score}") + click.echo(f"Mean composite score: {mean_score:.4f}") + click.echo() + click.echo("Quality Flag Distribution:") + click.echo(f" Sufficient evidence (>=4 layers): {sufficient}") + click.echo(f" Moderate evidence (2-3 layers): {moderate}") + click.echo(f" Sparse evidence (1 layer): {sparse}") + click.echo(f" No evidence: {no_evidence}") + click.echo() + click.echo(f"DuckDB Path: {config.duckdb_path}") + click.echo() + click.echo(click.style("Scoring complete (used existing checkpoint)", fg='green')) + return + + # Validate scoring weights + click.echo("Validating scoring weights...") + scoring_weights = config.scoring + try: + scoring_weights.validate_sum() + click.echo(click.style(" Weights validated (sum = 1.0)", fg='green')) + click.echo(f" gnomAD: {scoring_weights.gnomad:.2f}") + click.echo(f" Expression: {scoring_weights.expression:.2f}") + click.echo(f" Annotation: {scoring_weights.annotation:.2f}") + click.echo(f" Localization: {scoring_weights.localization:.2f}") + click.echo(f" Animal Model: {scoring_weights.animal_model:.2f}") + click.echo(f" Literature: {scoring_weights.literature:.2f}") + except ValueError as e: + click.echo(click.style(f" Error: {e}", fg='red'), err=True) + sys.exit(1) + + click.echo() + + # Step 1: Load known genes + click.echo(click.style("Step 1: Loading known genes (positive controls)...", bold=True)) + try: + load_known_genes_to_duckdb(store) + known_genes_df = store.load_dataframe('known_genes') + known_gene_count = known_genes_df.height if known_genes_df else 0 + click.echo(click.style( + f" Loaded {known_gene_count} known genes (OMIM Usher + SYSCILIA SCGS)", + fg='green' + )) + except Exception as e: + click.echo(click.style(f" Error loading known genes: {e}", fg='red'), err=True) + logger.exception("Failed to load known genes") + sys.exit(1) + + click.echo() + provenance.record_step('load_known_genes', { + 'known_gene_count': known_gene_count, + }) + + # Step 2: Compute composite scores + click.echo(click.style("Step 2: Computing composite scores...", bold=True)) + click.echo(" Joining all 6 evidence layers...") + click.echo(" Computing NULL-preserving weighted averages...") + + try: + scored_df = compute_composite_scores(store, scoring_weights) + total_genes = scored_df.height + genes_with_score = scored_df.filter(scored_df['composite_score'].is_not_null()).height + mean_score = scored_df.filter(scored_df['composite_score'].is_not_null())['composite_score'].mean() + + click.echo(click.style( + f" Scored {genes_with_score}/{total_genes} genes (mean: {mean_score:.4f})", + fg='green' + )) + except Exception as e: + click.echo(click.style(f" Error computing scores: {e}", fg='red'), err=True) + logger.exception("Failed to compute composite scores") + sys.exit(1) + + click.echo() + provenance.record_step('compute_composite_scores', { + 'total_genes': total_genes, + 'genes_with_score': genes_with_score, + 'mean_score': float(mean_score), + 'weights': { + 'gnomad': scoring_weights.gnomad, + 'expression': scoring_weights.expression, + 'annotation': scoring_weights.annotation, + 'localization': scoring_weights.localization, + 'animal_model': scoring_weights.animal_model, + 'literature': scoring_weights.literature, + }, + }) + + # Step 3: Persist scores + click.echo(click.style("Step 3: Persisting scored genes to DuckDB...", bold=True)) + + try: + persist_scored_genes(store, scored_df, scoring_weights) + click.echo(click.style( + f" Saved to 'scored_genes' table", + fg='green' + )) + except Exception as e: + click.echo(click.style(f" Error persisting scores: {e}", fg='red'), err=True) + logger.exception("Failed to persist scored genes") + sys.exit(1) + + click.echo() + + # Step 4: QC checks (unless --skip-qc) + qc_passed = True + if not skip_qc: + click.echo(click.style("Step 4: Running quality control checks...", bold=True)) + + try: + qc_result = run_qc_checks(store) + + # Display warnings + if qc_result.get('warnings'): + click.echo(click.style(" Warnings:", fg='yellow')) + for warning in qc_result['warnings']: + click.echo(click.style(f" - {warning}", fg='yellow')) + + # Display errors + if qc_result.get('errors'): + click.echo(click.style(" Errors:", fg='red')) + for error in qc_result['errors']: + click.echo(click.style(f" - {error}", fg='red')) + qc_passed = False + else: + click.echo(click.style(" All QC checks passed", fg='green')) + + # Display missing data rates + if 'missing_data_rates' in qc_result: + click.echo() + click.echo(" Missing data rates by layer:") + for layer, rate in qc_result['missing_data_rates'].items(): + click.echo(f" {layer}: {rate:.1%}") + + except Exception as e: + click.echo(click.style(f" Error running QC: {e}", fg='red'), err=True) + logger.exception("Failed to run QC checks") + qc_passed = False + + click.echo() + provenance.record_step('run_qc_checks', { + 'passed': qc_passed, + 'warnings_count': len(qc_result.get('warnings', [])), + 'errors_count': len(qc_result.get('errors', [])), + }) + else: + click.echo(click.style("Step 4: Skipping QC checks (--skip-qc)", fg='yellow')) + click.echo() + + # Step 5: Validation (unless --skip-validation) + validation_passed = True + if not skip_validation: + click.echo(click.style("Step 5: Validating known gene rankings...", bold=True)) + + try: + validation_result = validate_known_gene_ranking(store) + validation_passed = validation_result.get('validation_passed', False) + + # Display validation report + report = generate_validation_report(validation_result) + click.echo(report) + + if validation_passed: + click.echo(click.style(" Validation PASSED", fg='green', bold=True)) + else: + click.echo(click.style(" Validation FAILED", fg='red', bold=True)) + + except Exception as e: + click.echo(click.style(f" Error running validation: {e}", fg='red'), err=True) + logger.exception("Failed to validate known gene ranking") + validation_passed = False + + click.echo() + provenance.record_step('validate_known_gene_ranking', { + 'passed': validation_passed, + }) + else: + click.echo(click.style("Step 5: Skipping validation (--skip-validation)", fg='yellow')) + click.echo() + + # Save provenance sidecar + click.echo("Saving provenance metadata...") + scoring_dir = Path(config.data_dir) / "scoring" + scoring_dir.mkdir(parents=True, exist_ok=True) + provenance_path = scoring_dir / "scoring.provenance.json" + provenance.save_sidecar(provenance_path) + click.echo(click.style(f" Provenance saved: {provenance_path}", fg='green')) + click.echo() + + # Display final summary + sufficient = scored_df.filter(scored_df['quality_flag'] == 'sufficient_evidence').height + moderate = scored_df.filter(scored_df['quality_flag'] == 'moderate_evidence').height + sparse = scored_df.filter(scored_df['quality_flag'] == 'sparse_evidence').height + no_evidence = scored_df.filter(scored_df['quality_flag'] == 'no_evidence').height + + click.echo(click.style("=== Final Summary ===", bold=True)) + click.echo(f"Total Genes: {total_genes}") + click.echo(f"Genes with scores: {genes_with_score} ({genes_with_score / total_genes * 100:.1f}%)") + click.echo(f"Mean composite score: {mean_score:.4f}") + click.echo() + click.echo("Quality Flag Distribution:") + click.echo(f" Sufficient evidence (>=4 layers): {sufficient}") + click.echo(f" Moderate evidence (2-3 layers): {moderate}") + click.echo(f" Sparse evidence (1 layer): {sparse}") + click.echo(f" No evidence: {no_evidence}") + click.echo() + click.echo(f"QC Status: {'PASS' if qc_passed else 'FAIL'}") + click.echo(f"Validation Status: {'PASS' if validation_passed or skip_validation else 'FAIL'}") + click.echo() + click.echo(f"DuckDB Path: {config.duckdb_path}") + click.echo(f"Provenance: {provenance_path}") + click.echo() + click.echo(click.style("Scoring pipeline complete!", fg='green', bold=True)) + + except Exception as e: + click.echo(click.style(f"Scoring command failed: {e}", fg='red'), err=True) + logger.exception("Scoring command failed") + sys.exit(1) + finally: + # Clean up resources + if store is not None: + store.close()