From f33b04863546740577825c67b1094bb73bc3a446 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Wed, 11 Feb 2026 16:39:50 +0800 Subject: [PATCH] feat(01-04): add CLI entry point with setup and info commands - Create click-based CLI with command group (--config, --verbose options) - Add 'info' command displaying pipeline version, config hash, data source versions - Add 'setup' command orchestrating full infrastructure flow: - Load config -> create store/provenance - Fetch gene universe (with checkpoint-restart) - Map Ensembl IDs to HGNC + UniProt - Validate mapping quality gates - Save to DuckDB with provenance sidecar - Update pyproject.toml entry point to usher_pipeline.cli.main:cli - Add .gitignore for data/, *.duckdb, build artifacts, provenance files --- .gitignore | 40 +++++ pyproject.toml | 2 +- src/usher_pipeline/cli/__init__.py | 5 + src/usher_pipeline/cli/main.py | 103 +++++++++++++ src/usher_pipeline/cli/setup_cmd.py | 229 ++++++++++++++++++++++++++++ 5 files changed, 378 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 src/usher_pipeline/cli/__init__.py create mode 100644 src/usher_pipeline/cli/main.py create mode 100644 src/usher_pipeline/cli/setup_cmd.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18a644 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Data files +data/ + +# DuckDB +*.duckdb +*.duckdb.wal + +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +*.egg +*.egg-info/ +dist/ +build/ +.eggs/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Provenance files (not in data/) +/*.provenance.json + +# Virtual environment +.venv/ +venv/ +env/ diff --git a/pyproject.toml b/pyproject.toml index 8055a74..fe5ba4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dev = [ ] [project.scripts] -usher-pipeline = "usher_pipeline.cli:main" +usher-pipeline = "usher_pipeline.cli.main:cli" [tool.setuptools] packages = ["usher_pipeline"] diff --git a/src/usher_pipeline/cli/__init__.py b/src/usher_pipeline/cli/__init__.py new file mode 100644 index 0000000..03f840f --- /dev/null +++ b/src/usher_pipeline/cli/__init__.py @@ -0,0 +1,5 @@ +"""CLI interface for usher-pipeline.""" + +from usher_pipeline.cli.main import cli + +__all__ = ["cli"] diff --git a/src/usher_pipeline/cli/main.py b/src/usher_pipeline/cli/main.py new file mode 100644 index 0000000..9cae931 --- /dev/null +++ b/src/usher_pipeline/cli/main.py @@ -0,0 +1,103 @@ +"""Main CLI entry point for usher-pipeline. + +Provides command group with global options and subcommands for pipeline operations. +""" + +import logging +from pathlib import Path + +import click + +from usher_pipeline import __version__ +from usher_pipeline.config.loader import load_config +from usher_pipeline.cli.setup_cmd import setup + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + + +@click.group() +@click.option( + '--config', + type=click.Path(exists=True, path_type=Path), + default='config/default.yaml', + help='Path to pipeline configuration YAML file' +) +@click.option( + '--verbose', + is_flag=True, + help='Enable verbose logging (DEBUG level)' +) +@click.pass_context +def cli(ctx, config, verbose): + """Usher-pipeline: Reproducible pipeline for discovering under-studied cilia/Usher candidate genes. + + Provides data infrastructure, gene ID mapping, evidence layer aggregation, + and scoring for candidate gene prioritization. + """ + # Set up context + ctx.ensure_object(dict) + ctx.obj['config_path'] = config + ctx.obj['verbose'] = verbose + + # Set logging level + if verbose: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug("Verbose logging enabled") + + +@cli.command() +@click.pass_context +def info(ctx): + """Display pipeline information and configuration summary.""" + config_path = ctx.obj['config_path'] + + click.echo(f"Usher Pipeline v{__version__}") + click.echo(f"Config: {config_path}") + click.echo() + + try: + config = load_config(config_path) + + # Display config hash + config_hash = config.config_hash() + click.echo(f"Config Hash: {config_hash[:16]}...") + click.echo() + + # Display data source versions + click.echo(click.style("Data Source Versions:", bold=True)) + click.echo(f" Ensembl Release: {config.versions.ensembl_release}") + click.echo(f" gnomAD Version: {config.versions.gnomad_version}") + click.echo(f" GTEx Version: {config.versions.gtex_version}") + click.echo(f" HPA Version: {config.versions.hpa_version}") + click.echo() + + # Display paths + click.echo(click.style("Paths:", bold=True)) + click.echo(f" Data Directory: {config.data_dir}") + click.echo(f" Cache Directory: {config.cache_dir}") + click.echo(f" DuckDB Path: {config.duckdb_path}") + click.echo() + + # Display API config + click.echo(click.style("API Configuration:", bold=True)) + click.echo(f" Rate Limit: {config.api.rate_limit_per_second} req/s") + click.echo(f" Max Retries: {config.api.max_retries}") + click.echo(f" Cache TTL: {config.api.cache_ttl_seconds}s") + click.echo(f" Timeout: {config.api.timeout_seconds}s") + + except Exception as e: + click.echo(click.style(f"Error loading config: {e}", fg='red'), err=True) + ctx.exit(1) + + +# Register setup command +cli.add_command(setup) + + +if __name__ == '__main__': + cli() diff --git a/src/usher_pipeline/cli/setup_cmd.py b/src/usher_pipeline/cli/setup_cmd.py new file mode 100644 index 0000000..282402e --- /dev/null +++ b/src/usher_pipeline/cli/setup_cmd.py @@ -0,0 +1,229 @@ +"""Setup command: Initialize pipeline data infrastructure. + +Orchestrates the full setup flow: +1. Load config +2. Create PipelineStore and ProvenanceTracker +3. Check for existing checkpoints +4. Fetch gene universe from Ensembl/mygene +5. Map gene IDs (Ensembl -> HGNC + UniProt) +6. Validate mapping quality +7. Save to DuckDB with provenance +""" + +import logging +import sys +from pathlib import Path + +import click +import polars as pl + +from usher_pipeline.config.loader import load_config +from usher_pipeline.gene_mapping import ( + fetch_protein_coding_genes, + validate_gene_universe, + GeneMapper, + MappingValidator, +) +from usher_pipeline.persistence import PipelineStore, ProvenanceTracker + +logger = logging.getLogger(__name__) + + +@click.command('setup') +@click.option( + '--force', + is_flag=True, + help='Re-run setup even if checkpoints exist (re-fetches all data)' +) +@click.pass_context +def setup(ctx, force): + """Initialize pipeline data infrastructure. + + Fetches gene universe, maps IDs, validates results, and saves to DuckDB. + Supports checkpoint-restart: skips expensive operations if data exists. + """ + config_path = ctx.obj['config_path'] + click.echo(click.style("=== Usher Pipeline Setup ===", bold=True)) + click.echo() + + try: + # 1. Load config + click.echo("Loading configuration...") + config = load_config(config_path) + click.echo(click.style(f" Config loaded: {config_path}", fg='green')) + click.echo(f" Ensembl Release: {config.versions.ensembl_release}") + click.echo(f" DuckDB Path: {config.duckdb_path}") + click.echo() + + # 2. Create PipelineStore and ProvenanceTracker + click.echo("Initializing storage and provenance tracking...") + store = PipelineStore.from_config(config) + provenance = ProvenanceTracker.from_config(config) + click.echo(click.style(" Storage initialized", fg='green')) + click.echo() + + # 3. Check checkpoint + has_checkpoint = store.has_checkpoint('gene_universe') + + if has_checkpoint and not force: + click.echo(click.style( + "Gene universe checkpoint exists. Skipping fetch (use --force to re-fetch).", + fg='yellow' + )) + click.echo() + + # Load existing data for validation display + df = store.load_dataframe('gene_universe') + if df is not None: + gene_count = len(df) + click.echo(f"Loaded {gene_count} genes from checkpoint") + click.echo() + + # Display summary + click.echo(click.style("=== Setup Summary ===", bold=True)) + click.echo(f"Gene Count: {gene_count}") + click.echo(f"DuckDB Path: {config.duckdb_path}") + click.echo() + click.echo(click.style("Setup complete (used existing checkpoint)", fg='green')) + return + + # 4. Fetch gene universe + click.echo("Fetching protein-coding genes from mygene...") + click.echo(f" Ensembl Release: {config.versions.ensembl_release}") + + try: + gene_universe = fetch_protein_coding_genes( + ensembl_release=config.versions.ensembl_release + ) + click.echo(click.style( + f" Fetched {len(gene_universe)} protein-coding genes", + fg='green' + )) + except Exception as e: + click.echo(click.style(f" Error fetching genes: {e}", fg='red'), err=True) + logger.exception("Failed to fetch gene universe") + sys.exit(1) + + click.echo() + + # 5. Validate gene universe + click.echo("Validating gene universe...") + universe_validation = validate_gene_universe(gene_universe) + + for msg in universe_validation.messages: + if 'FAILED' in msg: + click.echo(click.style(f" {msg}", fg='red')) + else: + click.echo(f" {msg}") + + if not universe_validation.passed: + click.echo() + click.echo(click.style("Gene universe validation failed", fg='red'), err=True) + sys.exit(1) + + click.echo(click.style(" Validation passed", fg='green')) + click.echo() + provenance.record_step('fetch_gene_universe', { + 'gene_count': len(gene_universe), + 'ensembl_release': config.versions.ensembl_release + }) + + # 6. Map gene IDs + click.echo("Mapping Ensembl IDs to HGNC symbols and UniProt accessions...") + mapper = GeneMapper(batch_size=1000) + + try: + mapping_results, mapping_report = mapper.map_ensembl_ids(gene_universe) + click.echo(click.style( + f" Mapped {mapping_report.mapped_hgnc}/{mapping_report.total_genes} genes", + fg='green' + )) + except Exception as e: + click.echo(click.style(f" Error mapping IDs: {e}", fg='red'), err=True) + logger.exception("Failed to map gene IDs") + sys.exit(1) + + click.echo() + provenance.record_step('map_gene_ids', { + 'total_genes': mapping_report.total_genes, + 'mapped_hgnc': mapping_report.mapped_hgnc, + 'mapped_uniprot': mapping_report.mapped_uniprot, + 'success_rate_hgnc': f"{mapping_report.success_rate_hgnc:.1%}", + 'success_rate_uniprot': f"{mapping_report.success_rate_uniprot:.1%}", + }) + + # 7. Validate mapping + click.echo("Validating mapping quality...") + validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95) + validation_result = validator.validate(mapping_report) + + for msg in validation_result.messages: + if 'FAILED' in msg: + click.echo(click.style(f" {msg}", fg='red')) + elif 'WARNING' in msg: + click.echo(click.style(f" {msg}", fg='yellow')) + else: + click.echo(f" {msg}") + + if not validation_result.passed: + # Save unmapped report + unmapped_path = Path(config.data_dir) / "unmapped_genes.txt" + validator.save_unmapped_report(mapping_report, unmapped_path) + click.echo() + click.echo(click.style( + f"Mapping validation failed. Unmapped genes saved to: {unmapped_path}", + fg='red' + ), err=True) + sys.exit(1) + + click.echo(click.style(" Validation passed", fg='green')) + click.echo() + provenance.record_step('validate_mapping', { + 'hgnc_rate': f"{validation_result.hgnc_rate:.1%}", + 'uniprot_rate': f"{validation_result.uniprot_rate:.1%}", + 'validation_passed': True + }) + + # 8. Save to DuckDB + click.echo("Saving gene universe to DuckDB...") + + # Create DataFrame with mapping results + df = pl.DataFrame({ + 'ensembl_id': [r.ensembl_id for r in mapping_results], + 'hgnc_symbol': [r.hgnc_symbol for r in mapping_results], + 'uniprot_accession': [r.uniprot_accession for r in mapping_results], + }) + + store.save_dataframe( + table_name='gene_universe', + df=df, + description=f"Protein-coding genes from Ensembl {config.versions.ensembl_release} with HGNC/UniProt mapping" + ) + click.echo(click.style(f" Saved {len(df)} genes to 'gene_universe' table", fg='green')) + click.echo() + + # 9. Save provenance + click.echo("Saving provenance metadata...") + provenance_path = Path(config.data_dir) / "setup.provenance.json" + provenance.save_sidecar(provenance_path) + click.echo(click.style(f" Provenance saved: {provenance_path}", fg='green')) + click.echo() + + # 10. Display summary + click.echo(click.style("=== Setup Summary ===", bold=True)) + click.echo(f"Gene Count: {len(gene_universe)}") + click.echo(f"HGNC Mapping Rate: {mapping_report.success_rate_hgnc:.1%} ({mapping_report.mapped_hgnc}/{mapping_report.total_genes})") + click.echo(f"UniProt Mapping Rate: {mapping_report.success_rate_uniprot:.1%} ({mapping_report.mapped_uniprot}/{mapping_report.total_genes})") + click.echo(f"DuckDB Path: {config.duckdb_path}") + click.echo(f"Provenance: {provenance_path}") + click.echo() + click.echo(click.style("Setup complete!", fg='green', bold=True)) + + except Exception as e: + click.echo(click.style(f"Setup failed: {e}", fg='red'), err=True) + logger.exception("Setup command failed") + sys.exit(1) + finally: + # Clean up resources + if 'store' in locals(): + store.close()