feat(01-04): add CLI entry point with setup and info commands
- Create click-based CLI with command group (--config, --verbose options) - Add 'info' command displaying pipeline version, config hash, data source versions - Add 'setup' command orchestrating full infrastructure flow: - Load config -> create store/provenance - Fetch gene universe (with checkpoint-restart) - Map Ensembl IDs to HGNC + UniProt - Validate mapping quality gates - Save to DuckDB with provenance sidecar - Update pyproject.toml entry point to usher_pipeline.cli.main:cli - Add .gitignore for data/, *.duckdb, build artifacts, provenance files
This commit is contained in:
40
.gitignore
vendored
Normal file
40
.gitignore
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# Data files
|
||||
data/
|
||||
|
||||
# DuckDB
|
||||
*.duckdb
|
||||
*.duckdb.wal
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.Python
|
||||
*.so
|
||||
*.egg
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.eggs/
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
.tox/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Provenance files (not in data/)
|
||||
/*.provenance.json
|
||||
|
||||
# Virtual environment
|
||||
.venv/
|
||||
venv/
|
||||
env/
|
||||
@@ -42,7 +42,7 @@ dev = [
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
usher-pipeline = "usher_pipeline.cli:main"
|
||||
usher-pipeline = "usher_pipeline.cli.main:cli"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["usher_pipeline"]
|
||||
|
||||
5
src/usher_pipeline/cli/__init__.py
Normal file
5
src/usher_pipeline/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""CLI interface for usher-pipeline."""
|
||||
|
||||
from usher_pipeline.cli.main import cli
|
||||
|
||||
__all__ = ["cli"]
|
||||
103
src/usher_pipeline/cli/main.py
Normal file
103
src/usher_pipeline/cli/main.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Main CLI entry point for usher-pipeline.
|
||||
|
||||
Provides command group with global options and subcommands for pipeline operations.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from usher_pipeline import __version__
|
||||
from usher_pipeline.config.loader import load_config
|
||||
from usher_pipeline.cli.setup_cmd import setup
|
||||
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option(
|
||||
'--config',
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default='config/default.yaml',
|
||||
help='Path to pipeline configuration YAML file'
|
||||
)
|
||||
@click.option(
|
||||
'--verbose',
|
||||
is_flag=True,
|
||||
help='Enable verbose logging (DEBUG level)'
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx, config, verbose):
|
||||
"""Usher-pipeline: Reproducible pipeline for discovering under-studied cilia/Usher candidate genes.
|
||||
|
||||
Provides data infrastructure, gene ID mapping, evidence layer aggregation,
|
||||
and scoring for candidate gene prioritization.
|
||||
"""
|
||||
# Set up context
|
||||
ctx.ensure_object(dict)
|
||||
ctx.obj['config_path'] = config
|
||||
ctx.obj['verbose'] = verbose
|
||||
|
||||
# Set logging level
|
||||
if verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logging.debug("Verbose logging enabled")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def info(ctx):
|
||||
"""Display pipeline information and configuration summary."""
|
||||
config_path = ctx.obj['config_path']
|
||||
|
||||
click.echo(f"Usher Pipeline v{__version__}")
|
||||
click.echo(f"Config: {config_path}")
|
||||
click.echo()
|
||||
|
||||
try:
|
||||
config = load_config(config_path)
|
||||
|
||||
# Display config hash
|
||||
config_hash = config.config_hash()
|
||||
click.echo(f"Config Hash: {config_hash[:16]}...")
|
||||
click.echo()
|
||||
|
||||
# Display data source versions
|
||||
click.echo(click.style("Data Source Versions:", bold=True))
|
||||
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||
click.echo(f" gnomAD Version: {config.versions.gnomad_version}")
|
||||
click.echo(f" GTEx Version: {config.versions.gtex_version}")
|
||||
click.echo(f" HPA Version: {config.versions.hpa_version}")
|
||||
click.echo()
|
||||
|
||||
# Display paths
|
||||
click.echo(click.style("Paths:", bold=True))
|
||||
click.echo(f" Data Directory: {config.data_dir}")
|
||||
click.echo(f" Cache Directory: {config.cache_dir}")
|
||||
click.echo(f" DuckDB Path: {config.duckdb_path}")
|
||||
click.echo()
|
||||
|
||||
# Display API config
|
||||
click.echo(click.style("API Configuration:", bold=True))
|
||||
click.echo(f" Rate Limit: {config.api.rate_limit_per_second} req/s")
|
||||
click.echo(f" Max Retries: {config.api.max_retries}")
|
||||
click.echo(f" Cache TTL: {config.api.cache_ttl_seconds}s")
|
||||
click.echo(f" Timeout: {config.api.timeout_seconds}s")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(click.style(f"Error loading config: {e}", fg='red'), err=True)
|
||||
ctx.exit(1)
|
||||
|
||||
|
||||
# Register setup command
|
||||
cli.add_command(setup)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
229
src/usher_pipeline/cli/setup_cmd.py
Normal file
229
src/usher_pipeline/cli/setup_cmd.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""Setup command: Initialize pipeline data infrastructure.
|
||||
|
||||
Orchestrates the full setup flow:
|
||||
1. Load config
|
||||
2. Create PipelineStore and ProvenanceTracker
|
||||
3. Check for existing checkpoints
|
||||
4. Fetch gene universe from Ensembl/mygene
|
||||
5. Map gene IDs (Ensembl -> HGNC + UniProt)
|
||||
6. Validate mapping quality
|
||||
7. Save to DuckDB with provenance
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import polars as pl
|
||||
|
||||
from usher_pipeline.config.loader import load_config
|
||||
from usher_pipeline.gene_mapping import (
|
||||
fetch_protein_coding_genes,
|
||||
validate_gene_universe,
|
||||
GeneMapper,
|
||||
MappingValidator,
|
||||
)
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@click.command('setup')
|
||||
@click.option(
|
||||
'--force',
|
||||
is_flag=True,
|
||||
help='Re-run setup even if checkpoints exist (re-fetches all data)'
|
||||
)
|
||||
@click.pass_context
|
||||
def setup(ctx, force):
|
||||
"""Initialize pipeline data infrastructure.
|
||||
|
||||
Fetches gene universe, maps IDs, validates results, and saves to DuckDB.
|
||||
Supports checkpoint-restart: skips expensive operations if data exists.
|
||||
"""
|
||||
config_path = ctx.obj['config_path']
|
||||
click.echo(click.style("=== Usher Pipeline Setup ===", bold=True))
|
||||
click.echo()
|
||||
|
||||
try:
|
||||
# 1. Load config
|
||||
click.echo("Loading configuration...")
|
||||
config = load_config(config_path)
|
||||
click.echo(click.style(f" Config loaded: {config_path}", fg='green'))
|
||||
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||
click.echo(f" DuckDB Path: {config.duckdb_path}")
|
||||
click.echo()
|
||||
|
||||
# 2. Create PipelineStore and ProvenanceTracker
|
||||
click.echo("Initializing storage and provenance tracking...")
|
||||
store = PipelineStore.from_config(config)
|
||||
provenance = ProvenanceTracker.from_config(config)
|
||||
click.echo(click.style(" Storage initialized", fg='green'))
|
||||
click.echo()
|
||||
|
||||
# 3. Check checkpoint
|
||||
has_checkpoint = store.has_checkpoint('gene_universe')
|
||||
|
||||
if has_checkpoint and not force:
|
||||
click.echo(click.style(
|
||||
"Gene universe checkpoint exists. Skipping fetch (use --force to re-fetch).",
|
||||
fg='yellow'
|
||||
))
|
||||
click.echo()
|
||||
|
||||
# Load existing data for validation display
|
||||
df = store.load_dataframe('gene_universe')
|
||||
if df is not None:
|
||||
gene_count = len(df)
|
||||
click.echo(f"Loaded {gene_count} genes from checkpoint")
|
||||
click.echo()
|
||||
|
||||
# Display summary
|
||||
click.echo(click.style("=== Setup Summary ===", bold=True))
|
||||
click.echo(f"Gene Count: {gene_count}")
|
||||
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||
click.echo()
|
||||
click.echo(click.style("Setup complete (used existing checkpoint)", fg='green'))
|
||||
return
|
||||
|
||||
# 4. Fetch gene universe
|
||||
click.echo("Fetching protein-coding genes from mygene...")
|
||||
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||
|
||||
try:
|
||||
gene_universe = fetch_protein_coding_genes(
|
||||
ensembl_release=config.versions.ensembl_release
|
||||
)
|
||||
click.echo(click.style(
|
||||
f" Fetched {len(gene_universe)} protein-coding genes",
|
||||
fg='green'
|
||||
))
|
||||
except Exception as e:
|
||||
click.echo(click.style(f" Error fetching genes: {e}", fg='red'), err=True)
|
||||
logger.exception("Failed to fetch gene universe")
|
||||
sys.exit(1)
|
||||
|
||||
click.echo()
|
||||
|
||||
# 5. Validate gene universe
|
||||
click.echo("Validating gene universe...")
|
||||
universe_validation = validate_gene_universe(gene_universe)
|
||||
|
||||
for msg in universe_validation.messages:
|
||||
if 'FAILED' in msg:
|
||||
click.echo(click.style(f" {msg}", fg='red'))
|
||||
else:
|
||||
click.echo(f" {msg}")
|
||||
|
||||
if not universe_validation.passed:
|
||||
click.echo()
|
||||
click.echo(click.style("Gene universe validation failed", fg='red'), err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(click.style(" Validation passed", fg='green'))
|
||||
click.echo()
|
||||
provenance.record_step('fetch_gene_universe', {
|
||||
'gene_count': len(gene_universe),
|
||||
'ensembl_release': config.versions.ensembl_release
|
||||
})
|
||||
|
||||
# 6. Map gene IDs
|
||||
click.echo("Mapping Ensembl IDs to HGNC symbols and UniProt accessions...")
|
||||
mapper = GeneMapper(batch_size=1000)
|
||||
|
||||
try:
|
||||
mapping_results, mapping_report = mapper.map_ensembl_ids(gene_universe)
|
||||
click.echo(click.style(
|
||||
f" Mapped {mapping_report.mapped_hgnc}/{mapping_report.total_genes} genes",
|
||||
fg='green'
|
||||
))
|
||||
except Exception as e:
|
||||
click.echo(click.style(f" Error mapping IDs: {e}", fg='red'), err=True)
|
||||
logger.exception("Failed to map gene IDs")
|
||||
sys.exit(1)
|
||||
|
||||
click.echo()
|
||||
provenance.record_step('map_gene_ids', {
|
||||
'total_genes': mapping_report.total_genes,
|
||||
'mapped_hgnc': mapping_report.mapped_hgnc,
|
||||
'mapped_uniprot': mapping_report.mapped_uniprot,
|
||||
'success_rate_hgnc': f"{mapping_report.success_rate_hgnc:.1%}",
|
||||
'success_rate_uniprot': f"{mapping_report.success_rate_uniprot:.1%}",
|
||||
})
|
||||
|
||||
# 7. Validate mapping
|
||||
click.echo("Validating mapping quality...")
|
||||
validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95)
|
||||
validation_result = validator.validate(mapping_report)
|
||||
|
||||
for msg in validation_result.messages:
|
||||
if 'FAILED' in msg:
|
||||
click.echo(click.style(f" {msg}", fg='red'))
|
||||
elif 'WARNING' in msg:
|
||||
click.echo(click.style(f" {msg}", fg='yellow'))
|
||||
else:
|
||||
click.echo(f" {msg}")
|
||||
|
||||
if not validation_result.passed:
|
||||
# Save unmapped report
|
||||
unmapped_path = Path(config.data_dir) / "unmapped_genes.txt"
|
||||
validator.save_unmapped_report(mapping_report, unmapped_path)
|
||||
click.echo()
|
||||
click.echo(click.style(
|
||||
f"Mapping validation failed. Unmapped genes saved to: {unmapped_path}",
|
||||
fg='red'
|
||||
), err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(click.style(" Validation passed", fg='green'))
|
||||
click.echo()
|
||||
provenance.record_step('validate_mapping', {
|
||||
'hgnc_rate': f"{validation_result.hgnc_rate:.1%}",
|
||||
'uniprot_rate': f"{validation_result.uniprot_rate:.1%}",
|
||||
'validation_passed': True
|
||||
})
|
||||
|
||||
# 8. Save to DuckDB
|
||||
click.echo("Saving gene universe to DuckDB...")
|
||||
|
||||
# Create DataFrame with mapping results
|
||||
df = pl.DataFrame({
|
||||
'ensembl_id': [r.ensembl_id for r in mapping_results],
|
||||
'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
|
||||
'uniprot_accession': [r.uniprot_accession for r in mapping_results],
|
||||
})
|
||||
|
||||
store.save_dataframe(
|
||||
table_name='gene_universe',
|
||||
df=df,
|
||||
description=f"Protein-coding genes from Ensembl {config.versions.ensembl_release} with HGNC/UniProt mapping"
|
||||
)
|
||||
click.echo(click.style(f" Saved {len(df)} genes to 'gene_universe' table", fg='green'))
|
||||
click.echo()
|
||||
|
||||
# 9. Save provenance
|
||||
click.echo("Saving provenance metadata...")
|
||||
provenance_path = Path(config.data_dir) / "setup.provenance.json"
|
||||
provenance.save_sidecar(provenance_path)
|
||||
click.echo(click.style(f" Provenance saved: {provenance_path}", fg='green'))
|
||||
click.echo()
|
||||
|
||||
# 10. Display summary
|
||||
click.echo(click.style("=== Setup Summary ===", bold=True))
|
||||
click.echo(f"Gene Count: {len(gene_universe)}")
|
||||
click.echo(f"HGNC Mapping Rate: {mapping_report.success_rate_hgnc:.1%} ({mapping_report.mapped_hgnc}/{mapping_report.total_genes})")
|
||||
click.echo(f"UniProt Mapping Rate: {mapping_report.success_rate_uniprot:.1%} ({mapping_report.mapped_uniprot}/{mapping_report.total_genes})")
|
||||
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||
click.echo(f"Provenance: {provenance_path}")
|
||||
click.echo()
|
||||
click.echo(click.style("Setup complete!", fg='green', bold=True))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(click.style(f"Setup failed: {e}", fg='red'), err=True)
|
||||
logger.exception("Setup command failed")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Clean up resources
|
||||
if 'store' in locals():
|
||||
store.close()
|
||||
Reference in New Issue
Block a user