feat(01-04): add CLI entry point with setup and info commands
- Create click-based CLI with command group (--config, --verbose options) - Add 'info' command displaying pipeline version, config hash, data source versions - Add 'setup' command orchestrating full infrastructure flow: - Load config -> create store/provenance - Fetch gene universe (with checkpoint-restart) - Map Ensembl IDs to HGNC + UniProt - Validate mapping quality gates - Save to DuckDB with provenance sidecar - Update pyproject.toml entry point to usher_pipeline.cli.main:cli - Add .gitignore for data/, *.duckdb, build artifacts, provenance files
This commit is contained in:
40
.gitignore
vendored
Normal file
40
.gitignore
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Data files
|
||||||
|
data/
|
||||||
|
|
||||||
|
# DuckDB
|
||||||
|
*.duckdb
|
||||||
|
*.duckdb.wal
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.Python
|
||||||
|
*.so
|
||||||
|
*.egg
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.eggs/
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Provenance files (not in data/)
|
||||||
|
/*.provenance.json
|
||||||
|
|
||||||
|
# Virtual environment
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
@@ -42,7 +42,7 @@ dev = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
usher-pipeline = "usher_pipeline.cli:main"
|
usher-pipeline = "usher_pipeline.cli.main:cli"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
packages = ["usher_pipeline"]
|
packages = ["usher_pipeline"]
|
||||||
|
|||||||
5
src/usher_pipeline/cli/__init__.py
Normal file
5
src/usher_pipeline/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""CLI interface for usher-pipeline."""
|
||||||
|
|
||||||
|
from usher_pipeline.cli.main import cli
|
||||||
|
|
||||||
|
__all__ = ["cli"]
|
||||||
103
src/usher_pipeline/cli/main.py
Normal file
103
src/usher_pipeline/cli/main.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""Main CLI entry point for usher-pipeline.
|
||||||
|
|
||||||
|
Provides command group with global options and subcommands for pipeline operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from usher_pipeline import __version__
|
||||||
|
from usher_pipeline.config.loader import load_config
|
||||||
|
from usher_pipeline.cli.setup_cmd import setup
|
||||||
|
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.option(
|
||||||
|
'--config',
|
||||||
|
type=click.Path(exists=True, path_type=Path),
|
||||||
|
default='config/default.yaml',
|
||||||
|
help='Path to pipeline configuration YAML file'
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
'--verbose',
|
||||||
|
is_flag=True,
|
||||||
|
help='Enable verbose logging (DEBUG level)'
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def cli(ctx, config, verbose):
|
||||||
|
"""Usher-pipeline: Reproducible pipeline for discovering under-studied cilia/Usher candidate genes.
|
||||||
|
|
||||||
|
Provides data infrastructure, gene ID mapping, evidence layer aggregation,
|
||||||
|
and scoring for candidate gene prioritization.
|
||||||
|
"""
|
||||||
|
# Set up context
|
||||||
|
ctx.ensure_object(dict)
|
||||||
|
ctx.obj['config_path'] = config
|
||||||
|
ctx.obj['verbose'] = verbose
|
||||||
|
|
||||||
|
# Set logging level
|
||||||
|
if verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
logging.debug("Verbose logging enabled")
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.pass_context
|
||||||
|
def info(ctx):
|
||||||
|
"""Display pipeline information and configuration summary."""
|
||||||
|
config_path = ctx.obj['config_path']
|
||||||
|
|
||||||
|
click.echo(f"Usher Pipeline v{__version__}")
|
||||||
|
click.echo(f"Config: {config_path}")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = load_config(config_path)
|
||||||
|
|
||||||
|
# Display config hash
|
||||||
|
config_hash = config.config_hash()
|
||||||
|
click.echo(f"Config Hash: {config_hash[:16]}...")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# Display data source versions
|
||||||
|
click.echo(click.style("Data Source Versions:", bold=True))
|
||||||
|
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||||
|
click.echo(f" gnomAD Version: {config.versions.gnomad_version}")
|
||||||
|
click.echo(f" GTEx Version: {config.versions.gtex_version}")
|
||||||
|
click.echo(f" HPA Version: {config.versions.hpa_version}")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# Display paths
|
||||||
|
click.echo(click.style("Paths:", bold=True))
|
||||||
|
click.echo(f" Data Directory: {config.data_dir}")
|
||||||
|
click.echo(f" Cache Directory: {config.cache_dir}")
|
||||||
|
click.echo(f" DuckDB Path: {config.duckdb_path}")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# Display API config
|
||||||
|
click.echo(click.style("API Configuration:", bold=True))
|
||||||
|
click.echo(f" Rate Limit: {config.api.rate_limit_per_second} req/s")
|
||||||
|
click.echo(f" Max Retries: {config.api.max_retries}")
|
||||||
|
click.echo(f" Cache TTL: {config.api.cache_ttl_seconds}s")
|
||||||
|
click.echo(f" Timeout: {config.api.timeout_seconds}s")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f"Error loading config: {e}", fg='red'), err=True)
|
||||||
|
ctx.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Register setup command
|
||||||
|
cli.add_command(setup)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cli()
|
||||||
229
src/usher_pipeline/cli/setup_cmd.py
Normal file
229
src/usher_pipeline/cli/setup_cmd.py
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
"""Setup command: Initialize pipeline data infrastructure.
|
||||||
|
|
||||||
|
Orchestrates the full setup flow:
|
||||||
|
1. Load config
|
||||||
|
2. Create PipelineStore and ProvenanceTracker
|
||||||
|
3. Check for existing checkpoints
|
||||||
|
4. Fetch gene universe from Ensembl/mygene
|
||||||
|
5. Map gene IDs (Ensembl -> HGNC + UniProt)
|
||||||
|
6. Validate mapping quality
|
||||||
|
7. Save to DuckDB with provenance
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
from usher_pipeline.config.loader import load_config
|
||||||
|
from usher_pipeline.gene_mapping import (
|
||||||
|
fetch_protein_coding_genes,
|
||||||
|
validate_gene_universe,
|
||||||
|
GeneMapper,
|
||||||
|
MappingValidator,
|
||||||
|
)
|
||||||
|
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command('setup')
|
||||||
|
@click.option(
|
||||||
|
'--force',
|
||||||
|
is_flag=True,
|
||||||
|
help='Re-run setup even if checkpoints exist (re-fetches all data)'
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def setup(ctx, force):
|
||||||
|
"""Initialize pipeline data infrastructure.
|
||||||
|
|
||||||
|
Fetches gene universe, maps IDs, validates results, and saves to DuckDB.
|
||||||
|
Supports checkpoint-restart: skips expensive operations if data exists.
|
||||||
|
"""
|
||||||
|
config_path = ctx.obj['config_path']
|
||||||
|
click.echo(click.style("=== Usher Pipeline Setup ===", bold=True))
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. Load config
|
||||||
|
click.echo("Loading configuration...")
|
||||||
|
config = load_config(config_path)
|
||||||
|
click.echo(click.style(f" Config loaded: {config_path}", fg='green'))
|
||||||
|
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||||
|
click.echo(f" DuckDB Path: {config.duckdb_path}")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# 2. Create PipelineStore and ProvenanceTracker
|
||||||
|
click.echo("Initializing storage and provenance tracking...")
|
||||||
|
store = PipelineStore.from_config(config)
|
||||||
|
provenance = ProvenanceTracker.from_config(config)
|
||||||
|
click.echo(click.style(" Storage initialized", fg='green'))
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# 3. Check checkpoint
|
||||||
|
has_checkpoint = store.has_checkpoint('gene_universe')
|
||||||
|
|
||||||
|
if has_checkpoint and not force:
|
||||||
|
click.echo(click.style(
|
||||||
|
"Gene universe checkpoint exists. Skipping fetch (use --force to re-fetch).",
|
||||||
|
fg='yellow'
|
||||||
|
))
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# Load existing data for validation display
|
||||||
|
df = store.load_dataframe('gene_universe')
|
||||||
|
if df is not None:
|
||||||
|
gene_count = len(df)
|
||||||
|
click.echo(f"Loaded {gene_count} genes from checkpoint")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# Display summary
|
||||||
|
click.echo(click.style("=== Setup Summary ===", bold=True))
|
||||||
|
click.echo(f"Gene Count: {gene_count}")
|
||||||
|
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||||
|
click.echo()
|
||||||
|
click.echo(click.style("Setup complete (used existing checkpoint)", fg='green'))
|
||||||
|
return
|
||||||
|
|
||||||
|
# 4. Fetch gene universe
|
||||||
|
click.echo("Fetching protein-coding genes from mygene...")
|
||||||
|
click.echo(f" Ensembl Release: {config.versions.ensembl_release}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
gene_universe = fetch_protein_coding_genes(
|
||||||
|
ensembl_release=config.versions.ensembl_release
|
||||||
|
)
|
||||||
|
click.echo(click.style(
|
||||||
|
f" Fetched {len(gene_universe)} protein-coding genes",
|
||||||
|
fg='green'
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f" Error fetching genes: {e}", fg='red'), err=True)
|
||||||
|
logger.exception("Failed to fetch gene universe")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# 5. Validate gene universe
|
||||||
|
click.echo("Validating gene universe...")
|
||||||
|
universe_validation = validate_gene_universe(gene_universe)
|
||||||
|
|
||||||
|
for msg in universe_validation.messages:
|
||||||
|
if 'FAILED' in msg:
|
||||||
|
click.echo(click.style(f" {msg}", fg='red'))
|
||||||
|
else:
|
||||||
|
click.echo(f" {msg}")
|
||||||
|
|
||||||
|
if not universe_validation.passed:
|
||||||
|
click.echo()
|
||||||
|
click.echo(click.style("Gene universe validation failed", fg='red'), err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
click.echo(click.style(" Validation passed", fg='green'))
|
||||||
|
click.echo()
|
||||||
|
provenance.record_step('fetch_gene_universe', {
|
||||||
|
'gene_count': len(gene_universe),
|
||||||
|
'ensembl_release': config.versions.ensembl_release
|
||||||
|
})
|
||||||
|
|
||||||
|
# 6. Map gene IDs
|
||||||
|
click.echo("Mapping Ensembl IDs to HGNC symbols and UniProt accessions...")
|
||||||
|
mapper = GeneMapper(batch_size=1000)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mapping_results, mapping_report = mapper.map_ensembl_ids(gene_universe)
|
||||||
|
click.echo(click.style(
|
||||||
|
f" Mapped {mapping_report.mapped_hgnc}/{mapping_report.total_genes} genes",
|
||||||
|
fg='green'
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f" Error mapping IDs: {e}", fg='red'), err=True)
|
||||||
|
logger.exception("Failed to map gene IDs")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
click.echo()
|
||||||
|
provenance.record_step('map_gene_ids', {
|
||||||
|
'total_genes': mapping_report.total_genes,
|
||||||
|
'mapped_hgnc': mapping_report.mapped_hgnc,
|
||||||
|
'mapped_uniprot': mapping_report.mapped_uniprot,
|
||||||
|
'success_rate_hgnc': f"{mapping_report.success_rate_hgnc:.1%}",
|
||||||
|
'success_rate_uniprot': f"{mapping_report.success_rate_uniprot:.1%}",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 7. Validate mapping
|
||||||
|
click.echo("Validating mapping quality...")
|
||||||
|
validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95)
|
||||||
|
validation_result = validator.validate(mapping_report)
|
||||||
|
|
||||||
|
for msg in validation_result.messages:
|
||||||
|
if 'FAILED' in msg:
|
||||||
|
click.echo(click.style(f" {msg}", fg='red'))
|
||||||
|
elif 'WARNING' in msg:
|
||||||
|
click.echo(click.style(f" {msg}", fg='yellow'))
|
||||||
|
else:
|
||||||
|
click.echo(f" {msg}")
|
||||||
|
|
||||||
|
if not validation_result.passed:
|
||||||
|
# Save unmapped report
|
||||||
|
unmapped_path = Path(config.data_dir) / "unmapped_genes.txt"
|
||||||
|
validator.save_unmapped_report(mapping_report, unmapped_path)
|
||||||
|
click.echo()
|
||||||
|
click.echo(click.style(
|
||||||
|
f"Mapping validation failed. Unmapped genes saved to: {unmapped_path}",
|
||||||
|
fg='red'
|
||||||
|
), err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
click.echo(click.style(" Validation passed", fg='green'))
|
||||||
|
click.echo()
|
||||||
|
provenance.record_step('validate_mapping', {
|
||||||
|
'hgnc_rate': f"{validation_result.hgnc_rate:.1%}",
|
||||||
|
'uniprot_rate': f"{validation_result.uniprot_rate:.1%}",
|
||||||
|
'validation_passed': True
|
||||||
|
})
|
||||||
|
|
||||||
|
# 8. Save to DuckDB
|
||||||
|
click.echo("Saving gene universe to DuckDB...")
|
||||||
|
|
||||||
|
# Create DataFrame with mapping results
|
||||||
|
df = pl.DataFrame({
|
||||||
|
'ensembl_id': [r.ensembl_id for r in mapping_results],
|
||||||
|
'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
|
||||||
|
'uniprot_accession': [r.uniprot_accession for r in mapping_results],
|
||||||
|
})
|
||||||
|
|
||||||
|
store.save_dataframe(
|
||||||
|
table_name='gene_universe',
|
||||||
|
df=df,
|
||||||
|
description=f"Protein-coding genes from Ensembl {config.versions.ensembl_release} with HGNC/UniProt mapping"
|
||||||
|
)
|
||||||
|
click.echo(click.style(f" Saved {len(df)} genes to 'gene_universe' table", fg='green'))
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# 9. Save provenance
|
||||||
|
click.echo("Saving provenance metadata...")
|
||||||
|
provenance_path = Path(config.data_dir) / "setup.provenance.json"
|
||||||
|
provenance.save_sidecar(provenance_path)
|
||||||
|
click.echo(click.style(f" Provenance saved: {provenance_path}", fg='green'))
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
# 10. Display summary
|
||||||
|
click.echo(click.style("=== Setup Summary ===", bold=True))
|
||||||
|
click.echo(f"Gene Count: {len(gene_universe)}")
|
||||||
|
click.echo(f"HGNC Mapping Rate: {mapping_report.success_rate_hgnc:.1%} ({mapping_report.mapped_hgnc}/{mapping_report.total_genes})")
|
||||||
|
click.echo(f"UniProt Mapping Rate: {mapping_report.success_rate_uniprot:.1%} ({mapping_report.mapped_uniprot}/{mapping_report.total_genes})")
|
||||||
|
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||||
|
click.echo(f"Provenance: {provenance_path}")
|
||||||
|
click.echo()
|
||||||
|
click.echo(click.style("Setup complete!", fg='green', bold=True))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f"Setup failed: {e}", fg='red'), err=True)
|
||||||
|
logger.exception("Setup command failed")
|
||||||
|
sys.exit(1)
|
||||||
|
finally:
|
||||||
|
# Clean up resources
|
||||||
|
if 'store' in locals():
|
||||||
|
store.close()
|
||||||
Reference in New Issue
Block a user