feat(02-02): add DuckDB loader and CLI evidence command for gnomAD
- load_to_duckdb: Saves constraint DataFrame to gnomad_constraint table with provenance tracking - query_constrained_genes: Queries constrained genes by LOEUF threshold (validates GCON-03 interpretation) - evidence_cmd.py: CLI command group with gnomad subcommand (fetch->transform->load orchestration) - Checkpoint-restart: Skips processing if gnomad_constraint table exists (--force to override) - Full CLI: usher-pipeline evidence gnomad [--force] [--url URL] [--min-depth N] [--min-cds-pct N]
This commit is contained in:
243
src/usher_pipeline/cli/evidence_cmd.py
Normal file
243
src/usher_pipeline/cli/evidence_cmd.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""Evidence layer commands: Fetch and process evidence data.
|
||||
|
||||
Commands for downloading and processing various evidence sources:
|
||||
- gnomad: Constraint metrics (pLI, LOEUF)
|
||||
- clingen: Gene-disease associations (future)
|
||||
- gtex: Expression data (future)
|
||||
- etc.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.config.loader import load_config
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
from usher_pipeline.evidence.gnomad import (
|
||||
download_constraint_metrics,
|
||||
process_gnomad_constraint,
|
||||
load_to_duckdb,
|
||||
GNOMAD_CONSTRAINT_URL,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@click.group('evidence')
|
||||
def evidence():
|
||||
"""Fetch and process evidence layer data.
|
||||
|
||||
Evidence sources include constraint metrics (gnomAD), gene-disease
|
||||
associations (ClinGen), expression data (GTEx), and more.
|
||||
|
||||
Each evidence source follows the fetch -> transform -> load pattern
|
||||
with checkpoint-restart and provenance tracking.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@evidence.command('gnomad')
|
||||
@click.option(
|
||||
'--force',
|
||||
is_flag=True,
|
||||
help='Re-download and reprocess data even if checkpoint exists'
|
||||
)
|
||||
@click.option(
|
||||
'--url',
|
||||
default=GNOMAD_CONSTRAINT_URL,
|
||||
help='Override gnomAD constraint file URL'
|
||||
)
|
||||
@click.option(
|
||||
'--min-depth',
|
||||
type=float,
|
||||
default=30.0,
|
||||
help='Minimum mean sequencing depth for quality filtering (default: 30x)'
|
||||
)
|
||||
@click.option(
|
||||
'--min-cds-pct',
|
||||
type=float,
|
||||
default=0.9,
|
||||
help='Minimum CDS coverage percentage for quality filtering (default: 0.9 = 90%%)'
|
||||
)
|
||||
@click.pass_context
|
||||
def gnomad(ctx, force, url, min_depth, min_cds_pct):
|
||||
"""Fetch and load gnomAD constraint metrics (pLI, LOEUF).
|
||||
|
||||
Downloads gnomAD v4.1 constraint metrics, filters by coverage quality,
|
||||
normalizes LOEUF scores (0-1 range, inverted), and loads to DuckDB.
|
||||
|
||||
Supports checkpoint-restart: skips processing if data already exists
|
||||
in DuckDB (use --force to re-run).
|
||||
|
||||
Examples:
|
||||
|
||||
# First run: download, process, and load
|
||||
usher-pipeline evidence gnomad
|
||||
|
||||
# Force re-download and reprocess
|
||||
usher-pipeline evidence gnomad --force
|
||||
|
||||
# Use custom quality thresholds
|
||||
usher-pipeline evidence gnomad --min-depth 20 --min-cds-pct 0.8
|
||||
"""
|
||||
config_path = ctx.obj['config_path']
|
||||
|
||||
click.echo(click.style("=== gnomAD Constraint Evidence ===", bold=True))
|
||||
click.echo()
|
||||
|
||||
store = None
|
||||
try:
|
||||
# Load config
|
||||
click.echo("Loading configuration...")
|
||||
config = load_config(config_path)
|
||||
click.echo(click.style(f" Config loaded: {config_path}", fg='green'))
|
||||
click.echo(f" gnomAD Version: {config.versions.gnomad_version}")
|
||||
click.echo()
|
||||
|
||||
# Initialize storage and provenance
|
||||
click.echo("Initializing storage and provenance tracking...")
|
||||
store = PipelineStore.from_config(config)
|
||||
provenance = ProvenanceTracker.from_config(config)
|
||||
click.echo(click.style(" Storage initialized", fg='green'))
|
||||
click.echo()
|
||||
|
||||
# Check checkpoint
|
||||
has_checkpoint = store.has_checkpoint('gnomad_constraint')
|
||||
|
||||
if has_checkpoint and not force:
|
||||
click.echo(click.style(
|
||||
"gnomAD constraint checkpoint exists. Skipping processing (use --force to re-run).",
|
||||
fg='yellow'
|
||||
))
|
||||
click.echo()
|
||||
|
||||
# Load existing data for summary display
|
||||
df = store.load_dataframe('gnomad_constraint')
|
||||
if df is not None:
|
||||
total_genes = len(df)
|
||||
measured = df.filter(df['quality_flag'] == 'measured').height
|
||||
incomplete = df.filter(df['quality_flag'] == 'incomplete_coverage').height
|
||||
no_data = df.filter(df['quality_flag'] == 'no_data').height
|
||||
|
||||
click.echo(click.style("=== Summary ===", bold=True))
|
||||
click.echo(f"Total Genes: {total_genes}")
|
||||
click.echo(f" Measured (good coverage): {measured}")
|
||||
click.echo(f" Incomplete coverage: {incomplete}")
|
||||
click.echo(f" No data: {no_data}")
|
||||
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||
click.echo()
|
||||
click.echo(click.style("Evidence layer ready (used existing checkpoint)", fg='green'))
|
||||
return
|
||||
|
||||
# Download gnomAD constraint metrics
|
||||
click.echo("Downloading gnomAD constraint metrics...")
|
||||
click.echo(f" URL: {url}")
|
||||
click.echo(f" Version: {config.versions.gnomad_version}")
|
||||
|
||||
gnomad_dir = Path(config.data_dir) / "gnomad"
|
||||
gnomad_dir.mkdir(parents=True, exist_ok=True)
|
||||
tsv_path = gnomad_dir / "constraint_metrics.tsv"
|
||||
|
||||
try:
|
||||
tsv_path = download_constraint_metrics(
|
||||
output_path=tsv_path,
|
||||
url=url,
|
||||
force=force
|
||||
)
|
||||
click.echo(click.style(
|
||||
f" Downloaded to: {tsv_path}",
|
||||
fg='green'
|
||||
))
|
||||
except Exception as e:
|
||||
click.echo(click.style(f" Error downloading: {e}", fg='red'), err=True)
|
||||
logger.exception("Failed to download gnomAD constraint metrics")
|
||||
sys.exit(1)
|
||||
|
||||
click.echo()
|
||||
provenance.record_step('download_gnomad_constraint', {
|
||||
'url': url,
|
||||
'version': config.versions.gnomad_version,
|
||||
'output_path': str(tsv_path),
|
||||
})
|
||||
|
||||
# Process constraint data
|
||||
click.echo("Processing constraint metrics...")
|
||||
click.echo(f" Min depth: {min_depth}x")
|
||||
click.echo(f" Min CDS coverage: {min_cds_pct:.0%}")
|
||||
|
||||
try:
|
||||
df = process_gnomad_constraint(
|
||||
tsv_path=tsv_path,
|
||||
min_depth=min_depth,
|
||||
min_cds_pct=min_cds_pct
|
||||
)
|
||||
click.echo(click.style(
|
||||
f" Processed {len(df)} genes",
|
||||
fg='green'
|
||||
))
|
||||
except Exception as e:
|
||||
click.echo(click.style(f" Error processing: {e}", fg='red'), err=True)
|
||||
logger.exception("Failed to process gnomAD constraint metrics")
|
||||
sys.exit(1)
|
||||
|
||||
click.echo()
|
||||
provenance.record_step('process_gnomad_constraint', {
|
||||
'min_depth': min_depth,
|
||||
'min_cds_pct': min_cds_pct,
|
||||
'total_genes': len(df),
|
||||
})
|
||||
|
||||
# Load to DuckDB
|
||||
click.echo("Loading to DuckDB...")
|
||||
|
||||
try:
|
||||
load_to_duckdb(
|
||||
df=df,
|
||||
store=store,
|
||||
provenance=provenance,
|
||||
description=f"gnomAD {config.versions.gnomad_version} constraint metrics"
|
||||
)
|
||||
click.echo(click.style(
|
||||
f" Saved to 'gnomad_constraint' table",
|
||||
fg='green'
|
||||
))
|
||||
except Exception as e:
|
||||
click.echo(click.style(f" Error loading: {e}", fg='red'), err=True)
|
||||
logger.exception("Failed to load gnomAD constraint data to DuckDB")
|
||||
sys.exit(1)
|
||||
|
||||
click.echo()
|
||||
|
||||
# Save provenance sidecar
|
||||
click.echo("Saving provenance metadata...")
|
||||
provenance_path = gnomad_dir / "constraint.provenance.json"
|
||||
provenance.save_sidecar(provenance_path)
|
||||
click.echo(click.style(f" Provenance saved: {provenance_path}", fg='green'))
|
||||
click.echo()
|
||||
|
||||
# Display summary
|
||||
measured = df.filter(df['quality_flag'] == 'measured').height
|
||||
incomplete = df.filter(df['quality_flag'] == 'incomplete_coverage').height
|
||||
no_data = df.filter(df['quality_flag'] == 'no_data').height
|
||||
|
||||
click.echo(click.style("=== Summary ===", bold=True))
|
||||
click.echo(f"Total Genes: {len(df)}")
|
||||
click.echo(f" Measured (good coverage): {measured}")
|
||||
click.echo(f" Incomplete coverage: {incomplete}")
|
||||
click.echo(f" No data: {no_data}")
|
||||
click.echo(f"DuckDB Path: {config.duckdb_path}")
|
||||
click.echo(f"Provenance: {provenance_path}")
|
||||
click.echo()
|
||||
click.echo(click.style("gnomAD evidence layer complete!", fg='green', bold=True))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(click.style(f"Evidence command failed: {e}", fg='red'), err=True)
|
||||
logger.exception("Evidence command failed")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Clean up resources
|
||||
if store is not None:
|
||||
store.close()
|
||||
@@ -11,6 +11,7 @@ import click
|
||||
from usher_pipeline import __version__
|
||||
from usher_pipeline.config.loader import load_config
|
||||
from usher_pipeline.cli.setup_cmd import setup
|
||||
from usher_pipeline.cli.evidence_cmd import evidence
|
||||
|
||||
|
||||
# Configure logging
|
||||
@@ -95,8 +96,9 @@ def info(ctx):
|
||||
ctx.exit(1)
|
||||
|
||||
|
||||
# Register setup command
|
||||
# Register commands
|
||||
cli.add_command(setup)
|
||||
cli.add_command(evidence)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -7,6 +7,7 @@ from usher_pipeline.evidence.gnomad.transform import (
|
||||
normalize_scores,
|
||||
process_gnomad_constraint,
|
||||
)
|
||||
from usher_pipeline.evidence.gnomad.load import load_to_duckdb, query_constrained_genes
|
||||
|
||||
__all__ = [
|
||||
"ConstraintRecord",
|
||||
@@ -16,4 +17,6 @@ __all__ = [
|
||||
"filter_by_coverage",
|
||||
"normalize_scores",
|
||||
"process_gnomad_constraint",
|
||||
"load_to_duckdb",
|
||||
"query_constrained_genes",
|
||||
]
|
||||
|
||||
100
src/usher_pipeline/evidence/gnomad/load.py
Normal file
100
src/usher_pipeline/evidence/gnomad/load.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Load gnomAD constraint data to DuckDB with provenance tracking."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def load_to_duckdb(
|
||||
df: pl.DataFrame,
|
||||
store: PipelineStore,
|
||||
provenance: ProvenanceTracker,
|
||||
description: str = ""
|
||||
) -> None:
|
||||
"""Save gnomAD constraint DataFrame to DuckDB with provenance.
|
||||
|
||||
Creates or replaces the gnomad_constraint table (idempotent).
|
||||
Records provenance step with summary statistics.
|
||||
|
||||
Args:
|
||||
df: Processed gnomAD constraint DataFrame with quality_flag and loeuf_normalized
|
||||
store: PipelineStore instance for DuckDB persistence
|
||||
provenance: ProvenanceTracker instance for metadata recording
|
||||
description: Optional description for checkpoint metadata
|
||||
"""
|
||||
logger.info("gnomad_load_start", row_count=len(df))
|
||||
|
||||
# Calculate summary statistics for provenance
|
||||
measured_count = df.filter(pl.col("quality_flag") == "measured").height
|
||||
incomplete_count = df.filter(pl.col("quality_flag") == "incomplete_coverage").height
|
||||
no_data_count = df.filter(pl.col("quality_flag") == "no_data").height
|
||||
null_loeuf_count = df.filter(pl.col("loeuf").is_null()).height
|
||||
|
||||
# Save to DuckDB with CREATE OR REPLACE (idempotent)
|
||||
store.save_dataframe(
|
||||
df=df,
|
||||
table_name="gnomad_constraint",
|
||||
description=description or "gnomAD v4.1 constraint metrics with quality flags and normalized LOEUF scores",
|
||||
replace=True
|
||||
)
|
||||
|
||||
# Record provenance step with details
|
||||
provenance.record_step("load_gnomad_constraint", {
|
||||
"row_count": len(df),
|
||||
"measured_count": measured_count,
|
||||
"incomplete_count": incomplete_count,
|
||||
"no_data_count": no_data_count,
|
||||
"null_loeuf_count": null_loeuf_count,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"gnomad_load_complete",
|
||||
row_count=len(df),
|
||||
measured=measured_count,
|
||||
incomplete=incomplete_count,
|
||||
no_data=no_data_count,
|
||||
null_loeuf=null_loeuf_count,
|
||||
)
|
||||
|
||||
|
||||
def query_constrained_genes(
|
||||
store: PipelineStore,
|
||||
loeuf_threshold: float = 0.6
|
||||
) -> pl.DataFrame:
|
||||
"""Query highly constrained genes from DuckDB.
|
||||
|
||||
Demonstrates DuckDB query capability and validates GCON-03 interpretation:
|
||||
constrained genes are "important but under-studied" signals, not direct
|
||||
cilia involvement evidence.
|
||||
|
||||
Args:
|
||||
store: PipelineStore instance
|
||||
loeuf_threshold: LOEUF threshold (lower = more constrained)
|
||||
Default: 0.6 (represents genes in lower 40% of LOEUF distribution)
|
||||
|
||||
Returns:
|
||||
DataFrame with constrained genes sorted by LOEUF (most constrained first)
|
||||
Columns: gene_id, gene_symbol, loeuf, pli, quality_flag, loeuf_normalized
|
||||
"""
|
||||
logger.info("gnomad_query_constrained", loeuf_threshold=loeuf_threshold)
|
||||
|
||||
# Query DuckDB: constrained genes with good coverage only
|
||||
df = store.execute_query(
|
||||
"""
|
||||
SELECT gene_id, gene_symbol, loeuf, pli, quality_flag, loeuf_normalized
|
||||
FROM gnomad_constraint
|
||||
WHERE quality_flag = 'measured'
|
||||
AND loeuf < ?
|
||||
ORDER BY loeuf ASC
|
||||
""",
|
||||
params=[loeuf_threshold]
|
||||
)
|
||||
|
||||
logger.info("gnomad_query_complete", result_count=len(df))
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user