Files
usher-exploring/src/usher_pipeline/evidence/gnomad/models.py
gbanyan 174c4af02d feat(02-01): add gnomAD transform pipeline and comprehensive tests
- Implement filter_by_coverage with quality_flag categorization (measured/incomplete_coverage/no_data)
- Add normalize_scores with LOEUF inversion (lower LOEUF = higher score)
- NULL preservation throughout pipeline (unknown != zero constraint)
- process_gnomad_constraint end-to-end pipeline function
- 15 comprehensive unit tests covering edge cases:
  - NULL handling and preservation
  - Coverage filtering without dropping genes
  - Normalization bounds and inversion
  - Mixed type handling for robust parsing
- Fix column mapping to handle gnomAD v4.x loeuf/loeuf_upper duplication
- All existing tests continue to pass
2026-02-11 18:14:41 +08:00

62 lines
2.6 KiB
Python

"""Data models for gnomAD constraint metrics."""
from pydantic import BaseModel
# gnomAD v4.1 constraint metrics download URL
GNOMAD_CONSTRAINT_URL = (
"https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/"
"gnomad.v4.1.constraint_metrics.tsv"
)
# Column name mapping for different gnomAD versions
# v2.1.1 uses: gene, transcript, pLI, oe_lof_upper (LOEUF upper CI), mean_proportion_covered_bases
# v4.x uses: gene, transcript, mane_select, lof.pLI, lof.oe_ci.upper (LOEUF), mean_proportion_covered
# NOTE: In gnomAD data, what's called "upper" is actually the LOEUF value we want (observed/expected upper bound)
COLUMN_VARIANTS = {
"gene_id": ["gene", "gene_id"],
"gene_symbol": ["gene_symbol", "gene"],
"transcript": ["transcript", "canonical_transcript", "mane_select"],
"pli": ["pLI", "lof.pLI", "pli"],
# LOEUF is the "upper" column in gnomAD (oe_lof_upper = observed/expected upper bound)
"loeuf": ["lof.oe_ci.upper", "oe_lof_upper", "oe_lof", "loeuf"],
# loeuf_upper is typically the same as loeuf in gnomAD data (they report the upper CI)
"loeuf_upper": ["lof.oe_ci.upper", "oe_lof_upper_ci", "oe_lof_upper"],
"mean_depth": ["mean_coverage", "mean_depth", "mean_cov"],
"cds_covered_pct": [
"mean_proportion_covered_bases",
"mean_proportion_covered",
"cds_covered_pct",
],
}
class ConstraintRecord(BaseModel):
"""gnomAD constraint metrics for a single gene.
Attributes:
gene_id: Ensembl gene ID (e.g., ENSG00000...)
gene_symbol: HGNC gene symbol
transcript: Canonical transcript ID
pli: Probability of being loss-of-function intolerant (NULL if no estimate)
loeuf: Loss-of-function observed/expected upper bound fraction (NULL if no estimate)
loeuf_upper: Upper bound of LOEUF confidence interval
mean_depth: Mean exome sequencing depth across CDS
cds_covered_pct: Fraction of CDS bases with adequate coverage (0.0-1.0)
quality_flag: Data quality indicator - "measured", "incomplete_coverage", or "no_data"
loeuf_normalized: Normalized LOEUF score (0-1, inverted: higher = more constrained)
CRITICAL: NULL values represent missing data and are preserved as None.
Do NOT convert NULL to 0.0 - "unknown" is semantically different from "zero constraint".
"""
gene_id: str
gene_symbol: str
transcript: str
pli: float | None = None
loeuf: float | None = None
loeuf_upper: float | None = None
mean_depth: float | None = None
cds_covered_pct: float | None = None
quality_flag: str = "no_data"
loeuf_normalized: float | None = None