feat(04-01): implement known gene compilation and ScoringWeights validation
- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes) - Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence - Add load_known_genes_to_duckdb() to persist known genes table - Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
This commit is contained in:
@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
|
||||
description="Weight for literature evidence",
|
||||
)
|
||||
|
||||
def validate_sum(self) -> None:
|
||||
"""
|
||||
Validate that all scoring weights sum to 1.0.
|
||||
|
||||
Raises:
|
||||
ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
|
||||
|
||||
Notes:
|
||||
- Tolerance of 1e-6 accounts for floating point precision
|
||||
- Should be called before using weights in scoring calculations
|
||||
"""
|
||||
total = (
|
||||
self.gnomad
|
||||
+ self.expression
|
||||
+ self.annotation
|
||||
+ self.localization
|
||||
+ self.animal_model
|
||||
+ self.literature
|
||||
)
|
||||
|
||||
if abs(total - 1.0) > 1e-6:
|
||||
raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
|
||||
|
||||
|
||||
class APIConfig(BaseModel):
|
||||
"""Configuration for API clients."""
|
||||
|
||||
23
src/usher_pipeline/scoring/__init__.py
Normal file
23
src/usher_pipeline/scoring/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
|
||||
|
||||
from usher_pipeline.scoring.known_genes import (
|
||||
OMIM_USHER_GENES,
|
||||
SYSCILIA_SCGS_V2_CORE,
|
||||
compile_known_genes,
|
||||
load_known_genes_to_duckdb,
|
||||
)
|
||||
from usher_pipeline.scoring.integration import (
|
||||
join_evidence_layers,
|
||||
compute_composite_scores,
|
||||
persist_scored_genes,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"OMIM_USHER_GENES",
|
||||
"SYSCILIA_SCGS_V2_CORE",
|
||||
"compile_known_genes",
|
||||
"load_known_genes_to_duckdb",
|
||||
"join_evidence_layers",
|
||||
"compute_composite_scores",
|
||||
"persist_scored_genes",
|
||||
]
|
||||
123
src/usher_pipeline/scoring/known_genes.py
Normal file
123
src/usher_pipeline/scoring/known_genes.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Known cilia and Usher syndrome gene compilation."""
|
||||
|
||||
import polars as pl
|
||||
from usher_pipeline.persistence.duckdb_store import PipelineStore
|
||||
|
||||
# OMIM Usher syndrome genes (high-confidence disease genes)
|
||||
# Source: OMIM database (omim.org) - Usher syndrome entries
|
||||
# These genes are well-established causes of Usher syndrome
|
||||
OMIM_USHER_GENES = frozenset([
|
||||
"MYO7A", # USH1B
|
||||
"USH1C", # USH1C (harmonin)
|
||||
"CDH23", # USH1D
|
||||
"PCDH15", # USH1F
|
||||
"USH1G", # USH1G (SANS)
|
||||
"CIB2", # USH1J
|
||||
"USH2A", # USH2A
|
||||
"ADGRV1", # USH2C (GPR98)
|
||||
"WHRN", # USH2D (whirlin)
|
||||
"CLRN1", # USH3A
|
||||
])
|
||||
|
||||
# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
|
||||
# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
|
||||
# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
|
||||
# well-characterized ciliary components used as positive controls.
|
||||
# For complete list, see publication supplementary data.
|
||||
# Future enhancement: implement fetch_scgs_v2() to download full gene set.
|
||||
SYSCILIA_SCGS_V2_CORE = frozenset([
|
||||
"IFT88", # IFT-B core
|
||||
"IFT140", # IFT-A core
|
||||
"IFT172", # IFT-B core
|
||||
"BBS1", # BBSome
|
||||
"BBS2", # BBSome
|
||||
"BBS4", # BBSome
|
||||
"BBS5", # BBSome
|
||||
"BBS7", # BBSome
|
||||
"BBS9", # BBSome
|
||||
"BBS10", # BBSome
|
||||
"RPGRIP1L", # Transition zone
|
||||
"CEP290", # Transition zone
|
||||
"ARL13B", # Ciliary membrane
|
||||
"INPP5E", # Ciliary membrane
|
||||
"TMEM67", # MKS/JBTS
|
||||
"CC2D2A", # MKS/JBTS
|
||||
"NPHP1", # Nephronophthisis
|
||||
"NPHP3", # Nephronophthisis
|
||||
"NPHP4", # Nephronophthisis
|
||||
"RPGR", # Retinal ciliopathy
|
||||
"CEP164", # Centriole/basal body
|
||||
"OFD1", # OFD syndrome
|
||||
"MKS1", # Meckel syndrome
|
||||
"TCTN1", # Tectonic complex
|
||||
"TCTN2", # Tectonic complex
|
||||
"TMEM216", # MKS/JBTS
|
||||
"TMEM231", # MKS/JBTS
|
||||
"TMEM138", # MKS/JBTS
|
||||
])
|
||||
|
||||
|
||||
def compile_known_genes() -> pl.DataFrame:
|
||||
"""
|
||||
Compile known cilia/Usher genes into a structured DataFrame.
|
||||
|
||||
Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
|
||||
into a single reference set for exclusion filtering and positive
|
||||
control validation.
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- gene_symbol (str): Gene symbol
|
||||
- source (str): "omim_usher" or "syscilia_scgs_v2"
|
||||
- confidence (str): "HIGH" for all entries in this curated set
|
||||
|
||||
Notes:
|
||||
- Genes appearing in both lists will have two rows (one per source)
|
||||
- De-duplication is NOT performed on gene_symbol to preserve provenance
|
||||
- Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
|
||||
"""
|
||||
# Create DataFrames for each gene set
|
||||
omim_df = pl.DataFrame({
|
||||
"gene_symbol": list(OMIM_USHER_GENES),
|
||||
"source": ["omim_usher"] * len(OMIM_USHER_GENES),
|
||||
"confidence": ["HIGH"] * len(OMIM_USHER_GENES),
|
||||
})
|
||||
|
||||
syscilia_df = pl.DataFrame({
|
||||
"gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
|
||||
"source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
|
||||
"confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
|
||||
})
|
||||
|
||||
# Concatenate both gene sets
|
||||
combined = pl.concat([omim_df, syscilia_df])
|
||||
|
||||
return combined
|
||||
|
||||
|
||||
def load_known_genes_to_duckdb(store: PipelineStore) -> int:
|
||||
"""
|
||||
Load known cilia/Usher genes into DuckDB.
|
||||
|
||||
Args:
|
||||
store: PipelineStore instance for database access
|
||||
|
||||
Returns:
|
||||
Number of unique gene symbols loaded
|
||||
|
||||
Notes:
|
||||
- Table name: known_cilia_genes
|
||||
- Replaces existing table if present (CREATE OR REPLACE pattern)
|
||||
"""
|
||||
df = compile_known_genes()
|
||||
|
||||
store.save_dataframe(
|
||||
df=df,
|
||||
table_name="known_cilia_genes",
|
||||
description="Known cilia and Usher syndrome genes for positive control validation",
|
||||
replace=True,
|
||||
)
|
||||
|
||||
# Return count of unique gene symbols
|
||||
unique_count = df["gene_symbol"].n_unique()
|
||||
return unique_count
|
||||
Reference in New Issue
Block a user