feat(04-01): implement known gene compilation and ScoringWeights validation
- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes) - Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence - Add load_known_genes_to_duckdb() to persist known genes table - Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
This commit is contained in:
@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
|
|||||||
description="Weight for literature evidence",
|
description="Weight for literature evidence",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def validate_sum(self) -> None:
|
||||||
|
"""
|
||||||
|
Validate that all scoring weights sum to 1.0.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Tolerance of 1e-6 accounts for floating point precision
|
||||||
|
- Should be called before using weights in scoring calculations
|
||||||
|
"""
|
||||||
|
total = (
|
||||||
|
self.gnomad
|
||||||
|
+ self.expression
|
||||||
|
+ self.annotation
|
||||||
|
+ self.localization
|
||||||
|
+ self.animal_model
|
||||||
|
+ self.literature
|
||||||
|
)
|
||||||
|
|
||||||
|
if abs(total - 1.0) > 1e-6:
|
||||||
|
raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
|
||||||
|
|
||||||
|
|
||||||
class APIConfig(BaseModel):
|
class APIConfig(BaseModel):
|
||||||
"""Configuration for API clients."""
|
"""Configuration for API clients."""
|
||||||
|
|||||||
23
src/usher_pipeline/scoring/__init__.py
Normal file
23
src/usher_pipeline/scoring/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
|
||||||
|
|
||||||
|
from usher_pipeline.scoring.known_genes import (
|
||||||
|
OMIM_USHER_GENES,
|
||||||
|
SYSCILIA_SCGS_V2_CORE,
|
||||||
|
compile_known_genes,
|
||||||
|
load_known_genes_to_duckdb,
|
||||||
|
)
|
||||||
|
from usher_pipeline.scoring.integration import (
|
||||||
|
join_evidence_layers,
|
||||||
|
compute_composite_scores,
|
||||||
|
persist_scored_genes,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"OMIM_USHER_GENES",
|
||||||
|
"SYSCILIA_SCGS_V2_CORE",
|
||||||
|
"compile_known_genes",
|
||||||
|
"load_known_genes_to_duckdb",
|
||||||
|
"join_evidence_layers",
|
||||||
|
"compute_composite_scores",
|
||||||
|
"persist_scored_genes",
|
||||||
|
]
|
||||||
123
src/usher_pipeline/scoring/known_genes.py
Normal file
123
src/usher_pipeline/scoring/known_genes.py
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
"""Known cilia and Usher syndrome gene compilation."""
|
||||||
|
|
||||||
|
import polars as pl
|
||||||
|
from usher_pipeline.persistence.duckdb_store import PipelineStore
|
||||||
|
|
||||||
|
# OMIM Usher syndrome genes (high-confidence disease genes)
|
||||||
|
# Source: OMIM database (omim.org) - Usher syndrome entries
|
||||||
|
# These genes are well-established causes of Usher syndrome
|
||||||
|
OMIM_USHER_GENES = frozenset([
|
||||||
|
"MYO7A", # USH1B
|
||||||
|
"USH1C", # USH1C (harmonin)
|
||||||
|
"CDH23", # USH1D
|
||||||
|
"PCDH15", # USH1F
|
||||||
|
"USH1G", # USH1G (SANS)
|
||||||
|
"CIB2", # USH1J
|
||||||
|
"USH2A", # USH2A
|
||||||
|
"ADGRV1", # USH2C (GPR98)
|
||||||
|
"WHRN", # USH2D (whirlin)
|
||||||
|
"CLRN1", # USH3A
|
||||||
|
])
|
||||||
|
|
||||||
|
# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
|
||||||
|
# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
|
||||||
|
# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
|
||||||
|
# well-characterized ciliary components used as positive controls.
|
||||||
|
# For complete list, see publication supplementary data.
|
||||||
|
# Future enhancement: implement fetch_scgs_v2() to download full gene set.
|
||||||
|
SYSCILIA_SCGS_V2_CORE = frozenset([
|
||||||
|
"IFT88", # IFT-B core
|
||||||
|
"IFT140", # IFT-A core
|
||||||
|
"IFT172", # IFT-B core
|
||||||
|
"BBS1", # BBSome
|
||||||
|
"BBS2", # BBSome
|
||||||
|
"BBS4", # BBSome
|
||||||
|
"BBS5", # BBSome
|
||||||
|
"BBS7", # BBSome
|
||||||
|
"BBS9", # BBSome
|
||||||
|
"BBS10", # BBSome
|
||||||
|
"RPGRIP1L", # Transition zone
|
||||||
|
"CEP290", # Transition zone
|
||||||
|
"ARL13B", # Ciliary membrane
|
||||||
|
"INPP5E", # Ciliary membrane
|
||||||
|
"TMEM67", # MKS/JBTS
|
||||||
|
"CC2D2A", # MKS/JBTS
|
||||||
|
"NPHP1", # Nephronophthisis
|
||||||
|
"NPHP3", # Nephronophthisis
|
||||||
|
"NPHP4", # Nephronophthisis
|
||||||
|
"RPGR", # Retinal ciliopathy
|
||||||
|
"CEP164", # Centriole/basal body
|
||||||
|
"OFD1", # OFD syndrome
|
||||||
|
"MKS1", # Meckel syndrome
|
||||||
|
"TCTN1", # Tectonic complex
|
||||||
|
"TCTN2", # Tectonic complex
|
||||||
|
"TMEM216", # MKS/JBTS
|
||||||
|
"TMEM231", # MKS/JBTS
|
||||||
|
"TMEM138", # MKS/JBTS
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def compile_known_genes() -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Compile known cilia/Usher genes into a structured DataFrame.
|
||||||
|
|
||||||
|
Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
|
||||||
|
into a single reference set for exclusion filtering and positive
|
||||||
|
control validation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with columns:
|
||||||
|
- gene_symbol (str): Gene symbol
|
||||||
|
- source (str): "omim_usher" or "syscilia_scgs_v2"
|
||||||
|
- confidence (str): "HIGH" for all entries in this curated set
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Genes appearing in both lists will have two rows (one per source)
|
||||||
|
- De-duplication is NOT performed on gene_symbol to preserve provenance
|
||||||
|
- Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
|
||||||
|
"""
|
||||||
|
# Create DataFrames for each gene set
|
||||||
|
omim_df = pl.DataFrame({
|
||||||
|
"gene_symbol": list(OMIM_USHER_GENES),
|
||||||
|
"source": ["omim_usher"] * len(OMIM_USHER_GENES),
|
||||||
|
"confidence": ["HIGH"] * len(OMIM_USHER_GENES),
|
||||||
|
})
|
||||||
|
|
||||||
|
syscilia_df = pl.DataFrame({
|
||||||
|
"gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
|
||||||
|
"source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
|
||||||
|
"confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Concatenate both gene sets
|
||||||
|
combined = pl.concat([omim_df, syscilia_df])
|
||||||
|
|
||||||
|
return combined
|
||||||
|
|
||||||
|
|
||||||
|
def load_known_genes_to_duckdb(store: PipelineStore) -> int:
|
||||||
|
"""
|
||||||
|
Load known cilia/Usher genes into DuckDB.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
store: PipelineStore instance for database access
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of unique gene symbols loaded
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Table name: known_cilia_genes
|
||||||
|
- Replaces existing table if present (CREATE OR REPLACE pattern)
|
||||||
|
"""
|
||||||
|
df = compile_known_genes()
|
||||||
|
|
||||||
|
store.save_dataframe(
|
||||||
|
df=df,
|
||||||
|
table_name="known_cilia_genes",
|
||||||
|
description="Known cilia and Usher syndrome genes for positive control validation",
|
||||||
|
replace=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return count of unique gene symbols
|
||||||
|
unique_count = df["gene_symbol"].n_unique()
|
||||||
|
return unique_count
|
||||||
Reference in New Issue
Block a user