feat(04-01): implement known gene compilation and ScoringWeights validation

- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes)
- Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence
- Add load_known_genes_to_duckdb() to persist known genes table
- Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
This commit is contained in:
2026-02-11 20:41:31 +08:00
parent ed21f18a98
commit 0cd2f7c9dd
3 changed files with 169 additions and 0 deletions

View File

@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
description="Weight for literature evidence",
)
def validate_sum(self) -> None:
"""
Validate that all scoring weights sum to 1.0.
Raises:
ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
Notes:
- Tolerance of 1e-6 accounts for floating point precision
- Should be called before using weights in scoring calculations
"""
total = (
self.gnomad
+ self.expression
+ self.annotation
+ self.localization
+ self.animal_model
+ self.literature
)
if abs(total - 1.0) > 1e-6:
raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
class APIConfig(BaseModel):
"""Configuration for API clients."""

View File

@@ -0,0 +1,23 @@
"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
from usher_pipeline.scoring.known_genes import (
OMIM_USHER_GENES,
SYSCILIA_SCGS_V2_CORE,
compile_known_genes,
load_known_genes_to_duckdb,
)
from usher_pipeline.scoring.integration import (
join_evidence_layers,
compute_composite_scores,
persist_scored_genes,
)
__all__ = [
"OMIM_USHER_GENES",
"SYSCILIA_SCGS_V2_CORE",
"compile_known_genes",
"load_known_genes_to_duckdb",
"join_evidence_layers",
"compute_composite_scores",
"persist_scored_genes",
]

View File

@@ -0,0 +1,123 @@
"""Known cilia and Usher syndrome gene compilation."""
import polars as pl
from usher_pipeline.persistence.duckdb_store import PipelineStore
# OMIM Usher syndrome genes (high-confidence disease genes)
# Source: OMIM database (omim.org) - Usher syndrome entries
# These genes are well-established causes of Usher syndrome
OMIM_USHER_GENES = frozenset([
"MYO7A", # USH1B
"USH1C", # USH1C (harmonin)
"CDH23", # USH1D
"PCDH15", # USH1F
"USH1G", # USH1G (SANS)
"CIB2", # USH1J
"USH2A", # USH2A
"ADGRV1", # USH2C (GPR98)
"WHRN", # USH2D (whirlin)
"CLRN1", # USH3A
])
# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
# well-characterized ciliary components used as positive controls.
# For complete list, see publication supplementary data.
# Future enhancement: implement fetch_scgs_v2() to download full gene set.
SYSCILIA_SCGS_V2_CORE = frozenset([
"IFT88", # IFT-B core
"IFT140", # IFT-A core
"IFT172", # IFT-B core
"BBS1", # BBSome
"BBS2", # BBSome
"BBS4", # BBSome
"BBS5", # BBSome
"BBS7", # BBSome
"BBS9", # BBSome
"BBS10", # BBSome
"RPGRIP1L", # Transition zone
"CEP290", # Transition zone
"ARL13B", # Ciliary membrane
"INPP5E", # Ciliary membrane
"TMEM67", # MKS/JBTS
"CC2D2A", # MKS/JBTS
"NPHP1", # Nephronophthisis
"NPHP3", # Nephronophthisis
"NPHP4", # Nephronophthisis
"RPGR", # Retinal ciliopathy
"CEP164", # Centriole/basal body
"OFD1", # OFD syndrome
"MKS1", # Meckel syndrome
"TCTN1", # Tectonic complex
"TCTN2", # Tectonic complex
"TMEM216", # MKS/JBTS
"TMEM231", # MKS/JBTS
"TMEM138", # MKS/JBTS
])
def compile_known_genes() -> pl.DataFrame:
"""
Compile known cilia/Usher genes into a structured DataFrame.
Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
into a single reference set for exclusion filtering and positive
control validation.
Returns:
DataFrame with columns:
- gene_symbol (str): Gene symbol
- source (str): "omim_usher" or "syscilia_scgs_v2"
- confidence (str): "HIGH" for all entries in this curated set
Notes:
- Genes appearing in both lists will have two rows (one per source)
- De-duplication is NOT performed on gene_symbol to preserve provenance
- Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
"""
# Create DataFrames for each gene set
omim_df = pl.DataFrame({
"gene_symbol": list(OMIM_USHER_GENES),
"source": ["omim_usher"] * len(OMIM_USHER_GENES),
"confidence": ["HIGH"] * len(OMIM_USHER_GENES),
})
syscilia_df = pl.DataFrame({
"gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
"source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
"confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
})
# Concatenate both gene sets
combined = pl.concat([omim_df, syscilia_df])
return combined
def load_known_genes_to_duckdb(store: PipelineStore) -> int:
"""
Load known cilia/Usher genes into DuckDB.
Args:
store: PipelineStore instance for database access
Returns:
Number of unique gene symbols loaded
Notes:
- Table name: known_cilia_genes
- Replaces existing table if present (CREATE OR REPLACE pattern)
"""
df = compile_known_genes()
store.save_dataframe(
df=df,
table_name="known_cilia_genes",
description="Known cilia and Usher syndrome genes for positive control validation",
replace=True,
)
# Return count of unique gene symbols
unique_count = df["gene_symbol"].n_unique()
return unique_count