From 0cd2f7c9dd7327bb337fa842b9c29d01299dd1d2 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Wed, 11 Feb 2026 20:41:31 +0800 Subject: [PATCH] feat(04-01): implement known gene compilation and ScoringWeights validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes) - Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence - Add load_known_genes_to_duckdb() to persist known genes table - Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6) --- src/usher_pipeline/config/schema.py | 23 ++++ src/usher_pipeline/scoring/__init__.py | 23 ++++ src/usher_pipeline/scoring/known_genes.py | 123 ++++++++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 src/usher_pipeline/scoring/__init__.py create mode 100644 src/usher_pipeline/scoring/known_genes.py diff --git a/src/usher_pipeline/config/schema.py b/src/usher_pipeline/config/schema.py index 25e6478..843e060 100644 --- a/src/usher_pipeline/config/schema.py +++ b/src/usher_pipeline/config/schema.py @@ -70,6 +70,29 @@ class ScoringWeights(BaseModel): description="Weight for literature evidence", ) + def validate_sum(self) -> None: + """ + Validate that all scoring weights sum to 1.0. + + Raises: + ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance) + + Notes: + - Tolerance of 1e-6 accounts for floating point precision + - Should be called before using weights in scoring calculations + """ + total = ( + self.gnomad + + self.expression + + self.annotation + + self.localization + + self.animal_model + + self.literature + ) + + if abs(total - 1.0) > 1e-6: + raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}") + class APIConfig(BaseModel): """Configuration for API clients.""" diff --git a/src/usher_pipeline/scoring/__init__.py b/src/usher_pipeline/scoring/__init__.py new file mode 100644 index 0000000..be28d1d --- /dev/null +++ b/src/usher_pipeline/scoring/__init__.py @@ -0,0 +1,23 @@ +"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes.""" + +from usher_pipeline.scoring.known_genes import ( + OMIM_USHER_GENES, + SYSCILIA_SCGS_V2_CORE, + compile_known_genes, + load_known_genes_to_duckdb, +) +from usher_pipeline.scoring.integration import ( + join_evidence_layers, + compute_composite_scores, + persist_scored_genes, +) + +__all__ = [ + "OMIM_USHER_GENES", + "SYSCILIA_SCGS_V2_CORE", + "compile_known_genes", + "load_known_genes_to_duckdb", + "join_evidence_layers", + "compute_composite_scores", + "persist_scored_genes", +] diff --git a/src/usher_pipeline/scoring/known_genes.py b/src/usher_pipeline/scoring/known_genes.py new file mode 100644 index 0000000..d786c32 --- /dev/null +++ b/src/usher_pipeline/scoring/known_genes.py @@ -0,0 +1,123 @@ +"""Known cilia and Usher syndrome gene compilation.""" + +import polars as pl +from usher_pipeline.persistence.duckdb_store import PipelineStore + +# OMIM Usher syndrome genes (high-confidence disease genes) +# Source: OMIM database (omim.org) - Usher syndrome entries +# These genes are well-established causes of Usher syndrome +OMIM_USHER_GENES = frozenset([ + "MYO7A", # USH1B + "USH1C", # USH1C (harmonin) + "CDH23", # USH1D + "PCDH15", # USH1F + "USH1G", # USH1G (SANS) + "CIB2", # USH1J + "USH2A", # USH2A + "ADGRV1", # USH2C (GPR98) + "WHRN", # USH2D (whirlin) + "CLRN1", # USH3A +]) + +# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset +# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226 +# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of +# well-characterized ciliary components used as positive controls. +# For complete list, see publication supplementary data. +# Future enhancement: implement fetch_scgs_v2() to download full gene set. +SYSCILIA_SCGS_V2_CORE = frozenset([ + "IFT88", # IFT-B core + "IFT140", # IFT-A core + "IFT172", # IFT-B core + "BBS1", # BBSome + "BBS2", # BBSome + "BBS4", # BBSome + "BBS5", # BBSome + "BBS7", # BBSome + "BBS9", # BBSome + "BBS10", # BBSome + "RPGRIP1L", # Transition zone + "CEP290", # Transition zone + "ARL13B", # Ciliary membrane + "INPP5E", # Ciliary membrane + "TMEM67", # MKS/JBTS + "CC2D2A", # MKS/JBTS + "NPHP1", # Nephronophthisis + "NPHP3", # Nephronophthisis + "NPHP4", # Nephronophthisis + "RPGR", # Retinal ciliopathy + "CEP164", # Centriole/basal body + "OFD1", # OFD syndrome + "MKS1", # Meckel syndrome + "TCTN1", # Tectonic complex + "TCTN2", # Tectonic complex + "TMEM216", # MKS/JBTS + "TMEM231", # MKS/JBTS + "TMEM138", # MKS/JBTS +]) + + +def compile_known_genes() -> pl.DataFrame: + """ + Compile known cilia/Usher genes into a structured DataFrame. + + Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes + into a single reference set for exclusion filtering and positive + control validation. + + Returns: + DataFrame with columns: + - gene_symbol (str): Gene symbol + - source (str): "omim_usher" or "syscilia_scgs_v2" + - confidence (str): "HIGH" for all entries in this curated set + + Notes: + - Genes appearing in both lists will have two rows (one per source) + - De-duplication is NOT performed on gene_symbol to preserve provenance + - Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE) + """ + # Create DataFrames for each gene set + omim_df = pl.DataFrame({ + "gene_symbol": list(OMIM_USHER_GENES), + "source": ["omim_usher"] * len(OMIM_USHER_GENES), + "confidence": ["HIGH"] * len(OMIM_USHER_GENES), + }) + + syscilia_df = pl.DataFrame({ + "gene_symbol": list(SYSCILIA_SCGS_V2_CORE), + "source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE), + "confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE), + }) + + # Concatenate both gene sets + combined = pl.concat([omim_df, syscilia_df]) + + return combined + + +def load_known_genes_to_duckdb(store: PipelineStore) -> int: + """ + Load known cilia/Usher genes into DuckDB. + + Args: + store: PipelineStore instance for database access + + Returns: + Number of unique gene symbols loaded + + Notes: + - Table name: known_cilia_genes + - Replaces existing table if present (CREATE OR REPLACE pattern) + """ + df = compile_known_genes() + + store.save_dataframe( + df=df, + table_name="known_cilia_genes", + description="Known cilia and Usher syndrome genes for positive control validation", + replace=True, + ) + + # Return count of unique gene symbols + unique_count = df["gene_symbol"].n_unique() + return unique_count