feat(04-01): implement known gene compilation and ScoringWeights validation

- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes) - Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence - Add load_known_genes_to_duckdb() to persist known genes table - Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
2026-02-11 20:41:31 +08:00
parent ed21f18a98
commit 0cd2f7c9dd
3 changed files with 169 additions and 0 deletions
--- a/src/usher_pipeline/config/schema.py
+++ b/src/usher_pipeline/config/schema.py
@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
        description="Weight for literature evidence",
    )

+    def validate_sum(self) -> None:
+        """
+        Validate that all scoring weights sum to 1.0.
+
+        Raises:
+            ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
+
+        Notes:
+            - Tolerance of 1e-6 accounts for floating point precision
+            - Should be called before using weights in scoring calculations
+        """
+        total = (
+            self.gnomad
+            + self.expression
+            + self.annotation
+            + self.localization
+            + self.animal_model
+            + self.literature
+        )
+
+        if abs(total - 1.0) > 1e-6:
+            raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
+

 class APIConfig(BaseModel):
    """Configuration for API clients."""
--- a/src/usher_pipeline/scoring/init.py
+++ b/src/usher_pipeline/scoring/init.py
@@ -0,0 +1,23 @@
+"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
+
+from usher_pipeline.scoring.known_genes import (
+    OMIM_USHER_GENES,
+    SYSCILIA_SCGS_V2_CORE,
+    compile_known_genes,
+    load_known_genes_to_duckdb,
+)
+from usher_pipeline.scoring.integration import (
+    join_evidence_layers,
+    compute_composite_scores,
+    persist_scored_genes,
+)
+
+__all__ = [
+    "OMIM_USHER_GENES",
+    "SYSCILIA_SCGS_V2_CORE",
+    "compile_known_genes",
+    "load_known_genes_to_duckdb",
+    "join_evidence_layers",
+    "compute_composite_scores",
+    "persist_scored_genes",
+]
--- a/src/usher_pipeline/scoring/known_genes.py
+++ b/src/usher_pipeline/scoring/known_genes.py
@@ -0,0 +1,123 @@
+"""Known cilia and Usher syndrome gene compilation."""
+
+import polars as pl
+from usher_pipeline.persistence.duckdb_store import PipelineStore
+
+# OMIM Usher syndrome genes (high-confidence disease genes)
+# Source: OMIM database (omim.org) - Usher syndrome entries
+# These genes are well-established causes of Usher syndrome
+OMIM_USHER_GENES = frozenset([
+    "MYO7A",    # USH1B
+    "USH1C",    # USH1C (harmonin)
+    "CDH23",    # USH1D
+    "PCDH15",   # USH1F
+    "USH1G",    # USH1G (SANS)
+    "CIB2",     # USH1J
+    "USH2A",    # USH2A
+    "ADGRV1",   # USH2C (GPR98)
+    "WHRN",     # USH2D (whirlin)
+    "CLRN1",    # USH3A
+])
+
+# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
+# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
+# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
+# well-characterized ciliary components used as positive controls.
+# For complete list, see publication supplementary data.
+# Future enhancement: implement fetch_scgs_v2() to download full gene set.
+SYSCILIA_SCGS_V2_CORE = frozenset([
+    "IFT88",      # IFT-B core
+    "IFT140",     # IFT-A core
+    "IFT172",     # IFT-B core
+    "BBS1",       # BBSome
+    "BBS2",       # BBSome
+    "BBS4",       # BBSome
+    "BBS5",       # BBSome
+    "BBS7",       # BBSome
+    "BBS9",       # BBSome
+    "BBS10",      # BBSome
+    "RPGRIP1L",   # Transition zone
+    "CEP290",     # Transition zone
+    "ARL13B",     # Ciliary membrane
+    "INPP5E",     # Ciliary membrane
+    "TMEM67",     # MKS/JBTS
+    "CC2D2A",     # MKS/JBTS
+    "NPHP1",      # Nephronophthisis
+    "NPHP3",      # Nephronophthisis
+    "NPHP4",      # Nephronophthisis
+    "RPGR",       # Retinal ciliopathy
+    "CEP164",     # Centriole/basal body
+    "OFD1",       # OFD syndrome
+    "MKS1",       # Meckel syndrome
+    "TCTN1",      # Tectonic complex
+    "TCTN2",      # Tectonic complex
+    "TMEM216",    # MKS/JBTS
+    "TMEM231",    # MKS/JBTS
+    "TMEM138",    # MKS/JBTS
+])
+
+
+def compile_known_genes() -> pl.DataFrame:
+    """
+    Compile known cilia/Usher genes into a structured DataFrame.
+
+    Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
+    into a single reference set for exclusion filtering and positive
+    control validation.
+
+    Returns:
+        DataFrame with columns:
+        - gene_symbol (str): Gene symbol
+        - source (str): "omim_usher" or "syscilia_scgs_v2"
+        - confidence (str): "HIGH" for all entries in this curated set
+
+    Notes:
+        - Genes appearing in both lists will have two rows (one per source)
+        - De-duplication is NOT performed on gene_symbol to preserve provenance
+        - Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
+    """
+    # Create DataFrames for each gene set
+    omim_df = pl.DataFrame({
+        "gene_symbol": list(OMIM_USHER_GENES),
+        "source": ["omim_usher"] * len(OMIM_USHER_GENES),
+        "confidence": ["HIGH"] * len(OMIM_USHER_GENES),
+    })
+
+    syscilia_df = pl.DataFrame({
+        "gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
+        "source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
+        "confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
+    })
+
+    # Concatenate both gene sets
+    combined = pl.concat([omim_df, syscilia_df])
+
+    return combined
+
+
+def load_known_genes_to_duckdb(store: PipelineStore) -> int:
+    """
+    Load known cilia/Usher genes into DuckDB.
+
+    Args:
+        store: PipelineStore instance for database access
+
+    Returns:
+        Number of unique gene symbols loaded
+
+    Notes:
+        - Table name: known_cilia_genes
+        - Replaces existing table if present (CREATE OR REPLACE pattern)
+    """
+    df = compile_known_genes()
+
+    store.save_dataframe(
+        df=df,
+        table_name="known_cilia_genes",
+        description="Known cilia and Usher syndrome genes for positive control validation",
+        replace=True,
+    )
+
+    # Return count of unique gene symbols
+    unique_count = df["gene_symbol"].n_unique()
+    return unique_count