From 0cd2f7c9dd7327bb337fa842b9c29d01299dd1d2 Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Wed, 11 Feb 2026 20:41:31 +0800
Subject: [PATCH] feat(04-01): implement known gene compilation and
 ScoringWeights validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes)
- Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence
- Add load_known_genes_to_duckdb() to persist known genes table
- Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
---
 src/usher_pipeline/config/schema.py       |  23 ++++
 src/usher_pipeline/scoring/__init__.py    |  23 ++++
 src/usher_pipeline/scoring/known_genes.py | 123 ++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 src/usher_pipeline/scoring/__init__.py
 create mode 100644 src/usher_pipeline/scoring/known_genes.py

diff --git a/src/usher_pipeline/config/schema.py b/src/usher_pipeline/config/schema.py
index 25e6478..843e060 100644
--- a/src/usher_pipeline/config/schema.py
+++ b/src/usher_pipeline/config/schema.py
@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
         description="Weight for literature evidence",
     )
 
+    def validate_sum(self) -> None:
+        """
+        Validate that all scoring weights sum to 1.0.
+
+        Raises:
+            ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
+
+        Notes:
+            - Tolerance of 1e-6 accounts for floating point precision
+            - Should be called before using weights in scoring calculations
+        """
+        total = (
+            self.gnomad
+            + self.expression
+            + self.annotation
+            + self.localization
+            + self.animal_model
+            + self.literature
+        )
+
+        if abs(total - 1.0) > 1e-6:
+            raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
+
 
 class APIConfig(BaseModel):
     """Configuration for API clients."""
diff --git a/src/usher_pipeline/scoring/__init__.py b/src/usher_pipeline/scoring/__init__.py
new file mode 100644
index 0000000..be28d1d
--- /dev/null
+++ b/src/usher_pipeline/scoring/__init__.py
@@ -0,0 +1,23 @@
+"""Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
+
+from usher_pipeline.scoring.known_genes import (
+    OMIM_USHER_GENES,
+    SYSCILIA_SCGS_V2_CORE,
+    compile_known_genes,
+    load_known_genes_to_duckdb,
+)
+from usher_pipeline.scoring.integration import (
+    join_evidence_layers,
+    compute_composite_scores,
+    persist_scored_genes,
+)
+
+__all__ = [
+    "OMIM_USHER_GENES",
+    "SYSCILIA_SCGS_V2_CORE",
+    "compile_known_genes",
+    "load_known_genes_to_duckdb",
+    "join_evidence_layers",
+    "compute_composite_scores",
+    "persist_scored_genes",
+]
diff --git a/src/usher_pipeline/scoring/known_genes.py b/src/usher_pipeline/scoring/known_genes.py
new file mode 100644
index 0000000..d786c32
--- /dev/null
+++ b/src/usher_pipeline/scoring/known_genes.py
@@ -0,0 +1,123 @@
+"""Known cilia and Usher syndrome gene compilation."""
+
+import polars as pl
+from usher_pipeline.persistence.duckdb_store import PipelineStore
+
+# OMIM Usher syndrome genes (high-confidence disease genes)
+# Source: OMIM database (omim.org) - Usher syndrome entries
+# These genes are well-established causes of Usher syndrome
+OMIM_USHER_GENES = frozenset([
+    "MYO7A",    # USH1B
+    "USH1C",    # USH1C (harmonin)
+    "CDH23",    # USH1D
+    "PCDH15",   # USH1F
+    "USH1G",    # USH1G (SANS)
+    "CIB2",     # USH1J
+    "USH2A",    # USH2A
+    "ADGRV1",   # USH2C (GPR98)
+    "WHRN",     # USH2D (whirlin)
+    "CLRN1",    # USH3A
+])
+
+# SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
+# Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
+# Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
+# well-characterized ciliary components used as positive controls.
+# For complete list, see publication supplementary data.
+# Future enhancement: implement fetch_scgs_v2() to download full gene set.
+SYSCILIA_SCGS_V2_CORE = frozenset([
+    "IFT88",      # IFT-B core
+    "IFT140",     # IFT-A core
+    "IFT172",     # IFT-B core
+    "BBS1",       # BBSome
+    "BBS2",       # BBSome
+    "BBS4",       # BBSome
+    "BBS5",       # BBSome
+    "BBS7",       # BBSome
+    "BBS9",       # BBSome
+    "BBS10",      # BBSome
+    "RPGRIP1L",   # Transition zone
+    "CEP290",     # Transition zone
+    "ARL13B",     # Ciliary membrane
+    "INPP5E",     # Ciliary membrane
+    "TMEM67",     # MKS/JBTS
+    "CC2D2A",     # MKS/JBTS
+    "NPHP1",      # Nephronophthisis
+    "NPHP3",      # Nephronophthisis
+    "NPHP4",      # Nephronophthisis
+    "RPGR",       # Retinal ciliopathy
+    "CEP164",     # Centriole/basal body
+    "OFD1",       # OFD syndrome
+    "MKS1",       # Meckel syndrome
+    "TCTN1",      # Tectonic complex
+    "TCTN2",      # Tectonic complex
+    "TMEM216",    # MKS/JBTS
+    "TMEM231",    # MKS/JBTS
+    "TMEM138",    # MKS/JBTS
+])
+
+
+def compile_known_genes() -> pl.DataFrame:
+    """
+    Compile known cilia/Usher genes into a structured DataFrame.
+
+    Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
+    into a single reference set for exclusion filtering and positive
+    control validation.
+
+    Returns:
+        DataFrame with columns:
+        - gene_symbol (str): Gene symbol
+        - source (str): "omim_usher" or "syscilia_scgs_v2"
+        - confidence (str): "HIGH" for all entries in this curated set
+
+    Notes:
+        - Genes appearing in both lists will have two rows (one per source)
+        - De-duplication is NOT performed on gene_symbol to preserve provenance
+        - Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
+    """
+    # Create DataFrames for each gene set
+    omim_df = pl.DataFrame({
+        "gene_symbol": list(OMIM_USHER_GENES),
+        "source": ["omim_usher"] * len(OMIM_USHER_GENES),
+        "confidence": ["HIGH"] * len(OMIM_USHER_GENES),
+    })
+
+    syscilia_df = pl.DataFrame({
+        "gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
+        "source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
+        "confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
+    })
+
+    # Concatenate both gene sets
+    combined = pl.concat([omim_df, syscilia_df])
+
+    return combined
+
+
+def load_known_genes_to_duckdb(store: PipelineStore) -> int:
+    """
+    Load known cilia/Usher genes into DuckDB.
+
+    Args:
+        store: PipelineStore instance for database access
+
+    Returns:
+        Number of unique gene symbols loaded
+
+    Notes:
+        - Table name: known_cilia_genes
+        - Replaces existing table if present (CREATE OR REPLACE pattern)
+    """
+    df = compile_known_genes()
+
+    store.save_dataframe(
+        df=df,
+        table_name="known_cilia_genes",
+        description="Known cilia and Usher syndrome genes for positive control validation",
+        replace=True,
+    )
+
+    # Return count of unique gene symbols
+    unique_count = df["gene_symbol"].n_unique()
+    return unique_count