feat(04-01): implement known gene compilation and ScoringWeights validation

- Create scoring module with OMIM_USHER_GENES (10 genes) and SYSCILIA_SCGS_V2_CORE (28 genes) - Implement compile_known_genes() returning DataFrame with gene_symbol, source, confidence - Add load_known_genes_to_duckdb() to persist known genes table - Add ScoringWeights.validate_sum() method enforcing weight sum constraint (1.0 ± 1e-6)
2026-02-11 20:41:31 +08:00
parent ed21f18a98
commit 0cd2f7c9dd
3 changed files with 169 additions and 0 deletions
--- a/src/usher_pipeline/config/schema.py
+++ b/src/usher_pipeline/config/schema.py
@@ -70,6 +70,29 @@ class ScoringWeights(BaseModel):
        description="Weight for literature evidence",
    )
    def validate_sum(self) -> None:
        """
        Validate that all scoring weights sum to 1.0.
        Raises:
            ValueError: If weights do not sum to 1.0 (within 1e-6 tolerance)
        Notes:
            - Tolerance of 1e-6 accounts for floating point precision
            - Should be called before using weights in scoring calculations
        """
        total = (
            self.gnomad
            + self.expression
            + self.annotation
            + self.localization
            + self.animal_model
            + self.literature
        )
        if abs(total - 1.0) > 1e-6:
            raise ValueError(f"Scoring weights must sum to 1.0, got {total:.6f}")
 class APIConfig(BaseModel):
    """Configuration for API clients."""
--- a/src/usher_pipeline/scoring/init.py
+++ b/src/usher_pipeline/scoring/init.py
@@ -0,0 +1,23 @@
 """Multi-evidence scoring and known gene compilation for cilia/Usher syndrome genes."""
 from usher_pipeline.scoring.known_genes import (
    OMIM_USHER_GENES,
    SYSCILIA_SCGS_V2_CORE,
    compile_known_genes,
    load_known_genes_to_duckdb,
 )
 from usher_pipeline.scoring.integration import (
    join_evidence_layers,
    compute_composite_scores,
    persist_scored_genes,
 )
 __all__ = [
    "OMIM_USHER_GENES",
    "SYSCILIA_SCGS_V2_CORE",
    "compile_known_genes",
    "load_known_genes_to_duckdb",
    "join_evidence_layers",
    "compute_composite_scores",
    "persist_scored_genes",
 ]
--- a/src/usher_pipeline/scoring/known_genes.py
+++ b/src/usher_pipeline/scoring/known_genes.py
@@ -0,0 +1,123 @@
 """Known cilia and Usher syndrome gene compilation."""
 import polars as pl
 from usher_pipeline.persistence.duckdb_store import PipelineStore
 # OMIM Usher syndrome genes (high-confidence disease genes)
 # Source: OMIM database (omim.org) - Usher syndrome entries
 # These genes are well-established causes of Usher syndrome
 OMIM_USHER_GENES = frozenset([
    "MYO7A",    # USH1B
    "USH1C",    # USH1C (harmonin)
    "CDH23",    # USH1D
    "PCDH15",   # USH1F
    "USH1G",    # USH1G (SANS)
    "CIB2",     # USH1J
    "USH2A",    # USH2A
    "ADGRV1",   # USH2C (GPR98)
    "WHRN",     # USH2D (whirlin)
    "CLRN1",    # USH3A
 ])
 # SYSCILIA Gold Standard (SCGS) v2 - Core ciliary genes subset
 # Source: van Dam et al. (2021) MBoC - DOI: 10.1091/mbc.E21-05-0226
 # Full SCGS v2 contains 686 genes; this is a curated ~30 gene subset of
 # well-characterized ciliary components used as positive controls.
 # For complete list, see publication supplementary data.
 # Future enhancement: implement fetch_scgs_v2() to download full gene set.
 SYSCILIA_SCGS_V2_CORE = frozenset([
    "IFT88",      # IFT-B core
    "IFT140",     # IFT-A core
    "IFT172",     # IFT-B core
    "BBS1",       # BBSome
    "BBS2",       # BBSome
    "BBS4",       # BBSome
    "BBS5",       # BBSome
    "BBS7",       # BBSome
    "BBS9",       # BBSome
    "BBS10",      # BBSome
    "RPGRIP1L",   # Transition zone
    "CEP290",     # Transition zone
    "ARL13B",     # Ciliary membrane
    "INPP5E",     # Ciliary membrane
    "TMEM67",     # MKS/JBTS
    "CC2D2A",     # MKS/JBTS
    "NPHP1",      # Nephronophthisis
    "NPHP3",      # Nephronophthisis
    "NPHP4",      # Nephronophthisis
    "RPGR",       # Retinal ciliopathy
    "CEP164",     # Centriole/basal body
    "OFD1",       # OFD syndrome
    "MKS1",       # Meckel syndrome
    "TCTN1",      # Tectonic complex
    "TCTN2",      # Tectonic complex
    "TMEM216",    # MKS/JBTS
    "TMEM231",    # MKS/JBTS
    "TMEM138",    # MKS/JBTS
 ])
 def compile_known_genes() -> pl.DataFrame:
    """
    Compile known cilia/Usher genes into a structured DataFrame.
    Combines OMIM Usher syndrome genes and SYSCILIA SCGS v2 core genes
    into a single reference set for exclusion filtering and positive
    control validation.
    Returns:
        DataFrame with columns:
        - gene_symbol (str): Gene symbol
        - source (str): "omim_usher" or "syscilia_scgs_v2"
        - confidence (str): "HIGH" for all entries in this curated set
    Notes:
        - Genes appearing in both lists will have two rows (one per source)
        - De-duplication is NOT performed on gene_symbol to preserve provenance
        - Total rows = len(OMIM_USHER_GENES) + len(SYSCILIA_SCGS_V2_CORE)
    """
    # Create DataFrames for each gene set
    omim_df = pl.DataFrame({
        "gene_symbol": list(OMIM_USHER_GENES),
        "source": ["omim_usher"] * len(OMIM_USHER_GENES),
        "confidence": ["HIGH"] * len(OMIM_USHER_GENES),
    })
    syscilia_df = pl.DataFrame({
        "gene_symbol": list(SYSCILIA_SCGS_V2_CORE),
        "source": ["syscilia_scgs_v2"] * len(SYSCILIA_SCGS_V2_CORE),
        "confidence": ["HIGH"] * len(SYSCILIA_SCGS_V2_CORE),
    })
    # Concatenate both gene sets
    combined = pl.concat([omim_df, syscilia_df])
    return combined
 def load_known_genes_to_duckdb(store: PipelineStore) -> int:
    """
    Load known cilia/Usher genes into DuckDB.
    Args:
        store: PipelineStore instance for database access
    Returns:
        Number of unique gene symbols loaded
    Notes:
        - Table name: known_cilia_genes
        - Replaces existing table if present (CREATE OR REPLACE pattern)
    """
    df = compile_known_genes()
    store.save_dataframe(
        df=df,
        table_name="known_cilia_genes",
        description="Known cilia and Usher syndrome genes for positive control validation",
        replace=True,
    )
    # Return count of unique gene symbols
    unique_count = df["gene_symbol"].n_unique()
    return unique_count