fix: resolve JOIN explosion from duplicate gene_ids across evidence tables

- scoring/integration.py: Use CTEs with GROUP BY gene_id to deduplicate
  all evidence tables before LEFT JOIN (gnomAD had 211K rows for 18K
  genes due to per-transcript data; annotation/localization also had dupes)
- literature/transform.py: Deduplicate gene_symbols before PubMed queries
  to avoid querying the same symbol multiple times
- evidence_cmd.py: Fix Polars .alias() error in literature summary display
- Updated report results with full 6-layer scoring (44 HIGH, 7268 MEDIUM)
- Validation PASSED: known Usher genes median percentile 86.9%

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 04:48:13 +08:00
parent 63e3cccd3c
commit b63251a996
11 changed files with 19434 additions and 18147 deletions

View File

@@ -1071,7 +1071,7 @@ def literature(ctx, force, email, api_key, batch_size):
total_genes = len(df)
tier_counts = (
df.group_by("evidence_tier")
.agg(df.select("gene_id").count().alias("count"))
.agg(pl.len().alias("count"))
.sort("count", descending=True)
)
@@ -1205,7 +1205,7 @@ def literature(ctx, force, email, api_key, batch_size):
# Display summary
tier_counts = (
df.group_by("evidence_tier")
.agg(df.select("gene_id").count().alias("count"))
.agg(pl.len().alias("count"))
.sort("count", descending=True)
)

View File

@@ -242,17 +242,18 @@ def process_literature_evidence(
# Step 1: Map gene IDs to symbols
gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
gene_symbols = gene_map["gene_symbol"].to_list()
# Deduplicate symbols for PubMed queries (many gene_ids can share a symbol)
unique_symbols = gene_map["gene_symbol"].unique().to_list()
logger.info(
"literature_gene_mapping",
input_ids=len(gene_ids),
mapped_symbols=len(gene_symbols),
mapped_symbols=len(unique_symbols),
)
# Step 2: Fetch literature evidence
# Step 2: Fetch literature evidence (one query per unique symbol)
lit_df = fetch_literature_evidence(
gene_symbols=gene_symbols,
gene_symbols=unique_symbols,
email=email,
api_key=api_key,
batch_size=batch_size,
@@ -266,7 +267,8 @@ def process_literature_evidence(
# Step 4: Compute quality-weighted scores
lit_df = compute_literature_score(lit_df)
# Step 5: Join back to gene IDs
# Step 5: Join back to gene IDs (lit_df has unique symbols, gene_map may have
# multiple gene_ids per symbol — this is correct, each gene_id gets its score)
result_df = gene_map.join(
lit_df,
on="gene_symbol",

View File

@@ -37,7 +37,38 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
- Uses LEFT JOIN pattern to preserve NULLs
- evidence_count = sum of non-NULL layers (0-6)
"""
# Use CTEs to deduplicate each evidence table to one row per gene_id
# (some tables have multiple rows per gene, e.g. gnomAD has per-transcript data).
# For score columns, take the MAX to keep the strongest evidence.
query = """
WITH gu AS (
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
FROM gene_universe GROUP BY gene_id
),
gc AS (
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
FROM gnomad_constraint GROUP BY gene_id
),
te AS (
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
FROM tissue_expression GROUP BY gene_id
),
ac AS (
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
FROM annotation_completeness GROUP BY gene_id
),
sl AS (
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
FROM subcellular_localization GROUP BY gene_id
),
am AS (
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
FROM animal_model_phenotypes GROUP BY gene_id
),
le AS (
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
FROM literature_evidence GROUP BY gene_id
)
SELECT
g.gene_id,
g.gene_symbol,
@@ -55,13 +86,13 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
) AS evidence_count
FROM gene_universe g
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
FROM gu g
LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN te expr ON g.gene_id = expr.gene_id
LEFT JOIN ac annot ON g.gene_id = annot.gene_id
LEFT JOIN sl loc ON g.gene_id = loc.gene_id
LEFT JOIN am animal ON g.gene_id = animal.gene_id
LEFT JOIN le lit ON g.gene_id = lit.gene_id
"""
# Execute query and convert to polars
@@ -130,6 +161,34 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
weights.validate_sum()
query = f"""
WITH gu AS (
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
FROM gene_universe GROUP BY gene_id
),
gc AS (
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
FROM gnomad_constraint GROUP BY gene_id
),
te AS (
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
FROM tissue_expression GROUP BY gene_id
),
ac AS (
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
FROM annotation_completeness GROUP BY gene_id
),
sl AS (
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
FROM subcellular_localization GROUP BY gene_id
),
am AS (
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
FROM animal_model_phenotypes GROUP BY gene_id
),
le AS (
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
FROM literature_evidence GROUP BY gene_id
)
SELECT
g.gene_id,
g.gene_symbol,
@@ -222,13 +281,13 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution,
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution,
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution
FROM gene_universe g
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
FROM gu g
LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN te expr ON g.gene_id = expr.gene_id
LEFT JOIN ac annot ON g.gene_id = annot.gene_id
LEFT JOIN sl loc ON g.gene_id = loc.gene_id
LEFT JOIN am animal ON g.gene_id = animal.gene_id
LEFT JOIN le lit ON g.gene_id = lit.gene_id
ORDER BY composite_score DESC NULLS LAST
"""