fix: resolve JOIN explosion from duplicate gene_ids across evidence tables

- scoring/integration.py: Use CTEs with GROUP BY gene_id to deduplicate
  all evidence tables before LEFT JOIN (gnomAD had 211K rows for 18K
  genes due to per-transcript data; annotation/localization also had dupes)
- literature/transform.py: Deduplicate gene_symbols before PubMed queries
  to avoid querying the same symbol multiple times
- evidence_cmd.py: Fix Polars .alias() error in literature summary display
- Updated report results with full 6-layer scoring (44 HIGH, 7268 MEDIUM)
- Validation PASSED: known Usher genes median percentile 86.9%

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 04:48:13 +08:00
parent 63e3cccd3c
commit b63251a996
11 changed files with 19434 additions and 18147 deletions

Binary file not shown.

View File

@@ -1,12 +1,12 @@
generated_at: '2026-02-12T18:42:59.932245+00:00' generated_at: '2026-02-15T20:47:53.707513+00:00'
output_files: output_files:
- candidates.tsv - candidates.tsv
- candidates.parquet - candidates.parquet
statistics: statistics:
total_candidates: 18116 total_candidates: 19342
high_count: 0 high_count: 44
medium_count: 2151 medium_count: 7268
low_count: 15965 low_count: 12030
column_count: 22 column_count: 22
column_names: column_names:
- gene_id - gene_id

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 112 KiB

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 80 KiB

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

After

Width:  |  Height:  |  Size: 94 KiB

View File

@@ -1,6 +1,6 @@
{ {
"run_id": "5f00f9da-e548-4a58-b1b3-028d05c94d32", "run_id": "0bcdae80-84c2-486b-809a-36040ce4821d",
"timestamp": "2026-02-12T18:43:00.223842+00:00", "timestamp": "2026-02-15T20:47:54.482074+00:00",
"pipeline_version": "0.1.0", "pipeline_version": "0.1.0",
"parameters": { "parameters": {
"gnomad": 0.2, "gnomad": 0.2,
@@ -49,9 +49,9 @@
], ],
"validation_metrics": {}, "validation_metrics": {},
"tier_statistics": { "tier_statistics": {
"total": 18116, "total": 19342,
"high": 0, "high": 44,
"medium": 2151, "medium": 7268,
"low": 15965 "low": 12030
} }
} }

View File

@@ -1,7 +1,7 @@
# Pipeline Reproducibility Report # Pipeline Reproducibility Report
**Run ID:** `5f00f9da-e548-4a58-b1b3-028d05c94d32` **Run ID:** `0bcdae80-84c2-486b-809a-36040ce4821d`
**Timestamp:** 2026-02-12T18:43:00.223842+00:00 **Timestamp:** 2026-02-15T20:47:54.482074+00:00
**Pipeline Version:** 0.1.0 **Pipeline Version:** 0.1.0
## Parameters ## Parameters
@@ -39,7 +39,7 @@
## Tier Statistics ## Tier Statistics
- **Total Candidates:** 18116 - **Total Candidates:** 19342
- **HIGH:** 0 - **HIGH:** 44
- **MEDIUM:** 2151 - **MEDIUM:** 7268
- **LOW:** 15965 - **LOW:** 12030

View File

@@ -1071,7 +1071,7 @@ def literature(ctx, force, email, api_key, batch_size):
total_genes = len(df) total_genes = len(df)
tier_counts = ( tier_counts = (
df.group_by("evidence_tier") df.group_by("evidence_tier")
.agg(df.select("gene_id").count().alias("count")) .agg(pl.len().alias("count"))
.sort("count", descending=True) .sort("count", descending=True)
) )
@@ -1205,7 +1205,7 @@ def literature(ctx, force, email, api_key, batch_size):
# Display summary # Display summary
tier_counts = ( tier_counts = (
df.group_by("evidence_tier") df.group_by("evidence_tier")
.agg(df.select("gene_id").count().alias("count")) .agg(pl.len().alias("count"))
.sort("count", descending=True) .sort("count", descending=True)
) )

View File

@@ -242,17 +242,18 @@ def process_literature_evidence(
# Step 1: Map gene IDs to symbols # Step 1: Map gene IDs to symbols
gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids)) gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
gene_symbols = gene_map["gene_symbol"].to_list() # Deduplicate symbols for PubMed queries (many gene_ids can share a symbol)
unique_symbols = gene_map["gene_symbol"].unique().to_list()
logger.info( logger.info(
"literature_gene_mapping", "literature_gene_mapping",
input_ids=len(gene_ids), input_ids=len(gene_ids),
mapped_symbols=len(gene_symbols), mapped_symbols=len(unique_symbols),
) )
# Step 2: Fetch literature evidence # Step 2: Fetch literature evidence (one query per unique symbol)
lit_df = fetch_literature_evidence( lit_df = fetch_literature_evidence(
gene_symbols=gene_symbols, gene_symbols=unique_symbols,
email=email, email=email,
api_key=api_key, api_key=api_key,
batch_size=batch_size, batch_size=batch_size,
@@ -266,7 +267,8 @@ def process_literature_evidence(
# Step 4: Compute quality-weighted scores # Step 4: Compute quality-weighted scores
lit_df = compute_literature_score(lit_df) lit_df = compute_literature_score(lit_df)
# Step 5: Join back to gene IDs # Step 5: Join back to gene IDs (lit_df has unique symbols, gene_map may have
# multiple gene_ids per symbol — this is correct, each gene_id gets its score)
result_df = gene_map.join( result_df = gene_map.join(
lit_df, lit_df,
on="gene_symbol", on="gene_symbol",

View File

@@ -37,7 +37,38 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
- Uses LEFT JOIN pattern to preserve NULLs - Uses LEFT JOIN pattern to preserve NULLs
- evidence_count = sum of non-NULL layers (0-6) - evidence_count = sum of non-NULL layers (0-6)
""" """
# Use CTEs to deduplicate each evidence table to one row per gene_id
# (some tables have multiple rows per gene, e.g. gnomAD has per-transcript data).
# For score columns, take the MAX to keep the strongest evidence.
query = """ query = """
WITH gu AS (
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
FROM gene_universe GROUP BY gene_id
),
gc AS (
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
FROM gnomad_constraint GROUP BY gene_id
),
te AS (
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
FROM tissue_expression GROUP BY gene_id
),
ac AS (
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
FROM annotation_completeness GROUP BY gene_id
),
sl AS (
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
FROM subcellular_localization GROUP BY gene_id
),
am AS (
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
FROM animal_model_phenotypes GROUP BY gene_id
),
le AS (
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
FROM literature_evidence GROUP BY gene_id
)
SELECT SELECT
g.gene_id, g.gene_id,
g.gene_symbol, g.gene_symbol,
@@ -55,13 +86,13 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
) AS evidence_count ) AS evidence_count
FROM gene_universe g FROM gu g
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id LEFT JOIN te expr ON g.gene_id = expr.gene_id
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id LEFT JOIN ac annot ON g.gene_id = annot.gene_id
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id LEFT JOIN sl loc ON g.gene_id = loc.gene_id
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id LEFT JOIN am animal ON g.gene_id = animal.gene_id
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id LEFT JOIN le lit ON g.gene_id = lit.gene_id
""" """
# Execute query and convert to polars # Execute query and convert to polars
@@ -130,6 +161,34 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
weights.validate_sum() weights.validate_sum()
query = f""" query = f"""
WITH gu AS (
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
FROM gene_universe GROUP BY gene_id
),
gc AS (
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
FROM gnomad_constraint GROUP BY gene_id
),
te AS (
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
FROM tissue_expression GROUP BY gene_id
),
ac AS (
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
FROM annotation_completeness GROUP BY gene_id
),
sl AS (
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
FROM subcellular_localization GROUP BY gene_id
),
am AS (
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
FROM animal_model_phenotypes GROUP BY gene_id
),
le AS (
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
FROM literature_evidence GROUP BY gene_id
)
SELECT SELECT
g.gene_id, g.gene_id,
g.gene_symbol, g.gene_symbol,
@@ -222,13 +281,13 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution, CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution,
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution, CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution,
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution
FROM gene_universe g FROM gu g
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id LEFT JOIN te expr ON g.gene_id = expr.gene_id
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id LEFT JOIN ac annot ON g.gene_id = annot.gene_id
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id LEFT JOIN sl loc ON g.gene_id = loc.gene_id
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id LEFT JOIN am animal ON g.gene_id = animal.gene_id
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id LEFT JOIN le lit ON g.gene_id = lit.gene_id
ORDER BY composite_score DESC NULLS LAST ORDER BY composite_score DESC NULLS LAST
""" """