fix: resolve JOIN explosion from duplicate gene_ids across evidence tables
- scoring/integration.py: Use CTEs with GROUP BY gene_id to deduplicate all evidence tables before LEFT JOIN (gnomAD had 211K rows for 18K genes due to per-transcript data; annotation/localization also had dupes) - literature/transform.py: Deduplicate gene_symbols before PubMed queries to avoid querying the same symbol multiple times - evidence_cmd.py: Fix Polars .alias() error in literature summary display - Updated report results with full 6-layer scoring (44 HIGH, 7268 MEDIUM) - Validation PASSED: known Usher genes median percentile 86.9% Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
@@ -1,12 +1,12 @@
|
|||||||
generated_at: '2026-02-12T18:42:59.932245+00:00'
|
generated_at: '2026-02-15T20:47:53.707513+00:00'
|
||||||
output_files:
|
output_files:
|
||||||
- candidates.tsv
|
- candidates.tsv
|
||||||
- candidates.parquet
|
- candidates.parquet
|
||||||
statistics:
|
statistics:
|
||||||
total_candidates: 18116
|
total_candidates: 19342
|
||||||
high_count: 0
|
high_count: 44
|
||||||
medium_count: 2151
|
medium_count: 7268
|
||||||
low_count: 15965
|
low_count: 12030
|
||||||
column_count: 22
|
column_count: 22
|
||||||
column_names:
|
column_names:
|
||||||
- gene_id
|
- gene_id
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
Before Width: | Height: | Size: 112 KiB After Width: | Height: | Size: 116 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 80 KiB After Width: | Height: | Size: 87 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 88 KiB After Width: | Height: | Size: 94 KiB |
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"run_id": "5f00f9da-e548-4a58-b1b3-028d05c94d32",
|
"run_id": "0bcdae80-84c2-486b-809a-36040ce4821d",
|
||||||
"timestamp": "2026-02-12T18:43:00.223842+00:00",
|
"timestamp": "2026-02-15T20:47:54.482074+00:00",
|
||||||
"pipeline_version": "0.1.0",
|
"pipeline_version": "0.1.0",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"gnomad": 0.2,
|
"gnomad": 0.2,
|
||||||
@@ -49,9 +49,9 @@
|
|||||||
],
|
],
|
||||||
"validation_metrics": {},
|
"validation_metrics": {},
|
||||||
"tier_statistics": {
|
"tier_statistics": {
|
||||||
"total": 18116,
|
"total": 19342,
|
||||||
"high": 0,
|
"high": 44,
|
||||||
"medium": 2151,
|
"medium": 7268,
|
||||||
"low": 15965
|
"low": 12030
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
# Pipeline Reproducibility Report
|
# Pipeline Reproducibility Report
|
||||||
|
|
||||||
**Run ID:** `5f00f9da-e548-4a58-b1b3-028d05c94d32`
|
**Run ID:** `0bcdae80-84c2-486b-809a-36040ce4821d`
|
||||||
**Timestamp:** 2026-02-12T18:43:00.223842+00:00
|
**Timestamp:** 2026-02-15T20:47:54.482074+00:00
|
||||||
**Pipeline Version:** 0.1.0
|
**Pipeline Version:** 0.1.0
|
||||||
|
|
||||||
## Parameters
|
## Parameters
|
||||||
@@ -39,7 +39,7 @@
|
|||||||
|
|
||||||
## Tier Statistics
|
## Tier Statistics
|
||||||
|
|
||||||
- **Total Candidates:** 18116
|
- **Total Candidates:** 19342
|
||||||
- **HIGH:** 0
|
- **HIGH:** 44
|
||||||
- **MEDIUM:** 2151
|
- **MEDIUM:** 7268
|
||||||
- **LOW:** 15965
|
- **LOW:** 12030
|
||||||
|
|||||||
@@ -1071,7 +1071,7 @@ def literature(ctx, force, email, api_key, batch_size):
|
|||||||
total_genes = len(df)
|
total_genes = len(df)
|
||||||
tier_counts = (
|
tier_counts = (
|
||||||
df.group_by("evidence_tier")
|
df.group_by("evidence_tier")
|
||||||
.agg(df.select("gene_id").count().alias("count"))
|
.agg(pl.len().alias("count"))
|
||||||
.sort("count", descending=True)
|
.sort("count", descending=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1205,7 +1205,7 @@ def literature(ctx, force, email, api_key, batch_size):
|
|||||||
# Display summary
|
# Display summary
|
||||||
tier_counts = (
|
tier_counts = (
|
||||||
df.group_by("evidence_tier")
|
df.group_by("evidence_tier")
|
||||||
.agg(df.select("gene_id").count().alias("count"))
|
.agg(pl.len().alias("count"))
|
||||||
.sort("count", descending=True)
|
.sort("count", descending=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -242,17 +242,18 @@ def process_literature_evidence(
|
|||||||
|
|
||||||
# Step 1: Map gene IDs to symbols
|
# Step 1: Map gene IDs to symbols
|
||||||
gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
|
gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
|
||||||
gene_symbols = gene_map["gene_symbol"].to_list()
|
# Deduplicate symbols for PubMed queries (many gene_ids can share a symbol)
|
||||||
|
unique_symbols = gene_map["gene_symbol"].unique().to_list()
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"literature_gene_mapping",
|
"literature_gene_mapping",
|
||||||
input_ids=len(gene_ids),
|
input_ids=len(gene_ids),
|
||||||
mapped_symbols=len(gene_symbols),
|
mapped_symbols=len(unique_symbols),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 2: Fetch literature evidence
|
# Step 2: Fetch literature evidence (one query per unique symbol)
|
||||||
lit_df = fetch_literature_evidence(
|
lit_df = fetch_literature_evidence(
|
||||||
gene_symbols=gene_symbols,
|
gene_symbols=unique_symbols,
|
||||||
email=email,
|
email=email,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
@@ -266,7 +267,8 @@ def process_literature_evidence(
|
|||||||
# Step 4: Compute quality-weighted scores
|
# Step 4: Compute quality-weighted scores
|
||||||
lit_df = compute_literature_score(lit_df)
|
lit_df = compute_literature_score(lit_df)
|
||||||
|
|
||||||
# Step 5: Join back to gene IDs
|
# Step 5: Join back to gene IDs (lit_df has unique symbols, gene_map may have
|
||||||
|
# multiple gene_ids per symbol — this is correct, each gene_id gets its score)
|
||||||
result_df = gene_map.join(
|
result_df = gene_map.join(
|
||||||
lit_df,
|
lit_df,
|
||||||
on="gene_symbol",
|
on="gene_symbol",
|
||||||
|
|||||||
@@ -37,7 +37,38 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
|
|||||||
- Uses LEFT JOIN pattern to preserve NULLs
|
- Uses LEFT JOIN pattern to preserve NULLs
|
||||||
- evidence_count = sum of non-NULL layers (0-6)
|
- evidence_count = sum of non-NULL layers (0-6)
|
||||||
"""
|
"""
|
||||||
|
# Use CTEs to deduplicate each evidence table to one row per gene_id
|
||||||
|
# (some tables have multiple rows per gene, e.g. gnomAD has per-transcript data).
|
||||||
|
# For score columns, take the MAX to keep the strongest evidence.
|
||||||
query = """
|
query = """
|
||||||
|
WITH gu AS (
|
||||||
|
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
|
||||||
|
FROM gene_universe GROUP BY gene_id
|
||||||
|
),
|
||||||
|
gc AS (
|
||||||
|
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
|
||||||
|
FROM gnomad_constraint GROUP BY gene_id
|
||||||
|
),
|
||||||
|
te AS (
|
||||||
|
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
|
||||||
|
FROM tissue_expression GROUP BY gene_id
|
||||||
|
),
|
||||||
|
ac AS (
|
||||||
|
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
|
||||||
|
FROM annotation_completeness GROUP BY gene_id
|
||||||
|
),
|
||||||
|
sl AS (
|
||||||
|
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
|
||||||
|
FROM subcellular_localization GROUP BY gene_id
|
||||||
|
),
|
||||||
|
am AS (
|
||||||
|
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
|
||||||
|
FROM animal_model_phenotypes GROUP BY gene_id
|
||||||
|
),
|
||||||
|
le AS (
|
||||||
|
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
|
||||||
|
FROM literature_evidence GROUP BY gene_id
|
||||||
|
)
|
||||||
SELECT
|
SELECT
|
||||||
g.gene_id,
|
g.gene_id,
|
||||||
g.gene_symbol,
|
g.gene_symbol,
|
||||||
@@ -55,13 +86,13 @@ def join_evidence_layers(store: PipelineStore) -> pl.DataFrame:
|
|||||||
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN 1 ELSE 0 END +
|
||||||
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN 1 ELSE 0 END
|
||||||
) AS evidence_count
|
) AS evidence_count
|
||||||
FROM gene_universe g
|
FROM gu g
|
||||||
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
|
LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
|
||||||
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
|
LEFT JOIN te expr ON g.gene_id = expr.gene_id
|
||||||
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
|
LEFT JOIN ac annot ON g.gene_id = annot.gene_id
|
||||||
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
|
LEFT JOIN sl loc ON g.gene_id = loc.gene_id
|
||||||
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
|
LEFT JOIN am animal ON g.gene_id = animal.gene_id
|
||||||
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
|
LEFT JOIN le lit ON g.gene_id = lit.gene_id
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Execute query and convert to polars
|
# Execute query and convert to polars
|
||||||
@@ -130,6 +161,34 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
|
|||||||
weights.validate_sum()
|
weights.validate_sum()
|
||||||
|
|
||||||
query = f"""
|
query = f"""
|
||||||
|
WITH gu AS (
|
||||||
|
SELECT gene_id, FIRST(gene_symbol) AS gene_symbol
|
||||||
|
FROM gene_universe GROUP BY gene_id
|
||||||
|
),
|
||||||
|
gc AS (
|
||||||
|
SELECT gene_id, MAX(loeuf_normalized) AS loeuf_normalized
|
||||||
|
FROM gnomad_constraint GROUP BY gene_id
|
||||||
|
),
|
||||||
|
te AS (
|
||||||
|
SELECT gene_id, MAX(expression_score_normalized) AS expression_score_normalized
|
||||||
|
FROM tissue_expression GROUP BY gene_id
|
||||||
|
),
|
||||||
|
ac AS (
|
||||||
|
SELECT gene_id, MAX(annotation_score_normalized) AS annotation_score_normalized
|
||||||
|
FROM annotation_completeness GROUP BY gene_id
|
||||||
|
),
|
||||||
|
sl AS (
|
||||||
|
SELECT gene_id, MAX(localization_score_normalized) AS localization_score_normalized
|
||||||
|
FROM subcellular_localization GROUP BY gene_id
|
||||||
|
),
|
||||||
|
am AS (
|
||||||
|
SELECT gene_id, MAX(animal_model_score_normalized) AS animal_model_score_normalized
|
||||||
|
FROM animal_model_phenotypes GROUP BY gene_id
|
||||||
|
),
|
||||||
|
le AS (
|
||||||
|
SELECT gene_id, MAX(literature_score_normalized) AS literature_score_normalized
|
||||||
|
FROM literature_evidence GROUP BY gene_id
|
||||||
|
)
|
||||||
SELECT
|
SELECT
|
||||||
g.gene_id,
|
g.gene_id,
|
||||||
g.gene_symbol,
|
g.gene_symbol,
|
||||||
@@ -222,13 +281,13 @@ def compute_composite_scores(store: PipelineStore, weights: ScoringWeights) -> p
|
|||||||
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution,
|
CASE WHEN loc.localization_score_normalized IS NOT NULL THEN loc.localization_score_normalized * {weights.localization} ELSE NULL END AS localization_contribution,
|
||||||
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution,
|
CASE WHEN animal.animal_model_score_normalized IS NOT NULL THEN animal.animal_model_score_normalized * {weights.animal_model} ELSE NULL END AS animal_model_contribution,
|
||||||
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution
|
CASE WHEN lit.literature_score_normalized IS NOT NULL THEN lit.literature_score_normalized * {weights.literature} ELSE NULL END AS literature_contribution
|
||||||
FROM gene_universe g
|
FROM gu g
|
||||||
LEFT JOIN gnomad_constraint gnomad ON g.gene_id = gnomad.gene_id
|
LEFT JOIN gc gnomad ON g.gene_id = gnomad.gene_id
|
||||||
LEFT JOIN tissue_expression expr ON g.gene_id = expr.gene_id
|
LEFT JOIN te expr ON g.gene_id = expr.gene_id
|
||||||
LEFT JOIN annotation_completeness annot ON g.gene_id = annot.gene_id
|
LEFT JOIN ac annot ON g.gene_id = annot.gene_id
|
||||||
LEFT JOIN subcellular_localization loc ON g.gene_id = loc.gene_id
|
LEFT JOIN sl loc ON g.gene_id = loc.gene_id
|
||||||
LEFT JOIN animal_model_phenotypes animal ON g.gene_id = animal.gene_id
|
LEFT JOIN am animal ON g.gene_id = animal.gene_id
|
||||||
LEFT JOIN literature_evidence lit ON g.gene_id = lit.gene_id
|
LEFT JOIN le lit ON g.gene_id = lit.gene_id
|
||||||
ORDER BY composite_score DESC NULLS LAST
|
ORDER BY composite_score DESC NULLS LAST
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user