"""Unit tests for output generation module: tiering, evidence summary, and writers.""" from pathlib import Path import polars as pl import pytest import yaml from usher_pipeline.output import ( EVIDENCE_LAYERS, TIER_THRESHOLDS, add_evidence_summary, assign_tiers, write_candidate_output, ) @pytest.fixture def synthetic_scored_genes() -> pl.DataFrame: """ Create synthetic scored genes DataFrame spanning all tiers. Returns DataFrame with ~20 rows: - 3 genes HIGH tier (score >= 0.7, evidence_count >= 3) - 5 genes MEDIUM tier (score 0.4-0.69, evidence_count >= 2) - 5 genes LOW tier (score 0.2-0.39) - 3 genes EXCLUDED (score < 0.2) - 4 genes with NULL composite_score (no evidence) """ data = { "gene_id": [ # HIGH tier (3 genes) "ENSG001", "ENSG002", "ENSG003", # MEDIUM tier (5 genes) "ENSG004", "ENSG005", "ENSG006", "ENSG007", "ENSG008", # LOW tier (5 genes) "ENSG009", "ENSG010", "ENSG011", "ENSG012", "ENSG013", # EXCLUDED tier (3 genes - score < 0.2) "ENSG014", "ENSG015", "ENSG016", # NULL composite_score (4 genes - no evidence) "ENSG017", "ENSG018", "ENSG019", "ENSG020", ], "gene_symbol": [ "HIGH1", "HIGH2", "HIGH3", "MED1", "MED2", "MED3", "MED4", "MED5", "LOW1", "LOW2", "LOW3", "LOW4", "LOW5", "EX1", "EX2", "EX3", "NULL1", "NULL2", "NULL3", "NULL4", ], "composite_score": [ # HIGH: >= 0.7 0.85, 0.78, 0.72, # MEDIUM: 0.4-0.69 0.65, 0.58, 0.52, 0.48, 0.42, # LOW: 0.2-0.39 0.38, 0.32, 0.28, 0.24, 0.21, # EXCLUDED: < 0.2 0.18, 0.12, 0.05, # NULL (no evidence) None, None, None, None, ], "evidence_count": [ # HIGH: >= 3 5, 4, 3, # MEDIUM: >= 2 4, 3, 3, 2, 2, # LOW: >= 1 2, 2, 1, 1, 1, # EXCLUDED: any count 1, 1, 0, # NULL 0, 0, 0, 0, ], "quality_flag": [ "sufficient_evidence", "sufficient_evidence", "moderate_evidence", "sufficient_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "sparse_evidence", "sparse_evidence", "sparse_evidence", "sparse_evidence", "sparse_evidence", "no_evidence", "no_evidence", "no_evidence", "no_evidence", "no_evidence", ], # Layer scores (nullable) "gnomad_score": [ 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.4, 0.3, None, None, 0.2, None, 0.1, None, None, None, None, None, ], "expression_score": [ 0.85, 0.75, 0.65, 0.7, 0.6, 0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, None, 0.15, None, None, None, None, None, None, ], "annotation_score": [ 0.8, 0.7, 0.6, 0.65, 0.55, 0.45, None, None, None, None, 0.3, 0.25, 0.2, None, None, None, None, None, None, None, ], "localization_score": [ 0.75, None, None, 0.6, 0.5, None, 0.4, 0.35, 0.3, 0.25, None, None, None, None, None, None, None, None, None, None, ], "animal_model_score": [ 0.9, 0.8, 0.75, None, None, 0.55, 0.5, 0.45, None, None, None, None, None, None, None, None, None, None, None, None, ], "literature_score": [ None, 0.85, 0.7, 0.68, 0.62, 0.58, None, None, None, None, None, None, 0.22, 0.18, 0.15, 0.05, None, None, None, None, ], # Contribution columns (score * weight) - simplified for testing "gnomad_contribution": [ 0.18, 0.16, 0.14, 0.12, 0.1, 0.08, 0.06, 0.04, 0.08, 0.06, None, None, 0.04, None, 0.02, None, None, None, None, None, ], "expression_contribution": [ 0.17, 0.15, 0.13, 0.14, 0.12, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, None, 0.03, None, None, None, None, None, None, ], "annotation_contribution": [ 0.12, 0.105, 0.09, 0.098, 0.083, 0.068, None, None, None, None, 0.045, 0.038, 0.03, None, None, None, None, None, None, None, ], "localization_contribution": [ 0.113, None, None, 0.09, 0.075, None, 0.06, 0.053, 0.045, 0.038, None, None, None, None, None, None, None, None, None, None, ], "animal_model_contribution": [ 0.135, 0.12, 0.113, None, None, 0.083, 0.075, 0.068, None, None, None, None, None, None, None, None, None, None, None, None, ], "literature_contribution": [ None, 0.128, 0.105, 0.102, 0.093, 0.087, None, None, None, None, None, None, 0.033, 0.027, 0.023, 0.008, None, None, None, None, ], } return pl.DataFrame(data) def test_assign_tiers_default_thresholds(synthetic_scored_genes): """Test tier assignment with default thresholds.""" result = assign_tiers(synthetic_scored_genes) # Check that EXCLUDED genes are filtered out (should have 13 genes remaining) # 3 HIGH + 5 MEDIUM + 5 LOW = 13 (7 excluded: 3 below threshold + 4 NULL) assert result.height == 13, f"Expected 13 genes, got {result.height}" # Verify tier counts tier_dist = result.group_by("confidence_tier").agg(pl.len()).sort("confidence_tier") tier_counts = {row["confidence_tier"]: row["len"] for row in tier_dist.to_dicts()} assert tier_counts.get("HIGH", 0) == 3, "Expected 3 HIGH tier genes" assert tier_counts.get("MEDIUM", 0) == 5, "Expected 5 MEDIUM tier genes" assert tier_counts.get("LOW", 0) == 5, "Expected 5 LOW tier genes" assert "EXCLUDED" not in tier_counts, "EXCLUDED genes should be filtered out" def test_assign_tiers_custom_thresholds(synthetic_scored_genes): """Test tier assignment with custom thresholds.""" custom_thresholds = { "HIGH": {"composite_score": 0.8, "evidence_count": 4}, # Stricter "MEDIUM": {"composite_score": 0.5, "evidence_count": 3}, # Stricter "LOW": {"composite_score": 0.3, "evidence_count": 1}, # More relaxed } result = assign_tiers(synthetic_scored_genes, thresholds=custom_thresholds) # With stricter HIGH threshold (0.8), only 1 gene qualifies (ENSG001 with 0.85) tier_dist = result.group_by("confidence_tier").agg(pl.len()).sort("confidence_tier") tier_counts = {row["confidence_tier"]: row["len"] for row in tier_dist.to_dicts()} assert tier_counts.get("HIGH", 0) == 1, "Expected 1 HIGH tier gene with stricter threshold" def test_assign_tiers_sorting(synthetic_scored_genes): """Test that output is sorted by composite_score DESC, gene_id ASC.""" result = assign_tiers(synthetic_scored_genes) # Extract composite scores (should be descending) scores = result["composite_score"].to_list() # Check descending order for i in range(len(scores) - 1): assert scores[i] >= scores[i + 1], f"Scores not descending at index {i}" # Check first gene is the highest scorer assert result[0, "gene_id"] == "ENSG001", "Highest scorer should be ENSG001" def test_add_evidence_summary_supporting_layers(synthetic_scored_genes): """Test that supporting_layers correctly lists layers with non-NULL scores.""" result = add_evidence_summary(synthetic_scored_genes) # ENSG001 (HIGH1) has scores in: gnomad, expression, annotation, localization, animal_model # (literature is NULL) high1_row = result.filter(pl.col("gene_id") == "ENSG001") supporting = high1_row["supporting_layers"][0] # Extract string value from Series # Check that the 5 layers are listed assert "gnomad" in supporting assert "expression" in supporting assert "annotation" in supporting assert "localization" in supporting assert "animal_model" in supporting assert "literature" not in supporting # NULL score def test_add_evidence_summary_gaps(synthetic_scored_genes): """Test that evidence_gaps correctly lists layers with NULL scores.""" result = add_evidence_summary(synthetic_scored_genes) # ENSG020 (NULL4) has all NULL scores null4_row = result.filter(pl.col("gene_id") == "ENSG020") gaps = null4_row["evidence_gaps"][0] # Extract string value from Series supporting = null4_row["supporting_layers"][0] # Extract string value from Series # All 6 layers should be in gaps for layer in EVIDENCE_LAYERS: assert layer in gaps, f"Layer {layer} should be in evidence_gaps" # supporting_layers should be empty assert supporting == "", "Gene with all NULL scores should have empty supporting_layers" def test_write_candidate_output_creates_files(tmp_path, synthetic_scored_genes): """Test that write_candidate_output creates TSV, Parquet, and provenance files.""" # Add tier and evidence summary columns tiered = assign_tiers(synthetic_scored_genes) full_df = add_evidence_summary(tiered) # Write output paths = write_candidate_output(full_df, tmp_path, filename_base="test_candidates") # Check all files exist assert paths["tsv"].exists(), "TSV file should exist" assert paths["parquet"].exists(), "Parquet file should exist" assert paths["provenance"].exists(), "Provenance YAML should exist" # Check filenames assert paths["tsv"].name == "test_candidates.tsv" assert paths["parquet"].name == "test_candidates.parquet" assert paths["provenance"].name == "test_candidates.provenance.yaml" def test_write_candidate_output_tsv_readable(tmp_path, synthetic_scored_genes): """Test that TSV output can be read back and has correct schema.""" tiered = assign_tiers(synthetic_scored_genes) full_df = add_evidence_summary(tiered) paths = write_candidate_output(full_df, tmp_path) # Read back TSV tsv_df = pl.read_csv(paths["tsv"], separator="\t") # Check row count matches assert tsv_df.height == full_df.height, "TSV should have same row count as input" # Check column count matches assert len(tsv_df.columns) == len(full_df.columns), "TSV should have same column count as input" # Check key columns exist assert "gene_id" in tsv_df.columns assert "confidence_tier" in tsv_df.columns assert "supporting_layers" in tsv_df.columns assert "evidence_gaps" in tsv_df.columns def test_write_candidate_output_parquet_readable(tmp_path, synthetic_scored_genes): """Test that Parquet output can be read back and schema matches.""" tiered = assign_tiers(synthetic_scored_genes) full_df = add_evidence_summary(tiered) paths = write_candidate_output(full_df, tmp_path) # Read back Parquet parquet_df = pl.read_parquet(paths["parquet"]) # Check row count matches assert parquet_df.height == full_df.height, "Parquet should have same row count as input" # Check column count matches assert len(parquet_df.columns) == len(full_df.columns), "Parquet should have same column count as input" # Check schema matches (column names and order) assert parquet_df.columns == full_df.columns, "Parquet should have identical schema to input" def test_write_candidate_output_provenance_yaml(tmp_path, synthetic_scored_genes): """Test that provenance YAML contains accurate statistics.""" tiered = assign_tiers(synthetic_scored_genes) full_df = add_evidence_summary(tiered) paths = write_candidate_output(full_df, tmp_path) # Read provenance YAML with open(paths["provenance"]) as f: prov = yaml.safe_load(f) # Check structure assert "generated_at" in prov, "Provenance should have generated_at timestamp" assert "output_files" in prov, "Provenance should list output files" assert "statistics" in prov, "Provenance should have statistics" assert "column_count" in prov, "Provenance should have column_count" assert "column_names" in prov, "Provenance should have column_names" # Check statistics match stats = prov["statistics"] assert stats["total_candidates"] == full_df.height, "Total candidates should match row count" assert stats["high_count"] == 3, "Should have 3 HIGH tier genes" assert stats["medium_count"] == 5, "Should have 5 MEDIUM tier genes" assert stats["low_count"] == 5, "Should have 5 LOW tier genes" # Check column info assert prov["column_count"] == len(full_df.columns), "Column count should match DataFrame" assert len(prov["column_names"]) == len(full_df.columns), "Column names list should match DataFrame"