feat(01-02): create mapping validation gates with tests
- Add MappingValidator with configurable success rate thresholds (min_success_rate, warn_threshold) - Add validate_gene_universe for gene count, format, and duplicate checks - Add save_unmapped_report for manual review output - Implement 15 comprehensive tests with mocked mygene responses (no real API calls) - Tests cover: successful mapping, notfound handling, uniprot list parsing, batching, validation gates, universe validation
This commit is contained in:
@@ -13,6 +13,11 @@ from usher_pipeline.gene_mapping.universe import (
|
||||
fetch_protein_coding_genes,
|
||||
GeneUniverse,
|
||||
)
|
||||
from usher_pipeline.gene_mapping.validator import (
|
||||
MappingValidator,
|
||||
ValidationResult,
|
||||
validate_gene_universe,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"GeneMapper",
|
||||
@@ -20,4 +25,7 @@ __all__ = [
|
||||
"MappingReport",
|
||||
"fetch_protein_coding_genes",
|
||||
"GeneUniverse",
|
||||
"MappingValidator",
|
||||
"ValidationResult",
|
||||
"validate_gene_universe",
|
||||
]
|
||||
|
||||
224
src/usher_pipeline/gene_mapping/validator.py
Normal file
224
src/usher_pipeline/gene_mapping/validator.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""Validation gates for gene mapping quality control.
|
||||
|
||||
Provides validation for mapping results and gene universe data quality.
|
||||
Enforces configurable success rate thresholds and produces actionable reports.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from usher_pipeline.gene_mapping.mapper import MappingReport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""Result of a validation check.
|
||||
|
||||
Attributes:
|
||||
passed: Whether validation passed
|
||||
messages: List of validation messages (warnings, errors)
|
||||
hgnc_rate: HGNC mapping success rate (0-1)
|
||||
uniprot_rate: UniProt mapping success rate (0-1)
|
||||
"""
|
||||
passed: bool
|
||||
messages: list[str] = field(default_factory=list)
|
||||
hgnc_rate: float = 0.0
|
||||
uniprot_rate: float = 0.0
|
||||
|
||||
|
||||
class MappingValidator:
|
||||
"""Validator for gene ID mapping results.
|
||||
|
||||
Enforces configurable success rate thresholds and produces validation reports.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_success_rate: float = 0.90,
|
||||
warn_threshold: float = 0.95
|
||||
):
|
||||
"""Initialize mapping validator.
|
||||
|
||||
Args:
|
||||
min_success_rate: Minimum HGNC mapping success rate to pass (default: 0.90)
|
||||
warn_threshold: Success rate below this triggers warning (default: 0.95)
|
||||
"""
|
||||
self.min_success_rate = min_success_rate
|
||||
self.warn_threshold = warn_threshold
|
||||
logger.info(
|
||||
f"Initialized MappingValidator: min_rate={min_success_rate}, "
|
||||
f"warn_threshold={warn_threshold}"
|
||||
)
|
||||
|
||||
def validate(self, report: MappingReport) -> ValidationResult:
|
||||
"""Validate gene mapping results.
|
||||
|
||||
Checks if HGNC mapping success rate meets minimum threshold.
|
||||
Issues warning if rate is below warn_threshold but above min_success_rate.
|
||||
|
||||
Args:
|
||||
report: MappingReport from batch mapping operation
|
||||
|
||||
Returns:
|
||||
ValidationResult with pass/fail status and messages
|
||||
"""
|
||||
messages: list[str] = []
|
||||
hgnc_rate = report.success_rate_hgnc
|
||||
uniprot_rate = report.success_rate_uniprot
|
||||
|
||||
# Check HGNC success rate
|
||||
if hgnc_rate < self.min_success_rate:
|
||||
messages.append(
|
||||
f"FAILED: HGNC mapping success rate {hgnc_rate:.1%} is below "
|
||||
f"minimum threshold {self.min_success_rate:.1%}"
|
||||
)
|
||||
messages.append(
|
||||
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
|
||||
)
|
||||
messages.append(
|
||||
f"Unmapped genes: {len(report.unmapped_ids)} "
|
||||
f"(first 10: {report.unmapped_ids[:10]})"
|
||||
)
|
||||
passed = False
|
||||
elif hgnc_rate < self.warn_threshold:
|
||||
messages.append(
|
||||
f"WARNING: HGNC mapping success rate {hgnc_rate:.1%} is below "
|
||||
f"warning threshold {self.warn_threshold:.1%}"
|
||||
)
|
||||
messages.append(
|
||||
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
|
||||
)
|
||||
messages.append(
|
||||
f"Consider reviewing {len(report.unmapped_ids)} unmapped genes"
|
||||
)
|
||||
passed = True
|
||||
else:
|
||||
messages.append(
|
||||
f"PASSED: HGNC mapping success rate {hgnc_rate:.1%} "
|
||||
f"({report.mapped_hgnc}/{report.total_genes} genes)"
|
||||
)
|
||||
passed = True
|
||||
|
||||
# Report UniProt stats (informational only, not used for pass/fail)
|
||||
messages.append(
|
||||
f"UniProt mapping: {uniprot_rate:.1%} "
|
||||
f"({report.mapped_uniprot}/{report.total_genes} genes)"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Validation result: {'PASSED' if passed else 'FAILED'} "
|
||||
f"(HGNC: {hgnc_rate:.1%}, UniProt: {uniprot_rate:.1%})"
|
||||
)
|
||||
|
||||
return ValidationResult(
|
||||
passed=passed,
|
||||
messages=messages,
|
||||
hgnc_rate=hgnc_rate,
|
||||
uniprot_rate=uniprot_rate,
|
||||
)
|
||||
|
||||
def save_unmapped_report(
|
||||
self,
|
||||
report: MappingReport,
|
||||
output_path: Path
|
||||
) -> None:
|
||||
"""Save list of unmapped genes to file for manual review.
|
||||
|
||||
Args:
|
||||
report: MappingReport containing unmapped gene IDs
|
||||
output_path: Path to output file
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
unmapped_count = len(report.unmapped_ids)
|
||||
|
||||
with output_path.open('w') as f:
|
||||
f.write(f"# Unmapped Gene IDs\n")
|
||||
f.write(f"# Generated: {timestamp}\n")
|
||||
f.write(f"# Total unmapped: {unmapped_count}\n")
|
||||
f.write(f"# Success rate: {report.success_rate_hgnc:.1%}\n")
|
||||
f.write(f"#\n")
|
||||
for gene_id in report.unmapped_ids:
|
||||
f.write(f"{gene_id}\n")
|
||||
|
||||
logger.info(
|
||||
f"Saved {unmapped_count} unmapped gene IDs to {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def validate_gene_universe(genes: list[str]) -> ValidationResult:
|
||||
"""Validate gene universe data quality.
|
||||
|
||||
Checks:
|
||||
- Gene count is in expected range (19,000-22,000)
|
||||
- All gene IDs start with ENSG (Ensembl format)
|
||||
- No duplicate gene IDs
|
||||
|
||||
Args:
|
||||
genes: List of gene IDs to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult with validation status and messages
|
||||
"""
|
||||
messages: list[str] = []
|
||||
passed = True
|
||||
|
||||
gene_count = len(genes)
|
||||
MIN_GENES = 19000
|
||||
MAX_GENES = 22000
|
||||
|
||||
# Check gene count
|
||||
if gene_count < MIN_GENES:
|
||||
messages.append(
|
||||
f"FAILED: Gene count {gene_count} is below minimum {MIN_GENES}. "
|
||||
"This may indicate missing data or incomplete query."
|
||||
)
|
||||
passed = False
|
||||
elif gene_count > MAX_GENES:
|
||||
messages.append(
|
||||
f"FAILED: Gene count {gene_count} exceeds maximum {MAX_GENES}. "
|
||||
"This may indicate pseudogene contamination or inclusion of non-coding genes."
|
||||
)
|
||||
passed = False
|
||||
else:
|
||||
messages.append(
|
||||
f"Gene count {gene_count} is within expected range ({MIN_GENES}-{MAX_GENES})"
|
||||
)
|
||||
|
||||
# Check all IDs start with ENSG
|
||||
non_ensg = [g for g in genes if not g.startswith('ENSG')]
|
||||
if non_ensg:
|
||||
messages.append(
|
||||
f"FAILED: Found {len(non_ensg)} gene IDs not in ENSG format "
|
||||
f"(examples: {non_ensg[:5]})"
|
||||
)
|
||||
passed = False
|
||||
else:
|
||||
messages.append("All gene IDs are in ENSG format")
|
||||
|
||||
# Check for duplicates
|
||||
unique_genes = set(genes)
|
||||
if len(unique_genes) < gene_count:
|
||||
duplicates = gene_count - len(unique_genes)
|
||||
messages.append(
|
||||
f"FAILED: Found {duplicates} duplicate gene IDs"
|
||||
)
|
||||
passed = False
|
||||
else:
|
||||
messages.append("No duplicate gene IDs found")
|
||||
|
||||
logger.info(
|
||||
f"Gene universe validation: {'PASSED' if passed else 'FAILED'} "
|
||||
f"({gene_count} genes)"
|
||||
)
|
||||
|
||||
return ValidationResult(
|
||||
passed=passed,
|
||||
messages=messages,
|
||||
)
|
||||
328
tests/test_gene_mapping.py
Normal file
328
tests/test_gene_mapping.py
Normal file
@@ -0,0 +1,328 @@
|
||||
"""Tests for gene ID mapping module.
|
||||
|
||||
Tests gene universe retrieval, batch mapping, and validation gates.
|
||||
Uses mocked mygene responses to avoid real API calls.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from usher_pipeline.gene_mapping import (
|
||||
GeneMapper,
|
||||
MappingReport,
|
||||
MappingResult,
|
||||
MappingValidator,
|
||||
ValidationResult,
|
||||
validate_gene_universe,
|
||||
)
|
||||
|
||||
|
||||
# Mock mygene response fixtures
|
||||
|
||||
MOCK_SUCCESSFUL_RESPONSE = {
|
||||
'out': [
|
||||
{
|
||||
'query': 'ENSG00000139618',
|
||||
'symbol': 'BRCA2',
|
||||
'uniprot': {'Swiss-Prot': 'P51587'}
|
||||
},
|
||||
{
|
||||
'query': 'ENSG00000141510',
|
||||
'symbol': 'TP53',
|
||||
'uniprot': {'Swiss-Prot': 'P04637'}
|
||||
},
|
||||
{
|
||||
'query': 'ENSG00000012048',
|
||||
'symbol': 'BRCA1',
|
||||
'uniprot': {'Swiss-Prot': 'P38398'}
|
||||
},
|
||||
],
|
||||
'missing': []
|
||||
}
|
||||
|
||||
MOCK_RESPONSE_WITH_NOTFOUND = {
|
||||
'out': [
|
||||
{
|
||||
'query': 'ENSG00000139618',
|
||||
'symbol': 'BRCA2',
|
||||
'uniprot': {'Swiss-Prot': 'P51587'}
|
||||
},
|
||||
{
|
||||
'query': 'ENSG00000141510',
|
||||
'symbol': 'TP53',
|
||||
'uniprot': {'Swiss-Prot': 'P04637'}
|
||||
},
|
||||
{
|
||||
'query': 'ENSG00000000000',
|
||||
'notfound': True,
|
||||
},
|
||||
],
|
||||
'missing': ['ENSG00000000000']
|
||||
}
|
||||
|
||||
MOCK_RESPONSE_WITH_UNIPROT_LIST = {
|
||||
'out': [
|
||||
{
|
||||
'query': 'ENSG00000139618',
|
||||
'symbol': 'BRCA2',
|
||||
'uniprot': {'Swiss-Prot': ['P51587', 'Q9UBX7']} # List of accessions
|
||||
},
|
||||
],
|
||||
'missing': []
|
||||
}
|
||||
|
||||
|
||||
# Test MappingResult creation
|
||||
|
||||
def test_mapping_result_creation():
|
||||
"""Test creating MappingResult with all fields."""
|
||||
result = MappingResult(
|
||||
ensembl_id='ENSG00000139618',
|
||||
hgnc_symbol='BRCA2',
|
||||
uniprot_accession='P51587',
|
||||
mapping_source='mygene'
|
||||
)
|
||||
|
||||
assert result.ensembl_id == 'ENSG00000139618'
|
||||
assert result.hgnc_symbol == 'BRCA2'
|
||||
assert result.uniprot_accession == 'P51587'
|
||||
assert result.mapping_source == 'mygene'
|
||||
|
||||
|
||||
def test_mapping_result_with_none_values():
|
||||
"""Test MappingResult handles missing data."""
|
||||
result = MappingResult(
|
||||
ensembl_id='ENSG00000000000',
|
||||
)
|
||||
|
||||
assert result.ensembl_id == 'ENSG00000000000'
|
||||
assert result.hgnc_symbol is None
|
||||
assert result.uniprot_accession is None
|
||||
assert result.mapping_source == 'mygene'
|
||||
|
||||
|
||||
# Test GeneMapper with mocked mygene
|
||||
|
||||
def test_mapper_handles_successful_mapping():
|
||||
"""Test mapper with all genes successfully mapped."""
|
||||
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||
mock_mg = MagicMock()
|
||||
mock_mg.querymany.return_value = MOCK_SUCCESSFUL_RESPONSE
|
||||
mock_mygene.return_value = mock_mg
|
||||
|
||||
mapper = GeneMapper(batch_size=1000)
|
||||
results, report = mapper.map_ensembl_ids([
|
||||
'ENSG00000139618',
|
||||
'ENSG00000141510',
|
||||
'ENSG00000012048',
|
||||
])
|
||||
|
||||
# Check results
|
||||
assert len(results) == 3
|
||||
assert results[0].ensembl_id == 'ENSG00000139618'
|
||||
assert results[0].hgnc_symbol == 'BRCA2'
|
||||
assert results[0].uniprot_accession == 'P51587'
|
||||
|
||||
# Check report
|
||||
assert report.total_genes == 3
|
||||
assert report.mapped_hgnc == 3
|
||||
assert report.mapped_uniprot == 3
|
||||
assert report.success_rate_hgnc == 1.0
|
||||
assert report.success_rate_uniprot == 1.0
|
||||
assert len(report.unmapped_ids) == 0
|
||||
|
||||
|
||||
def test_mapper_handles_unmapped_genes():
|
||||
"""Test mapper with one gene not found."""
|
||||
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||
mock_mg = MagicMock()
|
||||
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_NOTFOUND
|
||||
mock_mygene.return_value = mock_mg
|
||||
|
||||
mapper = GeneMapper()
|
||||
results, report = mapper.map_ensembl_ids([
|
||||
'ENSG00000139618',
|
||||
'ENSG00000141510',
|
||||
'ENSG00000000000',
|
||||
])
|
||||
|
||||
# Check results
|
||||
assert len(results) == 3
|
||||
assert results[2].ensembl_id == 'ENSG00000000000'
|
||||
assert results[2].hgnc_symbol is None
|
||||
assert results[2].uniprot_accession is None
|
||||
|
||||
# Check report
|
||||
assert report.total_genes == 3
|
||||
assert report.mapped_hgnc == 2
|
||||
assert report.mapped_uniprot == 2
|
||||
assert abs(report.success_rate_hgnc - 0.667) < 0.01
|
||||
assert abs(report.success_rate_uniprot - 0.667) < 0.01
|
||||
assert 'ENSG00000000000' in report.unmapped_ids
|
||||
assert len(report.unmapped_ids) == 1
|
||||
|
||||
|
||||
def test_mapper_handles_uniprot_list():
|
||||
"""Test mapper handles UniProt Swiss-Prot as list (takes first)."""
|
||||
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||
mock_mg = MagicMock()
|
||||
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_UNIPROT_LIST
|
||||
mock_mygene.return_value = mock_mg
|
||||
|
||||
mapper = GeneMapper()
|
||||
results, report = mapper.map_ensembl_ids(['ENSG00000139618'])
|
||||
|
||||
# Should take first UniProt accession from list
|
||||
assert results[0].uniprot_accession == 'P51587'
|
||||
assert report.mapped_uniprot == 1
|
||||
|
||||
|
||||
def test_mapper_batching():
|
||||
"""Test mapper processes genes in batches."""
|
||||
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||
mock_mg = MagicMock()
|
||||
# Return empty response for each batch
|
||||
mock_mg.querymany.return_value = {'out': [], 'missing': []}
|
||||
mock_mygene.return_value = mock_mg
|
||||
|
||||
mapper = GeneMapper(batch_size=2)
|
||||
# 5 genes should result in 3 batches (2+2+1)
|
||||
gene_ids = [f'ENSG{i:011d}' for i in range(5)]
|
||||
results, report = mapper.map_ensembl_ids(gene_ids)
|
||||
|
||||
# Check querymany was called 3 times (3 batches)
|
||||
assert mock_mg.querymany.call_count == 3
|
||||
|
||||
|
||||
# Test MappingValidator
|
||||
|
||||
def test_validator_passes_high_rate():
|
||||
"""Test validator passes with success rate above minimum."""
|
||||
report = MappingReport(
|
||||
total_genes=100,
|
||||
mapped_hgnc=95,
|
||||
mapped_uniprot=90,
|
||||
unmapped_ids=[f'ENSG{i:011d}' for i in range(5)],
|
||||
)
|
||||
|
||||
validator = MappingValidator(min_success_rate=0.90)
|
||||
result = validator.validate(report)
|
||||
|
||||
assert result.passed is True
|
||||
assert result.hgnc_rate == 0.95
|
||||
assert result.uniprot_rate == 0.90
|
||||
assert any('PASSED' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validator_fails_low_rate():
|
||||
"""Test validator fails with success rate below minimum."""
|
||||
report = MappingReport(
|
||||
total_genes=100,
|
||||
mapped_hgnc=80,
|
||||
mapped_uniprot=75,
|
||||
unmapped_ids=[f'ENSG{i:011d}' for i in range(20)],
|
||||
)
|
||||
|
||||
validator = MappingValidator(min_success_rate=0.90)
|
||||
result = validator.validate(report)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.hgnc_rate == 0.80
|
||||
assert any('FAILED' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validator_warns_medium_rate():
|
||||
"""Test validator passes with warning for medium success rate."""
|
||||
report = MappingReport(
|
||||
total_genes=100,
|
||||
mapped_hgnc=92,
|
||||
mapped_uniprot=88,
|
||||
unmapped_ids=[f'ENSG{i:011d}' for i in range(8)],
|
||||
)
|
||||
|
||||
validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95)
|
||||
result = validator.validate(report)
|
||||
|
||||
# Should pass but with warning
|
||||
assert result.passed is True
|
||||
assert result.hgnc_rate == 0.92
|
||||
assert any('WARNING' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_save_unmapped_report(tmp_path):
|
||||
"""Test saving unmapped gene IDs to file."""
|
||||
report = MappingReport(
|
||||
total_genes=100,
|
||||
mapped_hgnc=95,
|
||||
mapped_uniprot=90,
|
||||
unmapped_ids=['ENSG00000000001', 'ENSG00000000002', 'ENSG00000000003'],
|
||||
)
|
||||
|
||||
validator = MappingValidator()
|
||||
output_path = tmp_path / "unmapped_genes.txt"
|
||||
validator.save_unmapped_report(report, output_path)
|
||||
|
||||
# Check file was created and contains expected content
|
||||
assert output_path.exists()
|
||||
content = output_path.read_text()
|
||||
assert '# Unmapped Gene IDs' in content
|
||||
assert '# Total unmapped: 3' in content
|
||||
assert 'ENSG00000000001' in content
|
||||
assert 'ENSG00000000002' in content
|
||||
assert 'ENSG00000000003' in content
|
||||
|
||||
|
||||
# Test validate_gene_universe
|
||||
|
||||
def test_validate_gene_universe_valid():
|
||||
"""Test gene universe validation with valid data."""
|
||||
genes = [f'ENSG{i:011d}' for i in range(20000)] # 20k genes
|
||||
result = validate_gene_universe(genes)
|
||||
|
||||
assert result.passed is True
|
||||
assert any('within expected range' in msg for msg in result.messages)
|
||||
assert any('ENSG format' in msg for msg in result.messages)
|
||||
assert any('No duplicate' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validate_gene_universe_invalid_count():
|
||||
"""Test gene universe validation fails with too many genes."""
|
||||
genes = [f'ENSG{i:011d}' for i in range(50000)] # 50k genes (too many)
|
||||
result = validate_gene_universe(genes)
|
||||
|
||||
assert result.passed is False
|
||||
assert any('exceeds maximum' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validate_gene_universe_invalid_format():
|
||||
"""Test gene universe validation fails with non-ENSG IDs."""
|
||||
genes = [f'ENSG{i:011d}' for i in range(19500)]
|
||||
genes.extend(['INVALID001', 'INVALID002']) # Add invalid IDs
|
||||
|
||||
result = validate_gene_universe(genes)
|
||||
|
||||
assert result.passed is False
|
||||
assert any('not in ENSG format' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validate_gene_universe_duplicates():
|
||||
"""Test gene universe validation fails with duplicates."""
|
||||
genes = [f'ENSG{i:011d}' for i in range(19500)]
|
||||
genes.extend(['ENSG00000000001', 'ENSG00000000002']) # Add duplicates
|
||||
|
||||
result = validate_gene_universe(genes)
|
||||
|
||||
assert result.passed is False
|
||||
assert any('duplicate' in msg for msg in result.messages)
|
||||
|
||||
|
||||
def test_validate_gene_universe_too_few():
|
||||
"""Test gene universe validation fails with too few genes."""
|
||||
genes = [f'ENSG{i:011d}' for i in range(1000)] # Only 1k genes
|
||||
|
||||
result = validate_gene_universe(genes)
|
||||
|
||||
assert result.passed is False
|
||||
assert any('below minimum' in msg for msg in result.messages)
|
||||
Reference in New Issue
Block a user