feat(01-02): create mapping validation gates with tests

- Add MappingValidator with configurable success rate thresholds (min_success_rate, warn_threshold)
- Add validate_gene_universe for gene count, format, and duplicate checks
- Add save_unmapped_report for manual review output
- Implement 15 comprehensive tests with mocked mygene responses (no real API calls)
- Tests cover: successful mapping, notfound handling, uniprot list parsing, batching, validation gates, universe validation
This commit is contained in:
2026-02-11 16:33:36 +08:00
parent 98a1a750dd
commit 0200395d9e
3 changed files with 560 additions and 0 deletions

View File

@@ -13,6 +13,11 @@ from usher_pipeline.gene_mapping.universe import (
fetch_protein_coding_genes, fetch_protein_coding_genes,
GeneUniverse, GeneUniverse,
) )
from usher_pipeline.gene_mapping.validator import (
MappingValidator,
ValidationResult,
validate_gene_universe,
)
__all__ = [ __all__ = [
"GeneMapper", "GeneMapper",
@@ -20,4 +25,7 @@ __all__ = [
"MappingReport", "MappingReport",
"fetch_protein_coding_genes", "fetch_protein_coding_genes",
"GeneUniverse", "GeneUniverse",
"MappingValidator",
"ValidationResult",
"validate_gene_universe",
] ]

View File

@@ -0,0 +1,224 @@
"""Validation gates for gene mapping quality control.
Provides validation for mapping results and gene universe data quality.
Enforces configurable success rate thresholds and produces actionable reports.
"""
import logging
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from usher_pipeline.gene_mapping.mapper import MappingReport
logger = logging.getLogger(__name__)
@dataclass
class ValidationResult:
"""Result of a validation check.
Attributes:
passed: Whether validation passed
messages: List of validation messages (warnings, errors)
hgnc_rate: HGNC mapping success rate (0-1)
uniprot_rate: UniProt mapping success rate (0-1)
"""
passed: bool
messages: list[str] = field(default_factory=list)
hgnc_rate: float = 0.0
uniprot_rate: float = 0.0
class MappingValidator:
"""Validator for gene ID mapping results.
Enforces configurable success rate thresholds and produces validation reports.
"""
def __init__(
self,
min_success_rate: float = 0.90,
warn_threshold: float = 0.95
):
"""Initialize mapping validator.
Args:
min_success_rate: Minimum HGNC mapping success rate to pass (default: 0.90)
warn_threshold: Success rate below this triggers warning (default: 0.95)
"""
self.min_success_rate = min_success_rate
self.warn_threshold = warn_threshold
logger.info(
f"Initialized MappingValidator: min_rate={min_success_rate}, "
f"warn_threshold={warn_threshold}"
)
def validate(self, report: MappingReport) -> ValidationResult:
"""Validate gene mapping results.
Checks if HGNC mapping success rate meets minimum threshold.
Issues warning if rate is below warn_threshold but above min_success_rate.
Args:
report: MappingReport from batch mapping operation
Returns:
ValidationResult with pass/fail status and messages
"""
messages: list[str] = []
hgnc_rate = report.success_rate_hgnc
uniprot_rate = report.success_rate_uniprot
# Check HGNC success rate
if hgnc_rate < self.min_success_rate:
messages.append(
f"FAILED: HGNC mapping success rate {hgnc_rate:.1%} is below "
f"minimum threshold {self.min_success_rate:.1%}"
)
messages.append(
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
)
messages.append(
f"Unmapped genes: {len(report.unmapped_ids)} "
f"(first 10: {report.unmapped_ids[:10]})"
)
passed = False
elif hgnc_rate < self.warn_threshold:
messages.append(
f"WARNING: HGNC mapping success rate {hgnc_rate:.1%} is below "
f"warning threshold {self.warn_threshold:.1%}"
)
messages.append(
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
)
messages.append(
f"Consider reviewing {len(report.unmapped_ids)} unmapped genes"
)
passed = True
else:
messages.append(
f"PASSED: HGNC mapping success rate {hgnc_rate:.1%} "
f"({report.mapped_hgnc}/{report.total_genes} genes)"
)
passed = True
# Report UniProt stats (informational only, not used for pass/fail)
messages.append(
f"UniProt mapping: {uniprot_rate:.1%} "
f"({report.mapped_uniprot}/{report.total_genes} genes)"
)
logger.info(
f"Validation result: {'PASSED' if passed else 'FAILED'} "
f"(HGNC: {hgnc_rate:.1%}, UniProt: {uniprot_rate:.1%})"
)
return ValidationResult(
passed=passed,
messages=messages,
hgnc_rate=hgnc_rate,
uniprot_rate=uniprot_rate,
)
def save_unmapped_report(
self,
report: MappingReport,
output_path: Path
) -> None:
"""Save list of unmapped genes to file for manual review.
Args:
report: MappingReport containing unmapped gene IDs
output_path: Path to output file
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
unmapped_count = len(report.unmapped_ids)
with output_path.open('w') as f:
f.write(f"# Unmapped Gene IDs\n")
f.write(f"# Generated: {timestamp}\n")
f.write(f"# Total unmapped: {unmapped_count}\n")
f.write(f"# Success rate: {report.success_rate_hgnc:.1%}\n")
f.write(f"#\n")
for gene_id in report.unmapped_ids:
f.write(f"{gene_id}\n")
logger.info(
f"Saved {unmapped_count} unmapped gene IDs to {output_path}"
)
def validate_gene_universe(genes: list[str]) -> ValidationResult:
"""Validate gene universe data quality.
Checks:
- Gene count is in expected range (19,000-22,000)
- All gene IDs start with ENSG (Ensembl format)
- No duplicate gene IDs
Args:
genes: List of gene IDs to validate
Returns:
ValidationResult with validation status and messages
"""
messages: list[str] = []
passed = True
gene_count = len(genes)
MIN_GENES = 19000
MAX_GENES = 22000
# Check gene count
if gene_count < MIN_GENES:
messages.append(
f"FAILED: Gene count {gene_count} is below minimum {MIN_GENES}. "
"This may indicate missing data or incomplete query."
)
passed = False
elif gene_count > MAX_GENES:
messages.append(
f"FAILED: Gene count {gene_count} exceeds maximum {MAX_GENES}. "
"This may indicate pseudogene contamination or inclusion of non-coding genes."
)
passed = False
else:
messages.append(
f"Gene count {gene_count} is within expected range ({MIN_GENES}-{MAX_GENES})"
)
# Check all IDs start with ENSG
non_ensg = [g for g in genes if not g.startswith('ENSG')]
if non_ensg:
messages.append(
f"FAILED: Found {len(non_ensg)} gene IDs not in ENSG format "
f"(examples: {non_ensg[:5]})"
)
passed = False
else:
messages.append("All gene IDs are in ENSG format")
# Check for duplicates
unique_genes = set(genes)
if len(unique_genes) < gene_count:
duplicates = gene_count - len(unique_genes)
messages.append(
f"FAILED: Found {duplicates} duplicate gene IDs"
)
passed = False
else:
messages.append("No duplicate gene IDs found")
logger.info(
f"Gene universe validation: {'PASSED' if passed else 'FAILED'} "
f"({gene_count} genes)"
)
return ValidationResult(
passed=passed,
messages=messages,
)

328
tests/test_gene_mapping.py Normal file
View File

@@ -0,0 +1,328 @@
"""Tests for gene ID mapping module.
Tests gene universe retrieval, batch mapping, and validation gates.
Uses mocked mygene responses to avoid real API calls.
"""
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from usher_pipeline.gene_mapping import (
GeneMapper,
MappingReport,
MappingResult,
MappingValidator,
ValidationResult,
validate_gene_universe,
)
# Mock mygene response fixtures
MOCK_SUCCESSFUL_RESPONSE = {
'out': [
{
'query': 'ENSG00000139618',
'symbol': 'BRCA2',
'uniprot': {'Swiss-Prot': 'P51587'}
},
{
'query': 'ENSG00000141510',
'symbol': 'TP53',
'uniprot': {'Swiss-Prot': 'P04637'}
},
{
'query': 'ENSG00000012048',
'symbol': 'BRCA1',
'uniprot': {'Swiss-Prot': 'P38398'}
},
],
'missing': []
}
MOCK_RESPONSE_WITH_NOTFOUND = {
'out': [
{
'query': 'ENSG00000139618',
'symbol': 'BRCA2',
'uniprot': {'Swiss-Prot': 'P51587'}
},
{
'query': 'ENSG00000141510',
'symbol': 'TP53',
'uniprot': {'Swiss-Prot': 'P04637'}
},
{
'query': 'ENSG00000000000',
'notfound': True,
},
],
'missing': ['ENSG00000000000']
}
MOCK_RESPONSE_WITH_UNIPROT_LIST = {
'out': [
{
'query': 'ENSG00000139618',
'symbol': 'BRCA2',
'uniprot': {'Swiss-Prot': ['P51587', 'Q9UBX7']} # List of accessions
},
],
'missing': []
}
# Test MappingResult creation
def test_mapping_result_creation():
"""Test creating MappingResult with all fields."""
result = MappingResult(
ensembl_id='ENSG00000139618',
hgnc_symbol='BRCA2',
uniprot_accession='P51587',
mapping_source='mygene'
)
assert result.ensembl_id == 'ENSG00000139618'
assert result.hgnc_symbol == 'BRCA2'
assert result.uniprot_accession == 'P51587'
assert result.mapping_source == 'mygene'
def test_mapping_result_with_none_values():
"""Test MappingResult handles missing data."""
result = MappingResult(
ensembl_id='ENSG00000000000',
)
assert result.ensembl_id == 'ENSG00000000000'
assert result.hgnc_symbol is None
assert result.uniprot_accession is None
assert result.mapping_source == 'mygene'
# Test GeneMapper with mocked mygene
def test_mapper_handles_successful_mapping():
"""Test mapper with all genes successfully mapped."""
with patch('mygene.MyGeneInfo') as mock_mygene:
mock_mg = MagicMock()
mock_mg.querymany.return_value = MOCK_SUCCESSFUL_RESPONSE
mock_mygene.return_value = mock_mg
mapper = GeneMapper(batch_size=1000)
results, report = mapper.map_ensembl_ids([
'ENSG00000139618',
'ENSG00000141510',
'ENSG00000012048',
])
# Check results
assert len(results) == 3
assert results[0].ensembl_id == 'ENSG00000139618'
assert results[0].hgnc_symbol == 'BRCA2'
assert results[0].uniprot_accession == 'P51587'
# Check report
assert report.total_genes == 3
assert report.mapped_hgnc == 3
assert report.mapped_uniprot == 3
assert report.success_rate_hgnc == 1.0
assert report.success_rate_uniprot == 1.0
assert len(report.unmapped_ids) == 0
def test_mapper_handles_unmapped_genes():
"""Test mapper with one gene not found."""
with patch('mygene.MyGeneInfo') as mock_mygene:
mock_mg = MagicMock()
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_NOTFOUND
mock_mygene.return_value = mock_mg
mapper = GeneMapper()
results, report = mapper.map_ensembl_ids([
'ENSG00000139618',
'ENSG00000141510',
'ENSG00000000000',
])
# Check results
assert len(results) == 3
assert results[2].ensembl_id == 'ENSG00000000000'
assert results[2].hgnc_symbol is None
assert results[2].uniprot_accession is None
# Check report
assert report.total_genes == 3
assert report.mapped_hgnc == 2
assert report.mapped_uniprot == 2
assert abs(report.success_rate_hgnc - 0.667) < 0.01
assert abs(report.success_rate_uniprot - 0.667) < 0.01
assert 'ENSG00000000000' in report.unmapped_ids
assert len(report.unmapped_ids) == 1
def test_mapper_handles_uniprot_list():
"""Test mapper handles UniProt Swiss-Prot as list (takes first)."""
with patch('mygene.MyGeneInfo') as mock_mygene:
mock_mg = MagicMock()
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_UNIPROT_LIST
mock_mygene.return_value = mock_mg
mapper = GeneMapper()
results, report = mapper.map_ensembl_ids(['ENSG00000139618'])
# Should take first UniProt accession from list
assert results[0].uniprot_accession == 'P51587'
assert report.mapped_uniprot == 1
def test_mapper_batching():
"""Test mapper processes genes in batches."""
with patch('mygene.MyGeneInfo') as mock_mygene:
mock_mg = MagicMock()
# Return empty response for each batch
mock_mg.querymany.return_value = {'out': [], 'missing': []}
mock_mygene.return_value = mock_mg
mapper = GeneMapper(batch_size=2)
# 5 genes should result in 3 batches (2+2+1)
gene_ids = [f'ENSG{i:011d}' for i in range(5)]
results, report = mapper.map_ensembl_ids(gene_ids)
# Check querymany was called 3 times (3 batches)
assert mock_mg.querymany.call_count == 3
# Test MappingValidator
def test_validator_passes_high_rate():
"""Test validator passes with success rate above minimum."""
report = MappingReport(
total_genes=100,
mapped_hgnc=95,
mapped_uniprot=90,
unmapped_ids=[f'ENSG{i:011d}' for i in range(5)],
)
validator = MappingValidator(min_success_rate=0.90)
result = validator.validate(report)
assert result.passed is True
assert result.hgnc_rate == 0.95
assert result.uniprot_rate == 0.90
assert any('PASSED' in msg for msg in result.messages)
def test_validator_fails_low_rate():
"""Test validator fails with success rate below minimum."""
report = MappingReport(
total_genes=100,
mapped_hgnc=80,
mapped_uniprot=75,
unmapped_ids=[f'ENSG{i:011d}' for i in range(20)],
)
validator = MappingValidator(min_success_rate=0.90)
result = validator.validate(report)
assert result.passed is False
assert result.hgnc_rate == 0.80
assert any('FAILED' in msg for msg in result.messages)
def test_validator_warns_medium_rate():
"""Test validator passes with warning for medium success rate."""
report = MappingReport(
total_genes=100,
mapped_hgnc=92,
mapped_uniprot=88,
unmapped_ids=[f'ENSG{i:011d}' for i in range(8)],
)
validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95)
result = validator.validate(report)
# Should pass but with warning
assert result.passed is True
assert result.hgnc_rate == 0.92
assert any('WARNING' in msg for msg in result.messages)
def test_save_unmapped_report(tmp_path):
"""Test saving unmapped gene IDs to file."""
report = MappingReport(
total_genes=100,
mapped_hgnc=95,
mapped_uniprot=90,
unmapped_ids=['ENSG00000000001', 'ENSG00000000002', 'ENSG00000000003'],
)
validator = MappingValidator()
output_path = tmp_path / "unmapped_genes.txt"
validator.save_unmapped_report(report, output_path)
# Check file was created and contains expected content
assert output_path.exists()
content = output_path.read_text()
assert '# Unmapped Gene IDs' in content
assert '# Total unmapped: 3' in content
assert 'ENSG00000000001' in content
assert 'ENSG00000000002' in content
assert 'ENSG00000000003' in content
# Test validate_gene_universe
def test_validate_gene_universe_valid():
"""Test gene universe validation with valid data."""
genes = [f'ENSG{i:011d}' for i in range(20000)] # 20k genes
result = validate_gene_universe(genes)
assert result.passed is True
assert any('within expected range' in msg for msg in result.messages)
assert any('ENSG format' in msg for msg in result.messages)
assert any('No duplicate' in msg for msg in result.messages)
def test_validate_gene_universe_invalid_count():
"""Test gene universe validation fails with too many genes."""
genes = [f'ENSG{i:011d}' for i in range(50000)] # 50k genes (too many)
result = validate_gene_universe(genes)
assert result.passed is False
assert any('exceeds maximum' in msg for msg in result.messages)
def test_validate_gene_universe_invalid_format():
"""Test gene universe validation fails with non-ENSG IDs."""
genes = [f'ENSG{i:011d}' for i in range(19500)]
genes.extend(['INVALID001', 'INVALID002']) # Add invalid IDs
result = validate_gene_universe(genes)
assert result.passed is False
assert any('not in ENSG format' in msg for msg in result.messages)
def test_validate_gene_universe_duplicates():
"""Test gene universe validation fails with duplicates."""
genes = [f'ENSG{i:011d}' for i in range(19500)]
genes.extend(['ENSG00000000001', 'ENSG00000000002']) # Add duplicates
result = validate_gene_universe(genes)
assert result.passed is False
assert any('duplicate' in msg for msg in result.messages)
def test_validate_gene_universe_too_few():
"""Test gene universe validation fails with too few genes."""
genes = [f'ENSG{i:011d}' for i in range(1000)] # Only 1k genes
result = validate_gene_universe(genes)
assert result.passed is False
assert any('below minimum' in msg for msg in result.messages)