diff --git a/src/usher_pipeline/gene_mapping/__init__.py b/src/usher_pipeline/gene_mapping/__init__.py index 3fa5a13..d9960d7 100644 --- a/src/usher_pipeline/gene_mapping/__init__.py +++ b/src/usher_pipeline/gene_mapping/__init__.py @@ -13,6 +13,11 @@ from usher_pipeline.gene_mapping.universe import ( fetch_protein_coding_genes, GeneUniverse, ) +from usher_pipeline.gene_mapping.validator import ( + MappingValidator, + ValidationResult, + validate_gene_universe, +) __all__ = [ "GeneMapper", @@ -20,4 +25,7 @@ __all__ = [ "MappingReport", "fetch_protein_coding_genes", "GeneUniverse", + "MappingValidator", + "ValidationResult", + "validate_gene_universe", ] diff --git a/src/usher_pipeline/gene_mapping/validator.py b/src/usher_pipeline/gene_mapping/validator.py new file mode 100644 index 0000000..644f3e2 --- /dev/null +++ b/src/usher_pipeline/gene_mapping/validator.py @@ -0,0 +1,224 @@ +"""Validation gates for gene mapping quality control. + +Provides validation for mapping results and gene universe data quality. +Enforces configurable success rate thresholds and produces actionable reports. +""" + +import logging +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path + +from usher_pipeline.gene_mapping.mapper import MappingReport + +logger = logging.getLogger(__name__) + + +@dataclass +class ValidationResult: + """Result of a validation check. + + Attributes: + passed: Whether validation passed + messages: List of validation messages (warnings, errors) + hgnc_rate: HGNC mapping success rate (0-1) + uniprot_rate: UniProt mapping success rate (0-1) + """ + passed: bool + messages: list[str] = field(default_factory=list) + hgnc_rate: float = 0.0 + uniprot_rate: float = 0.0 + + +class MappingValidator: + """Validator for gene ID mapping results. + + Enforces configurable success rate thresholds and produces validation reports. + """ + + def __init__( + self, + min_success_rate: float = 0.90, + warn_threshold: float = 0.95 + ): + """Initialize mapping validator. + + Args: + min_success_rate: Minimum HGNC mapping success rate to pass (default: 0.90) + warn_threshold: Success rate below this triggers warning (default: 0.95) + """ + self.min_success_rate = min_success_rate + self.warn_threshold = warn_threshold + logger.info( + f"Initialized MappingValidator: min_rate={min_success_rate}, " + f"warn_threshold={warn_threshold}" + ) + + def validate(self, report: MappingReport) -> ValidationResult: + """Validate gene mapping results. + + Checks if HGNC mapping success rate meets minimum threshold. + Issues warning if rate is below warn_threshold but above min_success_rate. + + Args: + report: MappingReport from batch mapping operation + + Returns: + ValidationResult with pass/fail status and messages + """ + messages: list[str] = [] + hgnc_rate = report.success_rate_hgnc + uniprot_rate = report.success_rate_uniprot + + # Check HGNC success rate + if hgnc_rate < self.min_success_rate: + messages.append( + f"FAILED: HGNC mapping success rate {hgnc_rate:.1%} is below " + f"minimum threshold {self.min_success_rate:.1%}" + ) + messages.append( + f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols" + ) + messages.append( + f"Unmapped genes: {len(report.unmapped_ids)} " + f"(first 10: {report.unmapped_ids[:10]})" + ) + passed = False + elif hgnc_rate < self.warn_threshold: + messages.append( + f"WARNING: HGNC mapping success rate {hgnc_rate:.1%} is below " + f"warning threshold {self.warn_threshold:.1%}" + ) + messages.append( + f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols" + ) + messages.append( + f"Consider reviewing {len(report.unmapped_ids)} unmapped genes" + ) + passed = True + else: + messages.append( + f"PASSED: HGNC mapping success rate {hgnc_rate:.1%} " + f"({report.mapped_hgnc}/{report.total_genes} genes)" + ) + passed = True + + # Report UniProt stats (informational only, not used for pass/fail) + messages.append( + f"UniProt mapping: {uniprot_rate:.1%} " + f"({report.mapped_uniprot}/{report.total_genes} genes)" + ) + + logger.info( + f"Validation result: {'PASSED' if passed else 'FAILED'} " + f"(HGNC: {hgnc_rate:.1%}, UniProt: {uniprot_rate:.1%})" + ) + + return ValidationResult( + passed=passed, + messages=messages, + hgnc_rate=hgnc_rate, + uniprot_rate=uniprot_rate, + ) + + def save_unmapped_report( + self, + report: MappingReport, + output_path: Path + ) -> None: + """Save list of unmapped genes to file for manual review. + + Args: + report: MappingReport containing unmapped gene IDs + output_path: Path to output file + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + unmapped_count = len(report.unmapped_ids) + + with output_path.open('w') as f: + f.write(f"# Unmapped Gene IDs\n") + f.write(f"# Generated: {timestamp}\n") + f.write(f"# Total unmapped: {unmapped_count}\n") + f.write(f"# Success rate: {report.success_rate_hgnc:.1%}\n") + f.write(f"#\n") + for gene_id in report.unmapped_ids: + f.write(f"{gene_id}\n") + + logger.info( + f"Saved {unmapped_count} unmapped gene IDs to {output_path}" + ) + + +def validate_gene_universe(genes: list[str]) -> ValidationResult: + """Validate gene universe data quality. + + Checks: + - Gene count is in expected range (19,000-22,000) + - All gene IDs start with ENSG (Ensembl format) + - No duplicate gene IDs + + Args: + genes: List of gene IDs to validate + + Returns: + ValidationResult with validation status and messages + """ + messages: list[str] = [] + passed = True + + gene_count = len(genes) + MIN_GENES = 19000 + MAX_GENES = 22000 + + # Check gene count + if gene_count < MIN_GENES: + messages.append( + f"FAILED: Gene count {gene_count} is below minimum {MIN_GENES}. " + "This may indicate missing data or incomplete query." + ) + passed = False + elif gene_count > MAX_GENES: + messages.append( + f"FAILED: Gene count {gene_count} exceeds maximum {MAX_GENES}. " + "This may indicate pseudogene contamination or inclusion of non-coding genes." + ) + passed = False + else: + messages.append( + f"Gene count {gene_count} is within expected range ({MIN_GENES}-{MAX_GENES})" + ) + + # Check all IDs start with ENSG + non_ensg = [g for g in genes if not g.startswith('ENSG')] + if non_ensg: + messages.append( + f"FAILED: Found {len(non_ensg)} gene IDs not in ENSG format " + f"(examples: {non_ensg[:5]})" + ) + passed = False + else: + messages.append("All gene IDs are in ENSG format") + + # Check for duplicates + unique_genes = set(genes) + if len(unique_genes) < gene_count: + duplicates = gene_count - len(unique_genes) + messages.append( + f"FAILED: Found {duplicates} duplicate gene IDs" + ) + passed = False + else: + messages.append("No duplicate gene IDs found") + + logger.info( + f"Gene universe validation: {'PASSED' if passed else 'FAILED'} " + f"({gene_count} genes)" + ) + + return ValidationResult( + passed=passed, + messages=messages, + ) diff --git a/tests/test_gene_mapping.py b/tests/test_gene_mapping.py new file mode 100644 index 0000000..ab7e63b --- /dev/null +++ b/tests/test_gene_mapping.py @@ -0,0 +1,328 @@ +"""Tests for gene ID mapping module. + +Tests gene universe retrieval, batch mapping, and validation gates. +Uses mocked mygene responses to avoid real API calls. +""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from usher_pipeline.gene_mapping import ( + GeneMapper, + MappingReport, + MappingResult, + MappingValidator, + ValidationResult, + validate_gene_universe, +) + + +# Mock mygene response fixtures + +MOCK_SUCCESSFUL_RESPONSE = { + 'out': [ + { + 'query': 'ENSG00000139618', + 'symbol': 'BRCA2', + 'uniprot': {'Swiss-Prot': 'P51587'} + }, + { + 'query': 'ENSG00000141510', + 'symbol': 'TP53', + 'uniprot': {'Swiss-Prot': 'P04637'} + }, + { + 'query': 'ENSG00000012048', + 'symbol': 'BRCA1', + 'uniprot': {'Swiss-Prot': 'P38398'} + }, + ], + 'missing': [] +} + +MOCK_RESPONSE_WITH_NOTFOUND = { + 'out': [ + { + 'query': 'ENSG00000139618', + 'symbol': 'BRCA2', + 'uniprot': {'Swiss-Prot': 'P51587'} + }, + { + 'query': 'ENSG00000141510', + 'symbol': 'TP53', + 'uniprot': {'Swiss-Prot': 'P04637'} + }, + { + 'query': 'ENSG00000000000', + 'notfound': True, + }, + ], + 'missing': ['ENSG00000000000'] +} + +MOCK_RESPONSE_WITH_UNIPROT_LIST = { + 'out': [ + { + 'query': 'ENSG00000139618', + 'symbol': 'BRCA2', + 'uniprot': {'Swiss-Prot': ['P51587', 'Q9UBX7']} # List of accessions + }, + ], + 'missing': [] +} + + +# Test MappingResult creation + +def test_mapping_result_creation(): + """Test creating MappingResult with all fields.""" + result = MappingResult( + ensembl_id='ENSG00000139618', + hgnc_symbol='BRCA2', + uniprot_accession='P51587', + mapping_source='mygene' + ) + + assert result.ensembl_id == 'ENSG00000139618' + assert result.hgnc_symbol == 'BRCA2' + assert result.uniprot_accession == 'P51587' + assert result.mapping_source == 'mygene' + + +def test_mapping_result_with_none_values(): + """Test MappingResult handles missing data.""" + result = MappingResult( + ensembl_id='ENSG00000000000', + ) + + assert result.ensembl_id == 'ENSG00000000000' + assert result.hgnc_symbol is None + assert result.uniprot_accession is None + assert result.mapping_source == 'mygene' + + +# Test GeneMapper with mocked mygene + +def test_mapper_handles_successful_mapping(): + """Test mapper with all genes successfully mapped.""" + with patch('mygene.MyGeneInfo') as mock_mygene: + mock_mg = MagicMock() + mock_mg.querymany.return_value = MOCK_SUCCESSFUL_RESPONSE + mock_mygene.return_value = mock_mg + + mapper = GeneMapper(batch_size=1000) + results, report = mapper.map_ensembl_ids([ + 'ENSG00000139618', + 'ENSG00000141510', + 'ENSG00000012048', + ]) + + # Check results + assert len(results) == 3 + assert results[0].ensembl_id == 'ENSG00000139618' + assert results[0].hgnc_symbol == 'BRCA2' + assert results[0].uniprot_accession == 'P51587' + + # Check report + assert report.total_genes == 3 + assert report.mapped_hgnc == 3 + assert report.mapped_uniprot == 3 + assert report.success_rate_hgnc == 1.0 + assert report.success_rate_uniprot == 1.0 + assert len(report.unmapped_ids) == 0 + + +def test_mapper_handles_unmapped_genes(): + """Test mapper with one gene not found.""" + with patch('mygene.MyGeneInfo') as mock_mygene: + mock_mg = MagicMock() + mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_NOTFOUND + mock_mygene.return_value = mock_mg + + mapper = GeneMapper() + results, report = mapper.map_ensembl_ids([ + 'ENSG00000139618', + 'ENSG00000141510', + 'ENSG00000000000', + ]) + + # Check results + assert len(results) == 3 + assert results[2].ensembl_id == 'ENSG00000000000' + assert results[2].hgnc_symbol is None + assert results[2].uniprot_accession is None + + # Check report + assert report.total_genes == 3 + assert report.mapped_hgnc == 2 + assert report.mapped_uniprot == 2 + assert abs(report.success_rate_hgnc - 0.667) < 0.01 + assert abs(report.success_rate_uniprot - 0.667) < 0.01 + assert 'ENSG00000000000' in report.unmapped_ids + assert len(report.unmapped_ids) == 1 + + +def test_mapper_handles_uniprot_list(): + """Test mapper handles UniProt Swiss-Prot as list (takes first).""" + with patch('mygene.MyGeneInfo') as mock_mygene: + mock_mg = MagicMock() + mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_UNIPROT_LIST + mock_mygene.return_value = mock_mg + + mapper = GeneMapper() + results, report = mapper.map_ensembl_ids(['ENSG00000139618']) + + # Should take first UniProt accession from list + assert results[0].uniprot_accession == 'P51587' + assert report.mapped_uniprot == 1 + + +def test_mapper_batching(): + """Test mapper processes genes in batches.""" + with patch('mygene.MyGeneInfo') as mock_mygene: + mock_mg = MagicMock() + # Return empty response for each batch + mock_mg.querymany.return_value = {'out': [], 'missing': []} + mock_mygene.return_value = mock_mg + + mapper = GeneMapper(batch_size=2) + # 5 genes should result in 3 batches (2+2+1) + gene_ids = [f'ENSG{i:011d}' for i in range(5)] + results, report = mapper.map_ensembl_ids(gene_ids) + + # Check querymany was called 3 times (3 batches) + assert mock_mg.querymany.call_count == 3 + + +# Test MappingValidator + +def test_validator_passes_high_rate(): + """Test validator passes with success rate above minimum.""" + report = MappingReport( + total_genes=100, + mapped_hgnc=95, + mapped_uniprot=90, + unmapped_ids=[f'ENSG{i:011d}' for i in range(5)], + ) + + validator = MappingValidator(min_success_rate=0.90) + result = validator.validate(report) + + assert result.passed is True + assert result.hgnc_rate == 0.95 + assert result.uniprot_rate == 0.90 + assert any('PASSED' in msg for msg in result.messages) + + +def test_validator_fails_low_rate(): + """Test validator fails with success rate below minimum.""" + report = MappingReport( + total_genes=100, + mapped_hgnc=80, + mapped_uniprot=75, + unmapped_ids=[f'ENSG{i:011d}' for i in range(20)], + ) + + validator = MappingValidator(min_success_rate=0.90) + result = validator.validate(report) + + assert result.passed is False + assert result.hgnc_rate == 0.80 + assert any('FAILED' in msg for msg in result.messages) + + +def test_validator_warns_medium_rate(): + """Test validator passes with warning for medium success rate.""" + report = MappingReport( + total_genes=100, + mapped_hgnc=92, + mapped_uniprot=88, + unmapped_ids=[f'ENSG{i:011d}' for i in range(8)], + ) + + validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95) + result = validator.validate(report) + + # Should pass but with warning + assert result.passed is True + assert result.hgnc_rate == 0.92 + assert any('WARNING' in msg for msg in result.messages) + + +def test_save_unmapped_report(tmp_path): + """Test saving unmapped gene IDs to file.""" + report = MappingReport( + total_genes=100, + mapped_hgnc=95, + mapped_uniprot=90, + unmapped_ids=['ENSG00000000001', 'ENSG00000000002', 'ENSG00000000003'], + ) + + validator = MappingValidator() + output_path = tmp_path / "unmapped_genes.txt" + validator.save_unmapped_report(report, output_path) + + # Check file was created and contains expected content + assert output_path.exists() + content = output_path.read_text() + assert '# Unmapped Gene IDs' in content + assert '# Total unmapped: 3' in content + assert 'ENSG00000000001' in content + assert 'ENSG00000000002' in content + assert 'ENSG00000000003' in content + + +# Test validate_gene_universe + +def test_validate_gene_universe_valid(): + """Test gene universe validation with valid data.""" + genes = [f'ENSG{i:011d}' for i in range(20000)] # 20k genes + result = validate_gene_universe(genes) + + assert result.passed is True + assert any('within expected range' in msg for msg in result.messages) + assert any('ENSG format' in msg for msg in result.messages) + assert any('No duplicate' in msg for msg in result.messages) + + +def test_validate_gene_universe_invalid_count(): + """Test gene universe validation fails with too many genes.""" + genes = [f'ENSG{i:011d}' for i in range(50000)] # 50k genes (too many) + result = validate_gene_universe(genes) + + assert result.passed is False + assert any('exceeds maximum' in msg for msg in result.messages) + + +def test_validate_gene_universe_invalid_format(): + """Test gene universe validation fails with non-ENSG IDs.""" + genes = [f'ENSG{i:011d}' for i in range(19500)] + genes.extend(['INVALID001', 'INVALID002']) # Add invalid IDs + + result = validate_gene_universe(genes) + + assert result.passed is False + assert any('not in ENSG format' in msg for msg in result.messages) + + +def test_validate_gene_universe_duplicates(): + """Test gene universe validation fails with duplicates.""" + genes = [f'ENSG{i:011d}' for i in range(19500)] + genes.extend(['ENSG00000000001', 'ENSG00000000002']) # Add duplicates + + result = validate_gene_universe(genes) + + assert result.passed is False + assert any('duplicate' in msg for msg in result.messages) + + +def test_validate_gene_universe_too_few(): + """Test gene universe validation fails with too few genes.""" + genes = [f'ENSG{i:011d}' for i in range(1000)] # Only 1k genes + + result = validate_gene_universe(genes) + + assert result.passed is False + assert any('below minimum' in msg for msg in result.messages)