feat(01-02): create mapping validation gates with tests
- Add MappingValidator with configurable success rate thresholds (min_success_rate, warn_threshold) - Add validate_gene_universe for gene count, format, and duplicate checks - Add save_unmapped_report for manual review output - Implement 15 comprehensive tests with mocked mygene responses (no real API calls) - Tests cover: successful mapping, notfound handling, uniprot list parsing, batching, validation gates, universe validation
This commit is contained in:
@@ -13,6 +13,11 @@ from usher_pipeline.gene_mapping.universe import (
|
|||||||
fetch_protein_coding_genes,
|
fetch_protein_coding_genes,
|
||||||
GeneUniverse,
|
GeneUniverse,
|
||||||
)
|
)
|
||||||
|
from usher_pipeline.gene_mapping.validator import (
|
||||||
|
MappingValidator,
|
||||||
|
ValidationResult,
|
||||||
|
validate_gene_universe,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"GeneMapper",
|
"GeneMapper",
|
||||||
@@ -20,4 +25,7 @@ __all__ = [
|
|||||||
"MappingReport",
|
"MappingReport",
|
||||||
"fetch_protein_coding_genes",
|
"fetch_protein_coding_genes",
|
||||||
"GeneUniverse",
|
"GeneUniverse",
|
||||||
|
"MappingValidator",
|
||||||
|
"ValidationResult",
|
||||||
|
"validate_gene_universe",
|
||||||
]
|
]
|
||||||
|
|||||||
224
src/usher_pipeline/gene_mapping/validator.py
Normal file
224
src/usher_pipeline/gene_mapping/validator.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
"""Validation gates for gene mapping quality control.
|
||||||
|
|
||||||
|
Provides validation for mapping results and gene universe data quality.
|
||||||
|
Enforces configurable success rate thresholds and produces actionable reports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from usher_pipeline.gene_mapping.mapper import MappingReport
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ValidationResult:
|
||||||
|
"""Result of a validation check.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
passed: Whether validation passed
|
||||||
|
messages: List of validation messages (warnings, errors)
|
||||||
|
hgnc_rate: HGNC mapping success rate (0-1)
|
||||||
|
uniprot_rate: UniProt mapping success rate (0-1)
|
||||||
|
"""
|
||||||
|
passed: bool
|
||||||
|
messages: list[str] = field(default_factory=list)
|
||||||
|
hgnc_rate: float = 0.0
|
||||||
|
uniprot_rate: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class MappingValidator:
|
||||||
|
"""Validator for gene ID mapping results.
|
||||||
|
|
||||||
|
Enforces configurable success rate thresholds and produces validation reports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_success_rate: float = 0.90,
|
||||||
|
warn_threshold: float = 0.95
|
||||||
|
):
|
||||||
|
"""Initialize mapping validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_success_rate: Minimum HGNC mapping success rate to pass (default: 0.90)
|
||||||
|
warn_threshold: Success rate below this triggers warning (default: 0.95)
|
||||||
|
"""
|
||||||
|
self.min_success_rate = min_success_rate
|
||||||
|
self.warn_threshold = warn_threshold
|
||||||
|
logger.info(
|
||||||
|
f"Initialized MappingValidator: min_rate={min_success_rate}, "
|
||||||
|
f"warn_threshold={warn_threshold}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate(self, report: MappingReport) -> ValidationResult:
|
||||||
|
"""Validate gene mapping results.
|
||||||
|
|
||||||
|
Checks if HGNC mapping success rate meets minimum threshold.
|
||||||
|
Issues warning if rate is below warn_threshold but above min_success_rate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report: MappingReport from batch mapping operation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ValidationResult with pass/fail status and messages
|
||||||
|
"""
|
||||||
|
messages: list[str] = []
|
||||||
|
hgnc_rate = report.success_rate_hgnc
|
||||||
|
uniprot_rate = report.success_rate_uniprot
|
||||||
|
|
||||||
|
# Check HGNC success rate
|
||||||
|
if hgnc_rate < self.min_success_rate:
|
||||||
|
messages.append(
|
||||||
|
f"FAILED: HGNC mapping success rate {hgnc_rate:.1%} is below "
|
||||||
|
f"minimum threshold {self.min_success_rate:.1%}"
|
||||||
|
)
|
||||||
|
messages.append(
|
||||||
|
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
|
||||||
|
)
|
||||||
|
messages.append(
|
||||||
|
f"Unmapped genes: {len(report.unmapped_ids)} "
|
||||||
|
f"(first 10: {report.unmapped_ids[:10]})"
|
||||||
|
)
|
||||||
|
passed = False
|
||||||
|
elif hgnc_rate < self.warn_threshold:
|
||||||
|
messages.append(
|
||||||
|
f"WARNING: HGNC mapping success rate {hgnc_rate:.1%} is below "
|
||||||
|
f"warning threshold {self.warn_threshold:.1%}"
|
||||||
|
)
|
||||||
|
messages.append(
|
||||||
|
f"Mapped {report.mapped_hgnc}/{report.total_genes} genes to HGNC symbols"
|
||||||
|
)
|
||||||
|
messages.append(
|
||||||
|
f"Consider reviewing {len(report.unmapped_ids)} unmapped genes"
|
||||||
|
)
|
||||||
|
passed = True
|
||||||
|
else:
|
||||||
|
messages.append(
|
||||||
|
f"PASSED: HGNC mapping success rate {hgnc_rate:.1%} "
|
||||||
|
f"({report.mapped_hgnc}/{report.total_genes} genes)"
|
||||||
|
)
|
||||||
|
passed = True
|
||||||
|
|
||||||
|
# Report UniProt stats (informational only, not used for pass/fail)
|
||||||
|
messages.append(
|
||||||
|
f"UniProt mapping: {uniprot_rate:.1%} "
|
||||||
|
f"({report.mapped_uniprot}/{report.total_genes} genes)"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Validation result: {'PASSED' if passed else 'FAILED'} "
|
||||||
|
f"(HGNC: {hgnc_rate:.1%}, UniProt: {uniprot_rate:.1%})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ValidationResult(
|
||||||
|
passed=passed,
|
||||||
|
messages=messages,
|
||||||
|
hgnc_rate=hgnc_rate,
|
||||||
|
uniprot_rate=uniprot_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
def save_unmapped_report(
|
||||||
|
self,
|
||||||
|
report: MappingReport,
|
||||||
|
output_path: Path
|
||||||
|
) -> None:
|
||||||
|
"""Save list of unmapped genes to file for manual review.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report: MappingReport containing unmapped gene IDs
|
||||||
|
output_path: Path to output file
|
||||||
|
"""
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
unmapped_count = len(report.unmapped_ids)
|
||||||
|
|
||||||
|
with output_path.open('w') as f:
|
||||||
|
f.write(f"# Unmapped Gene IDs\n")
|
||||||
|
f.write(f"# Generated: {timestamp}\n")
|
||||||
|
f.write(f"# Total unmapped: {unmapped_count}\n")
|
||||||
|
f.write(f"# Success rate: {report.success_rate_hgnc:.1%}\n")
|
||||||
|
f.write(f"#\n")
|
||||||
|
for gene_id in report.unmapped_ids:
|
||||||
|
f.write(f"{gene_id}\n")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Saved {unmapped_count} unmapped gene IDs to {output_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_gene_universe(genes: list[str]) -> ValidationResult:
|
||||||
|
"""Validate gene universe data quality.
|
||||||
|
|
||||||
|
Checks:
|
||||||
|
- Gene count is in expected range (19,000-22,000)
|
||||||
|
- All gene IDs start with ENSG (Ensembl format)
|
||||||
|
- No duplicate gene IDs
|
||||||
|
|
||||||
|
Args:
|
||||||
|
genes: List of gene IDs to validate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ValidationResult with validation status and messages
|
||||||
|
"""
|
||||||
|
messages: list[str] = []
|
||||||
|
passed = True
|
||||||
|
|
||||||
|
gene_count = len(genes)
|
||||||
|
MIN_GENES = 19000
|
||||||
|
MAX_GENES = 22000
|
||||||
|
|
||||||
|
# Check gene count
|
||||||
|
if gene_count < MIN_GENES:
|
||||||
|
messages.append(
|
||||||
|
f"FAILED: Gene count {gene_count} is below minimum {MIN_GENES}. "
|
||||||
|
"This may indicate missing data or incomplete query."
|
||||||
|
)
|
||||||
|
passed = False
|
||||||
|
elif gene_count > MAX_GENES:
|
||||||
|
messages.append(
|
||||||
|
f"FAILED: Gene count {gene_count} exceeds maximum {MAX_GENES}. "
|
||||||
|
"This may indicate pseudogene contamination or inclusion of non-coding genes."
|
||||||
|
)
|
||||||
|
passed = False
|
||||||
|
else:
|
||||||
|
messages.append(
|
||||||
|
f"Gene count {gene_count} is within expected range ({MIN_GENES}-{MAX_GENES})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check all IDs start with ENSG
|
||||||
|
non_ensg = [g for g in genes if not g.startswith('ENSG')]
|
||||||
|
if non_ensg:
|
||||||
|
messages.append(
|
||||||
|
f"FAILED: Found {len(non_ensg)} gene IDs not in ENSG format "
|
||||||
|
f"(examples: {non_ensg[:5]})"
|
||||||
|
)
|
||||||
|
passed = False
|
||||||
|
else:
|
||||||
|
messages.append("All gene IDs are in ENSG format")
|
||||||
|
|
||||||
|
# Check for duplicates
|
||||||
|
unique_genes = set(genes)
|
||||||
|
if len(unique_genes) < gene_count:
|
||||||
|
duplicates = gene_count - len(unique_genes)
|
||||||
|
messages.append(
|
||||||
|
f"FAILED: Found {duplicates} duplicate gene IDs"
|
||||||
|
)
|
||||||
|
passed = False
|
||||||
|
else:
|
||||||
|
messages.append("No duplicate gene IDs found")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Gene universe validation: {'PASSED' if passed else 'FAILED'} "
|
||||||
|
f"({gene_count} genes)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ValidationResult(
|
||||||
|
passed=passed,
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
328
tests/test_gene_mapping.py
Normal file
328
tests/test_gene_mapping.py
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
"""Tests for gene ID mapping module.
|
||||||
|
|
||||||
|
Tests gene universe retrieval, batch mapping, and validation gates.
|
||||||
|
Uses mocked mygene responses to avoid real API calls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from usher_pipeline.gene_mapping import (
|
||||||
|
GeneMapper,
|
||||||
|
MappingReport,
|
||||||
|
MappingResult,
|
||||||
|
MappingValidator,
|
||||||
|
ValidationResult,
|
||||||
|
validate_gene_universe,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Mock mygene response fixtures
|
||||||
|
|
||||||
|
MOCK_SUCCESSFUL_RESPONSE = {
|
||||||
|
'out': [
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000139618',
|
||||||
|
'symbol': 'BRCA2',
|
||||||
|
'uniprot': {'Swiss-Prot': 'P51587'}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000141510',
|
||||||
|
'symbol': 'TP53',
|
||||||
|
'uniprot': {'Swiss-Prot': 'P04637'}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000012048',
|
||||||
|
'symbol': 'BRCA1',
|
||||||
|
'uniprot': {'Swiss-Prot': 'P38398'}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'missing': []
|
||||||
|
}
|
||||||
|
|
||||||
|
MOCK_RESPONSE_WITH_NOTFOUND = {
|
||||||
|
'out': [
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000139618',
|
||||||
|
'symbol': 'BRCA2',
|
||||||
|
'uniprot': {'Swiss-Prot': 'P51587'}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000141510',
|
||||||
|
'symbol': 'TP53',
|
||||||
|
'uniprot': {'Swiss-Prot': 'P04637'}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000000000',
|
||||||
|
'notfound': True,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'missing': ['ENSG00000000000']
|
||||||
|
}
|
||||||
|
|
||||||
|
MOCK_RESPONSE_WITH_UNIPROT_LIST = {
|
||||||
|
'out': [
|
||||||
|
{
|
||||||
|
'query': 'ENSG00000139618',
|
||||||
|
'symbol': 'BRCA2',
|
||||||
|
'uniprot': {'Swiss-Prot': ['P51587', 'Q9UBX7']} # List of accessions
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'missing': []
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Test MappingResult creation
|
||||||
|
|
||||||
|
def test_mapping_result_creation():
|
||||||
|
"""Test creating MappingResult with all fields."""
|
||||||
|
result = MappingResult(
|
||||||
|
ensembl_id='ENSG00000139618',
|
||||||
|
hgnc_symbol='BRCA2',
|
||||||
|
uniprot_accession='P51587',
|
||||||
|
mapping_source='mygene'
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.ensembl_id == 'ENSG00000139618'
|
||||||
|
assert result.hgnc_symbol == 'BRCA2'
|
||||||
|
assert result.uniprot_accession == 'P51587'
|
||||||
|
assert result.mapping_source == 'mygene'
|
||||||
|
|
||||||
|
|
||||||
|
def test_mapping_result_with_none_values():
|
||||||
|
"""Test MappingResult handles missing data."""
|
||||||
|
result = MappingResult(
|
||||||
|
ensembl_id='ENSG00000000000',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.ensembl_id == 'ENSG00000000000'
|
||||||
|
assert result.hgnc_symbol is None
|
||||||
|
assert result.uniprot_accession is None
|
||||||
|
assert result.mapping_source == 'mygene'
|
||||||
|
|
||||||
|
|
||||||
|
# Test GeneMapper with mocked mygene
|
||||||
|
|
||||||
|
def test_mapper_handles_successful_mapping():
|
||||||
|
"""Test mapper with all genes successfully mapped."""
|
||||||
|
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||||
|
mock_mg = MagicMock()
|
||||||
|
mock_mg.querymany.return_value = MOCK_SUCCESSFUL_RESPONSE
|
||||||
|
mock_mygene.return_value = mock_mg
|
||||||
|
|
||||||
|
mapper = GeneMapper(batch_size=1000)
|
||||||
|
results, report = mapper.map_ensembl_ids([
|
||||||
|
'ENSG00000139618',
|
||||||
|
'ENSG00000141510',
|
||||||
|
'ENSG00000012048',
|
||||||
|
])
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
assert len(results) == 3
|
||||||
|
assert results[0].ensembl_id == 'ENSG00000139618'
|
||||||
|
assert results[0].hgnc_symbol == 'BRCA2'
|
||||||
|
assert results[0].uniprot_accession == 'P51587'
|
||||||
|
|
||||||
|
# Check report
|
||||||
|
assert report.total_genes == 3
|
||||||
|
assert report.mapped_hgnc == 3
|
||||||
|
assert report.mapped_uniprot == 3
|
||||||
|
assert report.success_rate_hgnc == 1.0
|
||||||
|
assert report.success_rate_uniprot == 1.0
|
||||||
|
assert len(report.unmapped_ids) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_mapper_handles_unmapped_genes():
|
||||||
|
"""Test mapper with one gene not found."""
|
||||||
|
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||||
|
mock_mg = MagicMock()
|
||||||
|
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_NOTFOUND
|
||||||
|
mock_mygene.return_value = mock_mg
|
||||||
|
|
||||||
|
mapper = GeneMapper()
|
||||||
|
results, report = mapper.map_ensembl_ids([
|
||||||
|
'ENSG00000139618',
|
||||||
|
'ENSG00000141510',
|
||||||
|
'ENSG00000000000',
|
||||||
|
])
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
assert len(results) == 3
|
||||||
|
assert results[2].ensembl_id == 'ENSG00000000000'
|
||||||
|
assert results[2].hgnc_symbol is None
|
||||||
|
assert results[2].uniprot_accession is None
|
||||||
|
|
||||||
|
# Check report
|
||||||
|
assert report.total_genes == 3
|
||||||
|
assert report.mapped_hgnc == 2
|
||||||
|
assert report.mapped_uniprot == 2
|
||||||
|
assert abs(report.success_rate_hgnc - 0.667) < 0.01
|
||||||
|
assert abs(report.success_rate_uniprot - 0.667) < 0.01
|
||||||
|
assert 'ENSG00000000000' in report.unmapped_ids
|
||||||
|
assert len(report.unmapped_ids) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_mapper_handles_uniprot_list():
|
||||||
|
"""Test mapper handles UniProt Swiss-Prot as list (takes first)."""
|
||||||
|
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||||
|
mock_mg = MagicMock()
|
||||||
|
mock_mg.querymany.return_value = MOCK_RESPONSE_WITH_UNIPROT_LIST
|
||||||
|
mock_mygene.return_value = mock_mg
|
||||||
|
|
||||||
|
mapper = GeneMapper()
|
||||||
|
results, report = mapper.map_ensembl_ids(['ENSG00000139618'])
|
||||||
|
|
||||||
|
# Should take first UniProt accession from list
|
||||||
|
assert results[0].uniprot_accession == 'P51587'
|
||||||
|
assert report.mapped_uniprot == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_mapper_batching():
|
||||||
|
"""Test mapper processes genes in batches."""
|
||||||
|
with patch('mygene.MyGeneInfo') as mock_mygene:
|
||||||
|
mock_mg = MagicMock()
|
||||||
|
# Return empty response for each batch
|
||||||
|
mock_mg.querymany.return_value = {'out': [], 'missing': []}
|
||||||
|
mock_mygene.return_value = mock_mg
|
||||||
|
|
||||||
|
mapper = GeneMapper(batch_size=2)
|
||||||
|
# 5 genes should result in 3 batches (2+2+1)
|
||||||
|
gene_ids = [f'ENSG{i:011d}' for i in range(5)]
|
||||||
|
results, report = mapper.map_ensembl_ids(gene_ids)
|
||||||
|
|
||||||
|
# Check querymany was called 3 times (3 batches)
|
||||||
|
assert mock_mg.querymany.call_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
# Test MappingValidator
|
||||||
|
|
||||||
|
def test_validator_passes_high_rate():
|
||||||
|
"""Test validator passes with success rate above minimum."""
|
||||||
|
report = MappingReport(
|
||||||
|
total_genes=100,
|
||||||
|
mapped_hgnc=95,
|
||||||
|
mapped_uniprot=90,
|
||||||
|
unmapped_ids=[f'ENSG{i:011d}' for i in range(5)],
|
||||||
|
)
|
||||||
|
|
||||||
|
validator = MappingValidator(min_success_rate=0.90)
|
||||||
|
result = validator.validate(report)
|
||||||
|
|
||||||
|
assert result.passed is True
|
||||||
|
assert result.hgnc_rate == 0.95
|
||||||
|
assert result.uniprot_rate == 0.90
|
||||||
|
assert any('PASSED' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validator_fails_low_rate():
|
||||||
|
"""Test validator fails with success rate below minimum."""
|
||||||
|
report = MappingReport(
|
||||||
|
total_genes=100,
|
||||||
|
mapped_hgnc=80,
|
||||||
|
mapped_uniprot=75,
|
||||||
|
unmapped_ids=[f'ENSG{i:011d}' for i in range(20)],
|
||||||
|
)
|
||||||
|
|
||||||
|
validator = MappingValidator(min_success_rate=0.90)
|
||||||
|
result = validator.validate(report)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert result.hgnc_rate == 0.80
|
||||||
|
assert any('FAILED' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validator_warns_medium_rate():
|
||||||
|
"""Test validator passes with warning for medium success rate."""
|
||||||
|
report = MappingReport(
|
||||||
|
total_genes=100,
|
||||||
|
mapped_hgnc=92,
|
||||||
|
mapped_uniprot=88,
|
||||||
|
unmapped_ids=[f'ENSG{i:011d}' for i in range(8)],
|
||||||
|
)
|
||||||
|
|
||||||
|
validator = MappingValidator(min_success_rate=0.90, warn_threshold=0.95)
|
||||||
|
result = validator.validate(report)
|
||||||
|
|
||||||
|
# Should pass but with warning
|
||||||
|
assert result.passed is True
|
||||||
|
assert result.hgnc_rate == 0.92
|
||||||
|
assert any('WARNING' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_unmapped_report(tmp_path):
|
||||||
|
"""Test saving unmapped gene IDs to file."""
|
||||||
|
report = MappingReport(
|
||||||
|
total_genes=100,
|
||||||
|
mapped_hgnc=95,
|
||||||
|
mapped_uniprot=90,
|
||||||
|
unmapped_ids=['ENSG00000000001', 'ENSG00000000002', 'ENSG00000000003'],
|
||||||
|
)
|
||||||
|
|
||||||
|
validator = MappingValidator()
|
||||||
|
output_path = tmp_path / "unmapped_genes.txt"
|
||||||
|
validator.save_unmapped_report(report, output_path)
|
||||||
|
|
||||||
|
# Check file was created and contains expected content
|
||||||
|
assert output_path.exists()
|
||||||
|
content = output_path.read_text()
|
||||||
|
assert '# Unmapped Gene IDs' in content
|
||||||
|
assert '# Total unmapped: 3' in content
|
||||||
|
assert 'ENSG00000000001' in content
|
||||||
|
assert 'ENSG00000000002' in content
|
||||||
|
assert 'ENSG00000000003' in content
|
||||||
|
|
||||||
|
|
||||||
|
# Test validate_gene_universe
|
||||||
|
|
||||||
|
def test_validate_gene_universe_valid():
|
||||||
|
"""Test gene universe validation with valid data."""
|
||||||
|
genes = [f'ENSG{i:011d}' for i in range(20000)] # 20k genes
|
||||||
|
result = validate_gene_universe(genes)
|
||||||
|
|
||||||
|
assert result.passed is True
|
||||||
|
assert any('within expected range' in msg for msg in result.messages)
|
||||||
|
assert any('ENSG format' in msg for msg in result.messages)
|
||||||
|
assert any('No duplicate' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_gene_universe_invalid_count():
|
||||||
|
"""Test gene universe validation fails with too many genes."""
|
||||||
|
genes = [f'ENSG{i:011d}' for i in range(50000)] # 50k genes (too many)
|
||||||
|
result = validate_gene_universe(genes)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert any('exceeds maximum' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_gene_universe_invalid_format():
|
||||||
|
"""Test gene universe validation fails with non-ENSG IDs."""
|
||||||
|
genes = [f'ENSG{i:011d}' for i in range(19500)]
|
||||||
|
genes.extend(['INVALID001', 'INVALID002']) # Add invalid IDs
|
||||||
|
|
||||||
|
result = validate_gene_universe(genes)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert any('not in ENSG format' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_gene_universe_duplicates():
|
||||||
|
"""Test gene universe validation fails with duplicates."""
|
||||||
|
genes = [f'ENSG{i:011d}' for i in range(19500)]
|
||||||
|
genes.extend(['ENSG00000000001', 'ENSG00000000002']) # Add duplicates
|
||||||
|
|
||||||
|
result = validate_gene_universe(genes)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert any('duplicate' in msg for msg in result.messages)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_gene_universe_too_few():
|
||||||
|
"""Test gene universe validation fails with too few genes."""
|
||||||
|
genes = [f'ENSG{i:011d}' for i in range(1000)] # Only 1k genes
|
||||||
|
|
||||||
|
result = validate_gene_universe(genes)
|
||||||
|
|
||||||
|
assert result.passed is False
|
||||||
|
assert any('below minimum' in msg for msg in result.messages)
|
||||||
Reference in New Issue
Block a user