feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts
- Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key)
- Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries
- Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none
- Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization
- Add biopython>=1.84 dependency to pyproject.toml
- Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
This commit is contained in:
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions

View File

@@ -35,6 +35,7 @@ dependencies = [
"pyyaml>=6.0",
"httpx>=0.28",
"structlog>=25.0",
"biopython>=1.84",
]
[project.optional-dependencies]
@@ -42,6 +43,9 @@ dev = [
"pytest>=7.4.0",
"pytest-cov>=4.1.0",
]
expression = [
"cellxgene-census>=1.19",
]
[project.scripts]
usher-pipeline = "usher_pipeline.cli.main:cli"