feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts - Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key) - Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries - Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none - Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization - Add biopython>=1.84 dependency to pyproject.toml - Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
    "pyyaml>=6.0",
    "httpx>=0.28",
    "structlog>=25.0",
+    "biopython>=1.84",
 ]

 [project.optional-dependencies]
@@ -42,6 +43,9 @@ dev = [
    "pytest>=7.4.0",
    "pytest-cov>=4.1.0",
 ]
+expression = [
+    "cellxgene-census>=1.19",
+]

 [project.scripts]
 usher-pipeline = "usher_pipeline.cli.main:cli"