feat(01-01): create Python package scaffold with config system

- pyproject.toml: installable package with bioinformatics dependencies - Pydantic config schema with validation (ensembl_release >= 100, directory creation) - YAML config loader with override support - Default config with Ensembl 113, gnomAD v4.1 - 5 passing tests for config validation and hashing
2026-02-11 16:24:35 +08:00
parent cab2f5fc66
commit 4a80a0398e
8 changed files with 459 additions and 0 deletions
--- a/config/default.yaml
+++ b/config/default.yaml
@@ -0,0 +1,25 @@
+# Default pipeline configuration
+
+data_dir: data
+cache_dir: data/cache
+duckdb_path: data/pipeline.duckdb
+
+versions:
+  ensembl_release: 113
+  gnomad_version: v4.1
+  gtex_version: v8
+  hpa_version: "23.0"
+
+api:
+  rate_limit_per_second: 5
+  max_retries: 5
+  cache_ttl_seconds: 86400
+  timeout_seconds: 30
+
+scoring:
+  gnomad: 0.20
+  expression: 0.20
+  annotation: 0.15
+  localization: 0.15
+  animal_model: 0.15
+  literature: 0.15
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,55 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "usher-pipeline"
+version = "0.1.0"
+description = "Reproducible pipeline for discovering under-studied cilia/Usher candidate genes"
+requires-python = ">=3.11"
+authors = [
+    {name = "Research Team"}
+]
+readme = "README.md"
+license = {text = "MIT"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+
+dependencies = [
+    "mygene>=3.2.0",
+    "requests>=2.31.0",
+    "requests-cache>=1.1.0",
+    "tenacity>=8.2.0",
+    "pydantic>=2.0",
+    "pydantic-yaml>=1.2.0",
+    "duckdb>=0.9.0",
+    "click>=8.1.0",
+    "polars>=0.19.0",
+    "pyarrow>=14.0.0",
+    "pyyaml>=6.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.4.0",
+    "pytest-cov>=4.1.0",
+]
+
+[project.scripts]
+usher-pipeline = "usher_pipeline.cli:main"
+
+[tool.setuptools]
+packages = ["usher_pipeline"]
+package-dir = {"" = "src"}
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_functions = ["test_*"]
+addopts = "-v --strict-markers"
--- a/src/usher_pipeline/init.py
+++ b/src/usher_pipeline/init.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
--- a/src/usher_pipeline/config/init.py
+++ b/src/usher_pipeline/config/init.py
@@ -0,0 +1,11 @@
+from .loader import load_config, load_config_with_overrides
+from .schema import PipelineConfig, DataSourceVersions, ScoringWeights, APIConfig
+
+__all__ = [
+    "load_config",
+    "load_config_with_overrides",
+    "PipelineConfig",
+    "DataSourceVersions",
+    "ScoringWeights",
+    "APIConfig",
+]
--- a/src/usher_pipeline/config/loader.py
+++ b/src/usher_pipeline/config/loader.py
@@ -0,0 +1,81 @@
+"""Configuration loading with YAML parsing and validation."""
+
+from pathlib import Path
+from typing import Any
+
+import pydantic_yaml
+
+from .schema import PipelineConfig
+
+
+def load_config(config_path: Path | str) -> PipelineConfig:
+    """
+    Load and validate pipeline configuration from YAML file.
+
+    Args:
+        config_path: Path to YAML configuration file
+
+    Returns:
+        Validated PipelineConfig instance
+
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        pydantic.ValidationError: If config is invalid
+    """
+    config_path = Path(config_path)
+
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+
+    # Read YAML file
+    with open(config_path, "r") as f:
+        yaml_content = f.read()
+
+    # Parse and validate with Pydantic
+    config = pydantic_yaml.parse_yaml_raw_as(PipelineConfig, yaml_content)
+
+    return config
+
+
+def load_config_with_overrides(
+    config_path: Path | str,
+    overrides: dict[str, Any],
+) -> PipelineConfig:
+    """
+    Load config from YAML and apply dictionary overrides.
+
+    Useful for CLI flags that override config file values.
+
+    Args:
+        config_path: Path to YAML configuration file
+        overrides: Dictionary of values to override (nested keys supported)
+
+    Returns:
+        Validated PipelineConfig with overrides applied
+
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        pydantic.ValidationError: If final config is invalid
+    """
+    # Load base config
+    config = load_config(config_path)
+
+    # Convert to dict, apply overrides, re-validate
+    config_dict = config.model_dump()
+
+    # Apply overrides (simple flat merge for now)
+    for key, value in overrides.items():
+        if "." in key:
+            # Handle nested keys like "api.rate_limit_per_second"
+            parts = key.split(".")
+            target = config_dict
+            for part in parts[:-1]:
+                target = target[part]
+            target[parts[-1]] = value
+        else:
+            config_dict[key] = value
+
+    # Re-validate with overrides applied
+    config = PipelineConfig.model_validate(config_dict)
+
+    return config
--- a/src/usher_pipeline/config/schema.py
+++ b/src/usher_pipeline/config/schema.py
@@ -0,0 +1,150 @@
+"""Pydantic models for pipeline configuration."""
+
+import hashlib
+import json
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class DataSourceVersions(BaseModel):
+    """Version information for external data sources."""
+
+    ensembl_release: int = Field(
+        ...,
+        ge=100,
+        description="Ensembl release number (must be >= 100)",
+    )
+    gnomad_version: str = Field(
+        default="v4.1",
+        description="gnomAD version",
+    )
+    gtex_version: str = Field(
+        default="v8",
+        description="GTEx version",
+    )
+    hpa_version: str = Field(
+        default="23.0",
+        description="Human Protein Atlas version",
+    )
+
+
+class ScoringWeights(BaseModel):
+    """Weights for multi-evidence scoring layers."""
+
+    gnomad: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=1.0,
+        description="Weight for genetic constraint evidence",
+    )
+    expression: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=1.0,
+        description="Weight for tissue expression evidence",
+    )
+    annotation: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for annotation completeness",
+    )
+    localization: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for subcellular localization evidence",
+    )
+    animal_model: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for animal model phenotype evidence",
+    )
+    literature: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for literature evidence",
+    )
+
+
+class APIConfig(BaseModel):
+    """Configuration for API clients."""
+
+    rate_limit_per_second: int = Field(
+        default=5,
+        ge=1,
+        description="Maximum API requests per second",
+    )
+    max_retries: int = Field(
+        default=5,
+        ge=1,
+        le=20,
+        description="Maximum retry attempts for failed requests",
+    )
+    cache_ttl_seconds: int = Field(
+        default=86400,
+        ge=0,
+        description="Cache time-to-live in seconds (0 = infinite)",
+    )
+    timeout_seconds: int = Field(
+        default=30,
+        ge=1,
+        description="Request timeout in seconds",
+    )
+
+
+class PipelineConfig(BaseModel):
+    """Main pipeline configuration."""
+
+    data_dir: Path = Field(
+        ...,
+        description="Directory for storing downloaded data",
+    )
+    cache_dir: Path = Field(
+        ...,
+        description="Directory for API response caching",
+    )
+    duckdb_path: Path = Field(
+        ...,
+        description="Path to DuckDB database file",
+    )
+    versions: DataSourceVersions = Field(
+        ...,
+        description="Data source version information",
+    )
+    api: APIConfig = Field(
+        ...,
+        description="API client configuration",
+    )
+    scoring: ScoringWeights = Field(
+        ...,
+        description="Scoring weights for evidence layers",
+    )
+
+    @field_validator("data_dir", "cache_dir")
+    @classmethod
+    def create_directory(cls, v: Path) -> Path:
+        """Create directory if it doesn't exist."""
+        v.mkdir(parents=True, exist_ok=True)
+        return v
+
+    def config_hash(self) -> str:
+        """
+        Compute SHA-256 hash of the configuration.
+
+        Returns a deterministic hash based on all config values,
+        useful for tracking config changes and cache invalidation.
+        """
+        # Convert config to dict and serialize deterministically
+        config_dict = self.model_dump(mode="python")
+        # Convert Path objects to strings for JSON serialization
+        config_json = json.dumps(
+            config_dict,
+            sort_keys=True,
+            default=str,
+        )
+        return hashlib.sha256(config_json.encode()).hexdigest()
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1 @@
+# Tests package
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -0,0 +1,135 @@
+"""Tests for configuration loading and validation."""
+
+from pathlib import Path
+
+import pytest
+from pydantic import ValidationError
+
+from usher_pipeline.config import load_config, load_config_with_overrides
+from usher_pipeline.config.schema import PipelineConfig
+
+
+def test_load_valid_config():
+    """Test loading valid default configuration."""
+    config = load_config("config/default.yaml")
+
+    assert isinstance(config, PipelineConfig)
+    assert config.versions.ensembl_release == 113
+    assert config.versions.gnomad_version == "v4.1"
+    assert config.api.rate_limit_per_second == 5
+    assert config.api.max_retries == 5
+    assert config.scoring.gnomad == 0.20
+
+
+def test_invalid_config_missing_field(tmp_path):
+    """Test that missing required field raises ValidationError."""
+    invalid_config = tmp_path / "invalid.yaml"
+    invalid_config.write_text("""
+versions:
+  ensembl_release: 113
+api:
+  rate_limit_per_second: 5
+scoring:
+  gnomad: 0.20
+""")
+
+    with pytest.raises(ValidationError) as exc_info:
+        load_config(invalid_config)
+
+    # Check that error mentions missing field
+    assert "data_dir" in str(exc_info.value)
+
+
+def test_invalid_ensembl_release(tmp_path):
+    """Test that ensembl_release < 100 raises ValidationError."""
+    invalid_config = tmp_path / "invalid_ensembl.yaml"
+    invalid_config.write_text("""
+data_dir: data
+cache_dir: data/cache
+duckdb_path: data/pipeline.duckdb
+versions:
+  ensembl_release: 99
+  gnomad_version: v4.1
+api:
+  rate_limit_per_second: 5
+  max_retries: 5
+  cache_ttl_seconds: 86400
+  timeout_seconds: 30
+scoring:
+  gnomad: 0.20
+  expression: 0.20
+  annotation: 0.15
+  localization: 0.15
+  animal_model: 0.15
+  literature: 0.15
+""")
+
+    with pytest.raises(ValidationError) as exc_info:
+        load_config(invalid_config)
+
+    # Check that error mentions ensembl_release constraint
+    error_str = str(exc_info.value)
+    assert "ensembl_release" in error_str
+    assert "greater than or equal to 100" in error_str.lower() or "100" in error_str
+
+
+def test_config_hash_deterministic():
+    """Test that config hash is deterministic and changes with config."""
+    config1 = load_config("config/default.yaml")
+    config2 = load_config("config/default.yaml")
+
+    # Same config should produce same hash
+    assert config1.config_hash() == config2.config_hash()
+
+    # Hash should be SHA-256 (64 hex chars)
+    assert len(config1.config_hash()) == 64
+
+    # Different config should produce different hash
+    config3 = load_config_with_overrides(
+        "config/default.yaml",
+        {"api.rate_limit_per_second": 10},
+    )
+    assert config3.config_hash() != config1.config_hash()
+
+
+def test_config_creates_directories(tmp_path):
+    """Test that loading config creates data and cache directories."""
+    config_file = tmp_path / "test_config.yaml"
+
+    # Use non-existent directories
+    data_dir = tmp_path / "test_data"
+    cache_dir = tmp_path / "test_cache"
+
+    config_file.write_text(f"""
+data_dir: {data_dir}
+cache_dir: {cache_dir}
+duckdb_path: {tmp_path / "test.duckdb"}
+versions:
+  ensembl_release: 113
+  gnomad_version: v4.1
+api:
+  rate_limit_per_second: 5
+  max_retries: 5
+  cache_ttl_seconds: 86400
+  timeout_seconds: 30
+scoring:
+  gnomad: 0.20
+  expression: 0.20
+  annotation: 0.15
+  localization: 0.15
+  animal_model: 0.15
+  literature: 0.15
+""")
+
+    # Directories should not exist before loading
+    assert not data_dir.exists()
+    assert not cache_dir.exists()
+
+    # Load config
+    config = load_config(config_file)
+
+    # Directories should be created
+    assert data_dir.exists()
+    assert cache_dir.exists()
+    assert data_dir.is_dir()
+    assert cache_dir.is_dir()