feat(01-01): create Python package scaffold with config system

- pyproject.toml: installable package with bioinformatics dependencies - Pydantic config schema with validation (ensembl_release >= 100, directory creation) - YAML config loader with override support - Default config with Ensembl 113, gnomAD v4.1 - 5 passing tests for config validation and hashing
2026-02-11 16:24:35 +08:00
parent cab2f5fc66
commit 4a80a0398e
8 changed files with 459 additions and 0 deletions
--- a/src/usher_pipeline/init.py
+++ b/src/usher_pipeline/init.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
--- a/src/usher_pipeline/config/init.py
+++ b/src/usher_pipeline/config/init.py
@@ -0,0 +1,11 @@
+from .loader import load_config, load_config_with_overrides
+from .schema import PipelineConfig, DataSourceVersions, ScoringWeights, APIConfig
+
+__all__ = [
+    "load_config",
+    "load_config_with_overrides",
+    "PipelineConfig",
+    "DataSourceVersions",
+    "ScoringWeights",
+    "APIConfig",
+]
--- a/src/usher_pipeline/config/loader.py
+++ b/src/usher_pipeline/config/loader.py
@@ -0,0 +1,81 @@
+"""Configuration loading with YAML parsing and validation."""
+
+from pathlib import Path
+from typing import Any
+
+import pydantic_yaml
+
+from .schema import PipelineConfig
+
+
+def load_config(config_path: Path | str) -> PipelineConfig:
+    """
+    Load and validate pipeline configuration from YAML file.
+
+    Args:
+        config_path: Path to YAML configuration file
+
+    Returns:
+        Validated PipelineConfig instance
+
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        pydantic.ValidationError: If config is invalid
+    """
+    config_path = Path(config_path)
+
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+
+    # Read YAML file
+    with open(config_path, "r") as f:
+        yaml_content = f.read()
+
+    # Parse and validate with Pydantic
+    config = pydantic_yaml.parse_yaml_raw_as(PipelineConfig, yaml_content)
+
+    return config
+
+
+def load_config_with_overrides(
+    config_path: Path | str,
+    overrides: dict[str, Any],
+) -> PipelineConfig:
+    """
+    Load config from YAML and apply dictionary overrides.
+
+    Useful for CLI flags that override config file values.
+
+    Args:
+        config_path: Path to YAML configuration file
+        overrides: Dictionary of values to override (nested keys supported)
+
+    Returns:
+        Validated PipelineConfig with overrides applied
+
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        pydantic.ValidationError: If final config is invalid
+    """
+    # Load base config
+    config = load_config(config_path)
+
+    # Convert to dict, apply overrides, re-validate
+    config_dict = config.model_dump()
+
+    # Apply overrides (simple flat merge for now)
+    for key, value in overrides.items():
+        if "." in key:
+            # Handle nested keys like "api.rate_limit_per_second"
+            parts = key.split(".")
+            target = config_dict
+            for part in parts[:-1]:
+                target = target[part]
+            target[parts[-1]] = value
+        else:
+            config_dict[key] = value
+
+    # Re-validate with overrides applied
+    config = PipelineConfig.model_validate(config_dict)
+
+    return config
--- a/src/usher_pipeline/config/schema.py
+++ b/src/usher_pipeline/config/schema.py
@@ -0,0 +1,150 @@
+"""Pydantic models for pipeline configuration."""
+
+import hashlib
+import json
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class DataSourceVersions(BaseModel):
+    """Version information for external data sources."""
+
+    ensembl_release: int = Field(
+        ...,
+        ge=100,
+        description="Ensembl release number (must be >= 100)",
+    )
+    gnomad_version: str = Field(
+        default="v4.1",
+        description="gnomAD version",
+    )
+    gtex_version: str = Field(
+        default="v8",
+        description="GTEx version",
+    )
+    hpa_version: str = Field(
+        default="23.0",
+        description="Human Protein Atlas version",
+    )
+
+
+class ScoringWeights(BaseModel):
+    """Weights for multi-evidence scoring layers."""
+
+    gnomad: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=1.0,
+        description="Weight for genetic constraint evidence",
+    )
+    expression: float = Field(
+        default=0.20,
+        ge=0.0,
+        le=1.0,
+        description="Weight for tissue expression evidence",
+    )
+    annotation: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for annotation completeness",
+    )
+    localization: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for subcellular localization evidence",
+    )
+    animal_model: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for animal model phenotype evidence",
+    )
+    literature: float = Field(
+        default=0.15,
+        ge=0.0,
+        le=1.0,
+        description="Weight for literature evidence",
+    )
+
+
+class APIConfig(BaseModel):
+    """Configuration for API clients."""
+
+    rate_limit_per_second: int = Field(
+        default=5,
+        ge=1,
+        description="Maximum API requests per second",
+    )
+    max_retries: int = Field(
+        default=5,
+        ge=1,
+        le=20,
+        description="Maximum retry attempts for failed requests",
+    )
+    cache_ttl_seconds: int = Field(
+        default=86400,
+        ge=0,
+        description="Cache time-to-live in seconds (0 = infinite)",
+    )
+    timeout_seconds: int = Field(
+        default=30,
+        ge=1,
+        description="Request timeout in seconds",
+    )
+
+
+class PipelineConfig(BaseModel):
+    """Main pipeline configuration."""
+
+    data_dir: Path = Field(
+        ...,
+        description="Directory for storing downloaded data",
+    )
+    cache_dir: Path = Field(
+        ...,
+        description="Directory for API response caching",
+    )
+    duckdb_path: Path = Field(
+        ...,
+        description="Path to DuckDB database file",
+    )
+    versions: DataSourceVersions = Field(
+        ...,
+        description="Data source version information",
+    )
+    api: APIConfig = Field(
+        ...,
+        description="API client configuration",
+    )
+    scoring: ScoringWeights = Field(
+        ...,
+        description="Scoring weights for evidence layers",
+    )
+
+    @field_validator("data_dir", "cache_dir")
+    @classmethod
+    def create_directory(cls, v: Path) -> Path:
+        """Create directory if it doesn't exist."""
+        v.mkdir(parents=True, exist_ok=True)
+        return v
+
+    def config_hash(self) -> str:
+        """
+        Compute SHA-256 hash of the configuration.
+
+        Returns a deterministic hash based on all config values,
+        useful for tracking config changes and cache invalidation.
+        """
+        # Convert config to dict and serialize deterministically
+        config_dict = self.model_dump(mode="python")
+        # Convert Path objects to strings for JSON serialization
+        config_json = json.dumps(
+            config_dict,
+            sort_keys=True,
+            default=str,
+        )
+        return hashlib.sha256(config_json.encode()).hexdigest()