feat(01-01): create Python package scaffold with config system

- pyproject.toml: installable package with bioinformatics dependencies
- Pydantic config schema with validation (ensembl_release >= 100, directory creation)
- YAML config loader with override support
- Default config with Ensembl 113, gnomAD v4.1
- 5 passing tests for config validation and hashing
This commit is contained in:
2026-02-11 16:24:35 +08:00
parent cab2f5fc66
commit 4a80a0398e
8 changed files with 459 additions and 0 deletions

View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

View File

@@ -0,0 +1,11 @@
from .loader import load_config, load_config_with_overrides
from .schema import PipelineConfig, DataSourceVersions, ScoringWeights, APIConfig
__all__ = [
"load_config",
"load_config_with_overrides",
"PipelineConfig",
"DataSourceVersions",
"ScoringWeights",
"APIConfig",
]

View File

@@ -0,0 +1,81 @@
"""Configuration loading with YAML parsing and validation."""
from pathlib import Path
from typing import Any
import pydantic_yaml
from .schema import PipelineConfig
def load_config(config_path: Path | str) -> PipelineConfig:
"""
Load and validate pipeline configuration from YAML file.
Args:
config_path: Path to YAML configuration file
Returns:
Validated PipelineConfig instance
Raises:
FileNotFoundError: If config file doesn't exist
pydantic.ValidationError: If config is invalid
"""
config_path = Path(config_path)
if not config_path.exists():
raise FileNotFoundError(f"Config file not found: {config_path}")
# Read YAML file
with open(config_path, "r") as f:
yaml_content = f.read()
# Parse and validate with Pydantic
config = pydantic_yaml.parse_yaml_raw_as(PipelineConfig, yaml_content)
return config
def load_config_with_overrides(
config_path: Path | str,
overrides: dict[str, Any],
) -> PipelineConfig:
"""
Load config from YAML and apply dictionary overrides.
Useful for CLI flags that override config file values.
Args:
config_path: Path to YAML configuration file
overrides: Dictionary of values to override (nested keys supported)
Returns:
Validated PipelineConfig with overrides applied
Raises:
FileNotFoundError: If config file doesn't exist
pydantic.ValidationError: If final config is invalid
"""
# Load base config
config = load_config(config_path)
# Convert to dict, apply overrides, re-validate
config_dict = config.model_dump()
# Apply overrides (simple flat merge for now)
for key, value in overrides.items():
if "." in key:
# Handle nested keys like "api.rate_limit_per_second"
parts = key.split(".")
target = config_dict
for part in parts[:-1]:
target = target[part]
target[parts[-1]] = value
else:
config_dict[key] = value
# Re-validate with overrides applied
config = PipelineConfig.model_validate(config_dict)
return config

View File

@@ -0,0 +1,150 @@
"""Pydantic models for pipeline configuration."""
import hashlib
import json
from pathlib import Path
from typing import Any
from pydantic import BaseModel, Field, field_validator
class DataSourceVersions(BaseModel):
"""Version information for external data sources."""
ensembl_release: int = Field(
...,
ge=100,
description="Ensembl release number (must be >= 100)",
)
gnomad_version: str = Field(
default="v4.1",
description="gnomAD version",
)
gtex_version: str = Field(
default="v8",
description="GTEx version",
)
hpa_version: str = Field(
default="23.0",
description="Human Protein Atlas version",
)
class ScoringWeights(BaseModel):
"""Weights for multi-evidence scoring layers."""
gnomad: float = Field(
default=0.20,
ge=0.0,
le=1.0,
description="Weight for genetic constraint evidence",
)
expression: float = Field(
default=0.20,
ge=0.0,
le=1.0,
description="Weight for tissue expression evidence",
)
annotation: float = Field(
default=0.15,
ge=0.0,
le=1.0,
description="Weight for annotation completeness",
)
localization: float = Field(
default=0.15,
ge=0.0,
le=1.0,
description="Weight for subcellular localization evidence",
)
animal_model: float = Field(
default=0.15,
ge=0.0,
le=1.0,
description="Weight for animal model phenotype evidence",
)
literature: float = Field(
default=0.15,
ge=0.0,
le=1.0,
description="Weight for literature evidence",
)
class APIConfig(BaseModel):
"""Configuration for API clients."""
rate_limit_per_second: int = Field(
default=5,
ge=1,
description="Maximum API requests per second",
)
max_retries: int = Field(
default=5,
ge=1,
le=20,
description="Maximum retry attempts for failed requests",
)
cache_ttl_seconds: int = Field(
default=86400,
ge=0,
description="Cache time-to-live in seconds (0 = infinite)",
)
timeout_seconds: int = Field(
default=30,
ge=1,
description="Request timeout in seconds",
)
class PipelineConfig(BaseModel):
"""Main pipeline configuration."""
data_dir: Path = Field(
...,
description="Directory for storing downloaded data",
)
cache_dir: Path = Field(
...,
description="Directory for API response caching",
)
duckdb_path: Path = Field(
...,
description="Path to DuckDB database file",
)
versions: DataSourceVersions = Field(
...,
description="Data source version information",
)
api: APIConfig = Field(
...,
description="API client configuration",
)
scoring: ScoringWeights = Field(
...,
description="Scoring weights for evidence layers",
)
@field_validator("data_dir", "cache_dir")
@classmethod
def create_directory(cls, v: Path) -> Path:
"""Create directory if it doesn't exist."""
v.mkdir(parents=True, exist_ok=True)
return v
def config_hash(self) -> str:
"""
Compute SHA-256 hash of the configuration.
Returns a deterministic hash based on all config values,
useful for tracking config changes and cache invalidation.
"""
# Convert config to dict and serialize deterministically
config_dict = self.model_dump(mode="python")
# Convert Path objects to strings for JSON serialization
config_json = json.dumps(
config_dict,
sort_keys=True,
default=str,
)
return hashlib.sha256(config_json.encode()).hexdigest()