feat(01-01): create Python package scaffold with config system
- pyproject.toml: installable package with bioinformatics dependencies - Pydantic config schema with validation (ensembl_release >= 100, directory creation) - YAML config loader with override support - Default config with Ensembl 113, gnomAD v4.1 - 5 passing tests for config validation and hashing
This commit is contained in:
25
config/default.yaml
Normal file
25
config/default.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Default pipeline configuration
|
||||
|
||||
data_dir: data
|
||||
cache_dir: data/cache
|
||||
duckdb_path: data/pipeline.duckdb
|
||||
|
||||
versions:
|
||||
ensembl_release: 113
|
||||
gnomad_version: v4.1
|
||||
gtex_version: v8
|
||||
hpa_version: "23.0"
|
||||
|
||||
api:
|
||||
rate_limit_per_second: 5
|
||||
max_retries: 5
|
||||
cache_ttl_seconds: 86400
|
||||
timeout_seconds: 30
|
||||
|
||||
scoring:
|
||||
gnomad: 0.20
|
||||
expression: 0.20
|
||||
annotation: 0.15
|
||||
localization: 0.15
|
||||
animal_model: 0.15
|
||||
literature: 0.15
|
||||
55
pyproject.toml
Normal file
55
pyproject.toml
Normal file
@@ -0,0 +1,55 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "usher-pipeline"
|
||||
version = "0.1.0"
|
||||
description = "Reproducible pipeline for discovering under-studied cilia/Usher candidate genes"
|
||||
requires-python = ">=3.11"
|
||||
authors = [
|
||||
{name = "Research Team"}
|
||||
]
|
||||
readme = "README.md"
|
||||
license = {text = "MIT"}
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"mygene>=3.2.0",
|
||||
"requests>=2.31.0",
|
||||
"requests-cache>=1.1.0",
|
||||
"tenacity>=8.2.0",
|
||||
"pydantic>=2.0",
|
||||
"pydantic-yaml>=1.2.0",
|
||||
"duckdb>=0.9.0",
|
||||
"click>=8.1.0",
|
||||
"polars>=0.19.0",
|
||||
"pyarrow>=14.0.0",
|
||||
"pyyaml>=6.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
usher-pipeline = "usher_pipeline.cli:main"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["usher_pipeline"]
|
||||
package-dir = {"" = "src"}
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
python_files = ["test_*.py"]
|
||||
python_functions = ["test_*"]
|
||||
addopts = "-v --strict-markers"
|
||||
1
src/usher_pipeline/__init__.py
Normal file
1
src/usher_pipeline/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.1.0"
|
||||
11
src/usher_pipeline/config/__init__.py
Normal file
11
src/usher_pipeline/config/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from .loader import load_config, load_config_with_overrides
|
||||
from .schema import PipelineConfig, DataSourceVersions, ScoringWeights, APIConfig
|
||||
|
||||
__all__ = [
|
||||
"load_config",
|
||||
"load_config_with_overrides",
|
||||
"PipelineConfig",
|
||||
"DataSourceVersions",
|
||||
"ScoringWeights",
|
||||
"APIConfig",
|
||||
]
|
||||
81
src/usher_pipeline/config/loader.py
Normal file
81
src/usher_pipeline/config/loader.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""Configuration loading with YAML parsing and validation."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pydantic_yaml
|
||||
|
||||
from .schema import PipelineConfig
|
||||
|
||||
|
||||
def load_config(config_path: Path | str) -> PipelineConfig:
|
||||
"""
|
||||
Load and validate pipeline configuration from YAML file.
|
||||
|
||||
Args:
|
||||
config_path: Path to YAML configuration file
|
||||
|
||||
Returns:
|
||||
Validated PipelineConfig instance
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
pydantic.ValidationError: If config is invalid
|
||||
"""
|
||||
config_path = Path(config_path)
|
||||
|
||||
if not config_path.exists():
|
||||
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||
|
||||
# Read YAML file
|
||||
with open(config_path, "r") as f:
|
||||
yaml_content = f.read()
|
||||
|
||||
# Parse and validate with Pydantic
|
||||
config = pydantic_yaml.parse_yaml_raw_as(PipelineConfig, yaml_content)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def load_config_with_overrides(
|
||||
config_path: Path | str,
|
||||
overrides: dict[str, Any],
|
||||
) -> PipelineConfig:
|
||||
"""
|
||||
Load config from YAML and apply dictionary overrides.
|
||||
|
||||
Useful for CLI flags that override config file values.
|
||||
|
||||
Args:
|
||||
config_path: Path to YAML configuration file
|
||||
overrides: Dictionary of values to override (nested keys supported)
|
||||
|
||||
Returns:
|
||||
Validated PipelineConfig with overrides applied
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
pydantic.ValidationError: If final config is invalid
|
||||
"""
|
||||
# Load base config
|
||||
config = load_config(config_path)
|
||||
|
||||
# Convert to dict, apply overrides, re-validate
|
||||
config_dict = config.model_dump()
|
||||
|
||||
# Apply overrides (simple flat merge for now)
|
||||
for key, value in overrides.items():
|
||||
if "." in key:
|
||||
# Handle nested keys like "api.rate_limit_per_second"
|
||||
parts = key.split(".")
|
||||
target = config_dict
|
||||
for part in parts[:-1]:
|
||||
target = target[part]
|
||||
target[parts[-1]] = value
|
||||
else:
|
||||
config_dict[key] = value
|
||||
|
||||
# Re-validate with overrides applied
|
||||
config = PipelineConfig.model_validate(config_dict)
|
||||
|
||||
return config
|
||||
150
src/usher_pipeline/config/schema.py
Normal file
150
src/usher_pipeline/config/schema.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Pydantic models for pipeline configuration."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
|
||||
class DataSourceVersions(BaseModel):
|
||||
"""Version information for external data sources."""
|
||||
|
||||
ensembl_release: int = Field(
|
||||
...,
|
||||
ge=100,
|
||||
description="Ensembl release number (must be >= 100)",
|
||||
)
|
||||
gnomad_version: str = Field(
|
||||
default="v4.1",
|
||||
description="gnomAD version",
|
||||
)
|
||||
gtex_version: str = Field(
|
||||
default="v8",
|
||||
description="GTEx version",
|
||||
)
|
||||
hpa_version: str = Field(
|
||||
default="23.0",
|
||||
description="Human Protein Atlas version",
|
||||
)
|
||||
|
||||
|
||||
class ScoringWeights(BaseModel):
|
||||
"""Weights for multi-evidence scoring layers."""
|
||||
|
||||
gnomad: float = Field(
|
||||
default=0.20,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for genetic constraint evidence",
|
||||
)
|
||||
expression: float = Field(
|
||||
default=0.20,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for tissue expression evidence",
|
||||
)
|
||||
annotation: float = Field(
|
||||
default=0.15,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for annotation completeness",
|
||||
)
|
||||
localization: float = Field(
|
||||
default=0.15,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for subcellular localization evidence",
|
||||
)
|
||||
animal_model: float = Field(
|
||||
default=0.15,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for animal model phenotype evidence",
|
||||
)
|
||||
literature: float = Field(
|
||||
default=0.15,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Weight for literature evidence",
|
||||
)
|
||||
|
||||
|
||||
class APIConfig(BaseModel):
|
||||
"""Configuration for API clients."""
|
||||
|
||||
rate_limit_per_second: int = Field(
|
||||
default=5,
|
||||
ge=1,
|
||||
description="Maximum API requests per second",
|
||||
)
|
||||
max_retries: int = Field(
|
||||
default=5,
|
||||
ge=1,
|
||||
le=20,
|
||||
description="Maximum retry attempts for failed requests",
|
||||
)
|
||||
cache_ttl_seconds: int = Field(
|
||||
default=86400,
|
||||
ge=0,
|
||||
description="Cache time-to-live in seconds (0 = infinite)",
|
||||
)
|
||||
timeout_seconds: int = Field(
|
||||
default=30,
|
||||
ge=1,
|
||||
description="Request timeout in seconds",
|
||||
)
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
"""Main pipeline configuration."""
|
||||
|
||||
data_dir: Path = Field(
|
||||
...,
|
||||
description="Directory for storing downloaded data",
|
||||
)
|
||||
cache_dir: Path = Field(
|
||||
...,
|
||||
description="Directory for API response caching",
|
||||
)
|
||||
duckdb_path: Path = Field(
|
||||
...,
|
||||
description="Path to DuckDB database file",
|
||||
)
|
||||
versions: DataSourceVersions = Field(
|
||||
...,
|
||||
description="Data source version information",
|
||||
)
|
||||
api: APIConfig = Field(
|
||||
...,
|
||||
description="API client configuration",
|
||||
)
|
||||
scoring: ScoringWeights = Field(
|
||||
...,
|
||||
description="Scoring weights for evidence layers",
|
||||
)
|
||||
|
||||
@field_validator("data_dir", "cache_dir")
|
||||
@classmethod
|
||||
def create_directory(cls, v: Path) -> Path:
|
||||
"""Create directory if it doesn't exist."""
|
||||
v.mkdir(parents=True, exist_ok=True)
|
||||
return v
|
||||
|
||||
def config_hash(self) -> str:
|
||||
"""
|
||||
Compute SHA-256 hash of the configuration.
|
||||
|
||||
Returns a deterministic hash based on all config values,
|
||||
useful for tracking config changes and cache invalidation.
|
||||
"""
|
||||
# Convert config to dict and serialize deterministically
|
||||
config_dict = self.model_dump(mode="python")
|
||||
# Convert Path objects to strings for JSON serialization
|
||||
config_json = json.dumps(
|
||||
config_dict,
|
||||
sort_keys=True,
|
||||
default=str,
|
||||
)
|
||||
return hashlib.sha256(config_json.encode()).hexdigest()
|
||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Tests package
|
||||
135
tests/test_config.py
Normal file
135
tests/test_config.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Tests for configuration loading and validation."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from usher_pipeline.config import load_config, load_config_with_overrides
|
||||
from usher_pipeline.config.schema import PipelineConfig
|
||||
|
||||
|
||||
def test_load_valid_config():
|
||||
"""Test loading valid default configuration."""
|
||||
config = load_config("config/default.yaml")
|
||||
|
||||
assert isinstance(config, PipelineConfig)
|
||||
assert config.versions.ensembl_release == 113
|
||||
assert config.versions.gnomad_version == "v4.1"
|
||||
assert config.api.rate_limit_per_second == 5
|
||||
assert config.api.max_retries == 5
|
||||
assert config.scoring.gnomad == 0.20
|
||||
|
||||
|
||||
def test_invalid_config_missing_field(tmp_path):
|
||||
"""Test that missing required field raises ValidationError."""
|
||||
invalid_config = tmp_path / "invalid.yaml"
|
||||
invalid_config.write_text("""
|
||||
versions:
|
||||
ensembl_release: 113
|
||||
api:
|
||||
rate_limit_per_second: 5
|
||||
scoring:
|
||||
gnomad: 0.20
|
||||
""")
|
||||
|
||||
with pytest.raises(ValidationError) as exc_info:
|
||||
load_config(invalid_config)
|
||||
|
||||
# Check that error mentions missing field
|
||||
assert "data_dir" in str(exc_info.value)
|
||||
|
||||
|
||||
def test_invalid_ensembl_release(tmp_path):
|
||||
"""Test that ensembl_release < 100 raises ValidationError."""
|
||||
invalid_config = tmp_path / "invalid_ensembl.yaml"
|
||||
invalid_config.write_text("""
|
||||
data_dir: data
|
||||
cache_dir: data/cache
|
||||
duckdb_path: data/pipeline.duckdb
|
||||
versions:
|
||||
ensembl_release: 99
|
||||
gnomad_version: v4.1
|
||||
api:
|
||||
rate_limit_per_second: 5
|
||||
max_retries: 5
|
||||
cache_ttl_seconds: 86400
|
||||
timeout_seconds: 30
|
||||
scoring:
|
||||
gnomad: 0.20
|
||||
expression: 0.20
|
||||
annotation: 0.15
|
||||
localization: 0.15
|
||||
animal_model: 0.15
|
||||
literature: 0.15
|
||||
""")
|
||||
|
||||
with pytest.raises(ValidationError) as exc_info:
|
||||
load_config(invalid_config)
|
||||
|
||||
# Check that error mentions ensembl_release constraint
|
||||
error_str = str(exc_info.value)
|
||||
assert "ensembl_release" in error_str
|
||||
assert "greater than or equal to 100" in error_str.lower() or "100" in error_str
|
||||
|
||||
|
||||
def test_config_hash_deterministic():
|
||||
"""Test that config hash is deterministic and changes with config."""
|
||||
config1 = load_config("config/default.yaml")
|
||||
config2 = load_config("config/default.yaml")
|
||||
|
||||
# Same config should produce same hash
|
||||
assert config1.config_hash() == config2.config_hash()
|
||||
|
||||
# Hash should be SHA-256 (64 hex chars)
|
||||
assert len(config1.config_hash()) == 64
|
||||
|
||||
# Different config should produce different hash
|
||||
config3 = load_config_with_overrides(
|
||||
"config/default.yaml",
|
||||
{"api.rate_limit_per_second": 10},
|
||||
)
|
||||
assert config3.config_hash() != config1.config_hash()
|
||||
|
||||
|
||||
def test_config_creates_directories(tmp_path):
|
||||
"""Test that loading config creates data and cache directories."""
|
||||
config_file = tmp_path / "test_config.yaml"
|
||||
|
||||
# Use non-existent directories
|
||||
data_dir = tmp_path / "test_data"
|
||||
cache_dir = tmp_path / "test_cache"
|
||||
|
||||
config_file.write_text(f"""
|
||||
data_dir: {data_dir}
|
||||
cache_dir: {cache_dir}
|
||||
duckdb_path: {tmp_path / "test.duckdb"}
|
||||
versions:
|
||||
ensembl_release: 113
|
||||
gnomad_version: v4.1
|
||||
api:
|
||||
rate_limit_per_second: 5
|
||||
max_retries: 5
|
||||
cache_ttl_seconds: 86400
|
||||
timeout_seconds: 30
|
||||
scoring:
|
||||
gnomad: 0.20
|
||||
expression: 0.20
|
||||
annotation: 0.15
|
||||
localization: 0.15
|
||||
animal_model: 0.15
|
||||
literature: 0.15
|
||||
""")
|
||||
|
||||
# Directories should not exist before loading
|
||||
assert not data_dir.exists()
|
||||
assert not cache_dir.exists()
|
||||
|
||||
# Load config
|
||||
config = load_config(config_file)
|
||||
|
||||
# Directories should be created
|
||||
assert data_dir.exists()
|
||||
assert cache_dir.exists()
|
||||
assert data_dir.is_dir()
|
||||
assert cache_dir.is_dir()
|
||||
Reference in New Issue
Block a user