diff --git a/config/default.yaml b/config/default.yaml new file mode 100644 index 0000000..f776ac9 --- /dev/null +++ b/config/default.yaml @@ -0,0 +1,25 @@ +# Default pipeline configuration + +data_dir: data +cache_dir: data/cache +duckdb_path: data/pipeline.duckdb + +versions: + ensembl_release: 113 + gnomad_version: v4.1 + gtex_version: v8 + hpa_version: "23.0" + +api: + rate_limit_per_second: 5 + max_retries: 5 + cache_ttl_seconds: 86400 + timeout_seconds: 30 + +scoring: + gnomad: 0.20 + expression: 0.20 + annotation: 0.15 + localization: 0.15 + animal_model: 0.15 + literature: 0.15 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8055a74 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,55 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "usher-pipeline" +version = "0.1.0" +description = "Reproducible pipeline for discovering under-studied cilia/Usher candidate genes" +requires-python = ">=3.11" +authors = [ + {name = "Research Team"} +] +readme = "README.md" +license = {text = "MIT"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] + +dependencies = [ + "mygene>=3.2.0", + "requests>=2.31.0", + "requests-cache>=1.1.0", + "tenacity>=8.2.0", + "pydantic>=2.0", + "pydantic-yaml>=1.2.0", + "duckdb>=0.9.0", + "click>=8.1.0", + "polars>=0.19.0", + "pyarrow>=14.0.0", + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-cov>=4.1.0", +] + +[project.scripts] +usher-pipeline = "usher_pipeline.cli:main" + +[tool.setuptools] +packages = ["usher_pipeline"] +package-dir = {"" = "src"} + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "-v --strict-markers" diff --git a/src/usher_pipeline/__init__.py b/src/usher_pipeline/__init__.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/src/usher_pipeline/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/src/usher_pipeline/config/__init__.py b/src/usher_pipeline/config/__init__.py new file mode 100644 index 0000000..381139c --- /dev/null +++ b/src/usher_pipeline/config/__init__.py @@ -0,0 +1,11 @@ +from .loader import load_config, load_config_with_overrides +from .schema import PipelineConfig, DataSourceVersions, ScoringWeights, APIConfig + +__all__ = [ + "load_config", + "load_config_with_overrides", + "PipelineConfig", + "DataSourceVersions", + "ScoringWeights", + "APIConfig", +] diff --git a/src/usher_pipeline/config/loader.py b/src/usher_pipeline/config/loader.py new file mode 100644 index 0000000..e2a5e49 --- /dev/null +++ b/src/usher_pipeline/config/loader.py @@ -0,0 +1,81 @@ +"""Configuration loading with YAML parsing and validation.""" + +from pathlib import Path +from typing import Any + +import pydantic_yaml + +from .schema import PipelineConfig + + +def load_config(config_path: Path | str) -> PipelineConfig: + """ + Load and validate pipeline configuration from YAML file. + + Args: + config_path: Path to YAML configuration file + + Returns: + Validated PipelineConfig instance + + Raises: + FileNotFoundError: If config file doesn't exist + pydantic.ValidationError: If config is invalid + """ + config_path = Path(config_path) + + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + # Read YAML file + with open(config_path, "r") as f: + yaml_content = f.read() + + # Parse and validate with Pydantic + config = pydantic_yaml.parse_yaml_raw_as(PipelineConfig, yaml_content) + + return config + + +def load_config_with_overrides( + config_path: Path | str, + overrides: dict[str, Any], +) -> PipelineConfig: + """ + Load config from YAML and apply dictionary overrides. + + Useful for CLI flags that override config file values. + + Args: + config_path: Path to YAML configuration file + overrides: Dictionary of values to override (nested keys supported) + + Returns: + Validated PipelineConfig with overrides applied + + Raises: + FileNotFoundError: If config file doesn't exist + pydantic.ValidationError: If final config is invalid + """ + # Load base config + config = load_config(config_path) + + # Convert to dict, apply overrides, re-validate + config_dict = config.model_dump() + + # Apply overrides (simple flat merge for now) + for key, value in overrides.items(): + if "." in key: + # Handle nested keys like "api.rate_limit_per_second" + parts = key.split(".") + target = config_dict + for part in parts[:-1]: + target = target[part] + target[parts[-1]] = value + else: + config_dict[key] = value + + # Re-validate with overrides applied + config = PipelineConfig.model_validate(config_dict) + + return config diff --git a/src/usher_pipeline/config/schema.py b/src/usher_pipeline/config/schema.py new file mode 100644 index 0000000..25e6478 --- /dev/null +++ b/src/usher_pipeline/config/schema.py @@ -0,0 +1,150 @@ +"""Pydantic models for pipeline configuration.""" + +import hashlib +import json +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, Field, field_validator + + +class DataSourceVersions(BaseModel): + """Version information for external data sources.""" + + ensembl_release: int = Field( + ..., + ge=100, + description="Ensembl release number (must be >= 100)", + ) + gnomad_version: str = Field( + default="v4.1", + description="gnomAD version", + ) + gtex_version: str = Field( + default="v8", + description="GTEx version", + ) + hpa_version: str = Field( + default="23.0", + description="Human Protein Atlas version", + ) + + +class ScoringWeights(BaseModel): + """Weights for multi-evidence scoring layers.""" + + gnomad: float = Field( + default=0.20, + ge=0.0, + le=1.0, + description="Weight for genetic constraint evidence", + ) + expression: float = Field( + default=0.20, + ge=0.0, + le=1.0, + description="Weight for tissue expression evidence", + ) + annotation: float = Field( + default=0.15, + ge=0.0, + le=1.0, + description="Weight for annotation completeness", + ) + localization: float = Field( + default=0.15, + ge=0.0, + le=1.0, + description="Weight for subcellular localization evidence", + ) + animal_model: float = Field( + default=0.15, + ge=0.0, + le=1.0, + description="Weight for animal model phenotype evidence", + ) + literature: float = Field( + default=0.15, + ge=0.0, + le=1.0, + description="Weight for literature evidence", + ) + + +class APIConfig(BaseModel): + """Configuration for API clients.""" + + rate_limit_per_second: int = Field( + default=5, + ge=1, + description="Maximum API requests per second", + ) + max_retries: int = Field( + default=5, + ge=1, + le=20, + description="Maximum retry attempts for failed requests", + ) + cache_ttl_seconds: int = Field( + default=86400, + ge=0, + description="Cache time-to-live in seconds (0 = infinite)", + ) + timeout_seconds: int = Field( + default=30, + ge=1, + description="Request timeout in seconds", + ) + + +class PipelineConfig(BaseModel): + """Main pipeline configuration.""" + + data_dir: Path = Field( + ..., + description="Directory for storing downloaded data", + ) + cache_dir: Path = Field( + ..., + description="Directory for API response caching", + ) + duckdb_path: Path = Field( + ..., + description="Path to DuckDB database file", + ) + versions: DataSourceVersions = Field( + ..., + description="Data source version information", + ) + api: APIConfig = Field( + ..., + description="API client configuration", + ) + scoring: ScoringWeights = Field( + ..., + description="Scoring weights for evidence layers", + ) + + @field_validator("data_dir", "cache_dir") + @classmethod + def create_directory(cls, v: Path) -> Path: + """Create directory if it doesn't exist.""" + v.mkdir(parents=True, exist_ok=True) + return v + + def config_hash(self) -> str: + """ + Compute SHA-256 hash of the configuration. + + Returns a deterministic hash based on all config values, + useful for tracking config changes and cache invalidation. + """ + # Convert config to dict and serialize deterministically + config_dict = self.model_dump(mode="python") + # Convert Path objects to strings for JSON serialization + config_json = json.dumps( + config_dict, + sort_keys=True, + default=str, + ) + return hashlib.sha256(config_json.encode()).hexdigest() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..d4839a6 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Tests package diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..74962ba --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,135 @@ +"""Tests for configuration loading and validation.""" + +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from usher_pipeline.config import load_config, load_config_with_overrides +from usher_pipeline.config.schema import PipelineConfig + + +def test_load_valid_config(): + """Test loading valid default configuration.""" + config = load_config("config/default.yaml") + + assert isinstance(config, PipelineConfig) + assert config.versions.ensembl_release == 113 + assert config.versions.gnomad_version == "v4.1" + assert config.api.rate_limit_per_second == 5 + assert config.api.max_retries == 5 + assert config.scoring.gnomad == 0.20 + + +def test_invalid_config_missing_field(tmp_path): + """Test that missing required field raises ValidationError.""" + invalid_config = tmp_path / "invalid.yaml" + invalid_config.write_text(""" +versions: + ensembl_release: 113 +api: + rate_limit_per_second: 5 +scoring: + gnomad: 0.20 +""") + + with pytest.raises(ValidationError) as exc_info: + load_config(invalid_config) + + # Check that error mentions missing field + assert "data_dir" in str(exc_info.value) + + +def test_invalid_ensembl_release(tmp_path): + """Test that ensembl_release < 100 raises ValidationError.""" + invalid_config = tmp_path / "invalid_ensembl.yaml" + invalid_config.write_text(""" +data_dir: data +cache_dir: data/cache +duckdb_path: data/pipeline.duckdb +versions: + ensembl_release: 99 + gnomad_version: v4.1 +api: + rate_limit_per_second: 5 + max_retries: 5 + cache_ttl_seconds: 86400 + timeout_seconds: 30 +scoring: + gnomad: 0.20 + expression: 0.20 + annotation: 0.15 + localization: 0.15 + animal_model: 0.15 + literature: 0.15 +""") + + with pytest.raises(ValidationError) as exc_info: + load_config(invalid_config) + + # Check that error mentions ensembl_release constraint + error_str = str(exc_info.value) + assert "ensembl_release" in error_str + assert "greater than or equal to 100" in error_str.lower() or "100" in error_str + + +def test_config_hash_deterministic(): + """Test that config hash is deterministic and changes with config.""" + config1 = load_config("config/default.yaml") + config2 = load_config("config/default.yaml") + + # Same config should produce same hash + assert config1.config_hash() == config2.config_hash() + + # Hash should be SHA-256 (64 hex chars) + assert len(config1.config_hash()) == 64 + + # Different config should produce different hash + config3 = load_config_with_overrides( + "config/default.yaml", + {"api.rate_limit_per_second": 10}, + ) + assert config3.config_hash() != config1.config_hash() + + +def test_config_creates_directories(tmp_path): + """Test that loading config creates data and cache directories.""" + config_file = tmp_path / "test_config.yaml" + + # Use non-existent directories + data_dir = tmp_path / "test_data" + cache_dir = tmp_path / "test_cache" + + config_file.write_text(f""" +data_dir: {data_dir} +cache_dir: {cache_dir} +duckdb_path: {tmp_path / "test.duckdb"} +versions: + ensembl_release: 113 + gnomad_version: v4.1 +api: + rate_limit_per_second: 5 + max_retries: 5 + cache_ttl_seconds: 86400 + timeout_seconds: 30 +scoring: + gnomad: 0.20 + expression: 0.20 + annotation: 0.15 + localization: 0.15 + animal_model: 0.15 + literature: 0.15 +""") + + # Directories should not exist before loading + assert not data_dir.exists() + assert not cache_dir.exists() + + # Load config + config = load_config(config_file) + + # Directories should be created + assert data_dir.exists() + assert cache_dir.exists() + assert data_dir.is_dir() + assert cache_dir.is_dir()