feat(01-03): create provenance tracker with comprehensive tests

- ProvenanceTracker class for metadata tracking
- Records pipeline version, data source versions, config hash, timestamps
- Sidecar JSON export alongside outputs
- DuckDB _provenance table support
- 13 comprehensive tests (8 DuckDB + 5 provenance)
- All tests pass (12 passed, 1 skipped - pandas)
This commit is contained in:
2026-02-11 16:31:51 +08:00
parent d51141f7d5
commit 98a1a750dd
3 changed files with 461 additions and 2 deletions

View File

@@ -1,6 +1,6 @@
"""Persistence layer for pipeline checkpoints and provenance tracking."""
from usher_pipeline.persistence.duckdb_store import PipelineStore
from usher_pipeline.persistence.provenance import ProvenanceTracker
# ProvenanceTracker will be added in Task 2
__all__ = ["PipelineStore"]
__all__ = ["PipelineStore", "ProvenanceTracker"]

View File

@@ -0,0 +1,141 @@
"""Provenance tracking for pipeline reproducibility."""
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
class ProvenanceTracker:
"""
Tracks provenance metadata for pipeline runs.
Records pipeline version, data source versions, config hash,
and processing steps for full reproducibility tracking.
"""
def __init__(self, pipeline_version: str, config: "PipelineConfig"):
"""
Initialize provenance tracker.
Args:
pipeline_version: Pipeline version string (e.g., "0.1.0")
config: PipelineConfig instance
"""
self.pipeline_version = pipeline_version
self.config_hash = config.config_hash()
self.data_source_versions = config.versions.model_dump()
self.processing_steps = []
self.created_at = datetime.now(timezone.utc)
def record_step(self, step_name: str, details: Optional[dict] = None) -> None:
"""
Record a processing step.
Args:
step_name: Name of the processing step
details: Optional dictionary of additional details
"""
step = {
"step_name": step_name,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
if details:
step["details"] = details
self.processing_steps.append(step)
def create_metadata(self) -> dict:
"""
Create full provenance metadata dictionary.
Returns:
Dictionary with all provenance information
"""
return {
"pipeline_version": self.pipeline_version,
"data_source_versions": self.data_source_versions,
"config_hash": self.config_hash,
"created_at": self.created_at.isoformat(),
"processing_steps": self.processing_steps,
}
def save_sidecar(self, output_path: Path) -> None:
"""
Save provenance metadata as a JSON sidecar file.
Args:
output_path: Path to the main output file.
Sidecar will be saved as {path}.provenance.json
"""
sidecar_path = output_path.with_suffix(".provenance.json")
sidecar_path.parent.mkdir(parents=True, exist_ok=True)
metadata = self.create_metadata()
with open(sidecar_path, "w") as f:
json.dump(metadata, f, indent=2, default=str)
def save_to_store(self, store: "PipelineStore") -> None:
"""
Save provenance metadata to DuckDB store.
Args:
store: PipelineStore instance
"""
metadata = self.create_metadata()
# Create or replace _provenance table
store.conn.execute("""
CREATE TABLE IF NOT EXISTS _provenance (
version VARCHAR,
config_hash VARCHAR,
created_at TIMESTAMP,
steps_json VARCHAR
)
""")
# Insert provenance record
store.conn.execute("""
INSERT INTO _provenance (version, config_hash, created_at, steps_json)
VALUES (?, ?, ?, ?)
""", [
metadata["pipeline_version"],
metadata["config_hash"],
metadata["created_at"],
json.dumps(metadata["processing_steps"]),
])
@staticmethod
def load_sidecar(sidecar_path: Path) -> dict:
"""
Load provenance metadata from a sidecar file.
Args:
sidecar_path: Path to the .provenance.json file
Returns:
Provenance metadata dictionary
"""
with open(sidecar_path) as f:
return json.load(f)
@classmethod
def from_config(
cls,
config: "PipelineConfig",
version: Optional[str] = None
) -> "ProvenanceTracker":
"""
Create ProvenanceTracker from a PipelineConfig.
Args:
config: PipelineConfig instance
version: Pipeline version string. If None, uses usher_pipeline.__version__
Returns:
ProvenanceTracker instance
"""
if version is None:
from usher_pipeline import __version__
version = __version__
return cls(version, config)