feat(05-02): implement reproducibility report module with JSON and Markdown output

- Create ReproducibilityReport dataclass with all metadata fields
- Implement generate_reproducibility_report function
- Extract parameters from PipelineConfig (scoring weights, data versions)
- Capture software environment (Python, polars, duckdb versions)
- Build filtering steps from ProvenanceTracker
- Compute tier statistics from tiered DataFrame
- Support optional validation metrics
- to_json: write as indented JSON for machine-readable format
- to_markdown: write with tables and headers for human-readable format
- 7 tests covering all report fields, formats, and edge cases
This commit is contained in:
2026-02-12 04:00:21 +08:00
parent 4e46b488f1
commit 5af63eab46
2 changed files with 565 additions and 0 deletions

View File

@@ -0,0 +1,320 @@
"""Reproducibility report generation for pipeline runs."""
import json
import sys
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
import duckdb
import polars as pl
from usher_pipeline.config.schema import PipelineConfig
from usher_pipeline.persistence.provenance import ProvenanceTracker
@dataclass
class FilteringStep:
"""Record of a data filtering/processing step."""
step_name: str
input_count: int
output_count: int
criteria: str
@dataclass
class ReproducibilityReport:
"""
Comprehensive reproducibility report for a pipeline run.
Contains all information needed to reproduce the analysis:
- Pipeline version and parameters
- Data source versions
- Software environment
- Filtering steps with gene counts
- Validation metrics
- Tier statistics
"""
run_id: str
timestamp: str
pipeline_version: str
parameters: dict
data_versions: dict
software_environment: dict
filtering_steps: list[FilteringStep] = field(default_factory=list)
validation_metrics: dict = field(default_factory=dict)
tier_statistics: dict = field(default_factory=dict)
def to_dict(self) -> dict:
"""
Convert report to dictionary.
Returns:
Dictionary representation of the report
"""
return {
"run_id": self.run_id,
"timestamp": self.timestamp,
"pipeline_version": self.pipeline_version,
"parameters": self.parameters,
"data_versions": self.data_versions,
"software_environment": self.software_environment,
"filtering_steps": [
{
"step_name": step.step_name,
"input_count": step.input_count,
"output_count": step.output_count,
"criteria": step.criteria,
}
for step in self.filtering_steps
],
"validation_metrics": self.validation_metrics,
"tier_statistics": self.tier_statistics,
}
def to_json(self, path: Path) -> Path:
"""
Write report as JSON file.
Args:
path: Output path for JSON file
Returns:
Path to the written file
"""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
json.dump(self.to_dict(), f, indent=2, default=str)
return path
def to_markdown(self, path: Path) -> Path:
"""
Write report as human-readable Markdown file.
Args:
path: Output path for Markdown file
Returns:
Path to the written file
"""
path.parent.mkdir(parents=True, exist_ok=True)
lines = [
"# Pipeline Reproducibility Report",
"",
f"**Run ID:** `{self.run_id}`",
f"**Timestamp:** {self.timestamp}",
f"**Pipeline Version:** {self.pipeline_version}",
"",
"## Parameters",
"",
"**Scoring Weights:**",
"",
]
# Add scoring weights if available
if "gnomad" in self.parameters:
lines.extend([
f"- gnomAD: {self.parameters['gnomad']:.2f}",
f"- Expression: {self.parameters['expression']:.2f}",
f"- Annotation: {self.parameters['annotation']:.2f}",
f"- Localization: {self.parameters['localization']:.2f}",
f"- Animal Model: {self.parameters['animal_model']:.2f}",
f"- Literature: {self.parameters['literature']:.2f}",
"",
])
# Add data versions
lines.extend([
"## Data Versions",
"",
])
for key, value in self.data_versions.items():
lines.append(f"- **{key}:** {value}")
lines.append("")
# Add software environment
lines.extend([
"## Software Environment",
"",
])
for key, value in self.software_environment.items():
lines.append(f"- **{key}:** {value}")
lines.append("")
# Add filtering steps if available
if self.filtering_steps:
lines.extend([
"## Filtering Steps",
"",
"| Step | Input Count | Output Count | Criteria |",
"|------|-------------|--------------|----------|",
])
for step in self.filtering_steps:
lines.append(
f"| {step.step_name} | {step.input_count} | "
f"{step.output_count} | {step.criteria} |"
)
lines.append("")
# Add tier statistics
lines.extend([
"## Tier Statistics",
"",
f"- **Total Candidates:** {self.tier_statistics.get('total', 0)}",
f"- **HIGH:** {self.tier_statistics.get('high', 0)}",
f"- **MEDIUM:** {self.tier_statistics.get('medium', 0)}",
f"- **LOW:** {self.tier_statistics.get('low', 0)}",
"",
])
# Add validation metrics if available
if self.validation_metrics:
lines.extend([
"## Validation Metrics",
"",
])
for key, value in self.validation_metrics.items():
if isinstance(value, float):
lines.append(f"- **{key}:** {value:.3f}")
else:
lines.append(f"- **{key}:** {value}")
lines.append("")
# Write to file
with open(path, "w") as f:
f.write("\n".join(lines))
return path
def generate_reproducibility_report(
config: PipelineConfig,
tiered_df: pl.DataFrame,
provenance: ProvenanceTracker,
validation_result: dict | None = None,
) -> ReproducibilityReport:
"""
Generate comprehensive reproducibility report.
Args:
config: Pipeline configuration
tiered_df: Scored and tiered DataFrame
provenance: Provenance tracker with processing steps
validation_result: Optional validation results dictionary
Returns:
ReproducibilityReport instance
Notes:
- Extracts parameters from config (scoring weights, data versions)
- Computes tier statistics from tiered_df
- Builds filtering steps from provenance steps
- Captures software versions (Python, polars, duckdb)
- Generates unique run ID
"""
# Generate run ID
run_id = str(uuid.uuid4())
# Get current timestamp
timestamp = datetime.now(timezone.utc).isoformat()
# Extract pipeline version from provenance
pipeline_version = provenance.pipeline_version
# Extract parameters from config
parameters = config.scoring.model_dump()
# Extract data versions from config
data_versions = config.versions.model_dump()
# Build software environment
software_environment = {
"python": sys.version.split()[0],
"polars": pl.__version__,
"duckdb": duckdb.__version__,
}
# Build filtering steps from provenance
filtering_steps = []
for step in provenance.get_steps():
details = step.get("details", {})
# Extract counts if available
input_count = details.get("input_count", 0)
output_count = details.get("output_count", 0)
criteria = details.get("criteria", "")
filtering_steps.append(
FilteringStep(
step_name=step["step_name"],
input_count=input_count,
output_count=output_count,
criteria=criteria,
)
)
# Compute tier statistics
total = tiered_df.height
high = 0
medium = 0
low = 0
if "confidence_tier" in tiered_df.columns:
tier_counts = tiered_df.group_by("confidence_tier").agg(
pl.len().alias("count")
)
for row in tier_counts.to_dicts():
tier = row["confidence_tier"]
count = row["count"]
if tier == "HIGH":
high = count
elif tier == "MEDIUM":
medium = count
elif tier == "LOW":
low = count
tier_statistics = {
"total": total,
"high": high,
"medium": medium,
"low": low,
}
# Extract validation metrics if provided
validation_metrics = {}
if validation_result:
validation_metrics = {
"median_percentile": validation_result.get("median_percentile", 0.0),
"top_quartile_fraction": validation_result.get(
"top_quartile_fraction", 0.0
),
"validation_passed": validation_result.get("validation_passed", False),
}
return ReproducibilityReport(
run_id=run_id,
timestamp=timestamp,
pipeline_version=pipeline_version,
parameters=parameters,
data_versions=data_versions,
software_environment=software_environment,
filtering_steps=filtering_steps,
validation_metrics=validation_metrics,
tier_statistics=tier_statistics,
)

View File

@@ -0,0 +1,245 @@
"""Tests for reproducibility report generation."""
import json
from pathlib import Path
import polars as pl
import pytest
from usher_pipeline.config.schema import (
APIConfig,
DataSourceVersions,
PipelineConfig,
ScoringWeights,
)
from usher_pipeline.output.reproducibility import generate_reproducibility_report
from usher_pipeline.persistence.provenance import ProvenanceTracker
@pytest.fixture
def mock_config(tmp_path):
"""Create mock pipeline configuration."""
return PipelineConfig(
data_dir=tmp_path / "data",
cache_dir=tmp_path / "cache",
duckdb_path=tmp_path / "pipeline.db",
versions=DataSourceVersions(
ensembl_release=113,
gnomad_version="v4.1",
gtex_version="v8",
hpa_version="23.0",
),
api=APIConfig(),
scoring=ScoringWeights(
gnomad=0.20,
expression=0.20,
annotation=0.15,
localization=0.15,
animal_model=0.15,
literature=0.15,
),
)
@pytest.fixture
def mock_provenance(mock_config):
"""Create mock provenance tracker."""
provenance = ProvenanceTracker(
pipeline_version="0.1.0",
config=mock_config,
)
# Record some processing steps
provenance.record_step(
"gene_universe_fetch",
details={
"input_count": 0,
"output_count": 20000,
"criteria": "Human protein-coding genes from Ensembl",
},
)
provenance.record_step(
"gnomad_filtering",
details={
"input_count": 20000,
"output_count": 19500,
"criteria": "Remove genes with quality flags",
},
)
return provenance
@pytest.fixture
def synthetic_tiered_df():
"""Create synthetic tiered DataFrame."""
return pl.DataFrame({
"gene_id": [f"ENSG{i:011d}" for i in range(100)],
"gene_symbol": [f"GENE{i}" for i in range(100)],
"composite_score": [0.1 + i * 0.008 for i in range(100)],
"confidence_tier": (
["HIGH"] * 30 + ["MEDIUM"] * 40 + ["LOW"] * 30
),
})
def test_generate_report_has_all_fields(
mock_config, mock_provenance, synthetic_tiered_df
):
"""Test that report contains all required fields."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
validation_result=None,
)
# Check all required fields exist
assert report.run_id is not None
assert report.timestamp is not None
assert report.pipeline_version == "0.1.0"
assert report.parameters is not None
assert report.data_versions is not None
assert report.software_environment is not None
assert report.tier_statistics is not None
def test_report_to_json_parseable(
mock_config, mock_provenance, synthetic_tiered_df, tmp_path
):
"""Test that JSON output is valid and parseable."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
)
json_path = tmp_path / "report.json"
report.to_json(json_path)
# Read back and verify it's valid JSON
with open(json_path) as f:
data = json.load(f)
# Verify expected keys
assert "run_id" in data
assert "timestamp" in data
assert "pipeline_version" in data
assert "parameters" in data
assert "data_versions" in data
assert "software_environment" in data
assert "filtering_steps" in data
assert "tier_statistics" in data
def test_report_to_markdown_has_headers(
mock_config, mock_provenance, synthetic_tiered_df, tmp_path
):
"""Test that Markdown output contains required sections."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
)
md_path = tmp_path / "report.md"
report.to_markdown(md_path)
# Read content
content = md_path.read_text()
# Verify headers
assert "# Pipeline Reproducibility Report" in content
assert "## Parameters" in content
assert "## Data Versions" in content
assert "## Filtering Steps" in content
assert "## Tier Statistics" in content
assert "## Software Environment" in content
def test_report_tier_statistics_match(
mock_config, mock_provenance, synthetic_tiered_df
):
"""Test that tier statistics match DataFrame counts."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
)
# Verify total matches
assert report.tier_statistics["total"] == synthetic_tiered_df.height
# Verify tier counts
assert report.tier_statistics["high"] == 30
assert report.tier_statistics["medium"] == 40
assert report.tier_statistics["low"] == 30
# Verify sum
tier_sum = (
report.tier_statistics["high"]
+ report.tier_statistics["medium"]
+ report.tier_statistics["low"]
)
assert tier_sum == report.tier_statistics["total"]
def test_report_includes_validation_when_provided(
mock_config, mock_provenance, synthetic_tiered_df
):
"""Test that validation metrics are included when provided."""
validation_result = {
"median_percentile": 0.85,
"top_quartile_fraction": 0.92,
"validation_passed": True,
}
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
validation_result=validation_result,
)
# Verify validation metrics are present
assert "median_percentile" in report.validation_metrics
assert report.validation_metrics["median_percentile"] == 0.85
assert report.validation_metrics["top_quartile_fraction"] == 0.92
assert report.validation_metrics["validation_passed"] is True
def test_report_without_validation(
mock_config, mock_provenance, synthetic_tiered_df
):
"""Test that report generates without error when validation_result is None."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
validation_result=None,
)
# Should have empty validation metrics
assert report.validation_metrics == {}
def test_report_software_versions(
mock_config, mock_provenance, synthetic_tiered_df
):
"""Test that software environment contains expected keys."""
report = generate_reproducibility_report(
config=mock_config,
tiered_df=synthetic_tiered_df,
provenance=mock_provenance,
)
# Verify software versions are captured
assert "python" in report.software_environment
assert "polars" in report.software_environment
assert "duckdb" in report.software_environment
# Verify they're not empty
assert report.software_environment["python"] != ""
assert report.software_environment["polars"] != ""
assert report.software_environment["duckdb"] != ""