feat(05-02): implement visualization module with matplotlib/seaborn plots

- Add matplotlib>=3.8.0 and seaborn>=0.13.0 to dependencies - Create visualizations.py with 3 plot functions and orchestrator - plot_score_distribution: histogram colored by confidence tier - plot_layer_contributions: bar chart of evidence layer coverage - plot_tier_breakdown: pie chart of tier distribution - Use Agg backend for headless/CLI safety - All plots saved at 300 DPI with proper figure cleanup - 6 tests covering file creation, edge cases, and return values
2026-02-12 03:57:50 +08:00
parent d2ef3a2b84
commit 150417ffcc
3 changed files with 358 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,8 @@ dependencies = [
    "structlog>=25.0",
    "biopython>=1.84",
    "scipy>=1.14",
    "matplotlib>=3.8.0",
    "seaborn>=0.13.0",
 ]
 [project.optional-dependencies]
--- a/src/usher_pipeline/output/visualizations.py
+++ b/src/usher_pipeline/output/visualizations.py
@@ -0,0 +1,244 @@
 """Visualization generation for pipeline outputs."""
 import logging
 from pathlib import Path
 import matplotlib
 import polars as pl
 # Use Agg backend (non-interactive, safe for headless/CLI use)
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt  # noqa: E402
 import seaborn as sns  # noqa: E402
 logger = logging.getLogger(__name__)
 def plot_score_distribution(df: pl.DataFrame, output_path: Path) -> Path:
    """
    Create histogram of composite scores colored by confidence tier.
    Args:
        df: DataFrame with composite_score and confidence_tier columns
        output_path: Path where PNG will be saved
    Returns:
        Path to the saved PNG file
    Notes:
        - Converts to pandas for seaborn compatibility
        - Uses tier-specific color coding (HIGH=green, MEDIUM=orange, LOW=red)
        - Saves at 300 DPI for publication quality
    """
    # Convert to pandas for seaborn
    pdf = df.to_pandas()
    # Set seaborn theme
    sns.set_theme(style="whitegrid", context="paper")
    # Create figure
    fig, ax = plt.subplots(figsize=(10, 6))
    # Create stacked histogram
    sns.histplot(
        data=pdf,
        x="composite_score",
        hue="confidence_tier",
        hue_order=["HIGH", "MEDIUM", "LOW"],
        palette={
            "HIGH": "#2ecc71",
            "MEDIUM": "#f39c12",
            "LOW": "#e74c3c",
        },
        bins=30,
        multiple="stack",
        ax=ax,
    )
    # Add labels
    ax.set_xlabel("Composite Score")
    ax.set_ylabel("Candidate Count")
    ax.set_title("Score Distribution by Confidence Tier")
    # Save figure
    output_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_path, dpi=300, bbox_inches="tight")
    # CRITICAL: Close figure to prevent memory leak
    plt.close(fig)
    logger.info(f"Saved score distribution plot to {output_path}")
    return output_path
 def plot_layer_contributions(df: pl.DataFrame, output_path: Path) -> Path:
    """
    Create bar chart showing evidence layer coverage.
    Args:
        df: DataFrame with layer score columns
        output_path: Path where PNG will be saved
    Returns:
        Path to the saved PNG file
    Notes:
        - Counts non-null values per layer
        - Shows which layers have the most/least coverage
    """
    # Define layer columns
    layer_columns = [
        "gnomad_score",
        "expression_score",
        "annotation_score",
        "localization_score",
        "animal_model_score",
        "literature_score",
    ]
    # Count non-null values per layer
    layer_counts = {}
    for col in layer_columns:
        if col in df.columns:
            count = df.filter(pl.col(col).is_not_null()).height
            # Clean label (remove "_score" suffix)
            label = col.replace("_score", "").replace("_", " ").title()
            layer_counts[label] = count
    # Create figure
    fig, ax = plt.subplots(figsize=(10, 6))
    # Create bar chart
    labels = list(layer_counts.keys())
    values = list(layer_counts.values())
    sns.barplot(x=labels, y=values, hue=labels, palette="viridis", ax=ax, legend=False)
    # Rotate labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    # Add labels
    ax.set_xlabel("Evidence Layer")
    ax.set_ylabel("Candidates with Evidence")
    ax.set_title("Evidence Layer Coverage")
    # Save figure
    output_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_path, dpi=300, bbox_inches="tight")
    # Close figure
    plt.close(fig)
    logger.info(f"Saved layer contributions plot to {output_path}")
    return output_path
 def plot_tier_breakdown(df: pl.DataFrame, output_path: Path) -> Path:
    """
    Create pie chart showing tier distribution.
    Args:
        df: DataFrame with confidence_tier column
        output_path: Path where PNG will be saved
    Returns:
        Path to the saved PNG file
    Notes:
        - Shows percentage breakdown of HIGH/MEDIUM/LOW tiers
        - Uses same color scheme as score distribution plot
    """
    # Count genes per tier
    if "confidence_tier" in df.columns:
        tier_counts = df.group_by("confidence_tier").agg(
            pl.len().alias("count")
        )
        tier_dict = {
            row["confidence_tier"]: row["count"]
            for row in tier_counts.to_dicts()
        }
    else:
        tier_dict = {}
    # Create figure
    fig, ax = plt.subplots(figsize=(8, 8))
    # Define tier order and colors
    tiers = ["HIGH", "MEDIUM", "LOW"]
    colors = ["#2ecc71", "#f39c12", "#e74c3c"]
    # Get counts in order (0 if tier not present)
    counts = [tier_dict.get(tier, 0) for tier in tiers]
    # Create pie chart
    ax.pie(
        counts,
        labels=tiers,
        colors=colors,
        autopct="%1.1f%%",
        startangle=90,
    )
    ax.set_title("Candidate Tier Breakdown")
    # Save figure
    output_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_path, dpi=300, bbox_inches="tight")
    # Close figure
    plt.close(fig)
    logger.info(f"Saved tier breakdown plot to {output_path}")
    return output_path
 def generate_all_plots(df: pl.DataFrame, output_dir: Path) -> dict[str, Path]:
    """
    Generate all visualization plots.
    Args:
        df: DataFrame with scoring results
        output_dir: Directory where plots will be saved
    Returns:
        Dictionary mapping plot name to file path
    Notes:
        - Creates output directory if needed
        - Wraps each plot in try/except to continue on individual failures
        - Uses standard filenames for each plot type
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    plots = {}
    # Plot 1: Score distribution
    try:
        plots["score_distribution"] = plot_score_distribution(
            df,
            output_dir / "score_distribution.png",
        )
    except Exception as e:
        logger.warning(f"Failed to create score distribution plot: {e}")
    # Plot 2: Layer contributions
    try:
        plots["layer_contributions"] = plot_layer_contributions(
            df,
            output_dir / "layer_contributions.png",
        )
    except Exception as e:
        logger.warning(f"Failed to create layer contributions plot: {e}")
    # Plot 3: Tier breakdown
    try:
        plots["tier_breakdown"] = plot_tier_breakdown(
            df,
            output_dir / "tier_breakdown.png",
        )
    except Exception as e:
        logger.warning(f"Failed to create tier breakdown plot: {e}")
    logger.info(f"Generated {len(plots)} plots in {output_dir}")
    return plots
--- a/tests/test_visualizations.py
+++ b/tests/test_visualizations.py
@@ -0,0 +1,112 @@
 """Tests for visualization generation."""
 from pathlib import Path
 import polars as pl
 import pytest
 from usher_pipeline.output.visualizations import (
    generate_all_plots,
    plot_layer_contributions,
    plot_score_distribution,
    plot_tier_breakdown,
 )
@pytest.fixture
 def synthetic_results_df():
    """Create synthetic scored results DataFrame."""
    return pl.DataFrame({
        "gene_symbol": [f"GENE{i}" for i in range(30)],
        "composite_score": [0.1 + i * 0.03 for i in range(30)],
        "confidence_tier": (
            ["HIGH"] * 10 + ["MEDIUM"] * 10 + ["LOW"] * 10
        ),
        "gnomad_score": [0.5 if i % 2 == 0 else None for i in range(30)],
        "expression_score": [0.6 if i % 3 == 0 else None for i in range(30)],
        "annotation_score": [0.7 if i % 4 == 0 else None for i in range(30)],
        "localization_score": [0.8 if i % 5 == 0 else None for i in range(30)],
        "animal_model_score": [0.9 if i % 6 == 0 else None for i in range(30)],
        "literature_score": [0.85 if i % 7 == 0 else None for i in range(30)],
    })
 def test_plot_score_distribution_creates_file(synthetic_results_df, tmp_path):
    """Test that score distribution plot creates a PNG file."""
    output_path = tmp_path / "score_dist.png"
    result = plot_score_distribution(synthetic_results_df, output_path)
    assert result == output_path
    assert output_path.exists()
    assert output_path.stat().st_size > 0
 def test_plot_layer_contributions_creates_file(synthetic_results_df, tmp_path):
    """Test that layer contributions plot creates a PNG file."""
    output_path = tmp_path / "layer_contrib.png"
    result = plot_layer_contributions(synthetic_results_df, output_path)
    assert result == output_path
    assert output_path.exists()
    assert output_path.stat().st_size > 0
 def test_plot_tier_breakdown_creates_file(synthetic_results_df, tmp_path):
    """Test that tier breakdown plot creates a PNG file."""
    output_path = tmp_path / "tier_breakdown.png"
    result = plot_tier_breakdown(synthetic_results_df, output_path)
    assert result == output_path
    assert output_path.exists()
    assert output_path.stat().st_size > 0
 def test_generate_all_plots_creates_all_files(synthetic_results_df, tmp_path):
    """Test that generate_all_plots creates all 3 PNG files."""
    output_dir = tmp_path / "plots"
    plots = generate_all_plots(synthetic_results_df, output_dir)
    # Check all files exist
    assert (output_dir / "score_distribution.png").exists()
    assert (output_dir / "layer_contributions.png").exists()
    assert (output_dir / "tier_breakdown.png").exists()
 def test_generate_all_plots_returns_paths(synthetic_results_df, tmp_path):
    """Test that generate_all_plots returns dict with 3 entries."""
    output_dir = tmp_path / "plots"
    plots = generate_all_plots(synthetic_results_df, output_dir)
    assert len(plots) == 3
    assert "score_distribution" in plots
    assert "layer_contributions" in plots
    assert "tier_breakdown" in plots
 def test_plots_handle_empty_dataframe(tmp_path):
    """Test that plots handle empty DataFrames without crashing."""
    empty_df = pl.DataFrame({
        "gene_symbol": [],
        "composite_score": [],
        "confidence_tier": [],
        "gnomad_score": [],
        "expression_score": [],
        "annotation_score": [],
        "localization_score": [],
        "animal_model_score": [],
        "literature_score": [],
    })
    output_dir = tmp_path / "empty_plots"
    # Should not crash
    plots = generate_all_plots(empty_df, output_dir)
    # At minimum, the function should return without error
    # Some plots may succeed (empty plot) or fail gracefully
    assert isinstance(plots, dict)