Complete OpenCV Method 3 implementation with 86.5% handwriting retention

- Implemented comprehensive feature analysis based on size, stroke length, and regularity - Size-based scoring: height >50px indicates handwriting - Stroke length ratio: >0.4 indicates handwriting - Irregularity metrics: low compactness/solidity indicates handwriting - Successfully tested on sample PDF with 2 signatures (楊智惠, 張志銘) - Created detailed documentation: CURRENT_STATUS.md and NEW_SESSION_HANDOFF.md - Stable PaddleOCR 2.7.3 configuration documented (numpy 1.26.4, opencv 4.6.0.66) - Prepared research plan for PP-OCRv5 upgrade investigation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 10:35:46 +08:00
parent 479d4e0019
commit 8f231da3bc
6 changed files with 1718 additions and 0 deletions
@@ -0,0 +1,415 @@
+#!/usr/bin/env python3
+"""
+PaddleOCR Signature Extraction - Improved Pipeline
+
+Implements:
+- Method B: Region Merging (merge nearby regions to avoid splits)
+- Method E: Two-Stage Approach (second OCR pass on regions)
+
+Pipeline:
+1. PaddleOCR detects printed text on full page
+2. Mask printed text with padding
+3. Detect candidate regions
+4. Merge nearby regions (METHOD B)
+5. For each region: Run OCR again to remove remaining printed text (METHOD E)
+6. VLM verification (optional)
+7. Save cleaned handwriting regions
+"""
+
+import fitz  # PyMuPDF
+import numpy as np
+import cv2
+from pathlib import Path
+from paddleocr_client import create_ocr_client
+from typing import List, Dict, Tuple
+import base64
+import requests
+
+# Configuration
+TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
+OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved"
+DPI = 300
+
+# PaddleOCR Settings
+MASKING_PADDING = 25  # Pixels to expand text boxes when masking
+
+# Region Detection Parameters
+MIN_REGION_AREA = 3000
+MAX_REGION_AREA = 300000
+MIN_ASPECT_RATIO = 0.3
+MAX_ASPECT_RATIO = 15.0
+
+# Region Merging Parameters (METHOD B)
+MERGE_DISTANCE_HORIZONTAL = 100  # pixels
+MERGE_DISTANCE_VERTICAL = 50     # pixels
+
+# VLM Settings (optional)
+USE_VLM_VERIFICATION = False  # Set to True to enable VLM filtering
+OLLAMA_URL = "http://192.168.30.36:11434"
+OLLAMA_MODEL = "qwen2.5vl:32b"
+
+
+def merge_nearby_regions(regions: List[Dict],
+                        h_distance: int = 100,
+                        v_distance: int = 50) -> List[Dict]:
+    """
+    Merge regions that are close to each other (METHOD B).
+
+    Args:
+        regions: List of region dicts with 'box': (x, y, w, h)
+        h_distance: Maximum horizontal distance between regions to merge
+        v_distance: Maximum vertical distance between regions to merge
+
+    Returns:
+        List of merged regions
+    """
+    if not regions:
+        return []
+
+    # Sort regions by y-coordinate (top to bottom)
+    regions = sorted(regions, key=lambda r: r['box'][1])
+
+    merged = []
+    skip_indices = set()
+
+    for i, region1 in enumerate(regions):
+        if i in skip_indices:
+            continue
+
+        x1, y1, w1, h1 = region1['box']
+
+        # Find all regions that should merge with this one
+        merge_group = [region1]
+
+        for j, region2 in enumerate(regions[i+1:], start=i+1):
+            if j in skip_indices:
+                continue
+
+            x2, y2, w2, h2 = region2['box']
+
+            # Calculate distances
+            # Horizontal distance: gap between boxes horizontally
+            h_dist = max(0, max(x1, x2) - min(x1 + w1, x2 + w2))
+
+            # Vertical distance: gap between boxes vertically
+            v_dist = max(0, max(y1, y2) - min(y1 + h1, y2 + h2))
+
+            # Check if regions are close enough to merge
+            if h_dist <= h_distance and v_dist <= v_distance:
+                merge_group.append(region2)
+                skip_indices.add(j)
+                # Update bounding box to include new region
+                x1 = min(x1, x2)
+                y1 = min(y1, y2)
+                w1 = max(x1 + w1, x2 + w2) - x1
+                h1 = max(y1 + h1, y2 + h2) - y1
+
+        # Create merged region
+        merged_box = (x1, y1, w1, h1)
+        merged_area = w1 * h1
+        merged_aspect = w1 / h1 if h1 > 0 else 0
+
+        merged.append({
+            'box': merged_box,
+            'area': merged_area,
+            'aspect_ratio': merged_aspect,
+            'merged_count': len(merge_group)
+        })
+
+    return merged
+
+
+def clean_region_with_ocr(region_image: np.ndarray,
+                          ocr_client,
+                          padding: int = 10) -> np.ndarray:
+    """
+    Remove printed text from a region using second OCR pass (METHOD E).
+
+    Args:
+        region_image: The region image to clean
+        ocr_client: PaddleOCR client
+        padding: Padding around detected text boxes
+
+    Returns:
+        Cleaned region with printed text masked
+    """
+    try:
+        # Run OCR on this specific region
+        text_boxes = ocr_client.get_text_boxes(region_image)
+
+        if not text_boxes:
+            return region_image  # No text found, return as-is
+
+        # Mask detected printed text
+        cleaned = region_image.copy()
+        for (x, y, w, h) in text_boxes:
+            # Add padding
+            x_pad = max(0, x - padding)
+            y_pad = max(0, y - padding)
+            w_pad = min(cleaned.shape[1] - x_pad, w + 2*padding)
+            h_pad = min(cleaned.shape[0] - y_pad, h + 2*padding)
+
+            cv2.rectangle(cleaned, (x_pad, y_pad),
+                         (x_pad + w_pad, y_pad + h_pad),
+                         (255, 255, 255), -1)  # Fill with white
+
+        return cleaned
+
+    except Exception as e:
+        print(f"      Warning: OCR cleaning failed: {e}")
+        return region_image
+
+
+def verify_handwriting_with_vlm(image: np.ndarray) -> Tuple[bool, float]:
+    """
+    Use VLM to verify if image contains handwriting.
+
+    Args:
+        image: Region image (RGB numpy array)
+
+    Returns:
+        (is_handwriting: bool, confidence: float)
+    """
+    try:
+        # Convert image to base64
+        from PIL import Image
+        from io import BytesIO
+
+        pil_image = Image.fromarray(image.astype(np.uint8))
+        buffered = BytesIO()
+        pil_image.save(buffered, format="PNG")
+        image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+        # Ask VLM
+        prompt = """Does this image contain handwritten text or a handwritten signature?
+
+Answer only 'yes' or 'no', followed by a confidence score 0-100.
+Format: yes 95 OR no 80"""
+
+        payload = {
+            "model": OLLAMA_MODEL,
+            "prompt": prompt,
+            "images": [image_base64],
+            "stream": False
+        }
+
+        response = requests.post(f"{OLLAMA_URL}/api/generate",
+                                json=payload, timeout=30)
+        response.raise_for_status()
+        answer = response.json()['response'].strip().lower()
+
+        # Parse answer
+        is_handwriting = 'yes' in answer
+
+        # Try to extract confidence
+        confidence = 0.5
+        parts = answer.split()
+        for part in parts:
+            try:
+                conf = float(part)
+                if 0 <= conf <= 100:
+                    confidence = conf / 100
+                    break
+            except:
+                continue
+
+        return is_handwriting, confidence
+
+    except Exception as e:
+        print(f"      Warning: VLM verification failed: {e}")
+        return True, 0.5  # Default to accepting the region
+
+
+print("="*80)
+print("PaddleOCR Improved Pipeline - Region Merging + Two-Stage Cleaning")
+print("="*80)
+
+# Create output directory
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+
+# Step 1: Connect to PaddleOCR
+print("\n1. Connecting to PaddleOCR server...")
+try:
+    ocr_client = create_ocr_client()
+    print(f"   ✅ Connected: {ocr_client.server_url}")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 2: Render PDF
+print("\n2. Rendering PDF...")
+try:
+    doc = fitz.open(TEST_PDF)
+    page = doc[0]
+    mat = fitz.Matrix(DPI/72, DPI/72)
+    pix = page.get_pixmap(matrix=mat)
+    original_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+        pix.height, pix.width, pix.n)
+
+    if pix.n == 4:
+        original_image = cv2.cvtColor(original_image, cv2.COLOR_RGBA2RGB)
+
+    print(f"   ✅ Rendered: {original_image.shape[1]}x{original_image.shape[0]}")
+    doc.close()
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 3: Detect printed text (Stage 1)
+print("\n3. Detecting printed text (Stage 1 OCR)...")
+try:
+    text_boxes = ocr_client.get_text_boxes(original_image)
+    print(f"   ✅ Detected {len(text_boxes)} text regions")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 4: Mask printed text with padding
+print(f"\n4. Masking printed text (padding={MASKING_PADDING}px)...")
+try:
+    masked_image = original_image.copy()
+
+    for (x, y, w, h) in text_boxes:
+        # Add padding
+        x_pad = max(0, x - MASKING_PADDING)
+        y_pad = max(0, y - MASKING_PADDING)
+        w_pad = min(masked_image.shape[1] - x_pad, w + 2*MASKING_PADDING)
+        h_pad = min(masked_image.shape[0] - y_pad, h + 2*MASKING_PADDING)
+
+        cv2.rectangle(masked_image, (x_pad, y_pad),
+                     (x_pad + w_pad, y_pad + h_pad), (0, 0, 0), -1)
+
+    print(f"   ✅ Masked {len(text_boxes)} regions")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 5: Detect candidate regions
+print("\n5. Detecting candidate regions...")
+try:
+    gray = cv2.cvtColor(masked_image, cv2.COLOR_RGB2GRAY)
+    _, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV)
+
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)
+
+    contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    candidate_regions = []
+    for contour in contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        area = w * h
+        aspect_ratio = w / h if h > 0 else 0
+
+        if (MIN_REGION_AREA <= area <= MAX_REGION_AREA and
+            MIN_ASPECT_RATIO <= aspect_ratio <= MAX_ASPECT_RATIO):
+            candidate_regions.append({
+                'box': (x, y, w, h),
+                'area': area,
+                'aspect_ratio': aspect_ratio
+            })
+
+    print(f"   ✅ Found {len(candidate_regions)} candidate regions")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 6: Merge nearby regions (METHOD B)
+print(f"\n6. Merging nearby regions (h_dist<={MERGE_DISTANCE_HORIZONTAL}, v_dist<={MERGE_DISTANCE_VERTICAL})...")
+try:
+    merged_regions = merge_nearby_regions(
+        candidate_regions,
+        h_distance=MERGE_DISTANCE_HORIZONTAL,
+        v_distance=MERGE_DISTANCE_VERTICAL
+    )
+    print(f"   ✅ Merged {len(candidate_regions)} → {len(merged_regions)} regions")
+
+    for i, region in enumerate(merged_regions):
+        if region['merged_count'] > 1:
+            print(f"      Region {i+1}: Merged {region['merged_count']} sub-regions")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+
+# Step 7: Extract and clean each region (METHOD E)
+print("\n7. Extracting and cleaning regions (Stage 2 OCR)...")
+final_signatures = []
+
+for i, region in enumerate(merged_regions):
+    x, y, w, h = region['box']
+    print(f"\n   Region {i+1}/{len(merged_regions)}: ({x}, {y}, {w}, {h})")
+
+    # Extract region from ORIGINAL image (not masked)
+    padding = 10
+    x_pad = max(0, x - padding)
+    y_pad = max(0, y - padding)
+    w_pad = min(original_image.shape[1] - x_pad, w + 2*padding)
+    h_pad = min(original_image.shape[0] - y_pad, h + 2*padding)
+
+    region_img = original_image[y_pad:y_pad+h_pad, x_pad:x_pad+w_pad].copy()
+
+    print(f"      - Extracted: {region_img.shape[1]}x{region_img.shape[0]}px")
+
+    # Clean with second OCR pass
+    print(f"      - Running Stage 2 OCR to remove printed text...")
+    cleaned_region = clean_region_with_ocr(region_img, ocr_client, padding=5)
+
+    # VLM verification (optional)
+    if USE_VLM_VERIFICATION:
+        print(f"      - VLM verification...")
+        is_handwriting, confidence = verify_handwriting_with_vlm(cleaned_region)
+        print(f"      - VLM says: {'✅ Handwriting' if is_handwriting else '❌ Not handwriting'} (confidence: {confidence:.2f})")
+
+        if not is_handwriting:
+            print(f"      - Skipping (not handwriting)")
+            continue
+
+    # Save
+    final_signatures.append({
+        'image': cleaned_region,
+        'box': region['box'],
+        'original_image': region_img
+    })
+
+    print(f"      ✅ Kept as signature candidate")
+
+print(f"\n   ✅ Final signatures: {len(final_signatures)}")
+
+# Step 8: Save results
+print("\n8. Saving results...")
+
+for i, sig in enumerate(final_signatures):
+    # Save cleaned signature
+    sig_path = Path(OUTPUT_DIR) / f"signature_{i+1:02d}_cleaned.png"
+    cv2.imwrite(str(sig_path), cv2.cvtColor(sig['image'], cv2.COLOR_RGB2BGR))
+
+    # Save original region for comparison
+    orig_path = Path(OUTPUT_DIR) / f"signature_{i+1:02d}_original.png"
+    cv2.imwrite(str(orig_path), cv2.cvtColor(sig['original_image'], cv2.COLOR_RGB2BGR))
+
+    print(f"   📁 Signature {i+1}: {sig_path.name}")
+
+# Save visualizations
+vis_merged = original_image.copy()
+for region in merged_regions:
+    x, y, w, h = region['box']
+    color = (255, 0, 0) if region in [{'box': s['box']} for s in final_signatures] else (128, 128, 128)
+    cv2.rectangle(vis_merged, (x, y), (x + w, y + h), color, 3)
+
+vis_path = Path(OUTPUT_DIR) / "visualization_merged_regions.png"
+cv2.imwrite(str(vis_path), cv2.cvtColor(vis_merged, cv2.COLOR_RGB2BGR))
+print(f"   📁 Visualization: {vis_path.name}")
+
+print("\n" + "="*80)
+print("Pipeline completed!")
+print(f"Results: {OUTPUT_DIR}")
+print("="*80)
+print(f"\nSummary:")
+print(f"  - Stage 1 OCR: {len(text_boxes)} text regions masked")
+print(f"  - Initial candidates: {len(candidate_regions)}")
+print(f"  - After merging: {len(merged_regions)}")
+print(f"  - Final signatures: {len(final_signatures)}")
+print(f"  - Expected signatures: 2 (楊智惠, 張志銘)")
+print("="*80)