From 479d4e00199a12fce35a412a51b48801121e6400 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Tue, 28 Oct 2025 22:28:18 +0800 Subject: [PATCH] Add PaddleOCR masking and region detection pipeline - Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning --- PADDLEOCR_STATUS.md | 475 ++++++++++++++++++++++++++++++++++ check_rejected_for_missing.py | 75 ++++++ paddleocr_client.py | 169 ++++++++++++ test_mask_and_detect.py | 216 ++++++++++++++++ test_paddleocr.py | 102 ++++++++ test_paddleocr_client.py | 81 ++++++ 6 files changed, 1118 insertions(+) create mode 100644 PADDLEOCR_STATUS.md create mode 100644 check_rejected_for_missing.py create mode 100644 paddleocr_client.py create mode 100644 test_mask_and_detect.py create mode 100644 test_paddleocr.py create mode 100644 test_paddleocr_client.py diff --git a/PADDLEOCR_STATUS.md b/PADDLEOCR_STATUS.md new file mode 100644 index 0000000..3dad0b8 --- /dev/null +++ b/PADDLEOCR_STATUS.md @@ -0,0 +1,475 @@ +# PaddleOCR Signature Extraction - Status & Options + +**Date**: October 28, 2025 +**Branch**: `PaddleOCR-Cover` +**Current Stage**: Masking + Region Detection Working, Refinement Needed + +--- + +## Current Approach Overview + +**Strategy**: PaddleOCR masks printed text → Detect remaining regions → VLM verification + +### Pipeline Steps + +``` +1. PaddleOCR (Linux server 192.168.30.36:5555) + └─> Detect printed text bounding boxes + +2. OpenCV Masking (Local) + └─> Black out all printed text areas + +3. Region Detection (Local) + └─> Find non-white areas (potential handwriting) + +4. VLM Verification (TODO) + └─> Confirm which regions are handwritten signatures +``` + +--- + +## Test Results (File: 201301_1324_AI1_page3.pdf) + +### Performance + +| Metric | Value | +|--------|-------| +| Printed text regions masked | 26 | +| Candidate regions detected | 12 | +| Actual signatures found | 2 ✅ | +| False positives (printed text) | 9 | +| Split signatures | 1 (Region 5 might be part of Region 4) | + +### Success + +✅ **PaddleOCR detected most printed text** (26 regions) +✅ **Masking works correctly** (black rectangles) +✅ **Region detection found both signatures** (regions 2, 4) +✅ **No false negatives** (didn't miss any signatures) + +### Issues Identified + +❌ **Problem 1: Handwriting Split Into Multiple Regions** +- Some signatures may be split into 2+ separate regions +- Example: Region 4 and Region 5 might be parts of same signature area +- Caused by gaps between handwritten strokes after masking + +❌ **Problem 2: Printed Name + Handwritten Signature Mixed** +- Region 2: Contains "張 志 銘" (printed) + handwritten signature +- Region 4: Contains "楊 智 惠" (printed) + handwritten signature +- PaddleOCR missed these printed names, so they weren't masked +- Final output includes both printed and handwritten parts + +❌ **Problem 3: Printed Text Not Masked by PaddleOCR** +- 9 regions contain printed text that PaddleOCR didn't detect +- These became false positive candidates +- Examples: dates, company names, paragraph text +- Shows PaddleOCR's detection isn't 100% complete + +--- + +## Proposed Solutions + +### Problem 1: Split Signatures + +#### Option A: More Aggressive Morphology ⭐ EASY +**Approach**: Increase kernel size and iterations to connect nearby strokes + +```python +# Current settings: +kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) +morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2) + +# Proposed settings: +kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)) # 3x larger +morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=5) # More iterations +``` + +**Pros**: +- Simple one-line change +- Connects nearby strokes automatically +- Fast execution + +**Cons**: +- May merge unrelated regions if too aggressive +- Need to tune parameters carefully +- Could lose fine details + +**Recommendation**: ⭐ Try first - easiest to implement and test + +--- + +#### Option B: Region Merging After Detection ⭐⭐ MEDIUM (RECOMMENDED) +**Approach**: After detecting all regions, merge those that are close together + +```python +def merge_nearby_regions(regions, distance_threshold=50): + """ + Merge regions that are within distance_threshold pixels of each other. + + Args: + regions: List of region dicts with 'box' (x, y, w, h) + distance_threshold: Maximum pixels between regions to merge + + Returns: + List of merged regions + """ + # Algorithm: + # 1. Calculate distance between all region pairs + # 2. If distance < threshold, merge their bounding boxes + # 3. Repeat until no more merges possible + + merged = [] + # Implementation here... + return merged +``` + +**Pros**: +- Keeps signatures together intelligently +- Won't merge distant unrelated regions +- Preserves original stroke details +- Can use vertical/horizontal distance separately + +**Cons**: +- Need to tune distance threshold +- More complex than Option A +- May need multiple merge passes + +**Recommendation**: ⭐⭐ **Best balance** - implement this first + +--- + +#### Option C: Don't Split - Extract Larger Context ⭐ EASY +**Approach**: When extracting regions, add significant padding to capture full context + +```python +# Current: padding = 10 pixels +padding = 50 # Much larger padding + +# Or: Merge all regions in the bottom 20% of page +# (signatures are usually at the bottom) +``` + +**Pros**: +- Guaranteed to capture complete signatures +- Very simple to implement +- No risk of losing parts + +**Cons**: +- May include extra unwanted content +- Larger image files +- Makes VLM verification more complex + +**Recommendation**: ⭐ Use as fallback if B doesn't work + +--- + +### Problem 2: Printed + Handwritten in Same Region + +#### Option A: Expand PaddleOCR Masking Boxes ⭐ EASY +**Approach**: Add padding when masking text boxes to catch edges + +```python +padding = 20 # pixels + +for (x, y, w, h) in text_boxes: + # Expand box in all directions + x_pad = max(0, x - padding) + y_pad = max(0, y - padding) + w_pad = min(image.shape[1] - x_pad, w + 2*padding) + h_pad = min(image.shape[0] - y_pad, h + 2*padding) + + cv2.rectangle(masked_image, (x_pad, y_pad), + (x_pad + w_pad, y_pad + h_pad), (0, 0, 0), -1) +``` + +**Pros**: +- Very simple - one parameter change +- Catches text edges and nearby text +- Fast execution + +**Cons**: +- If padding too large, may mask handwriting +- If padding too small, still misses text +- Hard to find perfect padding value + +**Recommendation**: ⭐ Quick test - try with padding=20-30 + +--- + +#### Option B: Run PaddleOCR Again on Each Region ⭐⭐ MEDIUM +**Approach**: Second-pass OCR on extracted regions to find remaining printed text + +```python +def clean_region(region_image, ocr_client): + """ + Remove any remaining printed text from a region. + + Args: + region_image: Extracted candidate region + ocr_client: PaddleOCR client + + Returns: + Cleaned image with only handwriting + """ + # Run OCR on this specific region + text_boxes = ocr_client.get_text_boxes(region_image) + + # Mask any detected printed text + cleaned = region_image.copy() + for (x, y, w, h) in text_boxes: + cv2.rectangle(cleaned, (x, y), (x+w, y+h), (0, 0, 0), -1) + + return cleaned +``` + +**Pros**: +- Very accurate - catches printed text PaddleOCR missed initially +- Clean separation of printed vs handwritten +- No manual tuning needed + +**Cons**: +- 2x slower (OCR call per region) +- May occasionally mask handwritten text if it looks printed +- More complex pipeline + +**Recommendation**: ⭐⭐ Good option if masking padding isn't enough + +--- + +#### Option C: Computer Vision Stroke Analysis ⭐⭐⭐ HARD +**Approach**: Analyze stroke characteristics to distinguish printed vs handwritten + +```python +def separate_printed_handwritten(region_image): + """ + Use CV techniques to separate printed from handwritten. + + Techniques: + - Stroke width analysis (printed = uniform, handwritten = variable) + - Edge detection + smoothness (printed = sharp, handwritten = organic) + - Connected component analysis + - Hough line detection (printed = straight, handwritten = curved) + """ + # Complex implementation... + pass +``` + +**Pros**: +- No API calls needed (fast) +- Can work when OCR fails +- Learns patterns in data + +**Cons**: +- Very complex to implement +- May not be reliable across different documents +- Requires significant tuning +- Hard to maintain + +**Recommendation**: ❌ Skip for now - too complex, uncertain results + +--- + +#### Option D: VLM Crop Guidance ⚠️ RISKY +**Approach**: Ask VLM to provide coordinates of handwriting location + +```python +prompt = """ +This image contains both printed and handwritten text. +Where is the handwritten signature located? +Provide coordinates as: x_start, y_start, x_end, y_end +""" + +# VLM returns coordinates +# Crop to that region only +``` + +**Pros**: +- VLM understands visual context +- Can distinguish printed vs handwritten + +**Cons**: +- **VLM coordinates are unreliable** (32% offset discovered in previous tests!) +- This was the original problem that led to PaddleOCR approach +- May extract wrong region + +**Recommendation**: ❌ **DO NOT USE** - VLM coordinates proven unreliable + +--- + +#### Option E: Two-Stage Hybrid Approach ⭐⭐⭐ BEST (RECOMMENDED) +**Approach**: Combine detection with targeted cleaning + +```python +def extract_signatures_twostage(pdf_path): + """ + Stage 1: Detect candidate regions (current pipeline) + Stage 2: Clean each region + """ + # Stage 1: Full page processing + image = render_pdf(pdf_path) + text_boxes = ocr_client.get_text_boxes(image) + masked_image = mask_text_regions(image, text_boxes, padding=20) + candidate_regions = detect_regions(masked_image) + + # Stage 2: Per-region cleaning + signatures = [] + for region_box in candidate_regions: + # Extract region from ORIGINAL image (not masked) + region_img = extract_region(image, region_box) + + # Option 1: Run OCR again to find remaining printed text + region_text_boxes = ocr_client.get_text_boxes(region_img) + cleaned_region = mask_text_regions(region_img, region_text_boxes) + + # Option 2: Ask VLM if it contains handwriting (no coordinates!) + is_handwriting = vlm_verify(cleaned_region) + + if is_handwriting: + signatures.append(cleaned_region) + + return signatures +``` + +**Pros**: +- Best accuracy - two passes of OCR +- Combines strengths of both approaches +- VLM only for yes/no, not coordinates +- Clean final output with only handwriting + +**Cons**: +- Slower (2 OCR calls per page) +- More complex code +- Higher computational cost + +**Recommendation**: ⭐⭐⭐ **BEST OVERALL** - implement this for production + +--- + +## Implementation Priority + +### Phase 1: Quick Wins (Test Immediately) +1. **Expand masking padding** (Problem 2, Option A) - 5 minutes +2. **More aggressive morphology** (Problem 1, Option A) - 5 minutes +3. **Test and measure improvement** + +### Phase 2: Region Merging (If Phase 1 insufficient) +4. **Implement region merging algorithm** (Problem 1, Option B) - 30 minutes +5. **Test on multiple PDFs** +6. **Tune distance threshold** + +### Phase 3: Two-Stage Approach (Best quality) +7. **Implement second-pass OCR on regions** (Problem 2, Option E) - 1 hour +8. **Add VLM verification** (Step 4 of pipeline) - 30 minutes +9. **Full pipeline testing** + +--- + +## Code Files Status + +### Existing Files ✅ +- **`paddleocr_client.py`** - REST API client for PaddleOCR server +- **`test_paddleocr_client.py`** - Connection and OCR test +- **`test_mask_and_detect.py`** - Current masking + detection pipeline + +### To Be Created 📝 +- **`extract_signatures_paddleocr.py`** - Production pipeline with all improvements +- **`region_merger.py`** - Region merging utilities +- **`vlm_verifier.py`** - VLM handwriting verification + +--- + +## Server Configuration + +**PaddleOCR Server**: +- Host: `192.168.30.36:5555` +- Running: ✅ Yes (PID: 210417) +- Version: 3.3.0 +- GPU: Enabled +- Language: Chinese (lang='ch') + +**VLM Server**: +- Host: `192.168.30.36:11434` (Ollama) +- Model: `qwen2.5vl:32b` +- Status: Not tested yet in this pipeline + +--- + +## Test Plan + +### Test File +- **File**: `201301_1324_AI1_page3.pdf` +- **Expected signatures**: 2 (楊智惠, 張志銘) +- **Current recall**: 100% (found both) +- **Current precision**: 16.7% (2 correct out of 12 regions) + +### Success Metrics After Improvements + +| Metric | Current | Target | +|--------|---------|--------| +| Signatures found | 2/2 (100%) | 2/2 (100%) | +| False positives | 10 | < 2 | +| Precision | 16.7% | > 80% | +| Signatures split | Unknown | 0 | +| Printed text in regions | Yes | No | + +--- + +## Git Branch Strategy + +**Current branch**: `PaddleOCR-Cover` +**Status**: Masking + Region Detection working, needs refinement + +**Recommended next steps**: +1. Commit current state with tag: `paddleocr-v1-basic` +2. Create feature branches: + - `paddleocr-region-merging` - For Problem 1 solutions + - `paddleocr-two-stage` - For Problem 2 solutions +3. Merge best solution back to `PaddleOCR-Cover` + +--- + +## Next Actions + +### Immediate (Today) +- [ ] Commit current working state +- [ ] Test Phase 1 quick wins (padding + morphology) +- [ ] Measure improvement + +### Short-term (This week) +- [ ] Implement Region Merging (Option B) +- [ ] Implement Two-Stage OCR (Option E) +- [ ] Add VLM verification +- [ ] Test on 10 PDFs + +### Long-term (Production) +- [ ] Optimize performance (parallel processing) +- [ ] Error handling and logging +- [ ] Process full 86K dataset +- [ ] Compare with previous hybrid approach (70% recall) + +--- + +## Comparison: PaddleOCR vs Previous Hybrid Approach + +### Previous Approach (VLM-Cover branch) +- **Method**: VLM names + CV detection + VLM verification +- **Results**: 70% recall, 100% precision +- **Problem**: Missed 30% of signatures (CV parameters too conservative) + +### PaddleOCR Approach (Current) +- **Method**: PaddleOCR masking + CV detection + VLM verification +- **Results**: 100% recall (found both signatures) +- **Problem**: Low precision (many false positives), printed text not fully removed + +### Winner: TBD +- PaddleOCR shows **better recall potential** +- After implementing refinements (Phase 2-3), should achieve **high recall + high precision** +- Need to test on larger dataset to confirm + +--- + +**Document version**: 1.0 +**Last updated**: October 28, 2025 +**Author**: Claude Code +**Status**: Ready for implementation diff --git a/check_rejected_for_missing.py b/check_rejected_for_missing.py new file mode 100644 index 0000000..6aaaffa --- /dev/null +++ b/check_rejected_for_missing.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""Check if rejected regions contain the missing signatures.""" + +import base64 +import requests +from pathlib import Path + +OLLAMA_URL = "http://192.168.30.36:11434" +OLLAMA_MODEL = "qwen2.5vl:32b" +REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected" + +# Missing signatures based on test results +MISSING = { + "201301_2061_AI1_page5": "林姿妤", + "201301_2458_AI1_page4": "魏興海", + "201301_2923_AI1_page3": "陈丽琦" +} + +def encode_image_to_base64(image_path): + """Encode image file to base64.""" + with open(image_path, 'rb') as f: + return base64.b64encode(f.read()).decode('utf-8') + +def ask_vlm_about_signature(image_base64, expected_name): + """Ask VLM if the image contains the expected signature.""" + prompt = f"""Does this image contain a handwritten signature with the Chinese name: "{expected_name}"? + +Look carefully for handwritten Chinese characters matching this name. + +Answer only 'yes' or 'no'.""" + + payload = { + "model": OLLAMA_MODEL, + "prompt": prompt, + "images": [image_base64], + "stream": False + } + + try: + response = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=60) + response.raise_for_status() + answer = response.json()['response'].strip().lower() + return answer + except Exception as e: + return f"error: {str(e)}" + +# Check each missing signature +for pdf_stem, missing_name in MISSING.items(): + print(f"\n{'='*80}") + print(f"Checking rejected regions from: {pdf_stem}") + print(f"Looking for missing signature: {missing_name}") + print('='*80) + + # Find all rejected regions from this PDF + rejected_regions = sorted(Path(REJECTED_PATH).glob(f"{pdf_stem}_region_*.png")) + + print(f"Found {len(rejected_regions)} rejected regions to check") + + for region_path in rejected_regions: + region_name = region_path.name + print(f"\nChecking: {region_name}...", end='', flush=True) + + # Encode and ask VLM + image_base64 = encode_image_to_base64(region_path) + answer = ask_vlm_about_signature(image_base64, missing_name) + + if 'yes' in answer: + print(f" ✅ FOUND! This region contains {missing_name}") + print(f" → The signature was detected by CV but rejected by verification!") + else: + print(f" ❌ No (VLM says: {answer})") + +print(f"\n{'='*80}") +print("Analysis complete!") +print('='*80) diff --git a/paddleocr_client.py b/paddleocr_client.py new file mode 100644 index 0000000..f70119b --- /dev/null +++ b/paddleocr_client.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +PaddleOCR Client +Connects to remote PaddleOCR server for OCR inference +""" + +import requests +import base64 +import numpy as np +from typing import List, Dict, Tuple, Optional +from PIL import Image +from io import BytesIO + +class PaddleOCRClient: + """Client for remote PaddleOCR server.""" + + def __init__(self, server_url: str = "http://192.168.30.36:5555"): + """ + Initialize PaddleOCR client. + + Args: + server_url: URL of the PaddleOCR server + """ + self.server_url = server_url.rstrip('/') + self.timeout = 30 # seconds + + def health_check(self) -> bool: + """ + Check if server is healthy. + + Returns: + True if server is healthy, False otherwise + """ + try: + response = requests.get( + f"{self.server_url}/health", + timeout=5 + ) + return response.status_code == 200 and response.json().get('status') == 'ok' + except Exception as e: + print(f"Health check failed: {e}") + return False + + def ocr(self, image: np.ndarray) -> List[Dict]: + """ + Perform OCR on an image. + + Args: + image: numpy array of the image (RGB format) + + Returns: + List of detection results, each containing: + - box: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + - text: detected text string + - confidence: confidence score (0-1) + + Raises: + Exception if OCR fails + """ + # Convert numpy array to PIL Image + if len(image.shape) == 2: # Grayscale + pil_image = Image.fromarray(image) + else: # RGB or RGBA + pil_image = Image.fromarray(image.astype(np.uint8)) + + # Encode to base64 + buffered = BytesIO() + pil_image.save(buffered, format="PNG") + image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + # Send request + try: + response = requests.post( + f"{self.server_url}/ocr", + json={"image": image_base64}, + timeout=self.timeout + ) + response.raise_for_status() + + result = response.json() + + if not result.get('success'): + error_msg = result.get('error', 'Unknown error') + raise Exception(f"OCR failed: {error_msg}") + + return result.get('results', []) + + except requests.exceptions.Timeout: + raise Exception(f"OCR request timed out after {self.timeout} seconds") + except requests.exceptions.ConnectionError: + raise Exception(f"Could not connect to server at {self.server_url}") + except Exception as e: + raise Exception(f"OCR request failed: {str(e)}") + + def get_text_boxes(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]: + """ + Get bounding boxes of all detected text. + + Args: + image: numpy array of the image + + Returns: + List of bounding boxes as (x, y, w, h) tuples + """ + results = self.ocr(image) + boxes = [] + + for result in results: + box = result['box'] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + + # Convert polygon to bounding box + xs = [point[0] for point in box] + ys = [point[1] for point in box] + + x = int(min(xs)) + y = int(min(ys)) + w = int(max(xs) - min(xs)) + h = int(max(ys) - min(ys)) + + boxes.append((x, y, w, h)) + + return boxes + + def __repr__(self): + return f"PaddleOCRClient(server_url='{self.server_url}')" + + +# Convenience function +def create_ocr_client(server_url: str = "http://192.168.30.36:5555") -> PaddleOCRClient: + """ + Create and test PaddleOCR client. + + Args: + server_url: URL of the PaddleOCR server + + Returns: + PaddleOCRClient instance + + Raises: + Exception if server is not reachable + """ + client = PaddleOCRClient(server_url) + + if not client.health_check(): + raise Exception( + f"PaddleOCR server at {server_url} is not responding. " + "Make sure the server is running on the Linux machine." + ) + + return client + + +if __name__ == "__main__": + # Test the client + print("Testing PaddleOCR client...") + + try: + client = create_ocr_client() + print(f"✅ Connected to server: {client.server_url}") + + # Create a test image + test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255 + + print("Running test OCR...") + results = client.ocr(test_image) + print(f"✅ OCR test successful! Found {len(results)} text regions") + + except Exception as e: + print(f"❌ Error: {e}") diff --git a/test_mask_and_detect.py b/test_mask_and_detect.py new file mode 100644 index 0000000..e672a31 --- /dev/null +++ b/test_mask_and_detect.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +Test PaddleOCR Masking + Region Detection Pipeline + +This script demonstrates: +1. PaddleOCR detects printed text bounding boxes +2. Mask out all printed text areas (fill with black) +3. Detect remaining non-white regions (potential handwriting) +4. Visualize the results +""" + +import fitz # PyMuPDF +import numpy as np +import cv2 +from pathlib import Path +from paddleocr_client import create_ocr_client + +# Configuration +TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf" +OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/mask_test" +DPI = 300 + +# Region detection parameters +MIN_REGION_AREA = 3000 # Minimum pixels for a region +MAX_REGION_AREA = 300000 # Maximum pixels for a region +MIN_ASPECT_RATIO = 0.3 # Minimum width/height ratio +MAX_ASPECT_RATIO = 15.0 # Maximum width/height ratio + +print("="*80) +print("PaddleOCR Masking + Region Detection Test") +print("="*80) + +# Create output directory +Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) + +# Step 1: Connect to PaddleOCR server +print("\n1. Connecting to PaddleOCR server...") +try: + ocr_client = create_ocr_client() + print(f" ✅ Connected: {ocr_client.server_url}") +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 2: Render PDF to image +print("\n2. Rendering PDF to image...") +try: + doc = fitz.open(TEST_PDF) + page = doc[0] + mat = fitz.Matrix(DPI/72, DPI/72) + pix = page.get_pixmap(matrix=mat) + original_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) + + if pix.n == 4: # RGBA + original_image = cv2.cvtColor(original_image, cv2.COLOR_RGBA2RGB) + + print(f" ✅ Rendered: {original_image.shape[1]}x{original_image.shape[0]} pixels") + doc.close() +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 3: Detect printed text with PaddleOCR +print("\n3. Detecting printed text with PaddleOCR...") +try: + text_boxes = ocr_client.get_text_boxes(original_image) + print(f" ✅ Detected {len(text_boxes)} text regions") + + # Show some sample boxes + if text_boxes: + print(" Sample text boxes (x, y, w, h):") + for i, box in enumerate(text_boxes[:3]): + print(f" {i+1}. {box}") +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 4: Mask out printed text areas +print("\n4. Masking printed text areas...") +try: + masked_image = original_image.copy() + + # Fill each text box with black + for (x, y, w, h) in text_boxes: + cv2.rectangle(masked_image, (x, y), (x + w, y + h), (0, 0, 0), -1) + + print(f" ✅ Masked {len(text_boxes)} text regions") + + # Save masked image + masked_path = Path(OUTPUT_DIR) / "01_masked_image.png" + cv2.imwrite(str(masked_path), cv2.cvtColor(masked_image, cv2.COLOR_RGB2BGR)) + print(f" 📁 Saved: {masked_path}") + +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 5: Detect remaining non-white regions +print("\n5. Detecting remaining non-white regions...") +try: + # Convert to grayscale + gray = cv2.cvtColor(masked_image, cv2.COLOR_RGB2GRAY) + + # Threshold to find non-white areas + # Anything darker than 250 is considered "content" + _, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV) + + # Apply morphological operations to connect nearby regions + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2) + + # Find contours + contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + print(f" ✅ Found {len(contours)} contours") + + # Filter contours by size and aspect ratio + potential_regions = [] + + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + area = w * h + aspect_ratio = w / h if h > 0 else 0 + + # Check constraints + if (MIN_REGION_AREA <= area <= MAX_REGION_AREA and + MIN_ASPECT_RATIO <= aspect_ratio <= MAX_ASPECT_RATIO): + potential_regions.append({ + 'box': (x, y, w, h), + 'area': area, + 'aspect_ratio': aspect_ratio + }) + + print(f" ✅ Filtered to {len(potential_regions)} potential handwriting regions") + + # Show region details + if potential_regions: + print("\n Detected regions:") + for i, region in enumerate(potential_regions[:5]): + x, y, w, h = region['box'] + print(f" {i+1}. Box: ({x}, {y}, {w}, {h}), " + f"Area: {region['area']}, " + f"Aspect: {region['aspect_ratio']:.2f}") + +except Exception as e: + print(f" ❌ Error: {e}") + import traceback + traceback.print_exc() + exit(1) + +# Step 6: Visualize results +print("\n6. Creating visualizations...") +try: + # Visualization 1: Original with text boxes + vis_original = original_image.copy() + for (x, y, w, h) in text_boxes: + cv2.rectangle(vis_original, (x, y), (x + w, y + h), (0, 255, 0), 3) + + vis_original_path = Path(OUTPUT_DIR) / "02_original_with_text_boxes.png" + cv2.imwrite(str(vis_original_path), cv2.cvtColor(vis_original, cv2.COLOR_RGB2BGR)) + print(f" 📁 Original + text boxes: {vis_original_path}") + + # Visualization 2: Masked image with detected regions + vis_masked = masked_image.copy() + for region in potential_regions: + x, y, w, h = region['box'] + cv2.rectangle(vis_masked, (x, y), (x + w, y + h), (255, 0, 0), 3) + + vis_masked_path = Path(OUTPUT_DIR) / "03_masked_with_regions.png" + cv2.imwrite(str(vis_masked_path), cv2.cvtColor(vis_masked, cv2.COLOR_RGB2BGR)) + print(f" 📁 Masked + regions: {vis_masked_path}") + + # Visualization 3: Binary threshold result + binary_path = Path(OUTPUT_DIR) / "04_binary_threshold.png" + cv2.imwrite(str(binary_path), binary) + print(f" 📁 Binary threshold: {binary_path}") + + # Visualization 4: Morphed result + morphed_path = Path(OUTPUT_DIR) / "05_morphed.png" + cv2.imwrite(str(morphed_path), morphed) + print(f" 📁 Morphed: {morphed_path}") + + # Extract and save each detected region + print("\n7. Extracting detected regions...") + for i, region in enumerate(potential_regions): + x, y, w, h = region['box'] + + # Add padding + padding = 10 + x_pad = max(0, x - padding) + y_pad = max(0, y - padding) + w_pad = min(original_image.shape[1] - x_pad, w + 2*padding) + h_pad = min(original_image.shape[0] - y_pad, h + 2*padding) + + # Extract region from original image + region_img = original_image[y_pad:y_pad+h_pad, x_pad:x_pad+w_pad] + + # Save region + region_path = Path(OUTPUT_DIR) / f"region_{i+1:02d}.png" + cv2.imwrite(str(region_path), cv2.cvtColor(region_img, cv2.COLOR_RGB2BGR)) + print(f" 📁 Region {i+1}: {region_path}") + +except Exception as e: + print(f" ❌ Error: {e}") + import traceback + traceback.print_exc() + +print("\n" + "="*80) +print("Test completed!") +print(f"Results saved to: {OUTPUT_DIR}") +print("="*80) +print("\nSummary:") +print(f" - Printed text regions detected: {len(text_boxes)}") +print(f" - Potential handwriting regions: {len(potential_regions)}") +print(f" - Expected signatures: 2 (楊智惠, 張志銘)") +print("="*80) diff --git a/test_paddleocr.py b/test_paddleocr.py new file mode 100644 index 0000000..a21cb2a --- /dev/null +++ b/test_paddleocr.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Test PaddleOCR on a sample PDF page.""" + +import fitz # PyMuPDF +from paddleocr import PaddleOCR +import numpy as np +from PIL import Image +import cv2 +from pathlib import Path + +# Configuration +TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf" +DPI = 300 + +print("="*80) +print("Testing PaddleOCR on macOS Apple Silicon") +print("="*80) + +# Step 1: Render PDF to image +print("\n1. Rendering PDF to image...") +try: + doc = fitz.open(TEST_PDF) + page = doc[0] + mat = fitz.Matrix(DPI/72, DPI/72) + pix = page.get_pixmap(matrix=mat) + image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) + + if pix.n == 4: # RGBA + image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) + + print(f" ✅ Rendered: {image.shape[1]}x{image.shape[0]} pixels") + doc.close() +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 2: Initialize PaddleOCR +print("\n2. Initializing PaddleOCR...") +print(" (First run will download models, may take a few minutes...)") +try: + # Use the correct syntax from official docs + ocr = PaddleOCR( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False, + lang='ch' # Chinese language + ) + print(" ✅ PaddleOCR initialized successfully") +except Exception as e: + print(f" ❌ Error: {e}") + import traceback + traceback.print_exc() + print("\n Note: PaddleOCR requires PaddlePaddle backend.") + print(" If this is a module import error, PaddlePaddle may not support this platform.") + exit(1) + +# Step 3: Run OCR +print("\n3. Running OCR to detect printed text...") +try: + result = ocr.ocr(image, cls=False) + + if result and result[0]: + print(f" ✅ Detected {len(result[0])} text regions") + + # Show first few detections + print("\n Sample detections:") + for i, item in enumerate(result[0][:5]): + box = item[0] # Bounding box coordinates + text = item[1][0] # Detected text + confidence = item[1][1] # Confidence score + print(f" {i+1}. Text: '{text}' (confidence: {confidence:.2f})") + print(f" Box: {box}") + else: + print(" ⚠️ No text detected") + +except Exception as e: + print(f" ❌ Error during OCR: {e}") + import traceback + traceback.print_exc() + exit(1) + +# Step 4: Visualize detection +print("\n4. Creating visualization...") +try: + vis_image = image.copy() + + if result and result[0]: + for item in result[0]: + box = np.array(item[0], dtype=np.int32) + cv2.polylines(vis_image, [box], True, (0, 255, 0), 2) + + # Save visualization + output_path = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_test_detection.png" + cv2.imwrite(output_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR)) + print(f" ✅ Saved visualization: {output_path}") + +except Exception as e: + print(f" ❌ Error during visualization: {e}") + +print("\n" + "="*80) +print("PaddleOCR test completed!") +print("="*80) diff --git a/test_paddleocr_client.py b/test_paddleocr_client.py new file mode 100644 index 0000000..a3c8392 --- /dev/null +++ b/test_paddleocr_client.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""Test PaddleOCR client with a real PDF page.""" + +import fitz # PyMuPDF +import numpy as np +import cv2 +from paddleocr_client import create_ocr_client + +# Test PDF +TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf" +DPI = 300 + +print("="*80) +print("Testing PaddleOCR Client with Real PDF") +print("="*80) + +# Step 1: Connect to server +print("\n1. Connecting to PaddleOCR server...") +try: + client = create_ocr_client() + print(f" ✅ Connected: {client.server_url}") +except Exception as e: + print(f" ❌ Connection failed: {e}") + exit(1) + +# Step 2: Render PDF +print("\n2. Rendering PDF to image...") +try: + doc = fitz.open(TEST_PDF) + page = doc[0] + mat = fitz.Matrix(DPI/72, DPI/72) + pix = page.get_pixmap(matrix=mat) + image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) + + if pix.n == 4: # RGBA + image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) + + print(f" ✅ Rendered: {image.shape[1]}x{image.shape[0]} pixels") + doc.close() +except Exception as e: + print(f" ❌ Error: {e}") + exit(1) + +# Step 3: Run OCR +print("\n3. Running OCR on image...") +try: + results = client.ocr(image) + print(f" ✅ OCR successful!") + print(f" Found {len(results)} text regions") + + # Show first few results + if results: + print("\n Sample detections:") + for i, result in enumerate(results[:5]): + text = result['text'] + confidence = result['confidence'] + print(f" {i+1}. '{text}' (confidence: {confidence:.2f})") + +except Exception as e: + print(f" ❌ OCR failed: {e}") + import traceback + traceback.print_exc() + exit(1) + +# Step 4: Get bounding boxes +print("\n4. Getting text bounding boxes...") +try: + boxes = client.get_text_boxes(image) + print(f" ✅ Got {len(boxes)} bounding boxes") + + if boxes: + print(" Sample boxes (x, y, w, h):") + for i, box in enumerate(boxes[:3]): + print(f" {i+1}. {box}") + +except Exception as e: + print(f" ❌ Error: {e}") + +print("\n" + "="*80) +print("Test completed successfully!") +print("="*80)