Add PaddleOCR masking and region detection pipeline

- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00
parent 52612e14ba
commit 479d4e0019
6 changed files with 1118 additions and 0 deletions
--- a/check_rejected_for_missing.py
+++ b/check_rejected_for_missing.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Check if rejected regions contain the missing signatures."""
+
+import base64
+import requests
+from pathlib import Path
+
+OLLAMA_URL = "http://192.168.30.36:11434"
+OLLAMA_MODEL = "qwen2.5vl:32b"
+REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
+
+# Missing signatures based on test results
+MISSING = {
+    "201301_2061_AI1_page5": "林姿妤",
+    "201301_2458_AI1_page4": "魏興海",
+    "201301_2923_AI1_page3": "陈丽琦"
+}
+
+def encode_image_to_base64(image_path):
+    """Encode image file to base64."""
+    with open(image_path, 'rb') as f:
+        return base64.b64encode(f.read()).decode('utf-8')
+
+def ask_vlm_about_signature(image_base64, expected_name):
+    """Ask VLM if the image contains the expected signature."""
+    prompt = f"""Does this image contain a handwritten signature with the Chinese name: "{expected_name}"?
+
+Look carefully for handwritten Chinese characters matching this name.
+
+Answer only 'yes' or 'no'."""
+
+    payload = {
+        "model": OLLAMA_MODEL,
+        "prompt": prompt,
+        "images": [image_base64],
+        "stream": False
+    }
+
+    try:
+        response = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=60)
+        response.raise_for_status()
+        answer = response.json()['response'].strip().lower()
+        return answer
+    except Exception as e:
+        return f"error: {str(e)}"
+
+# Check each missing signature
+for pdf_stem, missing_name in MISSING.items():
+    print(f"\n{'='*80}")
+    print(f"Checking rejected regions from: {pdf_stem}")
+    print(f"Looking for missing signature: {missing_name}")
+    print('='*80)
+
+    # Find all rejected regions from this PDF
+    rejected_regions = sorted(Path(REJECTED_PATH).glob(f"{pdf_stem}_region_*.png"))
+
+    print(f"Found {len(rejected_regions)} rejected regions to check")
+
+    for region_path in rejected_regions:
+        region_name = region_path.name
+        print(f"\nChecking: {region_name}...", end='', flush=True)
+
+        # Encode and ask VLM
+        image_base64 = encode_image_to_base64(region_path)
+        answer = ask_vlm_about_signature(image_base64, missing_name)
+
+        if 'yes' in answer:
+            print(f" ✅ FOUND! This region contains {missing_name}")
+            print(f"   → The signature was detected by CV but rejected by verification!")
+        else:
+            print(f" ❌ No (VLM says: {answer})")
+
+print(f"\n{'='*80}")
+print("Analysis complete!")
+print('='*80)