Files
pdf_signature_extraction/check_rejected_for_missing.py
gbanyan 479d4e0019 Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication
- Implemented text masking + region detection pipeline
- Test results: 100% recall on sample PDF (found both signatures)
- Identified issues: split regions, printed text not fully masked
- Documented 5 solution options in PADDLEOCR_STATUS.md
- Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00

76 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""Check if rejected regions contain the missing signatures."""
import base64
import requests
from pathlib import Path
OLLAMA_URL = "http://192.168.30.36:11434"
OLLAMA_MODEL = "qwen2.5vl:32b"
REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
# Missing signatures based on test results
MISSING = {
"201301_2061_AI1_page5": "林姿妤",
"201301_2458_AI1_page4": "魏興海",
"201301_2923_AI1_page3": "陈丽琦"
}
def encode_image_to_base64(image_path):
"""Encode image file to base64."""
with open(image_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
def ask_vlm_about_signature(image_base64, expected_name):
"""Ask VLM if the image contains the expected signature."""
prompt = f"""Does this image contain a handwritten signature with the Chinese name: "{expected_name}"?
Look carefully for handwritten Chinese characters matching this name.
Answer only 'yes' or 'no'."""
payload = {
"model": OLLAMA_MODEL,
"prompt": prompt,
"images": [image_base64],
"stream": False
}
try:
response = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=60)
response.raise_for_status()
answer = response.json()['response'].strip().lower()
return answer
except Exception as e:
return f"error: {str(e)}"
# Check each missing signature
for pdf_stem, missing_name in MISSING.items():
print(f"\n{'='*80}")
print(f"Checking rejected regions from: {pdf_stem}")
print(f"Looking for missing signature: {missing_name}")
print('='*80)
# Find all rejected regions from this PDF
rejected_regions = sorted(Path(REJECTED_PATH).glob(f"{pdf_stem}_region_*.png"))
print(f"Found {len(rejected_regions)} rejected regions to check")
for region_path in rejected_regions:
region_name = region_path.name
print(f"\nChecking: {region_name}...", end='', flush=True)
# Encode and ask VLM
image_base64 = encode_image_to_base64(region_path)
answer = ask_vlm_about_signature(image_base64, missing_name)
if 'yes' in answer:
print(f" ✅ FOUND! This region contains {missing_name}")
print(f" → The signature was detected by CV but rejected by verification!")
else:
print(f" ❌ No (VLM says: {answer})")
print(f"\n{'='*80}")
print("Analysis complete!")
print('='*80)