Complete PP-OCRv5 research and v4 vs v5 comparison

## 研究成果

### PP-OCRv5 API 測試
- 成功升級到 PaddleOCR 3.3.2 (PP-OCRv5)
- 理解新 API 結構和調用方式
- 驗證基礎檢測功能

### 關鍵發現
 PP-OCRv5 **沒有內建手寫分類功能**
- text_type 字段是語言類型,不是手寫/印刷分類
- 仍需要 OpenCV Method 3 來分離手寫和印刷文字

### 完整 Pipeline 對比測試
- v4 (2.7.3): 檢測 14 個文字 → 4 個候選區域
- v5 (3.3.2): 檢測 50 個文字 → 7 個候選區域
- 主簽名區域:兩個版本幾乎相同 (1150x511 vs 1144x511)

### 性能分析
優點:
- v5 手寫識別準確率 +13.7% (文檔承諾)
- 可能減少漏檢

缺點:
- 過度檢測(印章小字等)
- API 完全重寫,不兼容
- 仍無法替代 OpenCV Method 3

### 文件
- PP_OCRV5_RESEARCH_FINDINGS.md: 完整研究報告
- signature-comparison/: v4 vs v5 對比結果
- test_results/: v5 測試輸出
- test_*_pipeline.py: 完整測試腳本

### 建議
當前方案(v2.7.3 + OpenCV Method 3)已足夠穩定,
除非遇到大量漏檢,否則暫不升級到 v5。

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-27 11:21:55 +08:00
parent 8f231da3bc
commit 21df0ff387
10 changed files with 3726 additions and 0 deletions

290
test_v4_full_pipeline.py Normal file
View File

@@ -0,0 +1,290 @@
#!/usr/bin/env python3
"""
使用 PaddleOCR v2.7.3 (v4) 跑完整的簽名提取 pipeline
與 v5 對比
"""
import sys
import json
import cv2
import numpy as np
import requests
from pathlib import Path
# 配置
OCR_SERVER = "http://192.168.30.36:5555"
OUTPUT_DIR = Path("/Volumes/NV2/pdf_recognize/signature-comparison/v4-current")
MASKING_PADDING = 0
def setup_output_dir():
"""創建輸出目錄"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"輸出目錄: {OUTPUT_DIR}")
def get_page_image():
"""獲取測試頁面圖片"""
test_image = "/Volumes/NV2/pdf_recognize/full_page_original.png"
if Path(test_image).exists():
return cv2.imread(test_image)
else:
print(f"❌ 測試圖片不存在: {test_image}")
return None
def call_ocr_server(image):
"""調用服務器端的 PaddleOCR v2.7.3"""
print("\n調用 PaddleOCR v2.7.3 服務器...")
try:
import base64
_, buffer = cv2.imencode('.png', image)
img_base64 = base64.b64encode(buffer).decode('utf-8')
response = requests.post(
f"{OCR_SERVER}/ocr",
json={'image': img_base64},
timeout=30
)
if response.status_code == 200:
result = response.json()
print(f"✅ OCR 完成,檢測到 {len(result.get('results', []))} 個文字區域")
return result.get('results', [])
else:
print(f"❌ 服務器錯誤: {response.status_code}")
return None
except Exception as e:
print(f"❌ OCR 調用失敗: {e}")
import traceback
traceback.print_exc()
return None
def mask_printed_text(image, ocr_results):
"""遮罩印刷文字"""
print("\n遮罩印刷文字...")
masked_image = image.copy()
for i, result in enumerate(ocr_results):
box = result.get('box')
if box is None:
continue
# v2.7.3 返回多邊形格式: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# 轉換為矩形
box_points = np.array(box)
x_min = int(box_points[:, 0].min())
y_min = int(box_points[:, 1].min())
x_max = int(box_points[:, 0].max())
y_max = int(box_points[:, 1].max())
cv2.rectangle(
masked_image,
(x_min - MASKING_PADDING, y_min - MASKING_PADDING),
(x_max + MASKING_PADDING, y_max + MASKING_PADDING),
(0, 0, 0),
-1
)
masked_path = OUTPUT_DIR / "01_masked.png"
cv2.imwrite(str(masked_path), masked_image)
print(f"✅ 遮罩完成: {masked_path}")
return masked_image
def detect_regions(masked_image):
"""檢測候選區域"""
print("\n檢測候選區域...")
gray = cv2.cvtColor(masked_image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)
cv2.imwrite(str(OUTPUT_DIR / "02_binary.png"), binary)
cv2.imwrite(str(OUTPUT_DIR / "03_morphed.png"), morphed)
contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
MIN_AREA = 3000
MAX_AREA = 300000
candidate_regions = []
for contour in contours:
area = cv2.contourArea(contour)
if MIN_AREA <= area <= MAX_AREA:
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h if h > 0 else 0
candidate_regions.append({
'box': (x, y, w, h),
'area': area,
'aspect_ratio': aspect_ratio
})
candidate_regions.sort(key=lambda r: r['area'], reverse=True)
print(f"✅ 找到 {len(candidate_regions)} 個候選區域")
return candidate_regions
def merge_nearby_regions(regions, h_distance=100, v_distance=50):
"""合併鄰近區域"""
print("\n合併鄰近區域...")
if not regions:
return []
merged = []
used = set()
for i, r1 in enumerate(regions):
if i in used:
continue
x1, y1, w1, h1 = r1['box']
merged_box = [x1, y1, x1 + w1, y1 + h1]
group = [i]
for j, r2 in enumerate(regions):
if j <= i or j in used:
continue
x2, y2, w2, h2 = r2['box']
h_dist = min(abs(x1 - (x2 + w2)), abs((x1 + w1) - x2))
v_dist = min(abs(y1 - (y2 + h2)), abs((y1 + h1) - y2))
x_overlap = not (x1 + w1 < x2 or x2 + w2 < x1)
y_overlap = not (y1 + h1 < y2 or y2 + h2 < y1)
if (x_overlap and v_dist <= v_distance) or (y_overlap and h_dist <= h_distance):
merged_box[0] = min(merged_box[0], x2)
merged_box[1] = min(merged_box[1], y2)
merged_box[2] = max(merged_box[2], x2 + w2)
merged_box[3] = max(merged_box[3], y2 + h2)
group.append(j)
used.add(j)
used.add(i)
x, y = merged_box[0], merged_box[1]
w, h = merged_box[2] - merged_box[0], merged_box[3] - merged_box[1]
merged.append({
'box': (x, y, w, h),
'area': w * h,
'merged_count': len(group)
})
print(f"✅ 合併後剩餘 {len(merged)} 個區域")
return merged
def extract_signatures(image, regions):
"""提取簽名區域"""
print("\n提取簽名區域...")
vis_image = image.copy()
for i, region in enumerate(regions):
x, y, w, h = region['box']
cv2.rectangle(vis_image, (x, y), (x + w, y + h), (0, 255, 0), 3)
cv2.putText(vis_image, f"Region {i+1}", (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
signature = image[y:y+h, x:x+w]
sig_path = OUTPUT_DIR / f"signature_{i+1}.png"
cv2.imwrite(str(sig_path), signature)
print(f" Region {i+1}: {w}x{h} 像素, 面積={region['area']}")
vis_path = OUTPUT_DIR / "04_detected_regions.png"
cv2.imwrite(str(vis_path), vis_image)
print(f"\n✅ 標註圖已保存: {vis_path}")
return vis_image
def generate_summary(ocr_count, regions):
"""生成摘要報告"""
summary = f"""
PaddleOCR v2.7.3 (v4) 完整 Pipeline 測試結果
{'=' * 60}
1. OCR 檢測: {ocr_count} 個文字區域
2. 遮罩印刷文字: 完成
3. 檢測候選區域: {len(regions)}
4. 提取簽名: {len(regions)}
候選區域詳情:
{'-' * 60}
"""
for i, region in enumerate(regions):
x, y, w, h = region['box']
area = region['area']
summary += f"Region {i+1}: 位置({x}, {y}), 大小{w}x{h}, 面積={area}\n"
summary += f"\n所有結果保存在: {OUTPUT_DIR}\n"
return summary
def main():
print("=" * 60)
print("PaddleOCR v2.7.3 (v4) 完整 Pipeline 測試")
print("=" * 60)
setup_output_dir()
print("\n1. 讀取測試圖片...")
image = get_page_image()
if image is None:
return
print(f" 圖片大小: {image.shape}")
cv2.imwrite(str(OUTPUT_DIR / "00_original.png"), image)
print("\n2. PaddleOCR v2.7.3 檢測文字...")
ocr_results = call_ocr_server(image)
if ocr_results is None:
print("❌ OCR 失敗,終止測試")
return
print("\n3. 遮罩印刷文字...")
masked_image = mask_printed_text(image, ocr_results)
print("\n4. 檢測候選區域...")
regions = detect_regions(masked_image)
print("\n5. 合併鄰近區域...")
merged_regions = merge_nearby_regions(regions)
print("\n6. 提取簽名...")
vis_image = extract_signatures(image, merged_regions)
print("\n7. 生成摘要報告...")
summary = generate_summary(len(ocr_results), merged_regions)
print(summary)
summary_path = OUTPUT_DIR / "SUMMARY.txt"
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
print("=" * 60)
print("✅ v4 測試完成!")
print(f"結果目錄: {OUTPUT_DIR}")
print("=" * 60)
if __name__ == "__main__":
main()