Files
pdf_signature_extraction/paper/renumber_citations.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

196 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Renumber all in-text citations to sequential order by first appearance.
Also rewrites references.md with the final numbering.
"""
import re
from pathlib import Path
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
# === FINAL NUMBERING (by order of first appearance in paper) ===
# Format: new_number: (short_key, full_citation)
FINAL_REFS = {
1: ("cpa_act", 'Taiwan Certified Public Accountant Act (會計師法), Art. 4; FSC Attestation Regulations (查核簽證核准準則), Art. 6. Available: https://law.moj.gov.tw/ENG/LawClass/LawAll.aspx?pcode=G0400067'),
2: ("yen2013", 'S.-H. Yen, Y.-S. Chang, and H.-L. Chen, "Does the signature of a CPA matter? Evidence from Taiwan," *Res. Account. Regul.*, vol. 25, no. 2, pp. 230235, 2013.'),
3: ("bromley1993", 'J. Bromley et al., "Signature verification using a Siamese time delay neural network," in *Proc. NeurIPS*, 1993.'),
4: ("dey2017", 'S. Dey et al., "SigNet: Convolutional Siamese network for writer independent offline signature verification," arXiv:1707.02131, 2017.'),
5: ("hadjadj2020", 'I. Hadjadj et al., "An offline signature verification method based on a single known sample and an explainable deep learning approach," *Appl. Sci.*, vol. 10, no. 11, p. 3716, 2020.'),
6: ("li2024", 'H. Li et al., "TransOSV: Offline signature verification with transformers," *Pattern Recognit.*, vol. 145, p. 109882, 2024.'),
7: ("tehsin2024", 'S. Tehsin et al., "Enhancing signature verification using triplet Siamese similarity networks in digital documents," *Mathematics*, vol. 12, no. 17, p. 2757, 2024.'),
8: ("brimoh2024", 'P. Brimoh and C. C. Olisah, "Consensus-threshold criterion for offline signature verification using CNN learned representations," arXiv:2401.03085, 2024.'),
9: ("woodruff2021", 'N. Woodruff et al., "Fully-automatic pipeline for document signature analysis to detect money laundering activities," arXiv:2107.14091, 2021.'),
10: ("abramova2016", 'S. Abramova and R. Bohme, "Detecting copy-move forgeries in scanned text documents," in *Proc. Electronic Imaging*, 2016.'),
11: ("cmfd_survey", 'Y. Li et al., "Copy-move forgery detection in digital image forensics: A survey," *Multimedia Tools Appl.*, 2024.'),
12: ("jakhar2025", 'Y. Jakhar and M. D. Borah, "Effective near-duplicate image detection using perceptual hashing and deep learning," *Inf. Process. Manage.*, p. 104086, 2025.'),
13: ("pizzi2022", 'E. Pizzi et al., "A self-supervised descriptor for image copy detection," in *Proc. CVPR*, 2022.'),
14: ("hafemann2017", 'L. G. Hafemann, R. Sabourin, and L. S. Oliveira, "Learning features for offline handwritten signature verification using deep convolutional neural networks," *Pattern Recognit.*, vol. 70, pp. 163176, 2017.'),
15: ("zois2024", 'E. N. Zois, D. Tsourounis, and D. Kalivas, "Similarity distance learning on SPD manifold for writer independent offline signature verification," *IEEE Trans. Inf. Forensics Security*, vol. 19, pp. 13421356, 2024.'),
16: ("hafemann2019", 'L. G. Hafemann, R. Sabourin, and L. S. Oliveira, "Meta-learning for fast classifier adaptation to new users of signature verification systems," *IEEE Trans. Inf. Forensics Security*, vol. 15, pp. 17351745, 2019.'),
17: ("farid2009", 'H. Farid, "Image forgery detection," *IEEE Signal Process. Mag.*, vol. 26, no. 2, pp. 1625, 2009.'),
18: ("mehrjardi2023", 'F. Z. Mehrjardi, A. M. Latif, M. S. Zarchi, and R. Sheikhpour, "A survey on deep learning-based image forgery detection," *Pattern Recognit.*, vol. 144, art. no. 109778, 2023.'),
19: ("phash_survey", 'J. Luo et al., "A survey of perceptual hashing for multimedia," *ACM Trans. Multimedia Comput. Commun. Appl.*, vol. 21, no. 7, 2025.'),
20: ("engin2020", 'D. Engin et al., "Offline signature verification on real-world documents," in *Proc. CVPRW*, 2020.'),
21: ("tsourounis2022", 'D. Tsourounis et al., "From text to signatures: Knowledge transfer for efficient deep feature learning in offline signature verification," *Expert Syst. Appl.*, 2022.'),
22: ("chamakh2025", 'B. Chamakh and O. Bounouh, "A unified ResNet18-based approach for offline signature classification and verification," *Procedia Comput. Sci.*, vol. 270, 2025.'),
23: ("babenko2014", 'A. Babenko, A. Slesarev, A. Chigorin, and V. Lempitsky, "Neural codes for image retrieval," in *Proc. ECCV*, 2014, pp. 584599.'),
24: ("qwen2025", 'Qwen2.5-VL Technical Report, Alibaba Group, 2025.'),
25: ("yolov11", 'Ultralytics, "YOLOv11 documentation," 2024. [Online]. Available: https://docs.ultralytics.com/'),
26: ("he2016", 'K. He, X. Zhang, S. Ren, and J. Sun, "Deep residual learning for image recognition," in *Proc. CVPR*, 2016.'),
27: ("krawetz2013", 'N. Krawetz, "Kind of like that," The Hacker Factor Blog, 2013. [Online]. Available: https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html'),
28: ("silverman1986", 'B. W. Silverman, *Density Estimation for Statistics and Data Analysis*. London: Chapman & Hall, 1986.'),
29: ("cohen1988", 'J. Cohen, *Statistical Power Analysis for the Behavioral Sciences*, 2nd ed. Hillsdale, NJ: Lawrence Erlbaum, 1988.'),
30: ("wang2004", 'Z. Wang, A. C. Bovik, H. R. Sheikh, and E. P. Simoncelli, "Image quality assessment: From error visibility to structural similarity," *IEEE Trans. Image Process.*, vol. 13, no. 4, pp. 600612, 2004.'),
31: ("carcello2013", 'J. V. Carcello and C. Li, "Costs and benefits of requiring an engagement partner signature: Recent experience in the United Kingdom," *The Accounting Review*, vol. 88, no. 5, pp. 15111546, 2013.'),
32: ("blay2014", 'A. D. Blay, M. Notbohm, C. Schelleman, and A. Valencia, "Audit quality effects of an individual audit engagement partner signature mandate," *Int. J. Auditing*, vol. 18, no. 3, pp. 172192, 2014.'),
33: ("chi2009", 'W. Chi, H. Huang, Y. Liao, and H. Xie, "Mandatory audit partner rotation, audit quality, and market perception: Evidence from Taiwan," *Contemp. Account. Res.*, vol. 26, no. 2, pp. 359391, 2009.'),
34: ("redmon2016", 'J. Redmon, S. Divvala, R. Girshick, and A. Farhadi, "You only look once: Unified, real-time object detection," in *Proc. CVPR*, 2016, pp. 779788.'),
35: ("vlm_survey", 'J. Zhang, J. Huang, S. Jin, and S. Lu, "Vision-language models for vision tasks: A survey," *IEEE Trans. Pattern Anal. Mach. Intell.*, vol. 46, no. 8, pp. 56255644, 2024.'),
36: ("mann1947", 'H. B. Mann and D. R. Whitney, "On a test of whether one of two random variables is stochastically larger than the other," *Ann. Math. Statist.*, vol. 18, no. 1, pp. 5060, 1947.'),
}
# === LINE-SPECIFIC REPLACEMENTS PER FILE ===
# Each entry: (unique_context_string, old_text, new_text)
INTRO_FIXES = [
# Line 16: SV range should start at [3] not [2] (since [2] is Yen)
("offline signature verification [2]--[7]",
"offline signature verification [2]--[7]",
"offline signature verification [3]--[8]"),
# Line 23: Woodruff
("Woodruff et al. [8]",
"Woodruff et al. [8]",
"Woodruff et al. [9]"),
# Line 24: CMFD refs
("Copy-move forgery detection methods [9], [10]",
"methods [9], [10]",
"methods [10], [11]"),
# Line 25: pHash+DL refs
("perceptual hashing combined with deep learning [11], [12]",
"deep learning [11], [12]",
"deep learning [12], [13]"),
# Line 28: pHash -> dHash in pipeline description
("perceptual hash (pHash) distance",
"perceptual hash (pHash) distance",
"difference hash (dHash) distance"),
]
RW_FIXES = [
# Line 7: Hafemann 2017
("Hafemann et al. [24]", "et al. [24]", "et al. [14]"),
# Line 12: Zois
("Zois et al. [26]", "et al. [26]", "et al. [15]"),
# Line 13: Hafemann 2019
("Hafemann et al. [25]", "et al. [25]", "et al. [16]"),
# Line 18: Brimoh (wrongly [7], should be [8])
("Brimoh and Olisah [7]", "Olisah [7]", "Olisah [8]"),
# Line 23: Farid
("manipulated visual content [27]", "content [27]", "content [17]"),
# Line 23: Mehrjardi
("forgery detection [28]", "detection [28]", "detection [18]"),
# Line 24: CMFD survey
("manipulated photographs [10]", "photographs [10]", "photographs [11]"),
# Line 25: Abramova (was [11], should be [10])
("Abramova and Bohme [11]", "Bohme [11]", "Bohme [10]"),
# Line 27: Woodruff (was [8], should be [9])
("Woodruff et al. [8]", "et al. [8]", "et al. [9]"),
# Line 31: Pizzi (was [12], should be [13])
("Pizzi et al. [12]", "et al. [12]", "et al. [13]"),
# Line 36: pHash survey (was [13], should be [19])
("substantive content changes [13]", "changes [13]", "changes [19]"),
# Line 39: Jakhar (was [11], should be [12])
("Jakhar and Borah [11]", "Borah [11]", "Borah [12]"),
# Line 47: Engin (was [14], should be [20])
("Engin et al. [14]", "et al. [14]", "et al. [20]"),
# Line 48: Tsourounis (was [15], should be [21])
("Tsourounis et al. [15]", "et al. [15]", "et al. [21]"),
# Line 49: Chamakh (was [16], should be [22])
("Chamakh and Bounouh [16]", "Bounouh [16]", "Bounouh [22]"),
# Line 51: Babenko (was [29], should be [23])
("Babenko et al. [29]", "et al. [29]", "et al. [23]"),
]
METH_FIXES = [
# Line 40: Qwen (was [17], should be [24])
("parameters) [17]", ") [17]", ") [24]"),
# Line 53: YOLO (was [18], should be [25])
("(nano variant) [18]", "variant) [18]", "variant) [25]"),
# Line 75: ResNet (was [19], should be [26])
("neural network [19]", "network [19]", "network [26]"),
# Line 81: Engin, Tsourounis (was [14], [15], should be [20], [21])
("document analysis tasks [14], [15]",
"tasks [14], [15]",
"tasks [20], [21]"),
# Line 98: Krawetz dHash (was [36], should be [27])
("(dHash) [36]", ") [36]", ") [27]"),
# Line 101: pHash survey ref (was [14], should be [19])
("scan-induced variations [14]",
"variations [14]",
"variations [19]"),
# Line 122: Silverman KDE (was [33], should be [28])
("(KDE) [33]", ") [33]", ") [28]"),
]
RESULTS_FIXES = [
# Cohen's d citation (was [34], should be [29])
("effect size [34]", "size [34]", "size [29]"),
]
DISCUSSION_FIXES = [
# Engin/Tsourounis/Chamakh range (was [14]--[16], should be [20]--[22])
("prior literature [14]--[16]",
"literature [14]--[16]",
"literature [20]--[22]"),
]
def apply_fixes(filepath, fixes):
text = filepath.read_text(encoding='utf-8')
changes = 0
for context, old, new in fixes:
if context in text:
text = text.replace(old, new, 1)
changes += 1
else:
print(f" WARNING: context not found in {filepath.name}: {context[:60]}...")
filepath.write_text(text, encoding='utf-8')
print(f" {filepath.name}: {changes} fixes applied")
return changes
def rewrite_references():
"""Rewrite references.md with final sequential numbering."""
lines = ["# References\n\n"]
lines.append("<!-- IEEE numbered style, sequential by first appearance in text -->\n\n")
for num, (key, citation) in sorted(FINAL_REFS.items()):
lines.append(f"[{num}] {citation}\n\n")
lines.append(f"<!-- Total: {len(FINAL_REFS)} references -->\n")
ref_path = PAPER_DIR / "paper_a_references.md"
ref_path.write_text("".join(lines), encoding='utf-8')
print(f" paper_a_references.md: rewritten with {len(FINAL_REFS)} references")
def main():
print("Renumbering citations...\n")
total = 0
total += apply_fixes(PAPER_DIR / "paper_a_introduction.md", INTRO_FIXES)
total += apply_fixes(PAPER_DIR / "paper_a_related_work.md", RW_FIXES)
total += apply_fixes(PAPER_DIR / "paper_a_methodology.md", METH_FIXES)
total += apply_fixes(PAPER_DIR / "paper_a_results.md", RESULTS_FIXES)
total += apply_fixes(PAPER_DIR / "paper_a_discussion.md", DISCUSSION_FIXES)
print(f"\nTotal fixes: {total}")
print("\nRewriting references.md...")
rewrite_references()
print("\nDone! Verify with: grep -n '\\[.*\\]' paper/paper_a_*.md")
if __name__ == "__main__":
main()