pdf_signature_extraction/paper/renumber_citations.py

#!/usr/bin/env python3
"""
Renumber all in-text citations to sequential order by first appearance.
Also rewrites references.md with the final numbering.
"""
import re
from pathlib import Path

PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")

# === FINAL NUMBERING (by order of first appearance in paper) ===
# Format: new_number: (short_key, full_citation)
FINAL_REFS = {
    1:  ("cpa_act", 'Taiwan Certified Public Accountant Act (會計師法), Art. 4; FSC Attestation Regulations (查核簽證核准準則), Art. 6. Available: https://law.moj.gov.tw/ENG/LawClass/LawAll.aspx?pcode=G0400067'),
    2:  ("yen2013", 'S.-H. Yen, Y.-S. Chang, and H.-L. Chen, "Does the signature of a CPA matter? Evidence from Taiwan," *Res. Account. Regul.*, vol. 25, no. 2, pp. 230–235, 2013.'),
    3:  ("bromley1993", 'J. Bromley et al., "Signature verification using a Siamese time delay neural network," in *Proc. NeurIPS*, 1993.'),
    4:  ("dey2017", 'S. Dey et al., "SigNet: Convolutional Siamese network for writer independent offline signature verification," arXiv:1707.02131, 2017.'),
    5:  ("hadjadj2020", 'I. Hadjadj et al., "An offline signature verification method based on a single known sample and an explainable deep learning approach," *Appl. Sci.*, vol. 10, no. 11, p. 3716, 2020.'),
    6:  ("li2024", 'H. Li et al., "TransOSV: Offline signature verification with transformers," *Pattern Recognit.*, vol. 145, p. 109882, 2024.'),
    7:  ("tehsin2024", 'S. Tehsin et al., "Enhancing signature verification using triplet Siamese similarity networks in digital documents," *Mathematics*, vol. 12, no. 17, p. 2757, 2024.'),
    8:  ("brimoh2024", 'P. Brimoh and C. C. Olisah, "Consensus-threshold criterion for offline signature verification using CNN learned representations," arXiv:2401.03085, 2024.'),
    9:  ("woodruff2021", 'N. Woodruff et al., "Fully-automatic pipeline for document signature analysis to detect money laundering activities," arXiv:2107.14091, 2021.'),
    10: ("abramova2016", 'S. Abramova and R. Bohme, "Detecting copy-move forgeries in scanned text documents," in *Proc. Electronic Imaging*, 2016.'),
    11: ("cmfd_survey", 'Y. Li et al., "Copy-move forgery detection in digital image forensics: A survey," *Multimedia Tools Appl.*, 2024.'),
    12: ("jakhar2025", 'Y. Jakhar and M. D. Borah, "Effective near-duplicate image detection using perceptual hashing and deep learning," *Inf. Process. Manage.*, p. 104086, 2025.'),
    13: ("pizzi2022", 'E. Pizzi et al., "A self-supervised descriptor for image copy detection," in *Proc. CVPR*, 2022.'),
    14: ("hafemann2017", 'L. G. Hafemann, R. Sabourin, and L. S. Oliveira, "Learning features for offline handwritten signature verification using deep convolutional neural networks," *Pattern Recognit.*, vol. 70, pp. 163–176, 2017.'),
    15: ("zois2024", 'E. N. Zois, D. Tsourounis, and D. Kalivas, "Similarity distance learning on SPD manifold for writer independent offline signature verification," *IEEE Trans. Inf. Forensics Security*, vol. 19, pp. 1342–1356, 2024.'),
    16: ("hafemann2019", 'L. G. Hafemann, R. Sabourin, and L. S. Oliveira, "Meta-learning for fast classifier adaptation to new users of signature verification systems," *IEEE Trans. Inf. Forensics Security*, vol. 15, pp. 1735–1745, 2019.'),
    17: ("farid2009", 'H. Farid, "Image forgery detection," *IEEE Signal Process. Mag.*, vol. 26, no. 2, pp. 16–25, 2009.'),
    18: ("mehrjardi2023", 'F. Z. Mehrjardi, A. M. Latif, M. S. Zarchi, and R. Sheikhpour, "A survey on deep learning-based image forgery detection," *Pattern Recognit.*, vol. 144, art. no. 109778, 2023.'),
    19: ("phash_survey", 'J. Luo et al., "A survey of perceptual hashing for multimedia," *ACM Trans. Multimedia Comput. Commun. Appl.*, vol. 21, no. 7, 2025.'),
    20: ("engin2020", 'D. Engin et al., "Offline signature verification on real-world documents," in *Proc. CVPRW*, 2020.'),
    21: ("tsourounis2022", 'D. Tsourounis et al., "From text to signatures: Knowledge transfer for efficient deep feature learning in offline signature verification," *Expert Syst. Appl.*, 2022.'),
    22: ("chamakh2025", 'B. Chamakh and O. Bounouh, "A unified ResNet18-based approach for offline signature classification and verification," *Procedia Comput. Sci.*, vol. 270, 2025.'),
    23: ("babenko2014", 'A. Babenko, A. Slesarev, A. Chigorin, and V. Lempitsky, "Neural codes for image retrieval," in *Proc. ECCV*, 2014, pp. 584–599.'),
    24: ("qwen2025", 'Qwen2.5-VL Technical Report, Alibaba Group, 2025.'),
    25: ("yolov11", 'Ultralytics, "YOLOv11 documentation," 2024. [Online]. Available: https://docs.ultralytics.com/'),
    26: ("he2016", 'K. He, X. Zhang, S. Ren, and J. Sun, "Deep residual learning for image recognition," in *Proc. CVPR*, 2016.'),
    27: ("krawetz2013", 'N. Krawetz, "Kind of like that," The Hacker Factor Blog, 2013. [Online]. Available: https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html'),
    28: ("silverman1986", 'B. W. Silverman, *Density Estimation for Statistics and Data Analysis*. London: Chapman & Hall, 1986.'),
    29: ("cohen1988", 'J. Cohen, *Statistical Power Analysis for the Behavioral Sciences*, 2nd ed. Hillsdale, NJ: Lawrence Erlbaum, 1988.'),
    30: ("wang2004", 'Z. Wang, A. C. Bovik, H. R. Sheikh, and E. P. Simoncelli, "Image quality assessment: From error visibility to structural similarity," *IEEE Trans. Image Process.*, vol. 13, no. 4, pp. 600–612, 2004.'),
    31: ("carcello2013", 'J. V. Carcello and C. Li, "Costs and benefits of requiring an engagement partner signature: Recent experience in the United Kingdom," *The Accounting Review*, vol. 88, no. 5, pp. 1511–1546, 2013.'),
    32: ("blay2014", 'A. D. Blay, M. Notbohm, C. Schelleman, and A. Valencia, "Audit quality effects of an individual audit engagement partner signature mandate," *Int. J. Auditing*, vol. 18, no. 3, pp. 172–192, 2014.'),
    33: ("chi2009", 'W. Chi, H. Huang, Y. Liao, and H. Xie, "Mandatory audit partner rotation, audit quality, and market perception: Evidence from Taiwan," *Contemp. Account. Res.*, vol. 26, no. 2, pp. 359–391, 2009.'),
    34: ("redmon2016", 'J. Redmon, S. Divvala, R. Girshick, and A. Farhadi, "You only look once: Unified, real-time object detection," in *Proc. CVPR*, 2016, pp. 779–788.'),
    35: ("vlm_survey", 'J. Zhang, J. Huang, S. Jin, and S. Lu, "Vision-language models for vision tasks: A survey," *IEEE Trans. Pattern Anal. Mach. Intell.*, vol. 46, no. 8, pp. 5625–5644, 2024.'),
    36: ("mann1947", 'H. B. Mann and D. R. Whitney, "On a test of whether one of two random variables is stochastically larger than the other," *Ann. Math. Statist.*, vol. 18, no. 1, pp. 50–60, 1947.'),
}

# === LINE-SPECIFIC REPLACEMENTS PER FILE ===
# Each entry: (unique_context_string, old_text, new_text)

INTRO_FIXES = [
    # Line 16: SV range should start at [3] not [2] (since [2] is Yen)
    ("offline signature verification [2]--[7]",
     "offline signature verification [2]--[7]",
     "offline signature verification [3]--[8]"),
    # Line 23: Woodruff
    ("Woodruff et al. [8]",
     "Woodruff et al. [8]",
     "Woodruff et al. [9]"),
    # Line 24: CMFD refs
    ("Copy-move forgery detection methods [9], [10]",
     "methods [9], [10]",
     "methods [10], [11]"),
    # Line 25: pHash+DL refs
    ("perceptual hashing combined with deep learning [11], [12]",
     "deep learning [11], [12]",
     "deep learning [12], [13]"),
    # Line 28: pHash -> dHash in pipeline description
    ("perceptual hash (pHash) distance",
     "perceptual hash (pHash) distance",
     "difference hash (dHash) distance"),
]

RW_FIXES = [
    # Line 7: Hafemann 2017
    ("Hafemann et al. [24]", "et al. [24]", "et al. [14]"),
    # Line 12: Zois
    ("Zois et al. [26]", "et al. [26]", "et al. [15]"),
    # Line 13: Hafemann 2019
    ("Hafemann et al. [25]", "et al. [25]", "et al. [16]"),
    # Line 18: Brimoh (wrongly [7], should be [8])
    ("Brimoh and Olisah [7]", "Olisah [7]", "Olisah [8]"),
    # Line 23: Farid
    ("manipulated visual content [27]", "content [27]", "content [17]"),
    # Line 23: Mehrjardi
    ("forgery detection [28]", "detection [28]", "detection [18]"),
    # Line 24: CMFD survey
    ("manipulated photographs [10]", "photographs [10]", "photographs [11]"),
    # Line 25: Abramova (was [11], should be [10])
    ("Abramova and Bohme [11]", "Bohme [11]", "Bohme [10]"),
    # Line 27: Woodruff (was [8], should be [9])
    ("Woodruff et al. [8]", "et al. [8]", "et al. [9]"),
    # Line 31: Pizzi (was [12], should be [13])
    ("Pizzi et al. [12]", "et al. [12]", "et al. [13]"),
    # Line 36: pHash survey (was [13], should be [19])
    ("substantive content changes [13]", "changes [13]", "changes [19]"),
    # Line 39: Jakhar (was [11], should be [12])
    ("Jakhar and Borah [11]", "Borah [11]", "Borah [12]"),
    # Line 47: Engin (was [14], should be [20])
    ("Engin et al. [14]", "et al. [14]", "et al. [20]"),
    # Line 48: Tsourounis (was [15], should be [21])
    ("Tsourounis et al. [15]", "et al. [15]", "et al. [21]"),
    # Line 49: Chamakh (was [16], should be [22])
    ("Chamakh and Bounouh [16]", "Bounouh [16]", "Bounouh [22]"),
    # Line 51: Babenko (was [29], should be [23])
    ("Babenko et al. [29]", "et al. [29]", "et al. [23]"),
]

METH_FIXES = [
    # Line 40: Qwen (was [17], should be [24])
    ("parameters) [17]", ") [17]", ") [24]"),
    # Line 53: YOLO (was [18], should be [25])
    ("(nano variant) [18]", "variant) [18]", "variant) [25]"),
    # Line 75: ResNet (was [19], should be [26])
    ("neural network [19]", "network [19]", "network [26]"),
    # Line 81: Engin, Tsourounis (was [14], [15], should be [20], [21])
    ("document analysis tasks [14], [15]",
     "tasks [14], [15]",
     "tasks [20], [21]"),
    # Line 98: Krawetz dHash (was [36], should be [27])
    ("(dHash) [36]", ") [36]", ") [27]"),
    # Line 101: pHash survey ref (was [14], should be [19])
    ("scan-induced variations [14]",
     "variations [14]",
     "variations [19]"),
    # Line 122: Silverman KDE (was [33], should be [28])
    ("(KDE) [33]", ") [33]", ") [28]"),
]

RESULTS_FIXES = [
    # Cohen's d citation (was [34], should be [29])
    ("effect size [34]", "size [34]", "size [29]"),
]

DISCUSSION_FIXES = [
    # Engin/Tsourounis/Chamakh range (was [14]--[16], should be [20]--[22])
    ("prior literature [14]--[16]",
     "literature [14]--[16]",
     "literature [20]--[22]"),
]


def apply_fixes(filepath, fixes):
    text = filepath.read_text(encoding='utf-8')
    changes = 0
    for context, old, new in fixes:
        if context in text:
            text = text.replace(old, new, 1)
            changes += 1
        else:
            print(f"  WARNING: context not found in {filepath.name}: {context[:60]}...")
    filepath.write_text(text, encoding='utf-8')
    print(f"  {filepath.name}: {changes} fixes applied")
    return changes


def rewrite_references():
    """Rewrite references.md with final sequential numbering."""
    lines = ["# References\n\n"]
    lines.append("<!-- IEEE numbered style, sequential by first appearance in text -->\n\n")

    for num, (key, citation) in sorted(FINAL_REFS.items()):
        lines.append(f"[{num}] {citation}\n\n")

    lines.append(f"<!-- Total: {len(FINAL_REFS)} references -->\n")

    ref_path = PAPER_DIR / "paper_a_references.md"
    ref_path.write_text("".join(lines), encoding='utf-8')
    print(f"  paper_a_references.md: rewritten with {len(FINAL_REFS)} references")


def main():
    print("Renumbering citations...\n")

    total = 0
    total += apply_fixes(PAPER_DIR / "paper_a_introduction.md", INTRO_FIXES)
    total += apply_fixes(PAPER_DIR / "paper_a_related_work.md", RW_FIXES)
    total += apply_fixes(PAPER_DIR / "paper_a_methodology.md", METH_FIXES)
    total += apply_fixes(PAPER_DIR / "paper_a_results.md", RESULTS_FIXES)
    total += apply_fixes(PAPER_DIR / "paper_a_discussion.md", DISCUSSION_FIXES)

    print(f"\nTotal fixes: {total}")

    print("\nRewriting references.md...")
    rewrite_references()

    print("\nDone! Verify with: grep -n '\\[.*\\]' paper/paper_a_*.md")


if __name__ == "__main__":
    main()