From 6d5a9c70c53c44a0dfb71d635adfe3063c6549fe Mon Sep 17 00:00:00 2001
From: Nagi-ovo <13264500190@163.com>
Date: Thu, 15 Aug 2024 13:39:10 +0800
Subject: [PATCH] refactor: integrate core functionality into CHSIConverter
 class

---
 add_float_picture.py | 103 -----------------
 app.py               | 113 +-----------------
 extract_img.py       |  28 -----
 extract_info.py      | 111 ------------------
 utils.py             | 267 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 272 insertions(+), 350 deletions(-)
 delete mode 100644 add_float_picture.py
 delete mode 100644 extract_img.py
 delete mode 100644 extract_info.py
 create mode 100644 utils.py

diff --git a/add_float_picture.py b/add_float_picture.py
deleted file mode 100644
index 8f3367b..0000000
--- a/add_float_picture.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
- 
-# filename: add_float_picture.py
- 
-'''
-Implement floating image based on python-docx.
-- Text wrapping style: BEHIND TEXT <wp:anchor behindDoc="1">
-- Picture position: top-left corner of PAGE `<wp:positionH relativeFrom="page">`.
-Create a docx sample (Layout | Positions | More Layout Options) and explore the 
-source xml (Open as a zip | word | document.xml) to implement other text wrapping
-styles and position modes per `CT_Anchor._anchor_xml()`.
-'''
- 
-from docx.oxml import parse_xml, register_element_cls
-from docx.oxml.ns import nsdecls
-from docx.oxml.shape import CT_Picture
-from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne
-from flask import make_response
- 
-# refer to docx.oxml.shape.CT_Inline
-class CT_Anchor(BaseOxmlElement):
-    """
-    ``<w:anchor>`` element, container for a floating image.
-    """
-    extent = OneAndOnlyOne('wp:extent')
-    docPr = OneAndOnlyOne('wp:docPr')
-    graphic = OneAndOnlyOne('a:graphic')
- 
-    @classmethod
-    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
-        """
-        Return a new ``<wp:anchor>`` element populated with the values passed
-        as parameters.
-        """
-        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
-        anchor.extent.cx = cx
-        anchor.extent.cy = cy
-        anchor.docPr.id = shape_id
-        anchor.docPr.name = 'Picture %d' % shape_id
-        anchor.graphic.graphicData.uri = (
-            'http://schemas.openxmlformats.org/drawingml/2006/picture'
-        )
-        anchor.graphic.graphicData._insert_pic(pic)
-        return anchor
- 
-    @classmethod
-    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
-        """
-        Return a new `wp:anchor` element containing the `pic:pic` element
-        specified by the argument values.
-        """
-        pic_id = 0  # Word doesn't seem to use this, but does not omit it
-        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
-        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
-        anchor.graphic.graphicData._insert_pic(pic)
-        return anchor
-    @classmethod
-    def _anchor_xml(cls, pos_x, pos_y):
-        return (
-            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
-            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
-            '           %s>\n'
-            '  <wp:simplePos x="0" y="0"/>\n'
-            '  <wp:positionH relativeFrom="page">\n'
-            '    <wp:posOffset>%d</wp:posOffset>\n'
-            '  </wp:positionH>\n'
-            '  <wp:positionV relativeFrom="page">\n'
-            '    <wp:posOffset>%d</wp:posOffset>\n'
-            '  </wp:positionV>\n'                    
-            '  <wp:extent cx="914400" cy="914400"/>\n'
-            '  <wp:wrapNone/>\n'
-            '  <wp:docPr id="666" name="unnamed"/>\n'
-            '  <wp:cNvGraphicFramePr>\n'
-            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
-            '  </wp:cNvGraphicFramePr>\n'
-            '  <a:graphic>\n'
-            '    <a:graphicData uri="URI not set"/>\n'
-            '  </a:graphic>\n'
-            '</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
-        )
-# refer to docx.parts.story.BaseStoryPart.new_pic_inline
-def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
-    """Return a newly-created `w:anchor` element.
-    The element contains the image specified by *image_descriptor* and is scaled
-    based on the values of *width* and *height*.
-    """
-    rId, image = part.get_or_add_image(image_descriptor)
-    cx, cy = image.scaled_dimensions(width, height)
-    shape_id, filename = part.next_id, image.filename    
-    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)
-# refer to docx.text.run.add_picture
-def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
-    try:
-        """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
-        """
-        run = p.add_run()
-        anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
-        run._r.add_drawing(anchor)
-    except Exception as e:
-        return make_response("<script>alert('浮动图片添加时发生错误'); window.location.href = document.referrer;</script>")
-
-# refer to docx.oxml.__init__.py
-register_element_cls('wp:anchor', CT_Anchor)
\ No newline at end of file
diff --git a/app.py b/app.py
index 493735b..cfe5912 100644
--- a/app.py
+++ b/app.py
@@ -1,122 +1,19 @@
-from flask import Flask, request, render_template, send_from_directory, make_response
-from werkzeug.utils import secure_filename
+from flask import Flask, render_template
+from utils import CHSIConverter
 import os
-from add_float_picture import add_float_picture
-from extract_img import extract_image_from_pdf
-from extract_info import extract_info_from_pdf
-from docx import Document
-from docx.shared import Inches, Pt
-from docx.enum.table import WD_ALIGN_VERTICAL
-from docx.oxml import parse_xml
-import uuid
-import shutil
 
 app = Flask(__name__)
 
-def convert_to_docx(path):
-    try:
-        extracted_info = extract_info_from_pdf(path)
-        doc = Document("static/template.docx")
-
-        paragraph = doc.add_paragraph()
-        doc.element.body.insert(1, paragraph._element)
-        paragraph.alignment = 1
-        paragraph.add_run('Update date:' + extracted_info['Update Date'])
-
-        del extracted_info['Update Date']
-
-        table = doc.add_table(rows=1, cols=2)
-        table.autofit = False
-
-        for cell in table.columns[0].cells:
-            cell.width = Inches(0.5)
-        for cell in table.columns[1].cells:
-            cell.width = Inches(5.0)
-
-        border_xml = '<w:tcBorders xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' \
-                    '<w:top w:val="nil"/>' \
-                    '<w:left w:val="nil"/>' \
-                    '<w:bottom w:val="nil"/>' \
-                    '<w:right w:val="nil"/>' \
-                    '</w:tcBorders>'
-
-        for key, value in extracted_info.items():
-            cells = table.add_row().cells
-            for cell in cells:
-                cell._element.get_or_add_tcPr().append(parse_xml(border_xml))
-                cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
-
-            is_last = key == list(extracted_info.keys())[-1]
-            cells[0].text = key + ("" if is_last else "\n") 
-            cells[1].text = value + ("" if is_last else "\n")
-
-        cropped_image_1 = extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026)
-        add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140))
-
-        cropped_image_2 = extract_image_from_pdf(path, 1, 300, 2690, 630, 2985)
-        add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643))
-
-        output_path = path.replace(".pdf", ".docx")
-        doc.save(output_path)
-
-        return output_path
-    
-    except Exception as e:
-        return make_response(f"<script>alert('Error during DOCX conversion: {e}'); window.location.href = document.referrer;</script>")
-
 @app.route('/')
 def home():
     return render_template('index.html')
 
 @app.route('/convert', methods=['POST'])
-def convert_file():
-    if 'file' not in request.files:
-        return make_response("<script>alert('缺少文件部分'); window.location.href = document.referrer;</script>")
-
-    file = request.files['file']
-    if file.filename == '':
-        return make_response("<script>alert('没有选中的文件'); window.location.href = document.referrer;</script>")
-
-    if not file.filename.lower().endswith('.pdf'):
-        return make_response("<script>alert('只接受 PDF 文件'); window.location.href = document.referrer;</script>")
-
-    if not file.filename.startswith('教育部学籍在线验证报告_'):
-        return make_response("<script>alert('请不要传入无关文件'); window.location.href = document.referrer;</script>")
-    
-    try:
-        filename = secure_filename(file.filename)
-        filepath = os.path.join(os.getcwd(), 'upload', filename)
-        file.save(filepath)
-
-        output_path = convert_to_docx(filepath)
-
-        directory = os.path.dirname(output_path)
-        filename = os.path.basename(output_path)
-        output_filename = str(uuid.uuid4()) + '.docx'
-
-        response = make_response(send_from_directory(directory, filename, as_attachment=True))
-        response.headers["Content-Disposition"] = f"attachment; filename={output_filename}"
-        
-        # 隐私处理
-        upload_folder = os.path.join(os.getcwd(), 'upload')
-        for filename in os.listdir(upload_folder):
-            if filename != '.gitkeep': 
-                file_path = os.path.join(upload_folder, filename)
-                try:
-                    if os.path.isfile(file_path) or os.path.islink(file_path):
-                        os.unlink(file_path)
-                    elif os.path.isdir(file_path):
-                        shutil.rmtree(file_path)
-                except Exception as e:
-                    print(f"Failed to delete {file_path}. Reason: {e}")
-        
-        return response
-    except Exception as e:
-        return make_response(f"<script>alert('处理文件时发生错误: {e}'); window.location.href = document.referrer;</script>")
+def handle_convert():
+    return CHSIConverter.convert_file()
 
 if __name__ == '__main__':
-    # debug_mode = os.getenv('FLASK_DEBUG', 'false').lower() == 'true'
     port = int(os.getenv('FLASK_PORT', 5001))
     app.run(debug=True, port=port, host='0.0.0.0')
 else:
-    application=app
+    application = app
\ No newline at end of file
diff --git a/extract_img.py b/extract_img.py
deleted file mode 100644
index cb9742d..0000000
--- a/extract_img.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from pdf2image import convert_from_path
-from flask import make_response
-import os
-
-def extract_image_from_pdf(path, page_number, left, top, right, bottom):
-    '''
-    pdf_path = " "
-    page_number = 1
-    left = 1898  # 左边界坐标
-    top = 583  # 上边界坐标
-    right = 2230  # 右边界坐标
-    bottom = 1026  # 下边界坐标
-    '''
-    try:
-        images = convert_from_path(path, dpi=300, first_page=page_number, last_page=page_number)
-        image = images[0]
-
-        cropped_image = image.crop((left, top, right, bottom))
-        file_name = os.path.splitext(path)[0]  # 获取文件的基本名称（不包括扩展名）
-        image_path = f"{file_name}_image.png"  # 拼接新的文件路径，包括正确的扩展名
-        cropped_image.save(image_path)  # 将截取的图片保存到同一路径下
-        return image_path
-    except Exception as e:
-        return make_response(f"<script>alert('从PDF提取图片错误: {e}'); window.location.href = document.referrer;</script>")
-
-
-
-
diff --git a/extract_info.py b/extract_info.py
deleted file mode 100644
index d21ae5d..0000000
--- a/extract_info.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import re
-from pypdf import PdfReader
-from pypinyin import lazy_pinyin
-from flask import make_response
-
-def extract_text_from_pdf(pdf_path):
-    try:
-        pdf_file_obj = open(pdf_path, 'rb')
-        pdf_reader = PdfReader(pdf_file_obj)
-
-        text = ""
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-
-        pdf_file_obj.close()
-        return text
-    except Exception as e:
-        return make_response(f"<script>alert('从PDF提取文本错误: {e}'); window.location.href = document.referrer;</script>")
-
-def extract_info(patterns_dict, text):
-    results = {}
-    for prop, pattern in patterns_dict.items():
-        match = re.search(pattern, text)
-        if match:
-            if prop == 'Name':
-                pinyin = lazy_pinyin(match.group(1))
-                first_char = pinyin[0].capitalize()
-                remaining_chars = ''.join(pinyin[1:]).capitalize()
-                results[prop] =  remaining_chars + ' ' + first_char
-            elif prop == 'Gender':
-                if match.group(1) == '男':
-                    results[prop] = 'Male'
-                elif match.group(1) == '女':
-                    results[prop] = 'Female'
-                else:
-                    results[prop] = None
-            elif prop == 'Ethnic':
-                value = match.group(1)  
-                pinyin_list = lazy_pinyin(value[:-1])  
-                results[prop] = ''.join(pinyin_list).title()
-            elif prop == 'Date of Birth' or prop == 'Date of Enrollment':
-                value = match.group(1)  
-                parts = value.split('年')
-                year = parts[0]
-                month_day = parts[1].split('月')
-                formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year
-                results[prop] = formatted_date
-            elif prop == 'Levels' :
-                results[prop] = 'Undergraduate'
-            elif prop == 'Form' :
-                results[prop] = 'General full-time remote study'
-            elif prop == 'Educational System':
-                results[prop] = match.group(1)+' years'
-            elif prop == 'Type':
-                results[prop] = 'General higher education'
-            elif prop == 'School Status':
-                value = match.group(1)  
-                date_part = value.split("：")[1]  
-                parts = date_part.split('年')
-                year = parts[0]
-                month_day = parts[1].split('月')
-                day = month_day[1].replace('日', '').replace('）', '')  # Remove trailing bracket from the day
-                formatted_date = month_day[0] + '/' + day + '/' + year
-                results[prop] = 'Student registration (Expected graduation date: ' + formatted_date + ')'
-            elif prop == 'Update Date':
-                value = match.group(1)  
-                parts = value.split('年')
-                year = parts[0]
-                month_day = parts[1].split('月')
-                formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year
-                results[prop] = formatted_date
-            else:
-                results[prop] = match.group(1)
-        else:
-            results[prop] = None
-    return results
-
-def extract_info_from_pdf(path):
-    text = extract_text_from_pdf(path)
-
-    def rc(pattern):
-        return re.compile(r'{}\s*([^\s]*)'.format(pattern))
-
-    patterns_dict = {
-        'Update Date': rc('更新日期：'),
-        'Name': rc('姓名'),
-        'Gender': rc('性别'),
-        'Id Number': rc('证件号码'),
-        'Ethnic': rc('民族'),
-        'Date of Birth': rc('出生日期 '),
-        'Institution': rc('院校'),
-        'Levels': rc('层次'),
-        'Faculties': rc('院系'),
-        'Class': rc('班级'),
-        'Major': rc('专业'),
-        'Student Number': rc('学号'),
-        'Form': rc('形式'),
-        'Date of Enrollment': rc('入学日期'),
-        'Educational System': rc('学制'),
-        'Type': rc('类型'),
-        'School Status': rc('学籍状态'),
-    }
-    
-    # 获取匹配的信息
-    results = extract_info(patterns_dict, text)
-    
-    # for prop, value in results.items():
-    #   print(f'{prop}: {value}')
-    
-    return results
-
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..996bd0b
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,267 @@
+from flask import request, make_response, send_from_directory
+from werkzeug.utils import secure_filename
+from docx import Document
+from docx.shared import Inches, Pt
+from docx.enum.table import WD_ALIGN_VERTICAL
+from docx.oxml import parse_xml, register_element_cls
+from docx.oxml.ns import nsdecls
+from docx.oxml.shape import CT_Picture
+from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne
+from pypdf import PdfReader
+from pypinyin import lazy_pinyin
+from pdf2image import convert_from_path
+import re
+import os
+import uuid
+import shutil
+
+class CT_Anchor(BaseOxmlElement):
+    extent = OneAndOnlyOne('wp:extent')
+    docPr = OneAndOnlyOne('wp:docPr')
+    graphic = OneAndOnlyOne('a:graphic')
+
+    @classmethod
+    def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
+        anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
+        anchor.extent.cx = cx
+        anchor.extent.cy = cy
+        anchor.docPr.id = shape_id
+        anchor.docPr.name = f'Picture {shape_id}'
+        anchor.graphic.graphicData.uri = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
+        anchor.graphic.graphicData._insert_pic(pic)
+        return anchor
+
+    @classmethod
+    def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
+        pic_id = 0
+        pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
+        anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
+        anchor.graphic.graphicData._insert_pic(pic)
+        return anchor
+
+    @classmethod
+    def _anchor_xml(cls, pos_x, pos_y):
+        return (
+            '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
+            '           behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
+            '           %s>\n'
+            '  <wp:simplePos x="0" y="0"/>\n'
+            '  <wp:positionH relativeFrom="page">\n'
+            '    <wp:posOffset>%d</wp:posOffset>\n'
+            '  </wp:positionH>\n'
+            '  <wp:positionV relativeFrom="page">\n'
+            '    <wp:posOffset>%d</wp:posOffset>\n'
+            '  </wp:positionV>\n'                    
+            '  <wp:extent cx="914400" cy="914400"/>\n'
+            '  <wp:wrapNone/>\n'
+            '  <wp:docPr id="666" name="unnamed"/>\n'
+            '  <wp:cNvGraphicFramePr>\n'
+            '    <a:graphicFrameLocks noChangeAspect="1"/>\n'
+            '  </wp:cNvGraphicFramePr>\n'
+            '  <a:graphic>\n'
+            '    <a:graphicData uri="URI not set"/>\n'
+            '  </a:graphic>\n'
+            '</wp:anchor>' % (nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y))
+        )
+
+def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
+    rId, image = part.get_or_add_image(image_descriptor)
+    cx, cy = image.scaled_dimensions(width, height)
+    shape_id, filename = part.next_id, image.filename    
+    return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)
+
+class CHSIConverter:
+    @staticmethod
+    def extract_text_from_pdf(pdf_path):
+        try:
+            with open(pdf_path, 'rb') as pdf_file_obj:
+                pdf_reader = PdfReader(pdf_file_obj)
+                text = "".join(page.extract_text() for page in pdf_reader.pages)
+            return text
+        except Exception as e:
+            return make_response(f"<script>alert('从PDF提取文本错误: {e}'); window.location.href = document.referrer;</script>")
+
+    @staticmethod
+    def extract_info(patterns_dict, text):
+        results = {}
+        for prop, pattern in patterns_dict.items():
+            match = re.search(pattern, text)
+            if match:
+                value = match.group(1)
+                if prop == 'Name':
+                    pinyin = lazy_pinyin(value)
+                    results[prop] = f"{''.join(pinyin[1:]).capitalize()} {pinyin[0].capitalize()}"
+                elif prop == 'Gender':
+                    results[prop] = 'Male' if value == '男' else 'Female' if value == '女' else None
+                elif prop == 'Ethnic':
+                    results[prop] = ''.join(lazy_pinyin(value[:-1])).title()
+                elif prop in ['Date of Birth', 'Date of Enrollment', 'Update Date']:
+                    year, rest = value.split('年')
+                    month, day = rest.split('月')
+                    results[prop] = f"{month}/{day.replace('日', '')}/{year}"
+                elif prop == 'Levels':
+                    results[prop] = 'Undergraduate'
+                elif prop == 'Form':
+                    results[prop] = 'General full-time remote study'
+                elif prop == 'Educational System':
+                    results[prop] = f'{value} years'
+                elif prop == 'Type':
+                    results[prop] = 'General higher education'
+                elif prop == 'School Status':
+                    date_part = value.split("：")[1]
+                    year, rest = date_part.split('年')
+                    month, day = rest.split('月')
+                    day = day.replace('日', '').replace('', '')
+                    formatted_date = f"{month}/{day}/{year}"
+                    results[prop] = f'Student registration (Expected graduation date: {formatted_date})'
+                else:
+                    results[prop] = value
+            else:
+                results[prop] = None
+        return results
+
+    @classmethod
+    def extract_info_from_pdf(cls, path):
+        text = cls.extract_text_from_pdf(path)
+        patterns_dict = {prop: re.compile(r'{}\s*([^\s]*)'.format(pattern)) for prop, pattern in {
+            'Update Date': '更新日期：',
+            'Name': '姓名',
+            'Gender': '性别',
+            'Id Number': '证件号码',
+            'Ethnic': '民族',
+            'Date of Birth': '出生日期 ',
+            'Institution': '院校',
+            'Levels': '层次',
+            'Faculties': '院系',
+            'Class': '班级',
+            'Major': '专业',
+            'Student Number': '学号',
+            'Form': '形式',
+            'Date of Enrollment': '入学日期',
+            'Educational System': '学制',
+            'Type': '类型',
+            'School Status': '学籍状态',
+        }.items()}
+        return cls.extract_info(patterns_dict, text)
+
+    @staticmethod
+    def extract_image_from_pdf(path, page_number, left, top, right, bottom):
+        try:
+            images = convert_from_path(path, dpi=300, first_page=page_number, last_page=page_number)
+            image = images[0]
+            cropped_image = image.crop((left, top, right, bottom))
+            file_name = os.path.splitext(path)[0]
+            image_path = f"{file_name}_image.png"
+            cropped_image.save(image_path)
+            return image_path
+        except Exception as e:
+            return make_response(f"<script>alert('从PDF提取图片错误: {e}'); window.location.href = document.referrer;</script>")
+
+    @staticmethod
+    def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
+        try:
+            run = p.add_run()
+            anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
+            run._r.add_drawing(anchor)
+        except Exception:
+            return make_response("<script>alert('浮动图片添加时发生错误'); window.location.href = document.referrer;</script>")
+
+    @classmethod
+    def convert_to_docx(cls, path):
+        try:
+            extracted_info = cls.extract_info_from_pdf(path)
+            doc = Document("static/template.docx")
+
+            paragraph = doc.add_paragraph()
+            doc.element.body.insert(1, paragraph._element)
+            paragraph.alignment = 1
+            paragraph.add_run('Update date:' + extracted_info['Update Date'])
+
+            del extracted_info['Update Date']
+
+            table = doc.add_table(rows=1, cols=2)
+            table.autofit = False
+
+            for cell in table.columns[0].cells:
+                cell.width = Inches(0.5)
+            for cell in table.columns[1].cells:
+                cell.width = Inches(5.0)
+
+            border_xml = '<w:tcBorders xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' \
+                        '<w:top w:val="nil"/>' \
+                        '<w:left w:val="nil"/>' \
+                        '<w:bottom w:val="nil"/>' \
+                        '<w:right w:val="nil"/>' \
+                        '</w:tcBorders>'
+
+            for key, value in extracted_info.items():
+                cells = table.add_row().cells
+                for cell in cells:
+                    cell._element.get_or_add_tcPr().append(parse_xml(border_xml))
+                    cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
+
+                is_last = key == list(extracted_info.keys())[-1]
+                cells[0].text = key + ("" if is_last else "\n") 
+                cells[1].text = value + ("" if is_last else "\n")
+
+            cropped_image_1 = cls.extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026)
+            cls.add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140))
+
+            cropped_image_2 = cls.extract_image_from_pdf(path, 1, 300, 2690, 630, 2985)
+            cls.add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643))
+
+            output_path = path.replace(".pdf", ".docx")
+            doc.save(output_path)
+
+            return output_path
+        
+        except Exception as e:
+            return make_response(f"<script>alert('Error during DOCX conversion: {e}'); window.location.href = document.referrer;</script>")
+
+    @classmethod
+    def convert_file(cls):
+        if 'file' not in request.files:
+            return make_response("<script>alert('缺少文件部分'); window.location.href = document.referrer;</script>")
+
+        file = request.files['file']
+        if file.filename == '':
+            return make_response("<script>alert('没有选中的文件'); window.location.href = document.referrer;</script>")
+
+        if not file.filename.lower().endswith('.pdf'):
+            return make_response("<script>alert('只接受 PDF 文件'); window.location.href = document.referrer;</script>")
+
+        if not file.filename.startswith('教育部学籍在线验证报告_'):
+            return make_response("<script>alert('请不要传入无关文件'); window.location.href = document.referrer;</script>")
+        
+        try:
+            filename = secure_filename(file.filename)
+            filepath = os.path.join(os.getcwd(), 'upload', filename)
+            file.save(filepath)
+
+            output_path = cls.convert_to_docx(filepath)
+
+            directory = os.path.dirname(output_path)
+            filename = os.path.basename(output_path)
+            output_filename = str(uuid.uuid4()) + '.docx'
+
+            response = make_response(send_from_directory(directory, filename, as_attachment=True))
+            response.headers["Content-Disposition"] = f"attachment; filename={output_filename}"
+            
+            # 隐私处理
+            upload_folder = os.path.join(os.getcwd(), 'upload')
+            for filename in os.listdir(upload_folder):
+                if filename != '.gitkeep': 
+                    file_path = os.path.join(upload_folder, filename)
+                    try:
+                        if os.path.isfile(file_path) or os.path.islink(file_path):
+                            os.unlink(file_path)
+                        elif os.path.isdir(file_path):
+                            shutil.rmtree(file_path)
+                    except Exception as e:
+                        print(f"Failed to delete {file_path}. Reason: {e}")
+            
+            return response
+        except Exception as e:
+            return make_response(f"<script>alert('处理文件时发生错误: {e}'); window.location.href = document.referrer;</script>")
+
+register_element_cls('wp:anchor', CT_Anchor)
\ No newline at end of file