From 6d5a9c70c53c44a0dfb71d635adfe3063c6549fe Mon Sep 17 00:00:00 2001 From: Nagi-ovo <13264500190@163.com> Date: Thu, 15 Aug 2024 13:39:10 +0800 Subject: [PATCH] refactor: integrate core functionality into CHSIConverter class --- add_float_picture.py | 103 ----------------- app.py | 113 +----------------- extract_img.py | 28 ----- extract_info.py | 111 ------------------ utils.py | 267 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 350 deletions(-) delete mode 100644 add_float_picture.py delete mode 100644 extract_img.py delete mode 100644 extract_info.py create mode 100644 utils.py diff --git a/add_float_picture.py b/add_float_picture.py deleted file mode 100644 index 8f3367b..0000000 --- a/add_float_picture.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# filename: add_float_picture.py - -''' -Implement floating image based on python-docx. -- Text wrapping style: BEHIND TEXT -- Picture position: top-left corner of PAGE ``. -Create a docx sample (Layout | Positions | More Layout Options) and explore the -source xml (Open as a zip | word | document.xml) to implement other text wrapping -styles and position modes per `CT_Anchor._anchor_xml()`. -''' - -from docx.oxml import parse_xml, register_element_cls -from docx.oxml.ns import nsdecls -from docx.oxml.shape import CT_Picture -from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne -from flask import make_response - -# refer to docx.oxml.shape.CT_Inline -class CT_Anchor(BaseOxmlElement): - """ - ```` element, container for a floating image. - """ - extent = OneAndOnlyOne('wp:extent') - docPr = OneAndOnlyOne('wp:docPr') - graphic = OneAndOnlyOne('a:graphic') - - @classmethod - def new(cls, cx, cy, shape_id, pic, pos_x, pos_y): - """ - Return a new ```` element populated with the values passed - as parameters. - """ - anchor = parse_xml(cls._anchor_xml(pos_x, pos_y)) - anchor.extent.cx = cx - anchor.extent.cy = cy - anchor.docPr.id = shape_id - anchor.docPr.name = 'Picture %d' % shape_id - anchor.graphic.graphicData.uri = ( - 'http://schemas.openxmlformats.org/drawingml/2006/picture' - ) - anchor.graphic.graphicData._insert_pic(pic) - return anchor - - @classmethod - def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y): - """ - Return a new `wp:anchor` element containing the `pic:pic` element - specified by the argument values. - """ - pic_id = 0 # Word doesn't seem to use this, but does not omit it - pic = CT_Picture.new(pic_id, filename, rId, cx, cy) - anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y) - anchor.graphic.graphicData._insert_pic(pic) - return anchor - @classmethod - def _anchor_xml(cls, pos_x, pos_y): - return ( - '\n' - ' \n' - ' \n' - ' %d\n' - ' \n' - ' \n' - ' %d\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) ) - ) -# refer to docx.parts.story.BaseStoryPart.new_pic_inline -def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y): - """Return a newly-created `w:anchor` element. - The element contains the image specified by *image_descriptor* and is scaled - based on the values of *width* and *height*. - """ - rId, image = part.get_or_add_image(image_descriptor) - cx, cy = image.scaled_dimensions(width, height) - shape_id, filename = part.next_id, image.filename - return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y) -# refer to docx.text.run.add_picture -def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0): - try: - """Add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page. - """ - run = p.add_run() - anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y) - run._r.add_drawing(anchor) - except Exception as e: - return make_response("") - -# refer to docx.oxml.__init__.py -register_element_cls('wp:anchor', CT_Anchor) \ No newline at end of file diff --git a/app.py b/app.py index 493735b..cfe5912 100644 --- a/app.py +++ b/app.py @@ -1,122 +1,19 @@ -from flask import Flask, request, render_template, send_from_directory, make_response -from werkzeug.utils import secure_filename +from flask import Flask, render_template +from utils import CHSIConverter import os -from add_float_picture import add_float_picture -from extract_img import extract_image_from_pdf -from extract_info import extract_info_from_pdf -from docx import Document -from docx.shared import Inches, Pt -from docx.enum.table import WD_ALIGN_VERTICAL -from docx.oxml import parse_xml -import uuid -import shutil app = Flask(__name__) -def convert_to_docx(path): - try: - extracted_info = extract_info_from_pdf(path) - doc = Document("static/template.docx") - - paragraph = doc.add_paragraph() - doc.element.body.insert(1, paragraph._element) - paragraph.alignment = 1 - paragraph.add_run('Update date:' + extracted_info['Update Date']) - - del extracted_info['Update Date'] - - table = doc.add_table(rows=1, cols=2) - table.autofit = False - - for cell in table.columns[0].cells: - cell.width = Inches(0.5) - for cell in table.columns[1].cells: - cell.width = Inches(5.0) - - border_xml = '' \ - '' \ - '' \ - '' \ - '' \ - '' - - for key, value in extracted_info.items(): - cells = table.add_row().cells - for cell in cells: - cell._element.get_or_add_tcPr().append(parse_xml(border_xml)) - cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER - - is_last = key == list(extracted_info.keys())[-1] - cells[0].text = key + ("" if is_last else "\n") - cells[1].text = value + ("" if is_last else "\n") - - cropped_image_1 = extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026) - add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140)) - - cropped_image_2 = extract_image_from_pdf(path, 1, 300, 2690, 630, 2985) - add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643)) - - output_path = path.replace(".pdf", ".docx") - doc.save(output_path) - - return output_path - - except Exception as e: - return make_response(f"") - @app.route('/') def home(): return render_template('index.html') @app.route('/convert', methods=['POST']) -def convert_file(): - if 'file' not in request.files: - return make_response("") - - file = request.files['file'] - if file.filename == '': - return make_response("") - - if not file.filename.lower().endswith('.pdf'): - return make_response("") - - if not file.filename.startswith('教育部学籍在线验证报告_'): - return make_response("") - - try: - filename = secure_filename(file.filename) - filepath = os.path.join(os.getcwd(), 'upload', filename) - file.save(filepath) - - output_path = convert_to_docx(filepath) - - directory = os.path.dirname(output_path) - filename = os.path.basename(output_path) - output_filename = str(uuid.uuid4()) + '.docx' - - response = make_response(send_from_directory(directory, filename, as_attachment=True)) - response.headers["Content-Disposition"] = f"attachment; filename={output_filename}" - - # 隐私处理 - upload_folder = os.path.join(os.getcwd(), 'upload') - for filename in os.listdir(upload_folder): - if filename != '.gitkeep': - file_path = os.path.join(upload_folder, filename) - try: - if os.path.isfile(file_path) or os.path.islink(file_path): - os.unlink(file_path) - elif os.path.isdir(file_path): - shutil.rmtree(file_path) - except Exception as e: - print(f"Failed to delete {file_path}. Reason: {e}") - - return response - except Exception as e: - return make_response(f"") +def handle_convert(): + return CHSIConverter.convert_file() if __name__ == '__main__': - # debug_mode = os.getenv('FLASK_DEBUG', 'false').lower() == 'true' port = int(os.getenv('FLASK_PORT', 5001)) app.run(debug=True, port=port, host='0.0.0.0') else: - application=app + application = app \ No newline at end of file diff --git a/extract_img.py b/extract_img.py deleted file mode 100644 index cb9742d..0000000 --- a/extract_img.py +++ /dev/null @@ -1,28 +0,0 @@ -from pdf2image import convert_from_path -from flask import make_response -import os - -def extract_image_from_pdf(path, page_number, left, top, right, bottom): - ''' - pdf_path = " " - page_number = 1 - left = 1898 # 左边界坐标 - top = 583 # 上边界坐标 - right = 2230 # 右边界坐标 - bottom = 1026 # 下边界坐标 - ''' - try: - images = convert_from_path(path, dpi=300, first_page=page_number, last_page=page_number) - image = images[0] - - cropped_image = image.crop((left, top, right, bottom)) - file_name = os.path.splitext(path)[0] # 获取文件的基本名称(不包括扩展名) - image_path = f"{file_name}_image.png" # 拼接新的文件路径,包括正确的扩展名 - cropped_image.save(image_path) # 将截取的图片保存到同一路径下 - return image_path - except Exception as e: - return make_response(f"") - - - - diff --git a/extract_info.py b/extract_info.py deleted file mode 100644 index d21ae5d..0000000 --- a/extract_info.py +++ /dev/null @@ -1,111 +0,0 @@ -import re -from pypdf import PdfReader -from pypinyin import lazy_pinyin -from flask import make_response - -def extract_text_from_pdf(pdf_path): - try: - pdf_file_obj = open(pdf_path, 'rb') - pdf_reader = PdfReader(pdf_file_obj) - - text = "" - for page in pdf_reader.pages: - text += page.extract_text() - - pdf_file_obj.close() - return text - except Exception as e: - return make_response(f"") - -def extract_info(patterns_dict, text): - results = {} - for prop, pattern in patterns_dict.items(): - match = re.search(pattern, text) - if match: - if prop == 'Name': - pinyin = lazy_pinyin(match.group(1)) - first_char = pinyin[0].capitalize() - remaining_chars = ''.join(pinyin[1:]).capitalize() - results[prop] = remaining_chars + ' ' + first_char - elif prop == 'Gender': - if match.group(1) == '男': - results[prop] = 'Male' - elif match.group(1) == '女': - results[prop] = 'Female' - else: - results[prop] = None - elif prop == 'Ethnic': - value = match.group(1) - pinyin_list = lazy_pinyin(value[:-1]) - results[prop] = ''.join(pinyin_list).title() - elif prop == 'Date of Birth' or prop == 'Date of Enrollment': - value = match.group(1) - parts = value.split('年') - year = parts[0] - month_day = parts[1].split('月') - formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year - results[prop] = formatted_date - elif prop == 'Levels' : - results[prop] = 'Undergraduate' - elif prop == 'Form' : - results[prop] = 'General full-time remote study' - elif prop == 'Educational System': - results[prop] = match.group(1)+' years' - elif prop == 'Type': - results[prop] = 'General higher education' - elif prop == 'School Status': - value = match.group(1) - date_part = value.split(":")[1] - parts = date_part.split('年') - year = parts[0] - month_day = parts[1].split('月') - day = month_day[1].replace('日', '').replace(')', '') # Remove trailing bracket from the day - formatted_date = month_day[0] + '/' + day + '/' + year - results[prop] = 'Student registration (Expected graduation date: ' + formatted_date + ')' - elif prop == 'Update Date': - value = match.group(1) - parts = value.split('年') - year = parts[0] - month_day = parts[1].split('月') - formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year - results[prop] = formatted_date - else: - results[prop] = match.group(1) - else: - results[prop] = None - return results - -def extract_info_from_pdf(path): - text = extract_text_from_pdf(path) - - def rc(pattern): - return re.compile(r'{}\s*([^\s]*)'.format(pattern)) - - patterns_dict = { - 'Update Date': rc('更新日期:'), - 'Name': rc('姓名'), - 'Gender': rc('性别'), - 'Id Number': rc('证件号码'), - 'Ethnic': rc('民族'), - 'Date of Birth': rc('出生日期 '), - 'Institution': rc('院校'), - 'Levels': rc('层次'), - 'Faculties': rc('院系'), - 'Class': rc('班级'), - 'Major': rc('专业'), - 'Student Number': rc('学号'), - 'Form': rc('形式'), - 'Date of Enrollment': rc('入学日期'), - 'Educational System': rc('学制'), - 'Type': rc('类型'), - 'School Status': rc('学籍状态'), - } - - # 获取匹配的信息 - results = extract_info(patterns_dict, text) - - # for prop, value in results.items(): - # print(f'{prop}: {value}') - - return results - diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..996bd0b --- /dev/null +++ b/utils.py @@ -0,0 +1,267 @@ +from flask import request, make_response, send_from_directory +from werkzeug.utils import secure_filename +from docx import Document +from docx.shared import Inches, Pt +from docx.enum.table import WD_ALIGN_VERTICAL +from docx.oxml import parse_xml, register_element_cls +from docx.oxml.ns import nsdecls +from docx.oxml.shape import CT_Picture +from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne +from pypdf import PdfReader +from pypinyin import lazy_pinyin +from pdf2image import convert_from_path +import re +import os +import uuid +import shutil + +class CT_Anchor(BaseOxmlElement): + extent = OneAndOnlyOne('wp:extent') + docPr = OneAndOnlyOne('wp:docPr') + graphic = OneAndOnlyOne('a:graphic') + + @classmethod + def new(cls, cx, cy, shape_id, pic, pos_x, pos_y): + anchor = parse_xml(cls._anchor_xml(pos_x, pos_y)) + anchor.extent.cx = cx + anchor.extent.cy = cy + anchor.docPr.id = shape_id + anchor.docPr.name = f'Picture {shape_id}' + anchor.graphic.graphicData.uri = 'http://schemas.openxmlformats.org/drawingml/2006/picture' + anchor.graphic.graphicData._insert_pic(pic) + return anchor + + @classmethod + def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y): + pic_id = 0 + pic = CT_Picture.new(pic_id, filename, rId, cx, cy) + anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y) + anchor.graphic.graphicData._insert_pic(pic) + return anchor + + @classmethod + def _anchor_xml(cls, pos_x, pos_y): + return ( + '\n' + ' \n' + ' \n' + ' %d\n' + ' \n' + ' \n' + ' %d\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '' % (nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y)) + ) + +def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y): + rId, image = part.get_or_add_image(image_descriptor) + cx, cy = image.scaled_dimensions(width, height) + shape_id, filename = part.next_id, image.filename + return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y) + +class CHSIConverter: + @staticmethod + def extract_text_from_pdf(pdf_path): + try: + with open(pdf_path, 'rb') as pdf_file_obj: + pdf_reader = PdfReader(pdf_file_obj) + text = "".join(page.extract_text() for page in pdf_reader.pages) + return text + except Exception as e: + return make_response(f"") + + @staticmethod + def extract_info(patterns_dict, text): + results = {} + for prop, pattern in patterns_dict.items(): + match = re.search(pattern, text) + if match: + value = match.group(1) + if prop == 'Name': + pinyin = lazy_pinyin(value) + results[prop] = f"{''.join(pinyin[1:]).capitalize()} {pinyin[0].capitalize()}" + elif prop == 'Gender': + results[prop] = 'Male' if value == '男' else 'Female' if value == '女' else None + elif prop == 'Ethnic': + results[prop] = ''.join(lazy_pinyin(value[:-1])).title() + elif prop in ['Date of Birth', 'Date of Enrollment', 'Update Date']: + year, rest = value.split('年') + month, day = rest.split('月') + results[prop] = f"{month}/{day.replace('日', '')}/{year}" + elif prop == 'Levels': + results[prop] = 'Undergraduate' + elif prop == 'Form': + results[prop] = 'General full-time remote study' + elif prop == 'Educational System': + results[prop] = f'{value} years' + elif prop == 'Type': + results[prop] = 'General higher education' + elif prop == 'School Status': + date_part = value.split(":")[1] + year, rest = date_part.split('年') + month, day = rest.split('月') + day = day.replace('日', '').replace('', '') + formatted_date = f"{month}/{day}/{year}" + results[prop] = f'Student registration (Expected graduation date: {formatted_date})' + else: + results[prop] = value + else: + results[prop] = None + return results + + @classmethod + def extract_info_from_pdf(cls, path): + text = cls.extract_text_from_pdf(path) + patterns_dict = {prop: re.compile(r'{}\s*([^\s]*)'.format(pattern)) for prop, pattern in { + 'Update Date': '更新日期:', + 'Name': '姓名', + 'Gender': '性别', + 'Id Number': '证件号码', + 'Ethnic': '民族', + 'Date of Birth': '出生日期 ', + 'Institution': '院校', + 'Levels': '层次', + 'Faculties': '院系', + 'Class': '班级', + 'Major': '专业', + 'Student Number': '学号', + 'Form': '形式', + 'Date of Enrollment': '入学日期', + 'Educational System': '学制', + 'Type': '类型', + 'School Status': '学籍状态', + }.items()} + return cls.extract_info(patterns_dict, text) + + @staticmethod + def extract_image_from_pdf(path, page_number, left, top, right, bottom): + try: + images = convert_from_path(path, dpi=300, first_page=page_number, last_page=page_number) + image = images[0] + cropped_image = image.crop((left, top, right, bottom)) + file_name = os.path.splitext(path)[0] + image_path = f"{file_name}_image.png" + cropped_image.save(image_path) + return image_path + except Exception as e: + return make_response(f"") + + @staticmethod + def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0): + try: + run = p.add_run() + anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y) + run._r.add_drawing(anchor) + except Exception: + return make_response("") + + @classmethod + def convert_to_docx(cls, path): + try: + extracted_info = cls.extract_info_from_pdf(path) + doc = Document("static/template.docx") + + paragraph = doc.add_paragraph() + doc.element.body.insert(1, paragraph._element) + paragraph.alignment = 1 + paragraph.add_run('Update date:' + extracted_info['Update Date']) + + del extracted_info['Update Date'] + + table = doc.add_table(rows=1, cols=2) + table.autofit = False + + for cell in table.columns[0].cells: + cell.width = Inches(0.5) + for cell in table.columns[1].cells: + cell.width = Inches(5.0) + + border_xml = '' \ + '' \ + '' \ + '' \ + '' \ + '' + + for key, value in extracted_info.items(): + cells = table.add_row().cells + for cell in cells: + cell._element.get_or_add_tcPr().append(parse_xml(border_xml)) + cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER + + is_last = key == list(extracted_info.keys())[-1] + cells[0].text = key + ("" if is_last else "\n") + cells[1].text = value + ("" if is_last else "\n") + + cropped_image_1 = cls.extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026) + cls.add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140)) + + cropped_image_2 = cls.extract_image_from_pdf(path, 1, 300, 2690, 630, 2985) + cls.add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643)) + + output_path = path.replace(".pdf", ".docx") + doc.save(output_path) + + return output_path + + except Exception as e: + return make_response(f"") + + @classmethod + def convert_file(cls): + if 'file' not in request.files: + return make_response("") + + file = request.files['file'] + if file.filename == '': + return make_response("") + + if not file.filename.lower().endswith('.pdf'): + return make_response("") + + if not file.filename.startswith('教育部学籍在线验证报告_'): + return make_response("") + + try: + filename = secure_filename(file.filename) + filepath = os.path.join(os.getcwd(), 'upload', filename) + file.save(filepath) + + output_path = cls.convert_to_docx(filepath) + + directory = os.path.dirname(output_path) + filename = os.path.basename(output_path) + output_filename = str(uuid.uuid4()) + '.docx' + + response = make_response(send_from_directory(directory, filename, as_attachment=True)) + response.headers["Content-Disposition"] = f"attachment; filename={output_filename}" + + # 隐私处理 + upload_folder = os.path.join(os.getcwd(), 'upload') + for filename in os.listdir(upload_folder): + if filename != '.gitkeep': + file_path = os.path.join(upload_folder, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print(f"Failed to delete {file_path}. Reason: {e}") + + return response + except Exception as e: + return make_response(f"") + +register_element_cls('wp:anchor', CT_Anchor) \ No newline at end of file