Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Извлечение изображений из docx/pptx и добавление в БД #605

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,23 @@
logs_collection = db.create_collection(
'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
celery_check_collection = db['celery_check'] # collection for mapping celery_task to check
images_collection = db['images'] # коллекция для хранения изображений


def get_client():
return client

def save_image_to_db(check_id, image_data, caption):
from app.db.db_types import Image

image = Image({
'check_id': check_id,
'image_data': image_data,
'caption': caption
})
images_collection.insert_one(image.pack())
print(str(check_id) + " " + str(caption))


# Returns user if user was created and None if already exists
def add_user(username, password_hash='', is_LTI=False):
Expand Down
15 changes: 15 additions & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,18 @@ def none_to_false(x):
is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False
is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False
return {'is_ended': is_ended, 'is_failed': is_failed}

class Image(PackableWithId):
def __init__(self, dictionary=None):
super().__init__(dictionary)
dictionary = dictionary or {}
self.check_id = dictionary.get('check_id') # Привязка к check_id
self.caption = dictionary.get('caption', '') # Подпись к изображению
self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata

def pack(self):
package = super().pack()
package['check_id'] = str(self.check_id)
package['caption'] = self.caption
package['image_data'] = self.image_data
return package
39 changes: 36 additions & 3 deletions app/main/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,39 @@
from main.reports.md_uploader import MdUploader
from utils import convert_to

logger = logging.getLogger('root_logger')
from os.path import basename
from app.db.db_methods import add_check
from app.db.db_types import Check

logger = logging.getLogger('root_logger')

def parse(filepath, pdf_filepath):
from app.db.db_methods import files_info_collection

tmp_filepath = filepath.lower()
try:
if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
new_filepath = filepath
if tmp_filepath.endswith(('.odp', '.ppt')):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)

presentation = PresentationPPTX(new_filepath)

check = Check({
'filename': basename(new_filepath),
})

file_id = 0
file = files_info_collection.find_one({'name': basename(new_filepath)})
if file:
file_id = file['_id']

check_id = add_check(file_id, check)
presentation.extract_images_with_captions(check_id)
file_object = presentation


elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
Expand All @@ -28,7 +49,19 @@ def parse(filepath, pdf_filepath):

docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)

check = Check({
'filename': basename(new_filepath),
})

file_id = 0
file = files_info_collection.find_one({'name': basename(new_filepath)})
if file:
file_id = file['_id']

check_id = add_check(file_id, check)
docx.parse()
docx.extract_images_with_captions(check_id)
file_object = docx

elif tmp_filepath.endswith('.md' ):
Expand All @@ -54,4 +87,4 @@ def save_to_temp_file(file):
temp_file.write(file.read())
temp_file.close()
file.seek(0)
return temp_file.name
return temp_file.name
39 changes: 39 additions & 0 deletions app/main/presentations/pptx/presentation_pptx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from io import BytesIO

from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

from .slide_pptx import SlidePPTX
from ..presentation_basic import PresentationBasic
Expand All @@ -17,3 +20,39 @@ def add_slides(self):

def __str__(self):
return super().__str__()

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db

# Проход по каждому слайду в презентации
for slide in self.slides:
image_found = False
image_data = None
caption_text = None

# Проход по всем фигурам на слайде
for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
image_found = True
image_part = shape.image # Получаем объект изображения

# Извлекаем бинарные данные изображения
image_stream = image_part.blob
image_data = BytesIO(image_stream)

# Если мы нашли изображение, ищем следующий непустой текст как подпись
if image_found:
for shape in slide.slide.shapes:
if not shape.has_text_frame:
continue
text = shape.text.strip()
if text: # Находим непустое текстовое поле (предположительно, это подпись)
caption_text = text
# Сохраняем изображение и его подпись
save_image_to_db(check_id, image_data.getvalue(), caption_text)
break # Предполагаем, что это подпись к текущему изображению

# Сброс флага и данных изображения для следующего цикла
image_found = False
image_data = None
caption_text = None
52 changes: 52 additions & 0 deletions app/main/reports/docx_uploader/docx_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,58 @@ def show_chapters(self, work_type):
chapters_str += "&nbsp;&nbsp;&nbsp;&nbsp;" + header["text"] + "<br>"
return chapters_str

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db

image_found = False
image_data = None

# Проход по всем параграфам документа
for i, paragraph in enumerate(self.file.paragraphs):
# Проверяем, есть ли в параграфе встроенные объекты
for run in paragraph.runs:
if "graphic" in run._element.xml: # может быть изображение

# Извлечение бинарных данных изображения
image_streams = run._element.findall('.//a:blip', namespaces={
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
for image_stream in image_streams:
embed_id = image_stream.get(
'{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_id:
image_found = True
image_data = self.file.part.related_parts[embed_id].blob

# Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
if image_found:
# Переход к следующему параграфу
next_paragraph_index = i + 1

# Проверяем, есть ли следующий параграф
if next_paragraph_index < len(self.file.paragraphs):
while next_paragraph_index < len(self.file.paragraphs):
next_paragraph = self.file.paragraphs[next_paragraph_index]
next_paragraph_text = next_paragraph.text.strip()

# Проверка, не содержит ли следующий параграф также изображение
contains_image = any(
"graphic" in run._element.xml for run in next_paragraph.runs
)

# Если параграф не содержит изображения и текст не пуст, то это подпись
if not contains_image and next_paragraph_text:
# Сохраняем изображение и его подпись
save_image_to_db(check_id, image_data, next_paragraph_text)
break
else:
save_image_to_db(check_id, image_data, "picture without caption")
break
else:
save_image_to_db(check_id, image_data, "picture without caption")

image_found = False # Сброс флага, чтобы искать следующее изображение
image_data = None # Очистка данных изображения


def main(args):
file = args.file
Expand Down
Loading