The position of the extracted image is incorrect #189

edisonwd · 2024-11-19T11:36:55Z

The reproduced code is as follows:

import hashlib
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, Dict
from urllib.parse import urlparse

import pymupdf4llm
import requests


def is_valid_url(url: str) -> bool:
    """Check if the url is valid."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def handle_remote_file(file_path: str, temp_dir: TemporaryDirectory, headers: Optional[Dict] = None):
    # If the file is a web path, download it to a temporary file, and use that
    if not os.path.isfile(file_path) and is_valid_url(file_path):
        web_path = file_path
        r = requests.get(web_path, headers=headers)
        if r.status_code != 200:
            raise ValueError(
                "Check the url of your file; returned status code %s"
                % r.status_code
            )
        _, suffix = os.path.splitext(file_path)
        suffix = suffix.split("?")[0]

        md5_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest()
        temp_pdf = os.path.join(temp_dir.name, f"tmp_{md5_hash}{suffix}")
        with open(temp_pdf, mode="wb") as f:
            f.write(r.content)
        file_path = str(temp_pdf)
    elif not os.path.isfile(file_path):
        raise ValueError("File path %s is not a valid file or url" % file_path)

    return file_path


def pdf_to_markdown(file_path: str,
                    temp_dir: TemporaryDirectory,
                    md_file_path: str = None,
                    image_path: str = None,
                    write_images: bool = True,
                    image_format: str = "png",
                    dpi: int = 200):
    file_path = handle_remote_file(file_path, temp_dir=temp_dir)

    if md_file_path is None:
        filename, ext = os.path.splitext(os.path.basename(file_path))
        md_file_path = os.path.join(temp_dir.name, f"{filename}.md")

    if image_path is None:
        image_path = os.path.join(os.path.dirname(md_file_path), f"images")

    md_text = pymupdf4llm.to_markdown(doc=file_path,
                                      write_images=write_images,
                                      image_path=image_path,
                                      image_format=image_format,
                                      dpi=dpi,
                                      )

    # 写入Markdown文件
    Path(md_file_path).write_bytes(md_text.encode())
    print(f"Markdown file path: {md_file_path}")

    return md_file_path


if __name__ == '__main__':
    file_path = "https://arxiv.org/pdf/2411.07264"
    md_file_path = pdf_to_markdown(file_path=file_path,
                                   temp_dir=TemporaryDirectory(),
                                   md_file_path="test2.md",
                                   )
    print(md_file_path)

When I used the above code to convert the PDF to markdown format, I found that the position of the extracted image was incorrect.

The image position in the converted markdown file is:

The expected image position is:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

The position of the extracted image is incorrect #189

The position of the extracted image is incorrect #189

edisonwd commented Nov 19, 2024

The position of the extracted image is incorrect #189

The position of the extracted image is incorrect #189

Comments

edisonwd commented Nov 19, 2024