Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The position of the extracted image is incorrect #189

Open
edisonwd opened this issue Nov 19, 2024 · 0 comments
Open

The position of the extracted image is incorrect #189

edisonwd opened this issue Nov 19, 2024 · 0 comments

Comments

@edisonwd
Copy link

The reproduced code is as follows:

import hashlib
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, Dict
from urllib.parse import urlparse

import pymupdf4llm
import requests


def is_valid_url(url: str) -> bool:
    """Check if the url is valid."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def handle_remote_file(file_path: str, temp_dir: TemporaryDirectory, headers: Optional[Dict] = None):
    # If the file is a web path, download it to a temporary file, and use that
    if not os.path.isfile(file_path) and is_valid_url(file_path):
        web_path = file_path
        r = requests.get(web_path, headers=headers)
        if r.status_code != 200:
            raise ValueError(
                "Check the url of your file; returned status code %s"
                % r.status_code
            )
        _, suffix = os.path.splitext(file_path)
        suffix = suffix.split("?")[0]

        md5_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest()
        temp_pdf = os.path.join(temp_dir.name, f"tmp_{md5_hash}{suffix}")
        with open(temp_pdf, mode="wb") as f:
            f.write(r.content)
        file_path = str(temp_pdf)
    elif not os.path.isfile(file_path):
        raise ValueError("File path %s is not a valid file or url" % file_path)

    return file_path


def pdf_to_markdown(file_path: str,
                    temp_dir: TemporaryDirectory,
                    md_file_path: str = None,
                    image_path: str = None,
                    write_images: bool = True,
                    image_format: str = "png",
                    dpi: int = 200):
    file_path = handle_remote_file(file_path, temp_dir=temp_dir)

    if md_file_path is None:
        filename, ext = os.path.splitext(os.path.basename(file_path))
        md_file_path = os.path.join(temp_dir.name, f"{filename}.md")

    if image_path is None:
        image_path = os.path.join(os.path.dirname(md_file_path), f"images")

    md_text = pymupdf4llm.to_markdown(doc=file_path,
                                      write_images=write_images,
                                      image_path=image_path,
                                      image_format=image_format,
                                      dpi=dpi,
                                      )

    # 写入Markdown文件
    Path(md_file_path).write_bytes(md_text.encode())
    print(f"Markdown file path: {md_file_path}")

    return md_file_path


if __name__ == '__main__':
    file_path = "https://arxiv.org/pdf/2411.07264"
    md_file_path = pdf_to_markdown(file_path=file_path,
                                   temp_dir=TemporaryDirectory(),
                                   md_file_path="test2.md",
                                   )
    print(md_file_path)

When I used the above code to convert the PDF to markdown format, I found that the position of the extracted image was incorrect.

The image position in the converted markdown file is:
image

The expected image position is:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant