We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
The reproduced code is as follows:
import hashlib import os from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional, Dict from urllib.parse import urlparse import pymupdf4llm import requests def is_valid_url(url: str) -> bool: """Check if the url is valid.""" parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def handle_remote_file(file_path: str, temp_dir: TemporaryDirectory, headers: Optional[Dict] = None): # If the file is a web path, download it to a temporary file, and use that if not os.path.isfile(file_path) and is_valid_url(file_path): web_path = file_path r = requests.get(web_path, headers=headers) if r.status_code != 200: raise ValueError( "Check the url of your file; returned status code %s" % r.status_code ) _, suffix = os.path.splitext(file_path) suffix = suffix.split("?")[0] md5_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest() temp_pdf = os.path.join(temp_dir.name, f"tmp_{md5_hash}{suffix}") with open(temp_pdf, mode="wb") as f: f.write(r.content) file_path = str(temp_pdf) elif not os.path.isfile(file_path): raise ValueError("File path %s is not a valid file or url" % file_path) return file_path def pdf_to_markdown(file_path: str, temp_dir: TemporaryDirectory, md_file_path: str = None, image_path: str = None, write_images: bool = True, image_format: str = "png", dpi: int = 200): file_path = handle_remote_file(file_path, temp_dir=temp_dir) if md_file_path is None: filename, ext = os.path.splitext(os.path.basename(file_path)) md_file_path = os.path.join(temp_dir.name, f"{filename}.md") if image_path is None: image_path = os.path.join(os.path.dirname(md_file_path), f"images") md_text = pymupdf4llm.to_markdown(doc=file_path, write_images=write_images, image_path=image_path, image_format=image_format, dpi=dpi, ) # 写入Markdown文件 Path(md_file_path).write_bytes(md_text.encode()) print(f"Markdown file path: {md_file_path}") return md_file_path if __name__ == '__main__': file_path = "https://arxiv.org/pdf/2411.07264" md_file_path = pdf_to_markdown(file_path=file_path, temp_dir=TemporaryDirectory(), md_file_path="test2.md", ) print(md_file_path)
When I used the above code to convert the PDF to markdown format, I found that the position of the extracted image was incorrect.
The image position in the converted markdown file is:
The expected image position is:
The text was updated successfully, but these errors were encountered:
No branches or pull requests
The reproduced code is as follows:
When I used the above code to convert the PDF to markdown format, I found that the position of the extracted image was incorrect.
The image position in the converted markdown file is:
The expected image position is:
The text was updated successfully, but these errors were encountered: