Skip to content

Commit

Permalink
Merge pull request #580 from marabuuu/simplify-pdf-image-conversion
Browse files Browse the repository at this point in the history
Exchange library pdf2image for pdfium2 in `highlights-download-statistics.py`
  • Loading branch information
haesleinhuepf authored Jan 17, 2025
2 parents c4d878b + 3e1696c commit e89ff61
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 21 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ numpy
pandas
pygithub
toolz
pdf2image
pypdfium2
37 changes: 17 additions & 20 deletions scripts/highlights-download-statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from datetime import datetime
import requests
from pdf2image import convert_from_bytes
import pypdfium2
from io import BytesIO
from PIL import Image

Expand Down Expand Up @@ -85,32 +85,29 @@ def download_first_pdf_file_from_zenodo(folder, record_id):

# Get the first PDF file's download link and file name
for file_info in data['files']:
if file_info['key'].endswith('pdf'):
if file_info['key'].endswith('.pdf'):
file_url = file_info['links']['self']
file_name = file_info['key']

# Download the file content
# Download the file content into memory
response = requests.get(file_url)
response.raise_for_status()
file_content = BytesIO(response.content)

# Check file extension and convert based on that
file_extension = os.path.splitext(file_name)[1].lower()
date = datetime.now().strftime("%Y%m%d")

os.makedirs(path_to_png, exist_ok=True)
# Convert first page of PDF to PNG using pdfium
pdf = pypdfium2.PdfDocument(file_content)
page = pdf[0]
pil_image = page.render(
scale=2.0,
rotation=0
).to_pil()

if file_extension == '.pdf':
# Convert first page of PDF to PNG
pages = convert_from_bytes(file_content.getvalue())
img = pages[0]
img = resize_image(img, height=300)
img.save(path_to_png + f'{date}_first_page_{record_id}.png', 'PNG')
print("First page of PDF saved as PNG.")
break

else:
print(f"Unsupported file type: {file_extension}")
# Save the image
date = datetime.now().strftime("%Y%m%d")
os.makedirs(folder, exist_ok=True)
png_path = os.path.join(folder, f'{date}_first_page_{record_id}.png')
pil_image.save(png_path, 'PNG')
print(f"First page of PDF saved as PNG at {png_path}.")
break

return license_info

Expand Down

0 comments on commit e89ff61

Please sign in to comment.