Skip to content

Commit

Permalink
exchange library pdf2image for pdfium2
Browse files Browse the repository at this point in the history
  • Loading branch information
marabuuu committed Jan 7, 2025
1 parent d18ec67 commit 22e29bf
Showing 1 changed file with 17 additions and 20 deletions.
37 changes: 17 additions & 20 deletions scripts/highlights-download-statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from datetime import datetime
import requests
from pdf2image import convert_from_bytes
import pypdfium2
from io import BytesIO
from PIL import Image

Expand Down Expand Up @@ -85,32 +85,29 @@ def download_first_pdf_file_from_zenodo(folder, record_id):

# Get the first PDF file's download link and file name
for file_info in data['files']:
if file_info['key'].endswith('pdf'):
if file_info['key'].endswith('.pdf'):
file_url = file_info['links']['self']
file_name = file_info['key']

# Download the file content
# Download the file content into memory
response = requests.get(file_url)
response.raise_for_status()
file_content = BytesIO(response.content)

# Check file extension and convert based on that
file_extension = os.path.splitext(file_name)[1].lower()
date = datetime.now().strftime("%Y%m%d")

os.makedirs(path_to_png, exist_ok=True)
# Convert first page of PDF to PNG using pdfium
pdf = pypdfium2.PdfDocument(file_content)
page = pdf[0]
pil_image = page.render(
scale=2.0,
rotation=0
).to_pil()

if file_extension == '.pdf':
# Convert first page of PDF to PNG
pages = convert_from_bytes(file_content.getvalue())
img = pages[0]
img = resize_image(img, height=300)
img.save(path_to_png + f'{date}_first_page_{record_id}.png', 'PNG')
print("First page of PDF saved as PNG.")
break

else:
print(f"Unsupported file type: {file_extension}")
# Save the image
date = datetime.now().strftime("%Y%m%d")
os.makedirs(folder, exist_ok=True)
png_path = os.path.join(folder, f'{date}_first_page_{record_id}.png')
pil_image.save(png_path, 'PNG')
print(f"First page of PDF saved as PNG at {png_path}.")
break

return license_info

Expand Down

0 comments on commit 22e29bf

Please sign in to comment.