-
Notifications
You must be signed in to change notification settings - Fork 0
/
rasterize.py
67 lines (61 loc) · 2.17 KB
/
rasterize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
Copyright (c) Meta Platforms, Inc. and affiliates.
This source code is licensed under the MIT license found in the
LICENSE file in the root directory of this source tree.
"""
import argparse
import logging
import pypdfium2
from pathlib import Path
from tqdm import tqdm
import io
from typing import Optional, List, Union
logging.getLogger("pypdfium2").setLevel(logging.WARNING)
def rasterize_paper(
pdf: Union[Path, bytes],
outpath: Optional[Path] = None,
dpi: int = 96,
return_pil=False,
pages=None,
) -> Optional[List[io.BytesIO]]:
"""
Rasterize a PDF file to PNG images.
Args:
pdf (Path): The path to the PDF file.
outpath (Optional[Path], optional): The output directory. If None, the PIL images will be returned instead. Defaults to None.
dpi (int, optional): The output DPI. Defaults to 96.
return_pil (bool, optional): Whether to return the PIL images instead of writing them to disk. Defaults to False.
pages (Optional[List[int]], optional): The pages to rasterize. If None, all pages will be rasterized. Defaults to None.
Returns:
Optional[List[io.BytesIO]]: The PIL images if `return_pil` is True, otherwise None.
"""
pils = []
if outpath is None:
return_pil = True
else:
outpath.mkdir(parents=True, exist_ok=True)
try:
if isinstance(pdf, (str, Path)):
paper_id = pdf.stem
pdf = pypdfium2.PdfDocument(pdf)
if pages is None:
pages = range(len(pdf))
renderer = pdf.render(
pypdfium2.PdfBitmap.to_pil,
page_indices=pages,
scale=dpi / 72,
)
for i, image in zip(pages, renderer):
if return_pil:
page_bytes = io.BytesIO()
image.save(page_bytes, "bmp")
pils.append(page_bytes)
else:
image_file_name = "%02d.png" % (i + 1)
paper_dir = outpath / paper_id
paper_dir.mkdir(parents=True, exist_ok=True)
image.save((paper_dir / image_file_name), "png")
except Exception as e:
logging.error(e)
if return_pil:
return pils