Skip to content

Commit

Permalink
BUG: Title sometimes is bytes and not str.
Browse files Browse the repository at this point in the history
  • Loading branch information
reformy committed Nov 1, 2024
1 parent 98aa974 commit e837839
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ def _get_text(self, key: str) -> Optional[str]:
retval = self.get(key, None)
if isinstance(retval, TextStringObject):
return retval
if isinstance(retval, ByteStringObject):
try:
return retval.decode(encoding='utf-8')
except UnicodeDecodeError:
raise PyPdfError("Unable to decode text in metadata using UTF-8.")
return None

@property
Expand Down
Binary file added resources/bytes.pdf
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,19 @@ def test_read_metadata(pdf_path, expected):
docinfo.modification_date
docinfo.modification_date_raw
if "/Title" in metadict:
assert isinstance(docinfo.title, str)
assert metadict["/Title"] == docinfo.title


def test_read_metadata_title_is_bytes():
with open(RESOURCE_ROOT / "bytes.pdf", "rb") as inputfile:
reader = PdfReader(inputfile)
docinfo = reader.metadata
metadict = dict(docinfo)
assert isinstance(docinfo.title, str)
assert docinfo.title == 'Microsoft Word - トランスバース社買収電話会議英語Final.docx'


def test_iss1943():
with PdfReader(RESOURCE_ROOT / "crazyones.pdf") as reader:
docinfo = reader.metadata
Expand Down

0 comments on commit e837839

Please sign in to comment.