Skip to content

Commit

Permalink
Allow user to specify encoding
Browse files Browse the repository at this point in the history
The code was always hardcoding utf-8 as an encoding which
was producing wrong results for SHIFT-JIS (Japanese) file names. Thus I have added an optional argument for encoding which by default is set to utf-8 but can be specified to any other value if encoding is not utf-8
  • Loading branch information
Darkhood148 committed Mar 24, 2024
1 parent 2732b6b commit 08f3e03
Showing 1 changed file with 48 additions and 36 deletions.
84 changes: 48 additions & 36 deletions pycdlib/pycdlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,6 @@ def _find_dr_record_by_name(vd, path, encoding):
return root_dir_record

splitpath = utils.split_path(path)

currpath = splitpath.pop(0).decode('utf-8').encode(encoding)

entry = root_dir_record
Expand All @@ -505,7 +504,6 @@ def _find_dr_record_by_name(vd, path, encoding):
index = lo
if index != len(thelist) and thelist[index].file_ident == currpath:
child = thelist[index]

if child is None:
# We failed to find this component of the path, so break out of the
# loop and fail.
Expand All @@ -520,7 +518,6 @@ def _find_dr_record_by_name(vd, path, encoding):
# We found the last child we are looking for; return it.
if not splitpath:
return child

if not child.is_dir():
break
entry = child
Expand Down Expand Up @@ -705,8 +702,8 @@ def _seek_to_extent(self, extent):
self._cdfp.seek(extent * self.logical_block_size)

@functools.lru_cache(maxsize=256)
def _find_iso_record(self, iso_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_iso_record(self, iso_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given an ISO
path. If the entry is found, it returns the directory record object
Expand All @@ -718,11 +715,11 @@ def _find_iso_record(self, iso_path):
Returns:
The directory record entry representing the entry on the ISO.
"""
return _find_dr_record_by_name(self.pvd, iso_path, 'utf-8')
return _find_dr_record_by_name(self.pvd, iso_path, encoding)

@functools.lru_cache(maxsize=256)
def _find_rr_record(self, rr_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_rr_record(self, rr_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given a Rock
Ridge path. If the entry is found, it returns the directory record
Expand All @@ -742,7 +739,7 @@ def _find_rr_record(self, rr_path):

splitpath = utils.split_path(rr_path)

currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)

entry = root_dir_record

Expand Down Expand Up @@ -793,13 +790,13 @@ def _find_rr_record(self, rr_path):
if not child.is_dir():
break
entry = child
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)

raise pycdlibexception.PyCdlibInvalidInput('Could not find path')

@functools.lru_cache(maxsize=256)
def _find_joliet_record(self, joliet_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_joliet_record(self, joliet_path, encoding='utf-16_be'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given a Joliet
path. If the entry is found, it returns the directory record object
Expand All @@ -813,7 +810,7 @@ def _find_joliet_record(self, joliet_path):
"""
if self.joliet_vd is None:
raise pycdlibexception.PyCdlibInternalError('Joliet path requested on non-Joliet ISO')
return _find_dr_record_by_name(self.joliet_vd, joliet_path, 'utf-16_be')
return _find_dr_record_by_name(self.joliet_vd, joliet_path, encoding)

@functools.lru_cache(maxsize=256)
def _find_udf_record(self, udf_path):
Expand Down Expand Up @@ -2412,8 +2409,8 @@ def _udf_get_file_from_iso_fp(self, outfp, blocksize, udf_path):
utils.copy_data(data_len, blocksize, data_fp, outfp)

def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
joliet_path):
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes]) -> None
joliet_path, encoding=None):
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes], str) -> None
"""
An internal method to fetch a single file from the ISO and write it out
to the file object.
Expand All @@ -2433,13 +2430,16 @@ def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
if joliet_path is not None:
if self.joliet_vd is None:
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a joliet_path from a non-Joliet ISO')
found_record = self._find_joliet_record(joliet_path)
encoding = encoding or 'utf-16_be'
found_record = self._find_joliet_record(joliet_path, encoding)
elif rr_path is not None:
if not self.rock_ridge:
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a rr_path from a non-Rock Ridge ISO')
found_record = self._find_rr_record(rr_path)
encoding = encoding or 'utf-8'
found_record = self._find_rr_record(rr_path, encoding)
elif iso_path is not None:
found_record = self._find_iso_record(iso_path)
encoding = encoding or 'utf-8'
found_record = self._find_iso_record(iso_path, encoding)
else:
raise pycdlibexception.PyCdlibInternalError('Invalid path passed to get_file_from_iso_fp')

Expand Down Expand Up @@ -3471,8 +3471,8 @@ def _rm_joliet_dir(self, joliet_path):

return num_bytes_to_remove

def _get_iso_entry(self, iso_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_iso_entry(self, iso_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for an ISO path.
Expand All @@ -3484,10 +3484,10 @@ def _get_iso_entry(self, iso_path):
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_iso_record(iso_path)
return self._find_iso_record(iso_path, encoding)

def _get_rr_entry(self, rr_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_rr_entry(self, rr_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for a Rock Ridge path.
Expand All @@ -3500,10 +3500,10 @@ def _get_rr_entry(self, rr_path):
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_rr_record(rr_path)
return self._find_rr_record(rr_path, encoding)

def _get_joliet_entry(self, joliet_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_joliet_entry(self, joliet_path, encoding='utf-16_be'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for a Joliet path.
Expand All @@ -3516,7 +3516,7 @@ def _get_joliet_entry(self, joliet_path):
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_joliet_record(joliet_path)
return self._find_joliet_record(joliet_path, encoding)

def _get_udf_entry(self, udf_path):
# type: (str) -> udfmod.UDFFileEntry
Expand Down Expand Up @@ -4183,6 +4183,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
iso_path = None
rr_path = None
udf_path = None
encoding = None
num_paths = 0
for key, value in kwargs.items():
if key == 'blocksize':
Expand Down Expand Up @@ -4213,6 +4214,8 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
num_paths += 1
elif value is not None:
raise pycdlibexception.PyCdlibInvalidInput('udf_path must be a string')
elif key == 'encoding':
encoding = value
else:
raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key))

Expand All @@ -4223,7 +4226,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
self._udf_get_file_from_iso_fp(outfp, blocksize, udf_path)
else:
self._get_file_from_iso_fp(outfp, blocksize, iso_path, rr_path,
joliet_path)
joliet_path, encoding)

def get_and_write(self, iso_path, local_path, blocksize=8192):
# type: (str, str, int) -> None
Expand Down Expand Up @@ -5459,6 +5462,8 @@ def list_children(self, **kwargs):
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'):
if value is not None:
num_paths += 1
elif key in ('encoding'):
continue
else:
raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'")

Expand All @@ -5476,12 +5481,15 @@ def list_children(self, **kwargs):
else:
use_rr = False
if 'joliet_path' in kwargs:
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']))
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-16_be'
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']), kwargs['encoding'])
elif 'rr_path' in kwargs:
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']))
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']), kwargs['encoding'])
use_rr = True
else:
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']))
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']), kwargs['encoding'])

for c in _yield_children(rec, use_rr):
yield c
Expand Down Expand Up @@ -5626,8 +5634,8 @@ def rm_isohybrid(self):

self.isohybrid_mbr = None

def full_path_from_dirrecord(self, rec, rockridge=False):
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool) -> str
def full_path_from_dirrecord(self, rec, rockridge=False, user_encoding=None):
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool, str) -> str
"""
Get the absolute path of a directory record.
Expand All @@ -5646,6 +5654,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
if self.joliet_vd is not None and id(rec.vd) == id(self.joliet_vd):
encoding = 'utf-16_be'

if user_encoding:
encoding = user_encoding
# A root entry has no Rock Ridge entry, even on a Rock Ridge ISO.
# Always return / here.
if rec.is_root:
Expand Down Expand Up @@ -5685,6 +5695,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
encoding = rec.file_ident.encoding
else:
encoding = 'utf-8'
if user_encoding:
encoding = user_encoding
udf_rec = rec # type: Optional[udfmod.UDFFileEntry]
while udf_rec is not None:
ident = udf_rec.file_identifier()
Expand Down Expand Up @@ -5893,13 +5905,13 @@ def walk(self, **kwargs):
while dirs:
dir_record = dirs.popleft()

relpath = self.full_path_from_dirrecord(dir_record,
rockridge=path_type == 'rr_path')
relpath = self.full_path_from_dirrecord(dir_record, rockridge=path_type == 'rr_path',
user_encoding=user_encoding)
dirlist = []
filelist = []
dirdict = {}

for child in reversed(list(self.list_children(**{path_type: relpath}))):
for child in reversed(list(self.list_children(**{path_type: relpath, 'encoding': kwargs.get('encoding', None)}))):
if child is None or child.is_dot() or child.is_dotdot():
continue

Expand Down

0 comments on commit 08f3e03

Please sign in to comment.