Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make safe filename safe to use on POSIX #44

Merged
merged 4 commits into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
Release notes
=============

Version (next)
Version (next)
------------------------------

TBD.
- Add ``posix_only`` option to ``commoncode.paths.portable_filename`` and
``commoncode.paths.safe_path``. This option prevents
``commoncode.paths.portable_filename`` and ``commoncode.paths.safe_path`` from
replacing filenames and punctuation in filenames that are valid on POSIX
operating systems, but not Windows.

Version 31.0.0 - (2022-05-16)
------------------------------
Expand Down Expand Up @@ -50,7 +54,7 @@ This is a major version with API-breaking changes in the resource module.
otherwise missing from files path list.
In particular this behaviour changed when you create a VirtualCodebase from
a previous Codebase created with a "full_root" argument. Previously, the
missing paths of a "full_root" Codebase were kept unchanged.
missing paths of a "full_root" Codebase were kept unchanged.
Note that the VirtualCodebase has always ignored the "full_root" argument.

- The Codebase and VirtualCodebase are now iterable. Iterating on a codebase
Expand Down Expand Up @@ -80,7 +84,7 @@ Other changes:

- Remove Python upper version limit.
- Merge latest skeleton
- fileutils.parent_directory() now accepts a "with_trail" argument.
- fileutils.parent_directory() now accepts a "with_trail" argument.
The returned directory has a trailing path separator unless with_trail is False.
The default is True and the default behaviour is unchanged.

Expand Down
63 changes: 43 additions & 20 deletions src/commoncode/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# Build OS-portable and safer paths


def safe_path(path, posix=False, preserve_spaces=False):
def safe_path(path, posix=False, preserve_spaces=False, posix_only=False):
"""
Convert `path` to a safe and portable POSIX path usable on multiple OSes.
The returned path is an ASCII-only byte string, resolved for relative
Expand All @@ -52,7 +52,13 @@ def safe_path(path, posix=False, preserve_spaces=False):
_pathmod, path_sep = path_handlers(path, posix)

segments = [s.strip() for s in path.split(path_sep) if s.strip()]
segments = [portable_filename(s, preserve_spaces=preserve_spaces) for s in segments]
segments = [
portable_filename(
s,
preserve_spaces=preserve_spaces,
posix_only=posix_only
) for s in segments
]

if not segments:
return '_'
Expand Down Expand Up @@ -134,17 +140,34 @@ def resolve(path, posix=True):
return path


legal_punctuation = r"!\#$%&\(\)\+,\-\.;\=@\[\]_\{\}\~"
legal_spaces = r" "
legal_chars = r'A-Za-z0-9' + legal_punctuation
legal_punctuation = r'!\#$%&\(\)\+,\-\.;\=@\[\]_\{\}\~'
legal_spaces = r' '
legal_alphanumeric = r'A-Za-z0-9'
legal_chars = legal_alphanumeric + legal_punctuation
legal_chars_inc_spaces = legal_chars + legal_spaces
illegal_chars_re = r'[^' + legal_chars + r']'
illegal_chars_exc_spaces_re = r'[^' + legal_chars_inc_spaces + r']'
replace_illegal_chars = re.compile(illegal_chars_re).sub
replace_illegal_chars_exc_spaces = re.compile(illegal_chars_exc_spaces_re).sub


def portable_filename(filename, preserve_spaces=False):
posix_legal_punctuation = r'<:"/>\|\*\^\\\'`\?' + legal_punctuation
posix_legal_chars = legal_alphanumeric + posix_legal_punctuation
posix_legal_chars_inc_spaces = posix_legal_chars + legal_spaces
posix_illegal_chars_re = r'[^' + posix_legal_chars + r']'
posix_illegal_chars_exc_spaces_re = r'[^' + posix_legal_chars_inc_spaces + r']'
replace_illegal_posix_chars = re.compile(posix_illegal_chars_re).sub
replace_illegal_posix_chars_exc_spaces = re.compile(posix_illegal_chars_exc_spaces_re).sub


ILLEGAL_WINDOWS_NAMES = set([
'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
'aux', 'con', 'nul', 'prn'
])


def portable_filename(filename, preserve_spaces=False, posix_only=False):
"""
Return a new name for `filename` that is portable across operating systems.

Expand All @@ -170,22 +193,21 @@ def portable_filename(filename, preserve_spaces=False):
if not filename:
return '_'

if preserve_spaces:
filename = replace_illegal_chars_exc_spaces('_', filename)
if posix_only:
if preserve_spaces:
filename = replace_illegal_posix_chars_exc_spaces('_', filename)
else:
filename = replace_illegal_posix_chars('_', filename)
else:
filename = replace_illegal_chars('_', filename)

# these are illegal both upper and lowercase and with or without an extension
# we insert an underscore after the base name.
windows_illegal_names = set([
'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
'aux', 'con', 'nul', 'prn'
])
if preserve_spaces:
filename = replace_illegal_chars_exc_spaces('_', filename)
else:
filename = replace_illegal_chars('_', filename)

basename, dot, extension = filename.partition('.')
if basename.lower() in windows_illegal_names:
filename = ''.join([basename, '_', dot, extension])
if not posix_only:
basename, dot, extension = filename.partition('.')
if basename.lower() in ILLEGAL_WINDOWS_NAMES:
filename = ''.join([basename, '_', dot, extension])

# no name made only of dots.
if set(filename) == set(['.']):
Expand All @@ -198,6 +220,7 @@ def portable_filename(filename, preserve_spaces=False):

return filename


#
# paths comparisons, common prefix and suffix extraction
#
Expand Down
26 changes: 26 additions & 0 deletions tests/test_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ def test_safe_path_posix_style_many_dots(self):
expected = 'dotdot/dotdot/dotdot/webform.components.inc'
assert test == expected

def test_safe_path_posix_only(self):
test_path = 'var/lib/dpkg/info/libgsm1:amd64.list'
test = paths.safe_path(test_path)
expected = 'var/lib/dpkg/info/libgsm1_amd64.list'
assert test == expected
test = paths.safe_path(test_path, posix_only=True)
assert test == test_path

def test_resolve_mixed_slash(self):
test = paths.resolve('C:\\..\\./drupal.js')
expected = 'C/drupal.js'
Expand Down Expand Up @@ -140,6 +148,24 @@ def test_portable_filename(self):
expected = 'This_contain_UMLAUT_umlauts.txt'
assert paths.portable_filename(u'This contain UMLAUT \xfcml\xe4uts.txt') == expected

# Check to see if illegal Windows filenames are properly handled
for illegal_window_name in paths.ILLEGAL_WINDOWS_NAMES:
# Rename files with names that are illegal on Windows
expected = f'{illegal_window_name}_'
assert paths.portable_filename(illegal_window_name) == expected

# Allow files with names that are illegal on Windows
assert paths.portable_filename(illegal_window_name, posix_only=True) == illegal_window_name

# Check to see if the posix_only option does and does not replace
# punctuation characters that are illegal in Windows filenames
for valid_posix_path_char in paths.posix_legal_punctuation:
test_name = f'test{valid_posix_path_char}'
assert paths.portable_filename(test_name, posix_only=True) == test_name
if valid_posix_path_char not in paths.legal_punctuation:
expected = f'test_'
assert paths.portable_filename(test_name) == expected


class TestCommonPath(TestCase):

Expand Down