From 24b6a43c215a73f5b826dcd604902915d3433a69 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 10 Jun 2022 17:44:13 -0700 Subject: [PATCH 1/4] Create POSIX safe filename function Signed-off-by: Jono Yang --- src/commoncode/paths.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/commoncode/paths.py b/src/commoncode/paths.py index 402ef14..f3c6f29 100644 --- a/src/commoncode/paths.py +++ b/src/commoncode/paths.py @@ -198,6 +198,40 @@ def portable_filename(filename, preserve_spaces=False): return filename + +posix_legal_punctuation = r"!@#$%^&\*\(\)-_=\+\[\{\]\}\\\|;:'\",<.>\/\?`~\ " +posix_legal_characters = r"A-Za-z0-9" + posix_legal_punctuation +posix_illegal_characters_re = r"[^" + posix_legal_characters + r"]" +replace_illegal_posix_chars = re.compile(posix_illegal_characters_re).sub + + +def posix_safe_filename(filename): + """ + Return a new name for `filename` that is portable across POSIX systems. + + Filenames returned by `posix_safe_filename` are not guarenteed to be valid + on Windows systems as they may contain characters not allowed in Windows + filenames. + """ + filename = toascii(filename, translit=True) + + if not filename: + return '_' + + filename = replace_illegal_posix_chars('_', filename) + + # no name made only of dots. + if set(filename) == set(['.']): + filename = 'dot' * len(filename) + + # replaced any leading dotdot + if filename != '..' and filename.startswith('..'): + while filename.startswith('..'): + filename = filename.replace('..', '__', 1) + + return filename + + # # paths comparisons, common prefix and suffix extraction # From f37a07bddd36ac5d57e90f243e718ae2c7799370 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 10 Jun 2022 17:59:29 -0700 Subject: [PATCH 2/4] Update portable_filename with new posix option Signed-off-by: Jono Yang --- src/commoncode/paths.py | 81 ++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 49 deletions(-) diff --git a/src/commoncode/paths.py b/src/commoncode/paths.py index f3c6f29..6f1602c 100644 --- a/src/commoncode/paths.py +++ b/src/commoncode/paths.py @@ -144,7 +144,16 @@ def resolve(path, posix=True): replace_illegal_chars_exc_spaces = re.compile(illegal_chars_exc_spaces_re).sub -def portable_filename(filename, preserve_spaces=False): +posix_legal_punctuation = r"!@#$%^&\*\(\)-_=\+\[\{\]\}\\\|;:'\",<.>\/\?`~" +posix_legal_chars = r"A-Za-z0-9" + posix_legal_punctuation +posix_legal_chars_inc_spaces = posix_legal_chars + legal_spaces +posix_illegal_chars_re = r"[^" + posix_legal_chars + r"]" +posix_illegal_chars_exc_spaces_re = r"[^" + posix_legal_chars_inc_spaces + r"]" +replace_illegal_posix_chars = re.compile(posix_illegal_chars_re).sub +replace_illegal_posix_chars_exc_spaces = re.compile(posix_illegal_chars_exc_spaces_re).sub + + +def portable_filename(filename, preserve_spaces=False, posix_only=False): """ Return a new name for `filename` that is portable across operating systems. @@ -170,55 +179,29 @@ def portable_filename(filename, preserve_spaces=False): if not filename: return '_' - if preserve_spaces: - filename = replace_illegal_chars_exc_spaces('_', filename) + if posix_only: + if preserve_spaces: + filename = replace_illegal_posix_chars_exc_spaces('_', filename) + else: + filename = replace_illegal_posix_chars('_', filename) else: - filename = replace_illegal_chars('_', filename) - - # these are illegal both upper and lowercase and with or without an extension - # we insert an underscore after the base name. - windows_illegal_names = set([ - 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', - 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', - 'aux', 'con', 'nul', 'prn' - ]) - - basename, dot, extension = filename.partition('.') - if basename.lower() in windows_illegal_names: - filename = ''.join([basename, '_', dot, extension]) - - # no name made only of dots. - if set(filename) == set(['.']): - filename = 'dot' * len(filename) - - # replaced any leading dotdot - if filename != '..' and filename.startswith('..'): - while filename.startswith('..'): - filename = filename.replace('..', '__', 1) - - return filename - - -posix_legal_punctuation = r"!@#$%^&\*\(\)-_=\+\[\{\]\}\\\|;:'\",<.>\/\?`~\ " -posix_legal_characters = r"A-Za-z0-9" + posix_legal_punctuation -posix_illegal_characters_re = r"[^" + posix_legal_characters + r"]" -replace_illegal_posix_chars = re.compile(posix_illegal_characters_re).sub - - -def posix_safe_filename(filename): - """ - Return a new name for `filename` that is portable across POSIX systems. - - Filenames returned by `posix_safe_filename` are not guarenteed to be valid - on Windows systems as they may contain characters not allowed in Windows - filenames. - """ - filename = toascii(filename, translit=True) - - if not filename: - return '_' - - filename = replace_illegal_posix_chars('_', filename) + if preserve_spaces: + filename = replace_illegal_chars_exc_spaces('_', filename) + else: + filename = replace_illegal_chars('_', filename) + + if not posix_only: + # these are illegal both upper and lowercase and with or without an extension + # we insert an underscore after the base name. + windows_illegal_names = set([ + 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', + 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', + 'aux', 'con', 'nul', 'prn' + ]) + + basename, dot, extension = filename.partition('.') + if basename.lower() in windows_illegal_names: + filename = ''.join([basename, '_', dot, extension]) # no name made only of dots. if set(filename) == set(['.']): From 6434fdf2f74a41371f52514b6cf0b5b587553411 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 10 Jun 2022 18:05:15 -0700 Subject: [PATCH 3/4] Update safe_path function signature Signed-off-by: Jono Yang --- src/commoncode/paths.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/commoncode/paths.py b/src/commoncode/paths.py index 6f1602c..ef1c667 100644 --- a/src/commoncode/paths.py +++ b/src/commoncode/paths.py @@ -26,7 +26,7 @@ # Build OS-portable and safer paths -def safe_path(path, posix=False, preserve_spaces=False): +def safe_path(path, posix=False, preserve_spaces=False, posix_only=False): """ Convert `path` to a safe and portable POSIX path usable on multiple OSes. The returned path is an ASCII-only byte string, resolved for relative @@ -52,7 +52,13 @@ def safe_path(path, posix=False, preserve_spaces=False): _pathmod, path_sep = path_handlers(path, posix) segments = [s.strip() for s in path.split(path_sep) if s.strip()] - segments = [portable_filename(s, preserve_spaces=preserve_spaces) for s in segments] + segments = [ + portable_filename( + s, + preserve_spaces=preserve_spaces, + posix_only=posix_only + ) for s in segments + ] if not segments: return '_' From b88d65ad9d8640660814c6dfc8b9fc1cf187ecb3 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 2 Aug 2022 16:34:07 -0700 Subject: [PATCH 4/4] Test posix_only argument for portable_filename * Test posix_only argument for safe_path * Update CHANGELOG.rst Signed-off-by: Jono Yang --- CHANGELOG.rst | 12 ++++++++---- src/commoncode/paths.py | 32 ++++++++++++++++---------------- tests/test_paths.py | 26 ++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b65d657..66ca009 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,10 +1,14 @@ Release notes ============= -Version (next) +Version (next) ------------------------------ -TBD. +- Add ``posix_only`` option to ``commoncode.paths.portable_filename`` and + ``commoncode.paths.safe_path``. This option prevents + ``commoncode.paths.portable_filename`` and ``commoncode.paths.safe_path`` from + replacing filenames and punctuation in filenames that are valid on POSIX + operating systems, but not Windows. Version 31.0.0 - (2022-05-16) ------------------------------ @@ -50,7 +54,7 @@ This is a major version with API-breaking changes in the resource module. otherwise missing from files path list. In particular this behaviour changed when you create a VirtualCodebase from a previous Codebase created with a "full_root" argument. Previously, the - missing paths of a "full_root" Codebase were kept unchanged. + missing paths of a "full_root" Codebase were kept unchanged. Note that the VirtualCodebase has always ignored the "full_root" argument. - The Codebase and VirtualCodebase are now iterable. Iterating on a codebase @@ -80,7 +84,7 @@ Other changes: - Remove Python upper version limit. - Merge latest skeleton -- fileutils.parent_directory() now accepts a "with_trail" argument. +- fileutils.parent_directory() now accepts a "with_trail" argument. The returned directory has a trailing path separator unless with_trail is False. The default is True and the default behaviour is unchanged. diff --git a/src/commoncode/paths.py b/src/commoncode/paths.py index ef1c667..e9e7141 100644 --- a/src/commoncode/paths.py +++ b/src/commoncode/paths.py @@ -140,9 +140,10 @@ def resolve(path, posix=True): return path -legal_punctuation = r"!\#$%&\(\)\+,\-\.;\=@\[\]_\{\}\~" -legal_spaces = r" " -legal_chars = r'A-Za-z0-9' + legal_punctuation +legal_punctuation = r'!\#$%&\(\)\+,\-\.;\=@\[\]_\{\}\~' +legal_spaces = r' ' +legal_alphanumeric = r'A-Za-z0-9' +legal_chars = legal_alphanumeric + legal_punctuation legal_chars_inc_spaces = legal_chars + legal_spaces illegal_chars_re = r'[^' + legal_chars + r']' illegal_chars_exc_spaces_re = r'[^' + legal_chars_inc_spaces + r']' @@ -150,15 +151,22 @@ def resolve(path, posix=True): replace_illegal_chars_exc_spaces = re.compile(illegal_chars_exc_spaces_re).sub -posix_legal_punctuation = r"!@#$%^&\*\(\)-_=\+\[\{\]\}\\\|;:'\",<.>\/\?`~" -posix_legal_chars = r"A-Za-z0-9" + posix_legal_punctuation +posix_legal_punctuation = r'<:"/>\|\*\^\\\'`\?' + legal_punctuation +posix_legal_chars = legal_alphanumeric + posix_legal_punctuation posix_legal_chars_inc_spaces = posix_legal_chars + legal_spaces -posix_illegal_chars_re = r"[^" + posix_legal_chars + r"]" -posix_illegal_chars_exc_spaces_re = r"[^" + posix_legal_chars_inc_spaces + r"]" +posix_illegal_chars_re = r'[^' + posix_legal_chars + r']' +posix_illegal_chars_exc_spaces_re = r'[^' + posix_legal_chars_inc_spaces + r']' replace_illegal_posix_chars = re.compile(posix_illegal_chars_re).sub replace_illegal_posix_chars_exc_spaces = re.compile(posix_illegal_chars_exc_spaces_re).sub +ILLEGAL_WINDOWS_NAMES = set([ + 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', + 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', + 'aux', 'con', 'nul', 'prn' +]) + + def portable_filename(filename, preserve_spaces=False, posix_only=False): """ Return a new name for `filename` that is portable across operating systems. @@ -197,16 +205,8 @@ def portable_filename(filename, preserve_spaces=False, posix_only=False): filename = replace_illegal_chars('_', filename) if not posix_only: - # these are illegal both upper and lowercase and with or without an extension - # we insert an underscore after the base name. - windows_illegal_names = set([ - 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', - 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', - 'aux', 'con', 'nul', 'prn' - ]) - basename, dot, extension = filename.partition('.') - if basename.lower() in windows_illegal_names: + if basename.lower() in ILLEGAL_WINDOWS_NAMES: filename = ''.join([basename, '_', dot, extension]) # no name made only of dots. diff --git a/tests/test_paths.py b/tests/test_paths.py index 7357182..be855c2 100644 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -93,6 +93,14 @@ def test_safe_path_posix_style_many_dots(self): expected = 'dotdot/dotdot/dotdot/webform.components.inc' assert test == expected + def test_safe_path_posix_only(self): + test_path = 'var/lib/dpkg/info/libgsm1:amd64.list' + test = paths.safe_path(test_path) + expected = 'var/lib/dpkg/info/libgsm1_amd64.list' + assert test == expected + test = paths.safe_path(test_path, posix_only=True) + assert test == test_path + def test_resolve_mixed_slash(self): test = paths.resolve('C:\\..\\./drupal.js') expected = 'C/drupal.js' @@ -140,6 +148,24 @@ def test_portable_filename(self): expected = 'This_contain_UMLAUT_umlauts.txt' assert paths.portable_filename(u'This contain UMLAUT \xfcml\xe4uts.txt') == expected + # Check to see if illegal Windows filenames are properly handled + for illegal_window_name in paths.ILLEGAL_WINDOWS_NAMES: + # Rename files with names that are illegal on Windows + expected = f'{illegal_window_name}_' + assert paths.portable_filename(illegal_window_name) == expected + + # Allow files with names that are illegal on Windows + assert paths.portable_filename(illegal_window_name, posix_only=True) == illegal_window_name + + # Check to see if the posix_only option does and does not replace + # punctuation characters that are illegal in Windows filenames + for valid_posix_path_char in paths.posix_legal_punctuation: + test_name = f'test{valid_posix_path_char}' + assert paths.portable_filename(test_name, posix_only=True) == test_name + if valid_posix_path_char not in paths.legal_punctuation: + expected = f'test_' + assert paths.portable_filename(test_name) == expected + class TestCommonPath(TestCase):