From 0c1b379ac1df511b01987732ed221e425833f948 Mon Sep 17 00:00:00 2001 From: Taus Date: Tue, 29 Apr 2025 15:12:38 +0000 Subject: [PATCH 01/10] Python: Extract files in hidden dirs by default Changes the default behaviour of the Python extractor so files inside hidden directories are extracted by default. Also adds an extractor option, `skip_hidden_directories`, which can be set to `true` in order to revert to the old behaviour. Finally, I made the logic surrounding what is logged in various cases a bit more obvious. Technically this changes the behaviour of the extractor (in that hidden excluded files will now be logged as `(excluded)`, but I think this makes more sense anyway. --- python/codeql-extractor.yml | 7 +++++++ python/extractor/semmle/traverser.py | 16 ++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/python/codeql-extractor.yml b/python/codeql-extractor.yml index 97a9e1f2cf2f..2bd1a9c0aa76 100644 --- a/python/codeql-extractor.yml +++ b/python/codeql-extractor.yml @@ -44,3 +44,10 @@ options: Use this setting with caution, the Python extractor requires Python 3 to run. type: string pattern: "^(py|python|python3)$" + skip_hidden_directories: + title: Controls whether hidden directories are skipped during extraction. + description: > + By default, CodeQL will extract all Python files, including ones located in hidden directories. By setting this option to true, these hidden directories will be skipped instead. + Accepted values are true and false. + type: string + pattern: "^(true|false)$" diff --git a/python/extractor/semmle/traverser.py b/python/extractor/semmle/traverser.py index ad8bd38ae735..0945d8ace4bf 100644 --- a/python/extractor/semmle/traverser.py +++ b/python/extractor/semmle/traverser.py @@ -83,11 +83,10 @@ def _treewalk(self, path): self.logger.debug("Ignoring %s (symlink)", fullpath) continue if isdir(fullpath): - if fullpath in self.exclude_paths or is_hidden(fullpath): - if is_hidden(fullpath): - self.logger.debug("Ignoring %s (hidden)", fullpath) - else: - self.logger.debug("Ignoring %s (excluded)", fullpath) + if fullpath in self.exclude_paths: + self.logger.debug("Ignoring %s (excluded)", fullpath) + elif is_hidden(fullpath): + self.logger.debug("Ignoring %s (hidden)", fullpath) else: empty = True for item in self._treewalk(fullpath): @@ -101,7 +100,12 @@ def _treewalk(self, path): self.logger.debug("Ignoring %s (filter)", fullpath) -if os.name== 'nt': +if os.environ.get("CODEQL_EXTRACTOR_PYTHON_OPTION_SKIP_HIDDEN_DIRECTORIES", "false") == "false": + + def is_hidden(path): + return False + +elif os.name== 'nt': import ctypes def is_hidden(path): From 605f2bff9ccf53b35751a371436d2ee62329b56e Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 2 May 2025 12:42:23 +0000 Subject: [PATCH 02/10] Python: Add integration test --- .../hidden-files/query-default.expected | 5 ++++ .../hidden-files/query-skipped.expected | 4 ++++ .../hidden-files/query.ql | 3 +++ .../.hidden_dir/visible_file_in_hidden_dir.py | 0 .../hidden-files/repo_dir/.hidden_file.py | 0 .../hidden-files/repo_dir/foo.py | 1 + .../cli-integration-test/hidden-files/test.sh | 24 +++++++++++++++++++ 7 files changed, 37 insertions(+) create mode 100644 python/extractor/cli-integration-test/hidden-files/query-default.expected create mode 100644 python/extractor/cli-integration-test/hidden-files/query-skipped.expected create mode 100644 python/extractor/cli-integration-test/hidden-files/query.ql create mode 100644 python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py create mode 100644 python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py create mode 100644 python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py create mode 100755 python/extractor/cli-integration-test/hidden-files/test.sh diff --git a/python/extractor/cli-integration-test/hidden-files/query-default.expected b/python/extractor/cli-integration-test/hidden-files/query-default.expected new file mode 100644 index 000000000000..cc92af624b37 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query-default.expected @@ -0,0 +1,5 @@ +| name | ++-------------------------------+ +| .hidden_file.py | +| foo.py | +| visible_file_in_hidden_dir.py | diff --git a/python/extractor/cli-integration-test/hidden-files/query-skipped.expected b/python/extractor/cli-integration-test/hidden-files/query-skipped.expected new file mode 100644 index 000000000000..688dbe00d570 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query-skipped.expected @@ -0,0 +1,4 @@ +| name | ++-----------------+ +| .hidden_file.py | +| foo.py | diff --git a/python/extractor/cli-integration-test/hidden-files/query.ql b/python/extractor/cli-integration-test/hidden-files/query.ql new file mode 100644 index 000000000000..3b1b3c03849b --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/query.ql @@ -0,0 +1,3 @@ +import python + +select any(File f).getShortName() as name order by name diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/visible_file_in_hidden_dir.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_file.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py new file mode 100644 index 000000000000..517b47df53c2 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/repo_dir/foo.py @@ -0,0 +1 @@ +print(42) diff --git a/python/extractor/cli-integration-test/hidden-files/test.sh b/python/extractor/cli-integration-test/hidden-files/test.sh new file mode 100755 index 000000000000..77cb12664af6 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/test.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/ + +set -x + +CODEQL=${CODEQL:-codeql} + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$SCRIPTDIR" + +rm -rf db db-skipped + +# Test 1: Default behavior should be to extract files in hidden directories +$CODEQL database create db --language python --source-root repo_dir/ +$CODEQL query run --database db query.ql > query-default.actual +diff query-default.expected query-default.actual + +# Test 2: Setting the relevant extractor option to true skips files in hidden directories +$CODEQL database create db-skipped --language python --source-root repo_dir/ --extractor-option python.skip_hidden_directories=true +$CODEQL query run --database db-skipped query.ql > query-skipped.actual +diff query-skipped.expected query-skipped.actual + +rm -rf db db-skipped From 67d04d5477065a59f2bc0f706b5af4366b08293b Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 30 Apr 2025 12:38:31 +0000 Subject: [PATCH 03/10] Python: Add change note --- .../2025-04-30-extract-hidden-files-by-default.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md diff --git a/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md new file mode 100644 index 000000000000..96372513499f --- /dev/null +++ b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md @@ -0,0 +1,5 @@ +--- +category: minorAnalysis +--- + +- The Python extractor now extracts files in hidden directories by default. A new extractor option, `skip_hidden_directories` has been added as well. Setting it to `true` will make the extractor revert to the old behavior. From 2ded42c285151dfceb62597e5e767bfae0586f1c Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 2 May 2025 13:29:52 +0000 Subject: [PATCH 04/10] Python: Update extractor tests --- python/ql/test/2/extractor-tests/hidden/test.expected | 2 ++ python/ql/test/extractor-tests/filter-option/Test.expected | 1 + 2 files changed, 3 insertions(+) diff --git a/python/ql/test/2/extractor-tests/hidden/test.expected b/python/ql/test/2/extractor-tests/hidden/test.expected index ca72363d8f02..21bd0dfb2dd9 100644 --- a/python/ql/test/2/extractor-tests/hidden/test.expected +++ b/python/ql/test/2/extractor-tests/hidden/test.expected @@ -1,3 +1,5 @@ +| .hidden/inner/test.py | +| .hidden/module.py | | folder/module.py | | package | | package/__init__.py | diff --git a/python/ql/test/extractor-tests/filter-option/Test.expected b/python/ql/test/extractor-tests/filter-option/Test.expected index 7ade39a5998c..56b1e36c2a93 100644 --- a/python/ql/test/extractor-tests/filter-option/Test.expected +++ b/python/ql/test/extractor-tests/filter-option/Test.expected @@ -3,3 +3,4 @@ | Module foo.bar | | Module foo.include_test | | Package foo | +| Script hidden_foo.py | From 61719cf448963fbd3baad8bebb4da33bb9e3a354 Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 14:48:06 +0000 Subject: [PATCH 05/10] Python: Fix a bug in glob conversion If you have a filter like `**/foo/**` set in the `paths-ignore` bit of your config file, then currently the following happens: - First, the CodeQL CLI observes that this string ends in `/**` and strips off the `**` leaving `**/foo/` - Then the Python extractor strips off leading and trailing `/` characters and proceeds to convert `**/foo` into a regex that is matched against files to (potentially) extract. The trouble with this is that it leaves us unable to distinguish between, say, a file `foo.py` and a file `foo/bar.py`. In other words, we have lost the ability to exclude only the _folder_ `foo` and not any files that happen to start with `foo`. To fix this, we instead make a note of whether the glob ends in a forward slash or not, and adjust the regex correspondingly. --- python/extractor/semmle/path_filters.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/extractor/semmle/path_filters.py b/python/extractor/semmle/path_filters.py index cb1a4d9b8bca..1684b0b8fe20 100644 --- a/python/extractor/semmle/path_filters.py +++ b/python/extractor/semmle/path_filters.py @@ -41,6 +41,9 @@ def glob_part_to_regex(glob, add_sep): def glob_to_regex(glob, prefix=""): '''Convert entire glob to a compiled regex''' + # When the glob ends in `/`, we need to remember this so that we don't accidentally add an + # extra separator to the final regex. + end_sep = "" if glob.endswith("/") else SEP glob = glob.strip().strip("/") parts = glob.split("/") #Trailing '**' is redundant, so strip it off. @@ -53,7 +56,7 @@ def glob_to_regex(glob, prefix=""): # something like `C:\\folder\\subfolder\\` and without escaping the # backslash-path-separators will get interpreted as regex escapes (which might be # invalid sequences, causing the extractor to crash) - full_pattern = escape(prefix) + ''.join(parts) + "(?:" + SEP + ".*|$)" + full_pattern = escape(prefix) + ''.join(parts) + "(?:" + end_sep + ".*|$)" return re.compile(full_pattern) def filter_from_pattern(pattern, prev_filter, prefix): From 98388be25c9203e51c1635de2abf91a577fd7c18 Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 14:49:17 +0000 Subject: [PATCH 06/10] Python: Remove special casing of hidden files If it is necessary to exclude hidden files, then adding ``` paths-ignore: ['**/.*/**'] ``` to the relevant config file is recommended instead. --- python/extractor/semmle/traverser.py | 45 +++++----------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/python/extractor/semmle/traverser.py b/python/extractor/semmle/traverser.py index 0945d8ace4bf..4e316a075f75 100644 --- a/python/extractor/semmle/traverser.py +++ b/python/extractor/semmle/traverser.py @@ -85,48 +85,19 @@ def _treewalk(self, path): if isdir(fullpath): if fullpath in self.exclude_paths: self.logger.debug("Ignoring %s (excluded)", fullpath) - elif is_hidden(fullpath): - self.logger.debug("Ignoring %s (hidden)", fullpath) - else: - empty = True - for item in self._treewalk(fullpath): - yield item - empty = False - if not empty: - yield fullpath + continue + + empty = True + for item in self._treewalk(fullpath): + yield item + empty = False + if not empty: + yield fullpath elif self.filter(fullpath): yield fullpath else: self.logger.debug("Ignoring %s (filter)", fullpath) - -if os.environ.get("CODEQL_EXTRACTOR_PYTHON_OPTION_SKIP_HIDDEN_DIRECTORIES", "false") == "false": - - def is_hidden(path): - return False - -elif os.name== 'nt': - import ctypes - - def is_hidden(path): - #Magical windows code - try: - attrs = ctypes.windll.kernel32.GetFileAttributesW(str(path)) - if attrs == -1: - return False - if attrs&2: - return True - except Exception: - #Not sure what to log here, probably best to carry on. - pass - return os.path.basename(path).startswith(".") - -else: - - def is_hidden(path): - return os.path.basename(path).startswith(".") - - def exclude_filter_from_options(options): if options.exclude_package: choices = '|'.join(mod.replace('.', r'\.') for mod in options.exclude_package) From 96558b53b89fb8a72d087db5d55b6ce64848953d Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 14:53:15 +0000 Subject: [PATCH 07/10] Python: Update test The second test case now sets the `paths-ignore` setting in the config file in order to skip files in hidden directories. --- python/extractor/cli-integration-test/hidden-files/config.yml | 3 +++ .../cli-integration-test/hidden-files/query-default.expected | 1 + .../.hidden_dir/internal_non_hidden/another_non_hidden.py | 0 python/extractor/cli-integration-test/hidden-files/test.sh | 4 ++-- 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 python/extractor/cli-integration-test/hidden-files/config.yml create mode 100644 python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py diff --git a/python/extractor/cli-integration-test/hidden-files/config.yml b/python/extractor/cli-integration-test/hidden-files/config.yml new file mode 100644 index 000000000000..69d94597d950 --- /dev/null +++ b/python/extractor/cli-integration-test/hidden-files/config.yml @@ -0,0 +1,3 @@ +name: Test Config +paths-ignore: + - "**/.*/**" diff --git a/python/extractor/cli-integration-test/hidden-files/query-default.expected b/python/extractor/cli-integration-test/hidden-files/query-default.expected index cc92af624b37..72d34a1ab0b0 100644 --- a/python/extractor/cli-integration-test/hidden-files/query-default.expected +++ b/python/extractor/cli-integration-test/hidden-files/query-default.expected @@ -1,5 +1,6 @@ | name | +-------------------------------+ | .hidden_file.py | +| another_non_hidden.py | | foo.py | | visible_file_in_hidden_dir.py | diff --git a/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py b/python/extractor/cli-integration-test/hidden-files/repo_dir/.hidden_dir/internal_non_hidden/another_non_hidden.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/extractor/cli-integration-test/hidden-files/test.sh b/python/extractor/cli-integration-test/hidden-files/test.sh index 77cb12664af6..45485985adbb 100755 --- a/python/extractor/cli-integration-test/hidden-files/test.sh +++ b/python/extractor/cli-integration-test/hidden-files/test.sh @@ -16,8 +16,8 @@ $CODEQL database create db --language python --source-root repo_dir/ $CODEQL query run --database db query.ql > query-default.actual diff query-default.expected query-default.actual -# Test 2: Setting the relevant extractor option to true skips files in hidden directories -$CODEQL database create db-skipped --language python --source-root repo_dir/ --extractor-option python.skip_hidden_directories=true +# Test 2: The default behavior can be overridden by setting `paths-ignore` in the config file +$CODEQL database create db-skipped --language python --source-root repo_dir/ --codescanning-config=config.yml $CODEQL query run --database db-skipped query.ql > query-skipped.actual diff query-skipped.expected query-skipped.actual From 72ae633a64e8b62da1981c44fcd0bcb8501b84e6 Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 14:58:32 +0000 Subject: [PATCH 08/10] Python: Update change note and extractor config Removes the previously added extractor option and updates the change note to explain how to use `paths-ignore` to exclude files in hidden directories. --- python/codeql-extractor.yml | 7 ------- .../2025-04-30-extract-hidden-files-by-default.md | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/python/codeql-extractor.yml b/python/codeql-extractor.yml index 2bd1a9c0aa76..97a9e1f2cf2f 100644 --- a/python/codeql-extractor.yml +++ b/python/codeql-extractor.yml @@ -44,10 +44,3 @@ options: Use this setting with caution, the Python extractor requires Python 3 to run. type: string pattern: "^(py|python|python3)$" - skip_hidden_directories: - title: Controls whether hidden directories are skipped during extraction. - description: > - By default, CodeQL will extract all Python files, including ones located in hidden directories. By setting this option to true, these hidden directories will be skipped instead. - Accepted values are true and false. - type: string - pattern: "^(true|false)$" diff --git a/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md index 96372513499f..32b272215af7 100644 --- a/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md +++ b/python/ql/lib/change-notes/2025-04-30-extract-hidden-files-by-default.md @@ -2,4 +2,4 @@ category: minorAnalysis --- -- The Python extractor now extracts files in hidden directories by default. A new extractor option, `skip_hidden_directories` has been added as well. Setting it to `true` will make the extractor revert to the old behavior. +- The Python extractor now extracts files in hidden directories by default. If you would like to skip hidden files, add `paths-ignore: ["**/.*/**"]` to your [Code Scanning config](https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning#specifying-directories-to-scan). When using the CodeQL CLI for extraction, specify the configuration (creating the configuration file if necessary) using the `--codescanning-config` option. From c8cca126a18cc9b7573f650118a0d033f136c84e Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 14:59:33 +0000 Subject: [PATCH 09/10] Python: Bump extractor version --- python/extractor/semmle/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/extractor/semmle/util.py b/python/extractor/semmle/util.py index e0720a86312b..56f7889ae231 100644 --- a/python/extractor/semmle/util.py +++ b/python/extractor/semmle/util.py @@ -10,7 +10,7 @@ #Semantic version of extractor. #Update this if any changes are made -VERSION = "7.1.2" +VERSION = "7.1.3" PY_EXTENSIONS = ".py", ".pyw" From 2158eaa34c568f148672ca4dbf11dbb39599227b Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 15 May 2025 15:34:11 +0000 Subject: [PATCH 10/10] Python: Fix a bug in glob regex creation The previous version was tested on a version of the code where we had temporarily removed the `glob.strip("/")` bit, and so the bug didn't trigger then. We now correctly remember if the glob ends in `/`, and add an extra part in that case. This way, if the path ends with multiple slashes, they effectively get consolidated into a single one, which results in the correct semantics. --- python/extractor/semmle/path_filters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/extractor/semmle/path_filters.py b/python/extractor/semmle/path_filters.py index 1684b0b8fe20..908ec4c0ee0a 100644 --- a/python/extractor/semmle/path_filters.py +++ b/python/extractor/semmle/path_filters.py @@ -51,6 +51,11 @@ def glob_to_regex(glob, prefix=""): parts = parts[:-1] if not parts: return ".*" + # The `glob.strip("/")` call above will have removed all trailing slashes, but if there was at + # least one trailing slash, we want there to be an extra part, so we add it explicitly here in + # that case, using the emptyness of `end_sep` as a proxy. + if end_sep == "": + parts += [""] parts = [ glob_part_to_regex(escape(p), True) for p in parts[:-1] ] + [ glob_part_to_regex(escape(parts[-1]), False) ] # we need to escape the prefix, specifically because on windows the prefix will be # something like `C:\\folder\\subfolder\\` and without escaping the