From 2e78d77d7b1b71072197410f037ac9a494d3e78c Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Thu, 30 Jan 2025 11:31:03 +0100 Subject: [PATCH 1/5] Support scanning github actions --- guarddog/scanners/npm_package_scanner.py | 50 ++++++++++++++++-------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/guarddog/scanners/npm_package_scanner.py b/guarddog/scanners/npm_package_scanner.py index cc4df17c..87036ac7 100644 --- a/guarddog/scanners/npm_package_scanner.py +++ b/guarddog/scanners/npm_package_scanner.py @@ -18,18 +18,41 @@ def __init__(self) -> None: super().__init__(Analyzer(ECOSYSTEM.NPM)) def download_and_get_package_info(self, directory: str, package_name: str, version=None) -> typing.Tuple[dict, str]: - git_target = None - if urlparse(package_name).hostname is not None and package_name.endswith('.git'): - git_target = package_name + git_package_name = self._parse_git_url(package_name) - if not package_name.startswith("@") and package_name.count("/") == 1: - git_target = f"https://github.com/{package_name}.git" + if git_package_name != "": + data, tarball_url = self._get_git_data_and_tarball_url(git_package_name, version) + else: + data, tarball_url = self._get_npm_data_and_tarball_url(package_name, version) - if git_target is not None: - raise Exception("Git targets are not yet supported for npm") + log.debug(f"Downloading NPM package from {tarball_url}") + file_extension = pathlib.Path(tarball_url).suffix + zippath = os.path.join(directory, package_name.replace("/", "-") + file_extension) + unzippedpath = zippath.removesuffix(file_extension) + self.download_compressed(tarball_url, zippath, unzippedpath) + + return data, unzippedpath + + def _parse_git_url(self, package_name: str) -> str: + parsed_url = urlparse(package_name) + + path = parsed_url.path.strip("/") # trim leading and trailing slashes + + # TODO: support other git providers? + if parsed_url.hostname == "github.com" and path.endswith('.git') and not path.startswith("@") and path.count("/") == 1: + return path.removesuffix(".git") + + return "" + def _get_git_data_and_tarball_url(self, package_name: str, version=None) -> typing.Tuple[dict, str]: + if version is None: + return {}, f"https://github.com/{package_name}/archive/refs/heads/main.zip" + else: + return {}, f"https://github.com/{package_name}/archive/refs/tags/{version}.zip" + + def _get_npm_data_and_tarball_url(self, package_name: str, version=None) -> typing.Tuple[dict, str]: url = f"https://registry.npmjs.org/{package_name}" - log.debug(f"Downloading NPM package from {url}") + log.debug(f"Downloading NPM package metadata from {url}") response = requests.get(url) if response.status_code != 200: @@ -37,16 +60,9 @@ def download_and_get_package_info(self, directory: str, package_name: str, versi data = response.json() if "name" not in data: raise Exception(f"Error retrieving package: {package_name}") + # if version is none, we only scan the last package # TODO: figure logs and log it when we do that version = data["dist-tags"]["latest"] if version is None else version - details = data["versions"][version] - - tarball_url = details["dist"]["tarball"] - file_extension = pathlib.Path(tarball_url).suffix - zippath = os.path.join(directory, package_name.replace("/", "-") + file_extension) - unzippedpath = zippath.removesuffix(file_extension) - self.download_compressed(tarball_url, zippath, unzippedpath) - - return data, unzippedpath + return data, data["versions"][version]["dist"]["tarball"] From 23ed0edbadd4001f63b816fff3e12c54f60c0326 Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Thu, 30 Jan 2025 11:40:26 +0100 Subject: [PATCH 2/5] lint --- guarddog/scanners/npm_package_scanner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/guarddog/scanners/npm_package_scanner.py b/guarddog/scanners/npm_package_scanner.py index 87036ac7..14c558f2 100644 --- a/guarddog/scanners/npm_package_scanner.py +++ b/guarddog/scanners/npm_package_scanner.py @@ -36,10 +36,13 @@ def download_and_get_package_info(self, directory: str, package_name: str, versi def _parse_git_url(self, package_name: str) -> str: parsed_url = urlparse(package_name) - path = parsed_url.path.strip("/") # trim leading and trailing slashes + path = parsed_url.path.strip("/") # trim leading and trailing slashes # TODO: support other git providers? - if parsed_url.hostname == "github.com" and path.endswith('.git') and not path.startswith("@") and path.count("/") == 1: + if parsed_url.hostname == "github.com" and \ + path.endswith('.git') and \ + not path.startswith("@") and \ + path.count("/") == 1: return path.removesuffix(".git") return "" From a6853a543be0ba3899f471e465639a78b36ddd76 Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Thu, 30 Jan 2025 11:58:33 +0100 Subject: [PATCH 3/5] Suppport all default branches name --- guarddog/scanners/npm_package_scanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guarddog/scanners/npm_package_scanner.py b/guarddog/scanners/npm_package_scanner.py index 14c558f2..faeaa1bc 100644 --- a/guarddog/scanners/npm_package_scanner.py +++ b/guarddog/scanners/npm_package_scanner.py @@ -49,7 +49,7 @@ def _parse_git_url(self, package_name: str) -> str: def _get_git_data_and_tarball_url(self, package_name: str, version=None) -> typing.Tuple[dict, str]: if version is None: - return {}, f"https://github.com/{package_name}/archive/refs/heads/main.zip" + return {}, f"https://api.github.com/repos/{package_name}/zipball" else: return {}, f"https://github.com/{package_name}/archive/refs/tags/{version}.zip" From cc257ecf5e1adab598a62f3138521b19ce470bcc Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Thu, 30 Jan 2025 12:04:51 +0100 Subject: [PATCH 4/5] Fix missing zip file extension error --- guarddog/scanners/npm_package_scanner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/guarddog/scanners/npm_package_scanner.py b/guarddog/scanners/npm_package_scanner.py index faeaa1bc..d8c5eca4 100644 --- a/guarddog/scanners/npm_package_scanner.py +++ b/guarddog/scanners/npm_package_scanner.py @@ -27,6 +27,8 @@ def download_and_get_package_info(self, directory: str, package_name: str, versi log.debug(f"Downloading NPM package from {tarball_url}") file_extension = pathlib.Path(tarball_url).suffix + if file_extension == "": + file_extension = ".zip" zippath = os.path.join(directory, package_name.replace("/", "-") + file_extension) unzippedpath = zippath.removesuffix(file_extension) self.download_compressed(tarball_url, zippath, unzippedpath) From b20a248863206cae29c426a692ba90741ebfb9d9 Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Thu, 30 Jan 2025 13:37:39 +0100 Subject: [PATCH 5/5] Add test --- tests/core/test_npm_package_scanner.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/core/test_npm_package_scanner.py b/tests/core/test_npm_package_scanner.py index 95bf5b83..26fffdc7 100644 --- a/tests/core/test_npm_package_scanner.py +++ b/tests/core/test_npm_package_scanner.py @@ -25,14 +25,12 @@ def test_download_and_get_package_info_npm_namespaced(): assert os.path.exists(os.path.join(tmpdirname, "@datadog-browser-logs")) -@pytest.mark.parametrize("identifier", ["expressjs/express", "https://github.com/expressjs/express.git"]) -@pytest.mark.skip("Git targets are not yet supported for npm") -def test_download_and_get_package_info_from_github(identifier): +def test_download_and_get_package_info_from_github(): scanner = NPMPackageScanner() with tempfile.TemporaryDirectory() as tmpdirname: - data, path = scanner.download_and_get_package_info(tmpdirname, "identifier") - assert os.path.exists(os.path.join(tmpdirname, "express", "package", "package.json")) - assert "1.0.0" in data["versions"] + data, path = scanner.download_and_get_package_info(tmpdirname, "https://github.com/expressjs/express.git", "v5.0.0") + assert not data + assert os.path.exists(os.path.join(tmpdirname, "https:--github.com-expressjs-express.git", "express-5.0.0", "package.json")) def test_download_and_get_package_info_non_existing_packages():