From 2144921389a94315cd911f2f69a2ff05ed39e739 Mon Sep 17 00:00:00 2001 From: ddelange <14880945+ddelange@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:13:11 +0100 Subject: [PATCH] :sparkles: Add extract_licences_from_wheel --- src/pipgrip/cli.py | 17 +++++++++++ src/pipgrip/licenses.py | 56 +++++++++++++++++++++++++++++++++++ src/pipgrip/package_source.py | 2 +- 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 src/pipgrip/licenses.py diff --git a/src/pipgrip/cli.py b/src/pipgrip/cli.py index 529ef5f..a86a11f 100755 --- a/src/pipgrip/cli.py +++ b/src/pipgrip/cli.py @@ -18,6 +18,7 @@ from pipgrip.libs.mixology.failure import SolverFailure from pipgrip.libs.mixology.package import Package from pipgrip.libs.mixology.version_solver import VersionSolver +from pipgrip.licenses import get_licenses from pipgrip.package_source import PackageSource from pipgrip.pipper import install_packages, read_requirements @@ -312,6 +313,11 @@ def render_lock(packages, include_dot=True, sort=False): is_flag=True, help="Include pre-release and development versions. By default, pip implicitly excludes pre-releases (unless specified otherwise by PEP 440).", ) +@click.option( + "--detect-licenses", + is_flag=True, + help="Detect and extract license info for each dependency. Completeness can not be guaranteed.", +) @click.option( "-v", "--verbose", @@ -339,6 +345,7 @@ def main( index_url, extra_index_url, pre, + detect_licenses, verbose, ): if verbose == 0: @@ -437,6 +444,16 @@ def main( source, decision_packages ) + if detect_licenses: + licenses = { + package_key: get_licenses( + **source._packages_metadata[package_key][package_version] + ) + for package_key, package_version in packages_flat.items() + } + # TODO add to the various output formats + logger.info("Licenses:\n%s", json.dumps(licenses)) + if lock: with io.open( os.path.join(os.getcwd(), "pipgrip.lock"), mode="w", encoding="utf-8" diff --git a/src/pipgrip/licenses.py b/src/pipgrip/licenses.py new file mode 100644 index 0000000..b277820 --- /dev/null +++ b/src/pipgrip/licenses.py @@ -0,0 +1,56 @@ +from collections import OrderedDict +from zipfile import ZipFile + + +def extract_licences_from_wheel(wheel_fname): + """Extract all contents of files containing 'licence' in their filename.""" + zfp = ZipFile(wheel_fname, "r") + + # missing AUTHORS, COPYING and other legal files + licenses = OrderedDict( + (name, zfp.read(name)) + for name in zfp.namelist() + if "license" in name.split("/\\")[-1].lower() + ) + + return licenses + + +def get_licenses(wheel_fname, wheel_metadata, **kwargs): + """Extract copyright related info using a wheel as input.""" + # parse all urls mentioned in wheel_metadata + home_page = wheel_metadata.get("home_page", "") + project_urls = ( + OrderedDict((("home_page", home_page),)) if home_page else OrderedDict() + ) + project_urls.update(x.split(", ") for x in wheel_metadata.get("project_urls", [])) + + # first attempt at getting licenses based on filename + # e.g. for pip this is incomplete as the pip wheel doesn't contain vendored licenses + # https://github.com/pypa/pip/tree/21.0.1/src/pip/_vendor + # try `pip download pip --no-deps --no-binary :all:` and it will start crashing hard + # https://github.com/pypa/pip/issues/1884 + # e.g. for matplotlib, wheels do not reproduce matplotlib's LICENSE + # https://github.com/matplotlib/matplotlib/tree/v3.3.4/LICENSE + licenses = extract_licences_from_wheel(wheel_fname) + + # potential fallbacks (already incorrect as it's not found in the bdist_wheel used for installation): + # - use sdist instead (additional downloads): + # - scan for sdist on project_urls or [warehouse json api](https://warehouse.readthedocs.io/api-reference/json.html) + # - download, unarchive and run scancode-toolkit + # - existing databases: + # - https://libraries.io/pypi (detection method unverified) + # - https://clearlydefined.io/?type=pypi (uses scancode-toolkit) + # - machine readable spdx classifiers [ref](https://softwareengineering.stackexchange.com/a/381907/346730) + # - other license headers + + wheel_info = OrderedDict( + ( + ("author", wheel_metadata.get("author", "")), + ("author_email", wheel_metadata.get("author_email", "")), + ("project_urls", project_urls), + ("licenses", licenses), + ) + ) + + return wheel_info diff --git a/src/pipgrip/package_source.py b/src/pipgrip/package_source.py index 6ba0f42..cf3f73d 100755 --- a/src/pipgrip/package_source.py +++ b/src/pipgrip/package_source.py @@ -138,7 +138,7 @@ def discover_and_add(self, package): # type: (str, str) -> None deps=to_create["requires"], ) - # currently unused + # currently only used for licenses if req.key not in self._packages_metadata: self._packages_metadata[req.key] = {} to_create["pip_string"] = req.__str__()