Skip to content

Commit

Permalink
[KED-2897] Fix pipeline pull with unparseable requirements (kedro-org…
Browse files Browse the repository at this point in the history
…#1280)

Signed-off-by: Laurens Vijnck <[email protected]>
  • Loading branch information
antonymilne authored and lvijnck committed Apr 7, 2022
1 parent 811728b commit b05bb50
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 43 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
* Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
* Relaxed the bounds on the `plotly` requirement for `plotly.PlotlyDataSet`.
* `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).
* `kedro pipeline pull` now works when the project requirements contains entries such as `-r`, `--extra-index-url` and local wheel files ([Issue #913](https://github.com/quantumblacklabs/kedro/issues/913)).
* Fixed slow startup because of catalog processing by reducing the exponential growth of extra processing during `_FrozenDatasets` creations.
* Removed `.coveragerc` from the Kedro project template. `coverage` settings are now given in `pyproject.toml`.
* Fixed a bug where packaging or pulling a modular pipeline with the same name as the project's package name would throw an error (or silently pass without including the pipeline source code in the wheel file).
Expand Down
32 changes: 25 additions & 7 deletions kedro/framework/cli/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from importlib import import_module
from pathlib import Path
from textwrap import indent
from typing import Any, List, NamedTuple, Optional, Tuple, Union
from typing import Any, Iterable, List, NamedTuple, Optional, Set, Tuple, Union
from zipfile import ZipFile

import click
Expand Down Expand Up @@ -320,9 +320,6 @@ def _pull_package(
package_name = dist_info_file[0].stem.split("-")[0]
package_metadata = dist_info_file[0] / "METADATA"

_clean_pycache(temp_dir_path)
_install_files(metadata, package_name, temp_dir_path, env, alias)

req_pattern = r"Requires-Dist: (.*?)\n"
package_reqs = re.findall(req_pattern, package_metadata.read_text())
if package_reqs:
Expand All @@ -331,6 +328,9 @@ def _pull_package(
)
_append_package_reqs(requirements_in, package_reqs, package_name)

_clean_pycache(temp_dir_path)
_install_files(metadata, package_name, temp_dir_path, env, alias)


def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None:
# pylint: disable=import-outside-toplevel
Expand Down Expand Up @@ -1103,9 +1103,9 @@ def _append_package_reqs(
requirements_in: Path, package_reqs: List[str], pipeline_name: str
) -> None:
"""Appends modular pipeline requirements to project level requirements.in"""
existing_reqs = pkg_resources.parse_requirements(requirements_in.read_text())
new_reqs = pkg_resources.parse_requirements(package_reqs)
reqs_to_add = set(new_reqs) - set(existing_reqs)
existing_reqs = _safe_parse_requirements(requirements_in.read_text())
incoming_reqs = _safe_parse_requirements(package_reqs)
reqs_to_add = set(incoming_reqs) - set(existing_reqs)
if not reqs_to_add:
return

Expand All @@ -1124,3 +1124,21 @@ def _append_package_reqs(
"Use `kedro install --build-reqs` to compile and install the updated list of "
"requirements."
)


def _safe_parse_requirements(
requirements: Union[str, Iterable[str]]
) -> Set[pkg_resources.Requirement]:
"""Safely parse a requirement or set of requirements. This effectively replaces
pkg_resources.parse_requirements, which blows up with a ValueError as soon as it
encounters a requirement it cannot parse (e.g. `-r requirements.txt`). This way
we can still extract all the parseable requirements out of a set containing some
unparseable requirements.
"""
parseable_requirements = set()
for requirement in pkg_resources.yield_lines(requirements):
try:
parseable_requirements.add(pkg_resources.Requirement.parse(requirement))
except ValueError:
continue
return parseable_requirements
133 changes: 97 additions & 36 deletions tests/framework/cli/pipeline/test_pipeline_requirements.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import pkg_resources
import pytest
from click.testing import CliRunner

from kedro.framework.cli.pipeline import _get_wheel_name
from kedro.framework.cli.pipeline import _get_wheel_name, _safe_parse_requirements

PIPELINE_NAME = "my_pipeline"

# Inspired by test cases given in https://www.python.org/dev/peps/pep-0508/.
# These are all valid requirement specifications that can be used in both
# requirements.txt and in METADATA Requires-Dist.
VALID_REQUIREMENTS = """A
SIMPLE_REQUIREMENTS = """A
A.B-C_D
aa
name
Expand All @@ -27,6 +26,14 @@
pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ees
"""

# These requirements can be used in requirements.txt but not in METADATA Requires-Dist.
# They cannot be parsed by pkg_resources.
COMPLEX_REQUIREMENTS = """--extra-index-url https://this.wont.work
-r other_requirements.txt
./path/to/package.whl
http://some.website.com/package.whl
"""


@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_dist")
class TestPipelineRequirements:
Expand Down Expand Up @@ -73,30 +80,94 @@ def call_pipeline_pull(self, cli, metadata, repo_path):
)
assert result.exit_code == 0

def test_existing_complex_project_requirements_txt(
self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path
):
"""Pipeline requirements.txt and project requirements.txt, but no project
requirements.in."""
project_requirements_txt = fake_repo_path / "src" / "requirements.txt"
with open(project_requirements_txt, "a", encoding="utf-8") as file:
file.write(COMPLEX_REQUIREMENTS)
existing_requirements = _safe_parse_requirements(
project_requirements_txt.read_text()
)

self.call_pipeline_create(fake_project_cli, fake_metadata)
pipeline_requirements_txt = (
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path)

packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS)
project_requirements_in = fake_repo_path / "src" / "requirements.in"
pulled_requirements = _safe_parse_requirements(
project_requirements_in.read_text()
)
# requirements.in afterwards should be the requirements that already existed in
# project requirements.txt + those pulled in from pipeline requirements.txt.
# Unparseable COMPLEX_REQUIREMENTS should still be there.
assert pulled_requirements == existing_requirements | packaged_requirements
assert COMPLEX_REQUIREMENTS in project_requirements_in.read_text()

def test_existing_project_requirements_txt(
self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path
):
"""Pipeline requirements.txt and project requirements.txt, but no project
requirements.in."""
project_requirements_txt = fake_repo_path / "src" / "requirements.txt"
existing_requirements = _safe_parse_requirements(
project_requirements_txt.read_text()
)

self.call_pipeline_create(fake_project_cli, fake_metadata)
pipeline_requirements_txt = (
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)
pipeline_requirements_txt.write_text(VALID_REQUIREMENTS)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path)

packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS)
packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS)
project_requirements_in = fake_repo_path / "src" / "requirements.in"
pulled_requirements = pkg_resources.parse_requirements(
pulled_requirements = _safe_parse_requirements(
project_requirements_in.read_text()
)
# Packaged requirements expected to be a subset of pulled requirements due to
# default project level requirements.txt (e.g. black, flake8), which should be
# preserved
assert set(packaged_requirements) <= set(pulled_requirements)
# requirements.in afterwards should be the requirements that already existed in
# project requirements.txt + those pulled in from pipeline requirements.txt.
assert pulled_requirements == existing_requirements | packaged_requirements

def test_existing_complex_project_requirements_in(
self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path
):
"""Pipeline requirements.txt and a pre-existing project requirements.in."""
project_requirements_in = fake_repo_path / "src" / "requirements.in"
project_requirements_in.write_text(COMPLEX_REQUIREMENTS)
self.call_pipeline_create(fake_project_cli, fake_metadata)
pipeline_requirements_txt = (
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path)

packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS)
existing_requirements = _safe_parse_requirements(COMPLEX_REQUIREMENTS)
pulled_requirements = _safe_parse_requirements(
project_requirements_in.read_text()
)
# requirements.in afterwards should be the requirements that already existed in
# project requirements.txt + those pulled in from pipeline requirements.txt.
# Unparseable COMPLEX_REQUIREMENTS should still be there.
assert pulled_requirements == existing_requirements | packaged_requirements
assert COMPLEX_REQUIREMENTS in project_requirements_in.read_text()

def test_existing_project_requirements_in(
self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path
Expand All @@ -109,22 +180,20 @@ def test_existing_project_requirements_in(
pipeline_requirements_txt = (
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)
pipeline_requirements_txt.write_text(VALID_REQUIREMENTS)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path)

packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS)
existing_requirements = pkg_resources.parse_requirements(initial_dependency)
pulled_requirements = pkg_resources.parse_requirements(
packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS)
existing_requirements = _safe_parse_requirements(initial_dependency)
pulled_requirements = _safe_parse_requirements(
project_requirements_in.read_text()
)
# Requirements after pulling a pipeline expected to be the union of
# requirements packaged and requirements already existing at project level
assert set(pulled_requirements) == set(packaged_requirements) | set(
existing_requirements
)
# requirements.in afterwards should be the requirements that already existed in
# project requirements.txt + those pulled in from pipeline requirements.txt.
assert pulled_requirements == existing_requirements | packaged_requirements

def test_missing_project_requirements_in_and_txt(
self,
Expand All @@ -144,8 +213,8 @@ def test_missing_project_requirements_in_and_txt(
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)

pipeline_requirements_txt.write_text(VALID_REQUIREMENTS)
packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)
packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
Expand All @@ -155,10 +224,10 @@ def test_missing_project_requirements_in_and_txt(

assert not project_requirements_txt.exists()
assert project_requirements_in.exists()
pulled_requirements = pkg_resources.parse_requirements(
pulled_requirements = _safe_parse_requirements(
project_requirements_in.read_text()
)
assert set(packaged_requirements) == set(pulled_requirements)
assert packaged_requirements == pulled_requirements

def test_no_requirements(
self,
Expand Down Expand Up @@ -192,8 +261,8 @@ def test_all_requirements_already_covered(
fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt"
)
project_requirements_txt = fake_repo_path / "src" / "requirements.txt"
pipeline_requirements_txt.write_text(VALID_REQUIREMENTS)
project_requirements_txt.write_text(VALID_REQUIREMENTS)
pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS)
project_requirements_txt.write_text(SIMPLE_REQUIREMENTS)

self.call_pipeline_package(fake_project_cli, fake_metadata)
self.call_pipeline_delete(fake_project_cli, fake_metadata)
Expand All @@ -203,7 +272,7 @@ def test_all_requirements_already_covered(
# addition
project_requirements_in = fake_repo_path / "src" / "requirements.in"
assert project_requirements_in.exists()
assert project_requirements_in.read_text() == VALID_REQUIREMENTS
assert project_requirements_in.read_text() == SIMPLE_REQUIREMENTS

def test_no_pipeline_requirements_txt(
self, fake_project_cli, fake_metadata, fake_repo_path
Expand Down Expand Up @@ -235,16 +304,8 @@ def test_empty_pipeline_requirements_txt(
project_requirements_in = fake_repo_path / "src" / "requirements.in"
assert not project_requirements_in.exists()

@pytest.mark.parametrize(
"requirement",
[
"--extra-index-url https://this.wont.work",
"-r other_requirements.txt",
"./path/to/package.whl",
"http://some.website.com/package.whl",
],
)
def test_invalid_requirements(
@pytest.mark.parametrize("requirement", COMPLEX_REQUIREMENTS.splitlines())
def test_complex_requirements(
self, requirement, fake_project_cli, fake_metadata, fake_package_path
):
"""Options that are valid in requirements.txt but cannot be packaged using
Expand Down

0 comments on commit b05bb50

Please sign in to comment.