Skip to content

Commit

Permalink
Fix performance regression when searching for REUSE.toml
Browse files Browse the repository at this point in the history
Previously, the glob **/REUSE.toml would search _all_ directories,
including big directories containing build artefacts that are otherwise
ignored by VCS. This commit uses the same logic to find REUSE.toml as
any other file is found. It's not super rapid, but it does a heap more
filtering.

Signed-off-by: Carmen Bianca BAKKER <[email protected]>
  • Loading branch information
carmenbianca committed Jul 8, 2024
1 parent e340e5c commit 60e8f4f
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 14 deletions.
41 changes: 34 additions & 7 deletions src/reuse/global_licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

from . import ReuseInfo, SourceType
from ._util import _LICENSING, StrPath
from .covered_files import all_files
from .vcs import VCSStrategy

_LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -232,7 +234,7 @@ class GlobalLicensing(ABC):

@classmethod
@abstractmethod
def from_file(cls, path: StrPath) -> "GlobalLicensing":
def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing":
"""Parse the file and create a :class:`GlobalLicensing` object from its
contents.
Expand Down Expand Up @@ -262,7 +264,7 @@ class ReuseDep5(GlobalLicensing):
dep5_copyright: Copyright

@classmethod
def from_file(cls, path: StrPath) -> "ReuseDep5":
def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseDep5":
path = Path(path)
try:
with path.open(encoding="utf-8") as fp:
Expand Down Expand Up @@ -438,7 +440,7 @@ def from_toml(cls, toml: str, source: str) -> "ReuseTOML":
return cls.from_dict(tomldict, source)

@classmethod
def from_file(cls, path: StrPath) -> "ReuseTOML":
def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseTOML":
with Path(path).open(encoding="utf-8") as fp:
return cls.from_toml(fp.read(), str(path))

Expand Down Expand Up @@ -484,11 +486,21 @@ class NestedReuseTOML(GlobalLicensing):
reuse_tomls: List[ReuseTOML] = attrs.field()

@classmethod
def from_file(cls, path: StrPath) -> "GlobalLicensing":
def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing":
"""TODO: *path* is a directory instead of a file."""
include_submodules: bool = kwargs.get("include_submodules", False)
include_meson_subprojects: bool = kwargs.get(
"include_meson_subprojects", False
)
vcs_strategy: Optional[VCSStrategy] = kwargs.get("vcs_strategy")
tomls = [
ReuseTOML.from_file(toml_path)
for toml_path in cls.find_reuse_tomls(path)
for toml_path in cls.find_reuse_tomls(
path,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
)
]
return cls(reuse_tomls=tomls, source=str(path))

Expand Down Expand Up @@ -547,9 +559,24 @@ def reuse_info_of(
return dict(result)

@classmethod
def find_reuse_tomls(cls, path: StrPath) -> Generator[Path, None, None]:
def find_reuse_tomls(
cls,
path: StrPath,
include_submodules: bool = False,
include_meson_subprojects: bool = False,
vcs_strategy: Optional[VCSStrategy] = None,
) -> Generator[Path, None, None]:
"""Find all REUSE.toml files in *path*."""
return Path(path).rglob("**/REUSE.toml")
return (
item
for item in all_files(
path,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
)
if item.name == "REUSE.toml"
)

def _find_relevant_tomls(self, path: StrPath) -> List[ReuseTOML]:
found = []
Expand Down
36 changes: 32 additions & 4 deletions src/reuse/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ class GlobalLicensingFound(NamedTuple):
cls: Type[GlobalLicensing]


# TODO: The information (root, include_submodules, include_meson_subprojects,
# vcs_strategy) is passed to SO MANY PLACES. Maybe Project should be simplified
# to contain exclusively those values, or maybe these values should be extracted
# out of Project to simplify passing this information around.
class Project:
"""Simple object that holds the project's root, which is necessary for many
interactions.
Expand Down Expand Up @@ -140,9 +144,19 @@ def from_directory(
vcs_strategy = cls._detect_vcs_strategy(root)

global_licensing: Optional[GlobalLicensing] = None
found = cls.find_global_licensing(root)
found = cls.find_global_licensing(
root,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
)
if found:
global_licensing = found.cls.from_file(found.path)
global_licensing = found.cls.from_file(
found.path,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
)

project = cls(
root,
Expand Down Expand Up @@ -279,7 +293,11 @@ def relative_from_root(self, path: StrPath) -> Path:

@classmethod
def find_global_licensing(
cls, root: Path
cls,
root: Path,
include_submodules: bool = False,
include_meson_subprojects: bool = False,
vcs_strategy: Optional[VCSStrategy] = None,
) -> Optional[GlobalLicensingFound]:
"""Find the path and corresponding class of a project directory's
:class:`GlobalLicensing`.
Expand All @@ -302,9 +320,18 @@ def find_global_licensing(
PendingDeprecationWarning,
)
candidate = GlobalLicensingFound(dep5_path, ReuseDep5)

# TODO: the performance of this isn't great.
toml_path = None
with contextlib.suppress(StopIteration):
toml_path = next(root.rglob("**/REUSE.toml"))
toml_path = next(
NestedReuseTOML.find_reuse_tomls(
root,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
)
)
if toml_path is not None:
if candidate is not None:
raise GlobalLicensingConflict(
Expand All @@ -315,6 +342,7 @@ def find_global_licensing(
).format(new_path=toml_path, old_path=dep5_path)
)
candidate = GlobalLicensingFound(root, NestedReuseTOML)

return candidate

def _identifier_of_license(self, path: Path) -> str:
Expand Down
69 changes: 67 additions & 2 deletions tests/test_global_licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
ReuseDep5,
ReuseTOML,
)
from reuse.vcs import VCSStrategyGit

# REUSE-IgnoreStart

Expand Down Expand Up @@ -699,19 +700,83 @@ def test_one_deep(self, empty_directory):
"""Find a single REUSE.toml deeper in the directory tree."""
(empty_directory / "src").mkdir()
path = empty_directory / "src/REUSE.toml"
path.touch()
path.write_text("version = 1")
result = NestedReuseTOML.find_reuse_tomls(empty_directory)
assert list(result) == [path]

def test_multiple(self, fake_repository_reuse_toml):
"""Find multiple REUSE.tomls."""
(fake_repository_reuse_toml / "src/REUSE.toml").touch()
(fake_repository_reuse_toml / "src/REUSE.toml").write_text(
"version = 1"
)
result = NestedReuseTOML.find_reuse_tomls(fake_repository_reuse_toml)
assert set(result) == {
fake_repository_reuse_toml / "REUSE.toml",
fake_repository_reuse_toml / "src/REUSE.toml",
}

def test_with_vcs_strategy(self, git_repository):
"""Ignore the correct files ignored by the repository."""
(git_repository / "REUSE.toml").write_text("version = 1")
(git_repository / "build/REUSE.toml").write_text("version =1")
(git_repository / "src/REUSE.toml").write_text("version = 1")

result = NestedReuseTOML.find_reuse_tomls(
git_repository, vcs_strategy=VCSStrategyGit(git_repository)
)
assert set(result) == {
git_repository / "REUSE.toml",
git_repository / "src/REUSE.toml",
}

def test_includes_submodule(self, submodule_repository):
"""include_submodules is correctly implemented."""
(submodule_repository / "REUSE.toml").write_text("version = 1")
(submodule_repository / "submodule/REUSE.toml").write_text(
"version = 1"
)

result_without = NestedReuseTOML.find_reuse_tomls(
submodule_repository,
vcs_strategy=VCSStrategyGit(submodule_repository),
)
assert set(result_without) == {submodule_repository / "REUSE.toml"}

result_with = NestedReuseTOML.find_reuse_tomls(
submodule_repository,
include_submodules=True,
vcs_strategy=VCSStrategyGit(submodule_repository),
)
assert set(result_with) == {
submodule_repository / "REUSE.toml",
submodule_repository / "submodule/REUSE.toml",
}

def test_includes_meson_subprojects(self, subproject_repository):
"""include_meson_subprojects is correctly implemented."""
(subproject_repository / "REUSE.toml").write_text("version = 1")
(subproject_repository / "subprojects/REUSE.toml").write_text(
"version = 1"
)
(subproject_repository / "subprojects/libfoo/REUSE.toml").write_text(
"version = 1"
)

result_without = NestedReuseTOML.find_reuse_tomls(subproject_repository)
assert set(result_without) == {
subproject_repository / "REUSE.toml",
subproject_repository / "subprojects/REUSE.toml",
}

result_with = NestedReuseTOML.find_reuse_tomls(
subproject_repository, include_meson_subprojects=True
)
assert set(result_with) == {
subproject_repository / "REUSE.toml",
subproject_repository / "subprojects/REUSE.toml",
subproject_repository / "subprojects/libfoo/REUSE.toml",
}


class TestNestedReuseTOMLReuseInfoOf:
"""Tests for NestedReuseTOML.reuse_info_of."""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ def test_find_global_licensing_none(empty_directory):

def test_find_global_licensing_conflict(fake_repository_dep5):
"""Expect an error on a conflict"""
(fake_repository_dep5 / "REUSE.toml").touch()
(fake_repository_dep5 / "REUSE.toml").write_text("version = 1")
with pytest.raises(GlobalLicensingConflict):
Project.find_global_licensing(fake_repository_dep5)

Expand Down

0 comments on commit 60e8f4f

Please sign in to comment.