diff --git a/src/bandersnatch/mirror.py b/src/bandersnatch/mirror.py index d4eb83bf1..130898161 100644 --- a/src/bandersnatch/mirror.py +++ b/src/bandersnatch/mirror.py @@ -6,11 +6,11 @@ import os import sys import time -from collections.abc import Awaitable +from collections.abc import Awaitable, Callable from json import dump from pathlib import Path, WindowsPath from threading import RLock -from typing import Any +from typing import Any, TypeVar from urllib.parse import unquote, urlparse from filelock import Timeout @@ -22,7 +22,7 @@ from .master import Master from .package import Package from .simple import SimpleAPI, SimpleFormat, SimpleFormats -from .storage import storage_backend_plugins +from .storage import Storage, storage_backend_plugins LOG_PLUGINS = True logger = logging.getLogger(__name__) @@ -188,7 +188,6 @@ def __init__( digest_name: str | None = None, root_uri: str | None = None, keep_index_versions: int = 0, - diff_file: Path | str | None = None, diff_append_epoch: bool = False, diff_full_path: Path | str | None = None, flock_timeout: int = 1, @@ -230,7 +229,6 @@ def __init__( # PyPI mirror, which requires serving packages from # https://files.pythonhosted.org self.root_uri = root_uri or "" - self.diff_file = diff_file self.diff_append_epoch = diff_append_epoch self.diff_full_path = diff_full_path self.keep_index_versions = keep_index_versions @@ -913,6 +911,44 @@ async def download_file( return path +_T = TypeVar("_T") + + +async def _run_in_storage_executor( + storage_plugin: Storage, func: Callable[..., _T], *args: Any +) -> _T: + return await storage_plugin.loop.run_in_executor( + storage_plugin.executor, func, *args + ) + + +async def _setup_diff_file( + storage_plugin: Storage, configured_path: str, append_epoch: bool +) -> Path: + # use the storage backend to convert an abstract path to a concrete one + concrete_path = storage_plugin.PATH_BACKEND(configured_path) + + # create parent directories if needed + await _run_in_storage_executor( + storage_plugin, lambda: concrete_path.parent.mkdir(exist_ok=True, parents=True) + ) + + # adjust the file/directory name if timestamps are enabled + if append_epoch: + epoch = int(time.time()) + concrete_path = concrete_path.with_name(f"{concrete_path.name}-{epoch}") + + # if the path is directory, adjust to write to a file inside it + if await _run_in_storage_executor(storage_plugin, concrete_path.is_dir): + concrete_path = concrete_path / "mirrored-files" + + # if there is an existing file there then delete it + if await _run_in_storage_executor(storage_plugin, concrete_path.is_file): + concrete_path.unlink() + + return concrete_path + + async def mirror( config: configparser.ConfigParser, specific_packages: list[str] | None = None, @@ -928,28 +964,13 @@ async def mirror( ) ) - diff_file = storage_plugin.PATH_BACKEND(config_values.diff_file_path) - diff_full_path: Path | str - if diff_file: - diff_file.parent.mkdir(exist_ok=True, parents=True) - if config_values.diff_append_epoch: - diff_full_path = diff_file.with_name(f"{diff_file.name}-{int(time.time())}") - else: - diff_full_path = diff_file - else: - diff_full_path = "" - - if diff_full_path: - if isinstance(diff_full_path, str): - diff_full_path = storage_plugin.PATH_BACKEND(diff_full_path) - if await storage_plugin.loop.run_in_executor( - storage_plugin.executor, diff_full_path.is_file - ): - diff_full_path.unlink() - elif await storage_plugin.loop.run_in_executor( - storage_plugin.executor, diff_full_path.is_dir - ): - diff_full_path = diff_full_path / "mirrored-files" + diff_full_path: Path | None = None + if config_values.diff_file_path: + diff_full_path = await _setup_diff_file( + storage_plugin, + config_values.diff_file_path, + config_values.diff_append_epoch, + ) mirror_url = config.get("mirror", "master") timeout = config.getfloat("mirror", "timeout") @@ -975,9 +996,8 @@ async def mirror( keep_index_versions=config.getint( "mirror", "keep_index_versions", fallback=0 ), - diff_file=diff_file, diff_append_epoch=config_values.diff_append_epoch, - diff_full_path=diff_full_path if diff_full_path else None, + diff_full_path=diff_full_path, cleanup=config_values.cleanup, release_files_save=config_values.release_files_save, download_mirror=config_values.download_mirror, @@ -997,14 +1017,13 @@ async def mirror( loggable_changes = [str(chg) for chg in package_changes] logger.debug(f"{package_name} added: {loggable_changes}") - if mirror.diff_full_path: - logger.info(f"Writing diff file to {mirror.diff_full_path}") + if diff_full_path: + logger.info(f"Writing diff file to '{diff_full_path}'") diff_text = f"{os.linesep}".join( [str(chg.absolute()) for chg in mirror.diff_file_list] ) - diff_file = mirror.storage_backend.PATH_BACKEND(mirror.diff_full_path) - await storage_plugin.loop.run_in_executor( - storage_plugin.executor, diff_file.write_text, diff_text + await _run_in_storage_executor( + storage_plugin, diff_full_path.write_text, diff_text ) return 0 diff --git a/src/bandersnatch/tests/test_main.py b/src/bandersnatch/tests/test_main.py index e42eef061..5fc34b340 100644 --- a/src/bandersnatch/tests/test_main.py +++ b/src/bandersnatch/tests/test_main.py @@ -90,7 +90,6 @@ def test_main_reads_config_values(mirror_mock: mock.MagicMock, tmpdir: Path) -> "keep_index_versions": 0, "release_files_save": True, "storage_backend": "filesystem", - "diff_file": diff_file, "diff_append_epoch": False, "diff_full_path": diff_file, "cleanup": False, diff --git a/src/bandersnatch/tests/test_mirror.py b/src/bandersnatch/tests/test_mirror.py index a55c86e8c..8099aa1c1 100644 --- a/src/bandersnatch/tests/test_mirror.py +++ b/src/bandersnatch/tests/test_mirror.py @@ -2,6 +2,7 @@ import sys import unittest.mock as mock from collections.abc import Iterator +from configparser import ConfigParser from os import sep from pathlib import Path from tempfile import TemporaryDirectory @@ -14,6 +15,7 @@ from bandersnatch.configuration import BandersnatchConfig, Singleton from bandersnatch.master import Master from bandersnatch.mirror import BandersnatchMirror +from bandersnatch.mirror import mirror as mirror_cmd from bandersnatch.package import Package from bandersnatch.simple import SimpleFormats from bandersnatch.tests.test_simple_fixtures import SIXTYNINE_METADATA @@ -206,6 +208,70 @@ def test_mirror_removes_old_status_and_todo_inits_generation(tmpdir: Path) -> No assert open(str(tmpdir / "generation")).read().strip() == "5" +# This is a test to check that a diff file is NOT created when 'diff-file' isn't set in +# configuration. Obviously we can't exhaustively assert a negative - the regression +# being checked for here is a diff file being created at a 'default' location +# ($directory/mirrored-files) when diff-file isn't configured. +@pytest.mark.asyncio +async def test_mirror_subcommand_only_creates_diff_file_if_configured( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + # Setup 1 - create a configuration for the 'mirror' subcommand + # (Configuration is passed as a parameter, so we don't need to fiddle with Singleton) + config = ConfigParser() + config.read_dict( + { + "mirror": { + "master": "https://pypi.org", + "directory": (tmp_path / "mirror").as_posix(), + "timeout": 15, + "stop-on-error": False, + "workers": 1, + "hash-index": False, + }, + "plugins": {"enabled": ["allowlist_project"]}, + "allowlist": {"packages": ["black", "ruff", "autopep8"]}, + } + ) + + # Setup 2 - patch 'Mirror.synchronize' with a stub returning fixed data + # Normally it returns 'self.altered_packages', and those are what get written into + # the diff file. Non-empty fixed content here means a diff-file should exist and + # have content IF it is configured. + async def mock_synchronize( + self: Any, + specific_packages: list[str] | None = None, + sync_simple_index: bool = True, + ) -> dict[str, set[str]]: + return { + "ruff": {"fake/file/path/1", "fake/file/path/2"}, + "black": {"fake/file/path/3"}, + } + + monkeypatch.setattr(BandersnatchMirror, "synchronize", mock_synchronize) + + # Execute the 'mirror' subcommand. Because 'synchronize' is patched, this is pretty + # much running the body of the 'mirror' function, including creating Master and + # Mirror instances, but doesn't perform any actual package syncing at all. + await mirror_cmd(config=config) + + # An 'info'-level message is always logged after 'synchronize' completes. It should + # be consistent with the static data returned from our mock method. + expected_message = "2 packages had changes" + assert any( + m == expected_message for m in caplog.messages + ), f"Logged '{expected_message}'" + + # Because BandersnatchMirror's 'bootstrap' runs, this directory should exist; + # this is just sanity checking that some disk IO happened... + web_folder = tmp_path / "mirror" / "web" + assert web_folder.is_dir(), f"The folder '{web_folder}' was created" + + # But there should NOT be a diff file: + absent_diff_file = tmp_path / "mirror" / "mirrored-files" + assert not absent_diff_file.exists(), f"there is no diff file at {absent_diff_file}" + + def test_mirror_with_same_homedir_needs_lock( mirror: BandersnatchMirror, tmpdir: Path ) -> None: