From c64b7f64bf9fc120039c416d230b3e0fae928074 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Fri, 5 May 2023 11:18:35 +0000 Subject: [PATCH] Added configuration file parameters for repository metadata lock timings --- CHANGELOG.md | 7 +++--- doc/config/configuration_reference.rst | 30 +++++++++++++++++++++++++- etc/sarus.schema.json | 13 +++++++++++ src/common/Lockfile.cpp | 4 ++-- src/common/Lockfile.hpp | 2 +- src/common/test/json/valid.json | 7 +++++- src/common/test/test_JSON.cpp | 18 ++++++++++++++++ src/image_manager/ImageStore.cpp | 26 +++++++++++++++++----- src/image_manager/ImageStore.hpp | 3 ++- 9 files changed, 96 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 730ec24d..ac888b6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Added the `sarus hooks` command to list the hooks configured for the engine - Added the `--annotation` option to `sarus run` for setting custom annotations in the OCI bundle. More details [here](https://sarus.readthedocs.io/en/stable/user/user_guide.html#setting-oci-annotations) - Added the `--mpi-type` option to `sarus run` for selecting an MPI hook among those configured by the system administrator +- Added a warning message when acquisition of a lock file on the local repository metadata file is taking an unusually long time. + The message is displayed at a [configurable interval](https://sarus.readthedocs.io/en/stable/config/configuration_reference.html#repositorymetadatalocktimings-object-optional) (default 10 seconds), until the lock acquisition timeout is reached. - Added support for the optional `defaultMPIType` parameter in the `sarus.json` configuration file. More details [here](https://sarus.readthedocs.io/en/stable/config/configuration_reference.html#defaultmpitype-string-optional). -- Added a warning message when acquisition of a lockfile on the local repository metadata file is taking an unusually long time. - The message is displayed every 1 second, until the lock acquisition timeout is reached. +- Added support for the optional `repositoryMetadataLockTimings` parameter in the `sarus.json` configuration file. More details [here](https://sarus.readthedocs.io/en/stable/config/configuration_reference.html#repositorymetadatalocktimings-object-optional). - Added a new OCI hook to perform arbitrary sequences of bind mounts and device mounts into containers. The hook is meant to streamline the implementation and usage of advanced features which can be enabled through sets of related mounts. More details [here](https://sarus.readthedocs.io/en/stable/config/mount-hook.html). @@ -22,7 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed -- Sarus will now exit with an error if an operation requiring a lockfile on the local repository metadata cannot acquire a lock within 10 seconds. +- Sarus will now exit with an error if an operation requiring a lock file on the local repository metadata cannot acquire a lock within the [configured timeout duration](https://sarus.readthedocs.io/en/stable/config/configuration_reference.html#repositorymetadatalocktimings-object-optional) (default 60 seconds). Previously, Sarus would keep attempting to acquire a lock indefinitely. - When printing error traces, entries related to standard C++ exceptions now provide clearer information - Updated recommended runc version to 1.1.6 diff --git a/doc/config/configuration_reference.rst b/doc/config/configuration_reference.rst index 5273faf7..1c1f0719 100644 --- a/doc/config/configuration_reference.rst +++ b/doc/config/configuration_reference.rst @@ -402,6 +402,30 @@ This could fit the case of a system where only a single MPI hook is configured. Refer to :ref:`this section ` for more details about the interaction between command line options, annotations, and hook selection. +.. _config-reference-repositoryMetadataLockTimings: + +repositoryMetadataLockTimings (object, OPTIONAL) +------------------------------------------------ +To prevent race conditions when accessing metadata files of local image +repositories, Sarus implements a mechanism based on a lock file. + +The ``repositoryMetadataLockTimings`` JSON object allows to control the +behavior of Sarus while it is waiting to acquire the lock on the metadata file. +This object can have the following optional fields: + +* ``timeoutMs`` (integer): Maximum amount of time (in milliseconds) for which + Sarus attempts to acquire the lock. If Sarus is still waiting after this time + has elapsed, the program times out and exits with an error. + The value must be a positive integer. If the parameter is not defined, + defaults to 60000 ms (60 seconds). +* ``warningMs``: While waiting to acquire the lock, Sarus prints regular + warning messages to inform of its status (i.e. access to the metadata file + is taking longer but the program is not stalled for another reason) until + the timeout is reached or acquisition of the lock is successful. + This parameter determines the time interval (in milliseconds) between such + warning messages. The value must be a positive integer. If the parameter is not + defined, defaults to 10000 ms (10 seconds). + .. _config-reference-PMIxv3: enablePMIxv3Support (bool, OPTIONAL) (experimental) @@ -482,5 +506,9 @@ Example configuration file "enforce": false }, "containersRegistries.dPath": "/opt/sarus/1.5.2/etc/registries.d" - "defaultMPIType": "mpich" + "defaultMPIType": "mpich", + "repositoryMetadataLockTimings": { + "timeoutMs": 120000, + "warningMs": 15000 + } } diff --git a/etc/sarus.schema.json b/etc/sarus.schema.json index bbeb13dd..1df0ef95 100644 --- a/etc/sarus.schema.json +++ b/etc/sarus.schema.json @@ -122,6 +122,19 @@ }, "enablePMIxv3Support": { "type": "boolean" + }, + "repositoryMetadataLockTimings": { + "type": "object", + "properties": { + "timeoutMs": { + "type": "integer", + "minimum": 1 + }, + "warningMs": { + "type": "integer", + "minimum": 1 + } + } } }, "required": [ diff --git a/src/common/Lockfile.cpp b/src/common/Lockfile.cpp index c5d9febc..056dfc5b 100644 --- a/src/common/Lockfile.cpp +++ b/src/common/Lockfile.cpp @@ -29,7 +29,7 @@ Lockfile::Lockfile() : logger(&common::Logger::getInstance()) {} -Lockfile::Lockfile(const boost::filesystem::path& file, unsigned int timeoutMs) +Lockfile::Lockfile(const boost::filesystem::path& file, unsigned int timeoutMs, unsigned int warningMs) : logger(&common::Logger::getInstance()) , lockfile{convertToLockfile(file)} { @@ -45,7 +45,7 @@ Lockfile::Lockfile(const boost::filesystem::path& file, unsigned int timeoutMs) const int backoffTimeMs = 100; std::this_thread::sleep_for(std::chrono::milliseconds(backoffTimeMs)); elapsedTimeMs += backoffTimeMs; - if(elapsedTimeMs % 1000 == 0) { + if(elapsedTimeMs % warningMs == 0) { message = boost::format("Still attempting to acquire lock on file %s after %d ms (will timeout after %d milliseconds)...") % *lockfile % elapsedTimeMs % timeoutMs; logger->log(message.str(), loggerSubsystemName, common::LogLevel::WARN); diff --git a/src/common/Lockfile.hpp b/src/common/Lockfile.hpp index b89fc335..cc90987f 100644 --- a/src/common/Lockfile.hpp +++ b/src/common/Lockfile.hpp @@ -37,7 +37,7 @@ class Lockfile { public: Lockfile(); - Lockfile(const boost::filesystem::path& file, unsigned int timeoutMs=noTimeout); + Lockfile(const boost::filesystem::path& file, unsigned int timeoutMs=noTimeout, unsigned int warningMs=1000); Lockfile(const Lockfile&) = delete; Lockfile(Lockfile&&); ~Lockfile(); diff --git a/src/common/test/json/valid.json b/src/common/test/json/valid.json index eaca964f..ee19854a 100644 --- a/src/common/test/json/valid.json +++ b/src/common/test/json/valid.json @@ -52,5 +52,10 @@ "path": "/opt/sarus/etc/policy.json", "enforce": false }, - "containersRegistries.dPath": "/opt/sarus/etc/registries.d" + "containersRegistries.dPath": "/opt/sarus/etc/registries.d", + "defaultMPIType": "mpich", + "repositoryMetadataLockTimings": { + "timeoutMs": 120000, + "warningMs": 15000 + } } diff --git a/src/common/test/test_JSON.cpp b/src/common/test/test_JSON.cpp index 7f4fca19..80d0c023 100644 --- a/src/common/test/test_JSON.cpp +++ b/src/common/test/test_JSON.cpp @@ -70,6 +70,18 @@ TEST(JSONTestGroup, validFile) { CHECK_EQUAL(config->json["seccompProfile"].GetString(), std::string("/opt/sarus/etc/seccomp/default.json")); CHECK_EQUAL(config->json["apparmorProfile"].GetString(), std::string("sarus-default")); CHECK_EQUAL(config->json["selinuxLabel"].GetString(), std::string("system_u:system_r:svirt_sarus_t:s0:c124,c675")); + CHECK_EQUAL(config->json["selinuxMountLabel"].GetString(), std::string("system_u:object_r:svirt_sarus_file_t:s0:c715,c811")); + + const rapidjson::Value& containersPolicy = config->json["containersPolicy"]; + CHECK_EQUAL(containersPolicy["path"].GetString(), std::string("/opt/sarus/etc/policy.json")); + CHECK_EQUAL(containersPolicy["enforce"].GetBool(), false); + + CHECK_EQUAL(config->json["containersRegistries.dPath"].GetString(), std::string("/opt/sarus/etc/registries.d")); + CHECK_EQUAL(config->json["defaultMPIType"].GetString(), std::string("mpich")); + + const rapidjson::Value& lockTimings = config->json["repositoryMetadataLockTimings"]; + CHECK_EQUAL(lockTimings["timeoutMs"].GetInt(), 120000); + CHECK_EQUAL(lockTimings["warningMs"].GetInt(), 15000); } TEST(JSONTestGroup, minimumRequirementsFile) { @@ -96,4 +108,10 @@ TEST(JSONTestGroup, siteMountWithoutType) { CHECK_THROWS(sarus::common::Error, sarus::common::Config(jsonFile, jsonSchemaFile)); } +TEST(JSONTestGroup, invalidLockTiming) { + boost::filesystem::path jsonFile(testSourceDir / "json/invlid_lock_timing.json"); + boost::filesystem::path jsonSchemaFile(projectRootDir / "etc/sarus.schema.json"); + CHECK_THROWS(sarus::common::Error, sarus::common::Config(jsonFile, jsonSchemaFile)); +} + SARUS_UNITTEST_MAIN_FUNCTION(); diff --git a/src/image_manager/ImageStore.cpp b/src/image_manager/ImageStore.cpp index 6796e8d8..e730135c 100644 --- a/src/image_manager/ImageStore.cpp +++ b/src/image_manager/ImageStore.cpp @@ -35,7 +35,23 @@ namespace image_manager { ImageStore::ImageStore(std::shared_ptr config) : imagesDirectory{config->directories.images} , metadataFile{config->directories.repository / "metadata.json"} - {} + { + auto rjPtr = rj::Pointer("/repositoryMetadataLockTimings/timeoutMs"); + if (const rj::Value* configLockTimeoutMs = rjPtr.Get(config->json)) { + lockTimeoutMs = configLockTimeoutMs->GetInt(); + } + else { + lockTimeoutMs = 60000; + } + + rjPtr = rj::Pointer("/repositoryMetadataLockTimings/warningMs"); + if (const rj::Value* configLockWarningMs = rjPtr.Get(config->json)) { + lockWarningMs = configLockWarningMs->GetInt(); + } + else { + lockWarningMs = 10000; + } + } /** * Add the container image into repository (or update existing object) @@ -45,7 +61,7 @@ namespace image_manager { common::LogLevel::INFO); try { - common::Lockfile lock{metadataFile, lockTimeoutMs}; + common::Lockfile lock{metadataFile, lockTimeoutMs, lockWarningMs}; auto metadata = readRepositoryMetadata(); // remove previous entries with the same image reference (if any) @@ -82,7 +98,7 @@ namespace image_manager { common::LogLevel::INFO); try { - common::Lockfile lock{metadataFile, lockTimeoutMs}; + common::Lockfile lock{metadataFile, lockTimeoutMs, lockWarningMs}; auto repositoryMetadata = readRepositoryMetadata(); auto imageMetadata = findImageMetadata(imageReference, repositoryMetadata); @@ -114,7 +130,7 @@ namespace image_manager { auto images = std::vector{}; try { - common::Lockfile lock{metadataFile, lockTimeoutMs}; + common::Lockfile lock{metadataFile, lockTimeoutMs, lockWarningMs}; auto repositoryMetadata = readRepositoryMetadata(); for (const auto& imageMetadata : repositoryMetadata["images"].GetArray()) { // If backing files are present, all image data is available: add the image to list to be visualized. @@ -143,7 +159,7 @@ namespace image_manager { boost::optional image; try { - common::Lockfile lock{metadataFile, lockTimeoutMs}; + common::Lockfile lock{metadataFile, lockTimeoutMs, lockWarningMs}; auto repositoryMetadata = readRepositoryMetadata(); auto imageMetadata = findImageMetadata(reference, repositoryMetadata); if (imageMetadata) { diff --git a/src/image_manager/ImageStore.hpp b/src/image_manager/ImageStore.hpp index fa2e62e3..b78621cd 100644 --- a/src/image_manager/ImageStore.hpp +++ b/src/image_manager/ImageStore.hpp @@ -58,7 +58,8 @@ class ImageStore { const std::string sysname = "ImageStore"; // system name for logger boost::filesystem::path imagesDirectory; boost::filesystem::path metadataFile; - unsigned int lockTimeoutMs = 10000; + unsigned int lockWarningMs; + unsigned int lockTimeoutMs; }; } // namespace