Handle filesystem latency when creating RUN_COMPLETE file (#803)

* Catch unhandled TimeoutError and log status * Increase RUN_COMPLETE timeout to 30 seconds File system quirks mean that the RUN_COMPLETE file may not yet exist on the file system after it is created. Since the file is used by the same process relatively soon after creation, this can occassionally cause a FileNotFoundError. It was therefore necessary to wait for the file to be synced to the file system, and to impose a timeout. The previous timeout value was too short and resulted in TimeoutErrors relatively frequently. This commit increases the timeout value from 2 seconds to 30 seconds, per the suggestion of Paul Ferrell. * Add fallback if RUN_COMPLETE is not present The idiosyncrocies of NFS means that when the RUN_COMPLETE file is written, it may not appear on the local file system, which results in a FileNotFoundError when that file is accessed. This commit adds a fallback when the file does not exist on the local file system, in the process reverting the previous solution which involved waiting and eventually raising a timeout. The fallback consists of the following steps: 1. Create the file as normal, and check whether it exists. 2. If it does not exist, attempt to force an NFS cache reset by listing the contents of the parent directory. 3. If the file still does not exist, create a symlink to the location where the file is expected eventually to be. * Add unit test for missing RUN_COMPLETE file fallback * Fix failing style tests * Fix error
hpc · Feb 5, 2025 · 00aae1a · 00aae1a
1 parent 8dad702
commit 00aae1a
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 20 deletions.
diff --git a/lib/pavilion/test_run/test_run.py b/lib/pavilion/test_run/test_run.py
@@ -38,7 +38,7 @@
 from pavilion.test_config.file_format import NO_WORKING_DIR
 from pavilion.test_config.utils import parse_timeout
 from pavilion.types import ID_Pair
-from pavilion.micro import get_nested
+from pavilion.micro import get_nested, consume
 from pavilion.timing import wait
 from .test_attrs import TestAttributes
 
@@ -832,24 +832,14 @@ def run(self):
 
         return ret
 
-    def set_run_complete(self):
-        """Write a file in the test directory that indicates that the test
-    has completed a run, one way or another. This should only be called
-    when we're sure their won't be any more status changes."""
-
-        if self.complete:
-            return
-
-        if not self.saved:
-            raise RuntimeError("You must call the .save() method before run {} "
-                               "can be marked complete.".format(self.full_id))
+    @staticmethod
+    def _create_complete_file(complete_tmp_path: Path) -> None:
+        """Create the RUN_COMPLETE file for the test, ensuring that it is written
+        to NFS."""
 
         # Write the current time to the file. We don't actually use the contents
         # of the file, but it's nice to have another record of when this was
         # run.
-        complete_path = self.path/self.COMPLETE_FN
-        complete_tmp_path = complete_path.with_suffix('.tmp')
-
         with complete_tmp_path.open('w') as run_complete:
             json.dump(
                 {'complete': time.time()},
@@ -859,15 +849,54 @@ def set_run_complete(self):
             run_complete.flush()
             os.fsync(run_complete.fileno())
 
-        # Wait for the file to be written to disk before proceeding
-        wait(complete_tmp_path.exists, interval=0.2, timeout=2,
-                msg="Temporary complete file was not created.")
+    def _finalize_complete_file(self, complete_path: Path, complete_tmp_path: Path,
+                                status: TestStatusFile) -> None:
+        """Ensure that the temporary RUN_COMPLETE file has been successfully created and that it
+        exists on the current system. If it exists, rename it to reflect that it is finalized.
+        If it does not exist, try to force a cache refresh, and if that doesn't work, symlink to
+        the expected location."""
+
+        if complete_tmp_path.exists():
+            # Finalize the written file
+            complete_tmp_path.rename(complete_path)
+        else:
+            status.set(STATES.INFO,
+                            f"File {self.COMPLETE_FN}.tmp does not yet exist on local file system. "
+                            "Attempting to force NFS cache refresh.")
+
+            # Force an NFS cache update
+            consume(complete_tmp_path.parent.iterdir())
 
-        # Finalize the written file
-        complete_tmp_path.rename(complete_path)
+            if complete_tmp_path.exists():
+                complete_tmp_path.rename(complete_path)
+            else:
+                status.set(STATES.WARNING,
+                                f"Forced cache refresh failed for{self.COMPLETE_FN}.tmp. Falling "
+                                "back on symlink to temporary file location.")
+
+                # Symlink with the expectation that the temp file will be there eventually
+                complete_path.symlink_to(complete_tmp_path)
 
         self._complete = True
 
+    def set_run_complete(self) -> None:
+        """Write a file in the test directory that indicates that the test
+        has completed a run, one way or another. This should only be called
+        when we're sure their won't be any more status changes."""
+
+        if self.complete:
+            return
+
+        if not self.saved:
+            raise RuntimeError("You must call the .save() method before run {} "
+                               "can be marked complete.".format(self.full_id))
+
+        complete_path = self.path/self.COMPLETE_FN
+        complete_tmp_path = complete_path.with_suffix('.tmp')
+
+        self._create_complete_file(complete_tmp_path)
+        self._finalize_complete_file(complete_path, complete_tmp_path, self.status)
+
     def cancel(self, reason: str):
         """Create the cancellation file for the test, and denote in its status that it was
         cancelled, but do nothing beyond that."""

diff --git a/test/tests/test_run_tests.py b/test/tests/test_run_tests.py
@@ -1,11 +1,13 @@
 """Test the 'TestRun' object'"""
 
 import io
+import tempfile as tf
 
 from pavilion.errors import TestRunError
 from pavilion.test_run import TestRun
 from pavilion.unittest import PavTestCase
 from pavilion.variables import VariableSetManager
+from pavilion.status_file import TestStatusFile
 
 
 class TestRunTests(PavTestCase):
@@ -214,3 +216,24 @@ def test_skip_build(self):
         self.assertTrue(test.status.has_state('BUILD_SKIPPED'))
         test.finalize(VariableSetManager())
         self.assertEqual(test.run(), 0)
+
+    def test_complete_file_fallback(self):
+        """Test that the the fallback mechanism is correctly executed when the RUN_COMPLETE file
+        is missing on the local filesystem after it is created."""
+
+        # 1. Create a temporary directory, and pass that to the test as its working directory
+        cfg = self._quick_test_cfg()
+        test = TestRun(self.pav_cfg, cfg)
+
+        # 2. Let the test run create the complete file, then delete it.
+        complete_path = test.path / test.COMPLETE_FN
+        tmp_complete_path = complete_path.with_suffix(".tmp")
+        test._create_complete_file(tmp_complete_path)
+        tmp_complete_path.unlink()
+
+        # 3. Run _validate_complete_file to ensure it behaves correctly
+        status = TestStatusFile(test.path / test.STATUS_FN)
+        test._finalize_complete_file(complete_path, tmp_complete_path, status)
+
+        self.assertTrue(complete_path.is_symlink())
+        self.assertTrue(status.has_state("WARNING"))