hpc · hwikle-lanl · Feb 5, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/lib/pavilion/test_run/test_run.py b/lib/pavilion/test_run/test_run.py
@@ -38,7 +38,7 @@
 from pavilion.test_config.file_format import NO_WORKING_DIR
 from pavilion.test_config.utils import parse_timeout
 from pavilion.types import ID_Pair
-from pavilion.micro import get_nested
+from pavilion.micro import get_nested, consume
 from pavilion.timing import wait
 from .test_attrs import TestAttributes
 
@@ -832,24 +832,14 @@ def run(self):
 
         return ret
 
-    def set_run_complete(self):
-        """Write a file in the test directory that indicates that the test
-    has completed a run, one way or another. This should only be called
-    when we're sure their won't be any more status changes."""
-
-        if self.complete:
-            return
-
-        if not self.saved:
-            raise RuntimeError("You must call the .save() method before run {} "
-                               "can be marked complete.".format(self.full_id))
+    @staticmethod
+    def _create_complete_file(complete_tmp_path: Path) -> None:
+        """Create the RUN_COMPLETE file for the test, ensuring that it is written
+        to NFS."""
 
         # Write the current time to the file. We don't actually use the contents
         # of the file, but it's nice to have another record of when this was
         # run.
-        complete_path = self.path/self.COMPLETE_FN
-        complete_tmp_path = complete_path.with_suffix('.tmp')
-
         with complete_tmp_path.open('w') as run_complete:
             json.dump(
                 {'complete': time.time()},
@@ -859,15 +849,54 @@ def set_run_complete(self):
             run_complete.flush()
             os.fsync(run_complete.fileno())
 
-        # Wait for the file to be written to disk before proceeding
-        wait(complete_tmp_path.exists, interval=0.2, timeout=2,
-                msg="Temporary complete file was not created.")
+    def _finalize_complete_file(self, complete_path: Path, complete_tmp_path: Path,
+                                status: TestStatusFile) -> None:
+        """Ensure that the temporary RUN_COMPLETE file has been successfully created and that it
+        exists on the current system. If it exists, rename it to reflect that it is finalized.
+        If it does not exist, try to force a cache refresh, and if that doesn't work, symlink to
+        the expected location."""
+
+        if complete_tmp_path.exists():
+            # Finalize the written file
+            complete_tmp_path.rename(complete_path)
+        else:
+            status.set(STATES.INFO,
+                            f"File {self.COMPLETE_FN}.tmp does not yet exist on local file system. "
+                            "Attempting to force NFS cache refresh.")
+
+            # Force an NFS cache update
+            consume(complete_tmp_path.parent.iterdir())
 
-        # Finalize the written file
-        complete_tmp_path.rename(complete_path)
+            if complete_tmp_path.exists():
+                complete_tmp_path.rename(complete_path)
+            else:
+                status.set(STATES.WARNING,
+                                f"Forced cache refresh failed for{self.COMPLETE_FN}.tmp. Falling "
+                                "back on symlink to temporary file location.")
+
+                # Symlink with the expectation that the temp file will be there eventually
+                complete_path.symlink_to(complete_tmp_path)
 
         self._complete = True
 
+    def set_run_complete(self) -> None:
+        """Write a file in the test directory that indicates that the test
+        has completed a run, one way or another. This should only be called
+        when we're sure their won't be any more status changes."""
+
+        if self.complete:
+            return
+
+        if not self.saved:
+            raise RuntimeError("You must call the .save() method before run {} "
+                               "can be marked complete.".format(self.full_id))
+
+        complete_path = self.path/self.COMPLETE_FN
+        complete_tmp_path = complete_path.with_suffix('.tmp')
+
+        self._create_complete_file(complete_tmp_path)
+        self._finalize_complete_file(complete_path, complete_tmp_path, self.status)
+
     def cancel(self, reason: str):
         """Create the cancellation file for the test, and denote in its status that it was
         cancelled, but do nothing beyond that."""

diff --git a/test/tests/test_run_tests.py b/test/tests/test_run_tests.py
@@ -1,11 +1,13 @@
 """Test the 'TestRun' object'"""
 
 import io
+import tempfile as tf
 
 from pavilion.errors import TestRunError
 from pavilion.test_run import TestRun
 from pavilion.unittest import PavTestCase
 from pavilion.variables import VariableSetManager
+from pavilion.status_file import TestStatusFile
 
 
 class TestRunTests(PavTestCase):
@@ -214,3 +216,24 @@ def test_skip_build(self):
         self.assertTrue(test.status.has_state('BUILD_SKIPPED'))
         test.finalize(VariableSetManager())
         self.assertEqual(test.run(), 0)
+
+    def test_complete_file_fallback(self):
+        """Test that the the fallback mechanism is correctly executed when the RUN_COMPLETE file
+        is missing on the local filesystem after it is created."""
+
+        # 1. Create a temporary directory, and pass that to the test as its working directory
+        cfg = self._quick_test_cfg()
+        test = TestRun(self.pav_cfg, cfg)
+
+        # 2. Let the test run create the complete file, then delete it.
+        complete_path = test.path / test.COMPLETE_FN
+        tmp_complete_path = complete_path.with_suffix(".tmp")
+        test._create_complete_file(tmp_complete_path)
+        tmp_complete_path.unlink()
+
+        # 3. Run _validate_complete_file to ensure it behaves correctly
+        status = TestStatusFile(test.path / test.STATUS_FN)
+        test._finalize_complete_file(complete_path, tmp_complete_path, status)
+
+        self.assertTrue(complete_path.is_symlink())
+        self.assertTrue(status.has_state("WARNING"))