Skip to content

Commit

Permalink
Handle filesystem latency when creating RUN_COMPLETE file (#803)
Browse files Browse the repository at this point in the history
* Catch unhandled TimeoutError and log status

* Increase RUN_COMPLETE timeout to 30 seconds

File system quirks mean that the RUN_COMPLETE file may not yet exist
on the file system after it is created. Since the file is used by
the same process relatively soon after creation, this can occassionally
cause a FileNotFoundError. It was therefore necessary to wait for
the file to be synced to the file system, and to impose a timeout.
The previous timeout value was too short and resulted in TimeoutErrors
relatively frequently. This commit increases the timeout value from
2 seconds to 30 seconds, per the suggestion of Paul Ferrell.

* Add fallback if RUN_COMPLETE is not present

The idiosyncrocies of NFS means that when the RUN_COMPLETE file is
written, it may not appear on the local file system, which results
in a FileNotFoundError when that file is accessed. This commit adds
a fallback when the file does not exist on the local file system, in
the process reverting the previous solution which involved waiting
and eventually raising a timeout.

The fallback consists of the following steps:

1. Create the file as normal, and check whether it exists.

2. If it does not exist, attempt to force an NFS cache reset by listing
the contents of the parent directory.

3. If the file still does not exist, create a symlink to the location
where the file is expected eventually to be.

* Add unit test for missing RUN_COMPLETE file fallback

* Fix failing style tests

* Fix error
  • Loading branch information
hwikle-lanl authored Feb 5, 2025
1 parent 8dad702 commit 00aae1a
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 20 deletions.
69 changes: 49 additions & 20 deletions lib/pavilion/test_run/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from pavilion.test_config.file_format import NO_WORKING_DIR
from pavilion.test_config.utils import parse_timeout
from pavilion.types import ID_Pair
from pavilion.micro import get_nested
from pavilion.micro import get_nested, consume
from pavilion.timing import wait
from .test_attrs import TestAttributes

Expand Down Expand Up @@ -832,24 +832,14 @@ def run(self):

return ret

def set_run_complete(self):
"""Write a file in the test directory that indicates that the test
has completed a run, one way or another. This should only be called
when we're sure their won't be any more status changes."""

if self.complete:
return

if not self.saved:
raise RuntimeError("You must call the .save() method before run {} "
"can be marked complete.".format(self.full_id))
@staticmethod
def _create_complete_file(complete_tmp_path: Path) -> None:
"""Create the RUN_COMPLETE file for the test, ensuring that it is written
to NFS."""

# Write the current time to the file. We don't actually use the contents
# of the file, but it's nice to have another record of when this was
# run.
complete_path = self.path/self.COMPLETE_FN
complete_tmp_path = complete_path.with_suffix('.tmp')

with complete_tmp_path.open('w') as run_complete:
json.dump(
{'complete': time.time()},
Expand All @@ -859,15 +849,54 @@ def set_run_complete(self):
run_complete.flush()
os.fsync(run_complete.fileno())

# Wait for the file to be written to disk before proceeding
wait(complete_tmp_path.exists, interval=0.2, timeout=2,
msg="Temporary complete file was not created.")
def _finalize_complete_file(self, complete_path: Path, complete_tmp_path: Path,
status: TestStatusFile) -> None:
"""Ensure that the temporary RUN_COMPLETE file has been successfully created and that it
exists on the current system. If it exists, rename it to reflect that it is finalized.
If it does not exist, try to force a cache refresh, and if that doesn't work, symlink to
the expected location."""

if complete_tmp_path.exists():
# Finalize the written file
complete_tmp_path.rename(complete_path)
else:
status.set(STATES.INFO,
f"File {self.COMPLETE_FN}.tmp does not yet exist on local file system. "
"Attempting to force NFS cache refresh.")

# Force an NFS cache update
consume(complete_tmp_path.parent.iterdir())

# Finalize the written file
complete_tmp_path.rename(complete_path)
if complete_tmp_path.exists():
complete_tmp_path.rename(complete_path)
else:
status.set(STATES.WARNING,
f"Forced cache refresh failed for{self.COMPLETE_FN}.tmp. Falling "
"back on symlink to temporary file location.")

# Symlink with the expectation that the temp file will be there eventually
complete_path.symlink_to(complete_tmp_path)

self._complete = True

def set_run_complete(self) -> None:
"""Write a file in the test directory that indicates that the test
has completed a run, one way or another. This should only be called
when we're sure their won't be any more status changes."""

if self.complete:
return

if not self.saved:
raise RuntimeError("You must call the .save() method before run {} "
"can be marked complete.".format(self.full_id))

complete_path = self.path/self.COMPLETE_FN
complete_tmp_path = complete_path.with_suffix('.tmp')

self._create_complete_file(complete_tmp_path)
self._finalize_complete_file(complete_path, complete_tmp_path, self.status)

def cancel(self, reason: str):
"""Create the cancellation file for the test, and denote in its status that it was
cancelled, but do nothing beyond that."""
Expand Down
23 changes: 23 additions & 0 deletions test/tests/test_run_tests.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Test the 'TestRun' object'"""

import io
import tempfile as tf

from pavilion.errors import TestRunError
from pavilion.test_run import TestRun
from pavilion.unittest import PavTestCase
from pavilion.variables import VariableSetManager
from pavilion.status_file import TestStatusFile


class TestRunTests(PavTestCase):
Expand Down Expand Up @@ -214,3 +216,24 @@ def test_skip_build(self):
self.assertTrue(test.status.has_state('BUILD_SKIPPED'))
test.finalize(VariableSetManager())
self.assertEqual(test.run(), 0)

def test_complete_file_fallback(self):
"""Test that the the fallback mechanism is correctly executed when the RUN_COMPLETE file
is missing on the local filesystem after it is created."""

# 1. Create a temporary directory, and pass that to the test as its working directory
cfg = self._quick_test_cfg()
test = TestRun(self.pav_cfg, cfg)

# 2. Let the test run create the complete file, then delete it.
complete_path = test.path / test.COMPLETE_FN
tmp_complete_path = complete_path.with_suffix(".tmp")
test._create_complete_file(tmp_complete_path)
tmp_complete_path.unlink()

# 3. Run _validate_complete_file to ensure it behaves correctly
status = TestStatusFile(test.path / test.STATUS_FN)
test._finalize_complete_file(complete_path, tmp_complete_path, status)

self.assertTrue(complete_path.is_symlink())
self.assertTrue(status.has_state("WARNING"))

0 comments on commit 00aae1a

Please sign in to comment.