Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle filesystem latency when creating RUN_COMPLETE file #803

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 49 additions & 20 deletions lib/pavilion/test_run/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from pavilion.test_config.file_format import NO_WORKING_DIR
from pavilion.test_config.utils import parse_timeout
from pavilion.types import ID_Pair
from pavilion.micro import get_nested
from pavilion.micro import get_nested, consume
from pavilion.timing import wait
from .test_attrs import TestAttributes

Expand Down Expand Up @@ -832,24 +832,14 @@ def run(self):

return ret

def set_run_complete(self):
"""Write a file in the test directory that indicates that the test
has completed a run, one way or another. This should only be called
when we're sure their won't be any more status changes."""

if self.complete:
return

if not self.saved:
raise RuntimeError("You must call the .save() method before run {} "
"can be marked complete.".format(self.full_id))
@staticmethod
def _create_complete_file(complete_tmp_path: Path) -> None:
"""Create the RUN_COMPLETE file for the test, ensuring that it is written
to NFS."""

# Write the current time to the file. We don't actually use the contents
# of the file, but it's nice to have another record of when this was
# run.
complete_path = self.path/self.COMPLETE_FN
complete_tmp_path = complete_path.with_suffix('.tmp')

with complete_tmp_path.open('w') as run_complete:
json.dump(
{'complete': time.time()},
Expand All @@ -859,15 +849,54 @@ def set_run_complete(self):
run_complete.flush()
os.fsync(run_complete.fileno())

# Wait for the file to be written to disk before proceeding
wait(complete_tmp_path.exists, interval=0.2, timeout=2,
msg="Temporary complete file was not created.")
def _finalize_complete_file(self, complete_path: Path, complete_tmp_path: Path,
status: TestStatusFile) -> None:
"""Ensure that the temporary RUN_COMPLETE file has been successfully created and that it
exists on the current system. If it exists, rename it to reflect that it is finalized.
If it does not exist, try to force a cache refresh, and if that doesn't work, symlink to
the expected location."""

if complete_tmp_path.exists():
# Finalize the written file
complete_tmp_path.rename(complete_path)
else:
status.set(STATES.INFO,
f"File {self.COMPLETE_FN}.tmp does not yet exist on local file system. "
"Attempting to force NFS cache refresh.")

# Force an NFS cache update
consume(complete_tmp_path.parent.iterdir())

# Finalize the written file
complete_tmp_path.rename(complete_path)
if complete_tmp_path.exists():
complete_tmp_path.rename(complete_path)
else:
status.set(STATES.WARNING,
f"Forced cache refresh failed for{self.COMPLETE_FN}.tmp. Falling "
"back on symlink to temporary file location.")

# Symlink with the expectation that the temp file will be there eventually
complete_path.symlink_to(complete_tmp_path)

self._complete = True

def set_run_complete(self) -> None:
"""Write a file in the test directory that indicates that the test
has completed a run, one way or another. This should only be called
when we're sure their won't be any more status changes."""

if self.complete:
return

if not self.saved:
raise RuntimeError("You must call the .save() method before run {} "
"can be marked complete.".format(self.full_id))

complete_path = self.path/self.COMPLETE_FN
complete_tmp_path = complete_path.with_suffix('.tmp')

self._create_complete_file(complete_tmp_path)
self._finalize_complete_file(complete_path, complete_tmp_path, self.status)

def cancel(self, reason: str):
"""Create the cancellation file for the test, and denote in its status that it was
cancelled, but do nothing beyond that."""
Expand Down
23 changes: 23 additions & 0 deletions test/tests/test_run_tests.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Test the 'TestRun' object'"""

import io
import tempfile as tf

from pavilion.errors import TestRunError
from pavilion.test_run import TestRun
from pavilion.unittest import PavTestCase
from pavilion.variables import VariableSetManager
from pavilion.status_file import TestStatusFile


class TestRunTests(PavTestCase):
Expand Down Expand Up @@ -214,3 +216,24 @@ def test_skip_build(self):
self.assertTrue(test.status.has_state('BUILD_SKIPPED'))
test.finalize(VariableSetManager())
self.assertEqual(test.run(), 0)

def test_complete_file_fallback(self):
"""Test that the the fallback mechanism is correctly executed when the RUN_COMPLETE file
is missing on the local filesystem after it is created."""

# 1. Create a temporary directory, and pass that to the test as its working directory
cfg = self._quick_test_cfg()
test = TestRun(self.pav_cfg, cfg)

# 2. Let the test run create the complete file, then delete it.
complete_path = test.path / test.COMPLETE_FN
tmp_complete_path = complete_path.with_suffix(".tmp")
test._create_complete_file(tmp_complete_path)
tmp_complete_path.unlink()

# 3. Run _validate_complete_file to ensure it behaves correctly
status = TestStatusFile(test.path / test.STATUS_FN)
test._finalize_complete_file(complete_path, tmp_complete_path, status)

self.assertTrue(complete_path.is_symlink())
self.assertTrue(status.has_state("WARNING"))