Skip to content

Commit

Permalink
Expand CLI, method argument, fix recursive bug, expand tests also for…
Browse files Browse the repository at this point in the history
… nested directory
  • Loading branch information
GeigerJ2 committed Dec 11, 2024
1 parent 35f5245 commit 0e8e1cb
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 36 deletions.
31 changes: 27 additions & 4 deletions src/aiida/cmdline/commands/cmd_data/cmd_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""`verdi data core.remote` command."""

import stat
from pathlib import Path

import click

Expand Down Expand Up @@ -89,9 +90,31 @@ def remote_show(datum):
echo.echo(f'- Remote folder full path: {datum.get_remote_path()}')


@remote.command('size')
@remote.command("size")
@arguments.NODE()
def remote_size(node):
@click.option(
"-m",
"--method",
type=click.STRING,
default="du",
help="The method that should be used to evaluate the size (either ``du`` or ``lstat``.)",
)
@click.option(
"-p",
"--path",
type=click.Path(),
default=None,
help="Relative path of the object of the ``RemoteData`` node for which the size should be evaluated.",
)
def remote_size(node, method, path):
"""Print the total size of a RemoteData object."""
print(node)
total_size = node.get_size_on_disk()
try:
total_size, method = node.get_size_on_disk(relpath=path, method=method)
remote_path = Path(node.get_remote_path())
full_path = remote_path / path if path is not None else remote_path
echo.echo(
f"Estimated total size of directory `{full_path}` on the Computer "
f"`{node.computer.label}` obtained via `{method}`: {total_size}"
)
except FileNotFoundError as exc:
echo.echo_critical(str(exc))
94 changes: 64 additions & 30 deletions src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,14 @@ def _validate(self):
def get_authinfo(self):
return AuthInfo.get_collection(self.backend).get(dbcomputer=self.computer, aiidauser=self.user)

def get_size_on_disk(self, relpath: Path | None = None) -> str:
def get_size_on_disk(self, relpath: Path | None = None, method: str = "du") -> str:
"""
Connects to the remote folder and returns the total size of all files in the directory recursively in a
human-readable format.
:param relpath: File or directory path for which the total size should be returned, relative to
``self.get_remote_path()``.
:param method: Method to be used to evaluate the directory/file size (either ``du`` or ``lstat``).
:return: Total size of file or directory in human-readable format.
:raises: ``FileNotFoundError``, if file or directory does not exist anymore on the remote ``Computer``.
Expand All @@ -210,39 +211,57 @@ def get_size_on_disk(self, relpath: Path | None = None) -> str:
from aiida.common.utils import format_directory_size

if relpath is None:
relpath = Path('.')
relpath = Path(".")

authinfo = self.get_authinfo()
full_path = Path(self.get_remote_path()) / relpath
computer_label = self.computer.label if self.computer is not None else ''
computer_label = self.computer.label if self.computer is not None else ""

with authinfo.get_transport() as transport:
if not transport.path_exists(str(full_path)):
exc_message = (
f'The required remote folder {full_path} on Computer <{computer_label}>'
'does not exist, is not a directory or has been deleted.'
f"The required remote folder {full_path} on Computer <{computer_label}> "
"does not exist, is not a directory or has been deleted."
)
raise FileNotFoundError(exc_message)

try:
total_size: int = self._get_size_on_disk_du(full_path, transport)

except (RuntimeError, NotImplementedError):
lstat_warn = (
'Problem executing `du` command. Will return total file size based on `lstat`. '
'Take the result with a grain of salt, as `lstat` does not consider the file system block size, '
'but instead returns the true size of the files in bytes, which differs from the actual space'
'requirements on disk.'
if method not in ("du", "lstat"):
raise NotImplementedError(
f"Specified method `{method}` for evaluating the size on disk not implemented."
)
_logger.warning(lstat_warn)

if method == "du":
try:
total_size: int = self._get_size_on_disk_du(full_path, transport)

except (RuntimeError, NotImplementedError):
lstat_warn = (
"Problem executing `du` command. Will return total file size based on `lstat`. "
"Take the result with a grain of salt, as `lstat` does not consider the file system block size,"
" but instead returns the true size of the files in bytes, which differs from the actual space"
"requirements on disk."
)

_logger.warning(lstat_warn)
method = "lstat"

else:
_logger.report("Obtained size on the remote using `du`.")

# No elif here, but another if, to allow that the method is internally changed to `lstat`, if `du` fails
if method == "lstat":
try:
total_size: int = self._get_size_on_disk_lstat(full_path, transport)
print(f"TOTAL_SIZE: {total_size}")

except OSError:
_logger.critical('Could not evaluate directory size using either `du` or `lstat`.')
_logger.critical(
"Could not evaluate directory size using either `du` or `lstat`."
)
else:
_logger.report("Obtained size on the remote using `lstat`.")

return format_directory_size(size_in_bytes=total_size)
return format_directory_size(size_in_bytes=total_size), method

def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""Connects to the remote folder and returns the total size of all files in the directory recursively in bytes
Expand All @@ -256,16 +275,19 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""

try:
retval, stdout, stderr = transport.exec_command_wait(f'du --bytes {full_path}')
retval, stdout, stderr = transport.exec_command_wait(
f"du -s --bytes {full_path}"
)
if not stderr and retval == 0:
total_size: int = int(stdout.split('\t')[0])
total_size: int = int(stdout.split("\t")[0])
return total_size
else:
raise RuntimeError(f'Error executing `du` command: {stderr}')
raise RuntimeError(f"Error executing `du` command: {stderr}")

except NotImplementedError as exc:
raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc
# _logger.critical('`exec_command_wait` not implemented for the current transport plugin.')
raise NotImplementedError(
"`exec_command_wait` not implemented for the current transport plugin."
) from exc

def _get_size_on_disk_lstat(self, full_path: Path, transport: Transport) -> int:
"""
Expand All @@ -280,27 +302,39 @@ def _get_size_on_disk_lstat(self, full_path: Path, transport: Transport) -> int:
:raises OSError: When directory given by ``full_path`` not existing or not a directory.
:return: Total size of directory in bytes (including all its contents).
"""
try:
total_size = 0
def _get_size_on_disk_lstat_recursive(full_path, transport):

current_size = 0

contents = self.listdir_withattributes(full_path)

for item in contents:
item_path = full_path / item['name']
item_path = full_path / item["name"]
# Add size of current item (file or directory metadata)
total_size += item['attributes']['st_size']
# breakpoint()
# print(f'ITEM: {item["name"]}, {item["attributes"]["st_size"]}({item_path})')
current_size += item["attributes"]["st_size"]

# If it's a directory, recursively get size of contents
if item['isdir']:
total_size += self._get_size_on_disk_lstat(item_path, transport)
if item["isdir"]:
# print(item["name"])
current_size += _get_size_on_disk_lstat_recursive(item_path, transport)

# print(f"CURRENT_SIZE: {current_size}")

return current_size

try:
total_size = _get_size_on_disk_lstat_recursive(full_path, transport)
return total_size

except OSError as exception:
print(exception.errno, exception)
if exception.errno in (2, 20):
# directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
f"The required remote folder {full_path} on {self.computer.label} does not exist, is not a "
"directory or has been deleted."
)
exc.errno = exception.errno
raise exc from exception
Expand Down
57 changes: 55 additions & 2 deletions tests/orm/nodes/data/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,66 @@ def test_get_size_on_disk(request, fixture):

# Check here for human-readable output string, as integer byte values are checked in
# `test_get_size_on_disk_[du|lstat]`
size_on_disk = remote_data.get_size_on_disk()
size_on_disk, method = remote_data.get_size_on_disk(method='du')
assert size_on_disk == '4.01 KB'
assert method == 'du'

size_on_disk, method = remote_data.get_size_on_disk(method='lstat')
assert size_on_disk == '12.00 B'
assert method == 'lstat'

# Path/file non-existent
with pytest.raises(FileNotFoundError, match='.*does not exist, is not a directory.*'):
remote_data.get_size_on_disk(relpath=Path('non-existent'))

@pytest.mark.parametrize(
'num_char, relpath, sizes',
(
(1, '.', {'du': 12291, 'lstat': 8195, 'human': '12.00 KB'}),
(100, '.', {'du': 12588, 'lstat': 8492, 'human': '12.29 KB'}),
(int(1e6), '.', {'du': 3012288, 'lstat': 3008192, 'human': '2.87 MB'}),
(1, 'subdir1', {'du': 8194, 'lstat': 4098, 'human': '8.00 KB'}),
(100, 'subdir1', {'du': 8392, 'lstat': 4296, 'human': '8.20 KB'}),
(int(1e6), 'subdir1', {'du': 2008192, 'lstat': 2004096, 'human': '1.92 MB'}),
),
)
def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes):

sub_dir1 = tmp_path / "subdir1"
sub_dir1.mkdir()

sub_dir2 = tmp_path / "subdir1" / "subdir2"
sub_dir2.mkdir()

# Create some files with known sizes
file1 = sub_dir1 / "file1.txt"
file1.write_text("a"*num_char)

file2 = sub_dir2 / "file2.bin"
file2.write_bytes(b"a" * num_char)

file3 = tmp_path / "file3.txt"
file3.write_text("a" * num_char)

remote_data = RemoteData(computer=aiida_localhost, remote_path=tmp_path)

authinfo = remote_data.get_authinfo()
full_path = Path(remote_data.get_remote_path()) / relpath

with authinfo.get_transport() as transport:

size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
size_on_disk_lstat = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)

size_on_disk_human, _ = remote_data.get_size_on_disk(relpath=relpath)

print(f'du: {size_on_disk_du}, lstat: {size_on_disk_lstat}, human: {size_on_disk_human}')
assert size_on_disk_du == sizes['du']
assert size_on_disk_lstat == sizes['lstat']
assert size_on_disk_human == sizes['human']

# Do the same possibly for subtrees


@pytest.mark.parametrize(
'num_char, sizes',
Expand Down Expand Up @@ -104,7 +157,7 @@ def test_get_size_on_disk_sizes(tmp_path, num_char, sizes, request, fixture):
with authinfo.get_transport() as transport:
size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
size_on_disk_lstat = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
size_on_disk_human = remote_data.get_size_on_disk()
size_on_disk_human, _ = remote_data.get_size_on_disk()

assert size_on_disk_du == sizes['du']
assert size_on_disk_lstat == sizes['lstat']
Expand Down

0 comments on commit 0e8e1cb

Please sign in to comment.