Skip to content

Commit

Permalink
MAINT: PR 839 revisions
Browse files Browse the repository at this point in the history
* `cffi_backend` module changes requested from PR review
  - remove a spurious `darshan_free` from `_log_get_heatmap_record()`
  - fix the scoping of the `darshan_free` of `buf` object used with
    `darshan_accumulator_inject` in `log_get_derived_metrics`
  - adding a missing `log_close()` to `log_get_derived_metrics` (maybe
    we can wrap in Python contexts in the future though)
  - use a separate buffer for `darshan_accumulator_emit()` inside
    `log_get_derived_metrics`

* note that making the above CFFI/free-related changes caused
a segfault in the testuite, so in the end I adjusted the location
of the memory freeing as I saw fit to avoid segfaults--I'd say at this
point please provide concrete evidence with a memory leak plot or
failing test for additional adjustments there, or just push the change
in

* in the end, there is a slightly more concise usage of `darshan_free()`
but no meaningful change in the free operations

* I also reverted the suggested changed to `darshan_accumulator_emit()`
usage--there was no testable evidence of an issue, and it was also
causing segfaults..

* address many of the discussion points that came up in gh-868:
  - `log_get_derived_metrics()` now uses an LRU cache, which effectively
    means that we use memoization to return derived metrics data
    rather than doing another pass over the log file if the same
    log path and module name have already been accumulated from; we
    still need to pass over a given log twice in most cases--once at
    initial read-in and once for using `log_get_derived_metrics`; how
    we decide to add filtering of records prior to accumulation
    interface in Python is probably a deeper discussion/for later
  - `log_get_bytes_bandwidth()` and its associated testing have been
     migrated to modules not named after "CFFI", like the in the above
     PR, because I think we should only use the "CFFI" named modules
     for direct CFFI interaction/testing, and for other analyses we
     should probably use more distinct names. Also, to some extent
     everything depends on the CFFI layer, so trying to restrict
     "CFFI" modules to direct rather than direct interaction will
     help keep them manageably sized, especially given the proclivity
     for surprising memory issues/segfaults in those parts of the code.
  - add a proper docstring with examples for `log_get_bytes_bandwidth()`
  • Loading branch information
tylerjereddy committed Dec 16, 2022
1 parent f1bac18 commit 96806fa
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 106 deletions.
1 change: 0 additions & 1 deletion darshan-util/pydarshan/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down
18 changes: 3 additions & 15 deletions darshan-util/pydarshan/darshan/backend/cffi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,6 @@ def _log_get_heatmap_record(log):
buf = ffi.new("void **")
r = libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf)
if r < 1:
libdutil.darshan_free(buf[0])
return None

filerec = ffi.cast(mod_type, buf)
Expand All @@ -660,6 +659,7 @@ def _log_get_heatmap_record(log):
return rec


@functools.lru_cache()
def log_get_derived_metrics(log_path: str, mod_name: str):
"""
Returns the darshan_derived_metrics struct from CFFI/C accumulator code.
Expand Down Expand Up @@ -716,24 +716,12 @@ def log_get_derived_metrics(log_path: str, mod_name: str):
r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
darshan_derived_metrics,
rbuf[0])
libdutil.darshan_free(buf[0])
if r != 0:
libdutil.darshan_free(buf[0])
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_emit() at the C level. "
"It may be possible "
"to retrieve additional information from the stderr "
"stream.")
libdutil.darshan_free(buf[0])
log_close(log_handle)
return darshan_derived_metrics


def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
# get total bytes (in MiB) and bandwidth (in MiB/s) for
# a given module -- this information was commonly reported
# in the old perl-based summary reports
darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
mod_name=mod_name)
total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
total_bw = darshan_derived_metrics.agg_perf_by_slowest
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
return ret_str
2 changes: 1 addition & 1 deletion darshan-util/pydarshan/darshan/cli/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import darshan
import darshan.cli
from darshan.backend.cffi_backend import log_get_bytes_bandwidth
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.experimental.plots import (
plot_dxt_heatmap,
plot_io_cost,
Expand Down
53 changes: 53 additions & 0 deletions darshan-util/pydarshan/darshan/lib/accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from darshan.backend.cffi_backend import log_get_derived_metrics


def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
"""
Summarize I/O performance for a given darshan module.
Parameters
----------
log_path : str
Path to the darshan binary log file.
mod_name : str
Name of the darshan module to summarize the I/O
performance for.
Returns
-------
out: str
A short string summarizing the performance of the given module
in the provided log file, including bandwidth and total data
transferred.
Raises
------
RuntimeError
When a provided module name is not supported for the accumulator
interface for provision of the summary data, or for any other
error that occurs in the C/CFFI interface.
ValueError
When a provided module name does not exist in the log file.
Examples
--------
>>> from darshan.log_utils import get_log_path
>>> from darshan.lib.accum import log_get_bytes_bandwidth
>>> log_path = get_log_path("imbalanced-io.darshan")
>>> log_get_bytes_bandwidth(log_path, "POSIX")
I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s
>>> log_get_bytes_bandwidth(log_path, "MPI-IO")
I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
"""
# get total bytes (in MiB) and bandwidth (in MiB/s) for
# a given module -- this information was commonly reported
# in the old perl-based summary reports
darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
mod_name=mod_name)
total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
total_bw = darshan_derived_metrics.agg_perf_by_slowest
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
return ret_str
89 changes: 0 additions & 89 deletions darshan-util/pydarshan/darshan/tests/test_cffi_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,92 +159,3 @@ def test_log_get_generic_record(dtype):
# make sure the returned key/column names agree
assert actual_counter_names == expected_counter_names
assert actual_fcounter_names == expected_fcounter_names


@pytest.mark.parametrize("log_path, mod_name, expected_str", [
# the expected bytes/bandwidth strings are pasted
# directly from the old perl summary reports;
# exceptions noted below
# in some cases we defer to darshan-parser for the expected
# values; see discussion in gh-839
("imbalanced-io.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
("imbalanced-io.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
# imbalanced-io.darshan does have LUSTRE data,
# but it doesn't support derived metrics at time
# of writing
("imbalanced-io.darshan",
"LUSTRE",
"RuntimeError"),
("imbalanced-io.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
("partial_data_stdio.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
("partial_data_stdio.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
# the C derived metrics code can't distinguish
# between different kinds of errors at this time,
# but we can still intercept in some cases...
("partial_data_stdio.darshan",
"GARBAGE",
"ValueError"),
# TODO: determine if the lack of APMPI and
# any other "add-ons" in _structdefs is a bug
# in the control flow for `log_get_derived_metrics()`?
("e3sm_io_heatmap_only.darshan",
"APMPI",
"KeyError"),
("skew-app.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
("skew-app.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
])
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
# test the basic scenario of retrieving
# the total data transferred and bandwidth
# for all records in a given module; the situation
# of accumulating drived metrics with filtering
# (i.e., for a single filename) is not tested here

log_path = get_log_path(log_path)
if expected_str == "RuntimeError":
with pytest.raises(RuntimeError,
match=f"{mod_name} module does not support derived"):
backend.log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "ValueError":
with pytest.raises(ValueError,
match=f"{mod_name} is not in the available log"):
backend.log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "KeyError":
with pytest.raises(KeyError, match=f"{mod_name}"):
backend.log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
else:
actual_str = backend.log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
assert actual_str == expected_str
93 changes: 93 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_lib_accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.log_utils import get_log_path

import pytest


@pytest.mark.parametrize("log_path, mod_name, expected_str", [
# the expected bytes/bandwidth strings are pasted
# directly from the old perl summary reports;
# exceptions noted below
# in some cases we defer to darshan-parser for the expected
# values; see discussion in gh-839
("imbalanced-io.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
("imbalanced-io.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
# imbalanced-io.darshan does have LUSTRE data,
# but it doesn't support derived metrics at time
# of writing
("imbalanced-io.darshan",
"LUSTRE",
"RuntimeError"),
("imbalanced-io.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
("partial_data_stdio.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
("partial_data_stdio.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
# the C derived metrics code can't distinguish
# between different kinds of errors at this time,
# but we can still intercept in some cases...
("partial_data_stdio.darshan",
"GARBAGE",
"ValueError"),
# TODO: determine if the lack of APMPI and
# any other "add-ons" in _structdefs is a bug
# in the control flow for `log_get_derived_metrics()`?
("e3sm_io_heatmap_only.darshan",
"APMPI",
"KeyError"),
("skew-app.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
("skew-app.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
])
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
# test the basic scenario of retrieving
# the total data transferred and bandwidth
# for all records in a given module; the situation
# of accumulating drived metrics with filtering
# (i.e., for a single filename) is not tested here

log_path = get_log_path(log_path)
if expected_str == "RuntimeError":
with pytest.raises(RuntimeError,
match=f"{mod_name} module does not support derived"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "ValueError":
with pytest.raises(ValueError,
match=f"{mod_name} is not in the available log"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "KeyError":
with pytest.raises(KeyError, match=f"{mod_name}"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
else:
actual_str = log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
assert actual_str == expected_str

0 comments on commit 96806fa

Please sign in to comment.