Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Python derived/accum interface #839

Merged
merged 20 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions darshan-util/pydarshan/darshan/backend/cffi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,3 +708,72 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None):
rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1]
buf = rec_arr.tobytes()
return buf


@functools.lru_cache()
def log_get_derived_metrics(log_path: str, mod_name: str):
"""
Returns the darshan_derived_metrics struct from CFFI/C accumulator code.

Parameters:
log_path: Path to the darshan log file
mod_name: The name of the module to retrieve derived metrics for

Returns:
darshan_derived_metrics struct (cdata object)
"""
# TODO: eventually add support for i.e., a regex filter on the records
# the user wants to get derived metrics for--like filtering to records
# with a single filename involved before accumulating the data?
log_handle = log_open(log_path)
jobrec = ffi.new("struct darshan_job *")
libdutil.darshan_log_get_job(log_handle['handle'], jobrec)
modules = log_get_modules(log_handle)

if mod_name not in modules:
raise ValueError(f"{mod_name} is not in the available log file "
f"modules: {modules.keys()}")

mod_type = _structdefs[mod_name]
darshan_accumulator = ffi.new("darshan_accumulator *")
r = libdutil.darshan_accumulator_create(modules[mod_name]['idx'],
jobrec[0].nprocs,
darshan_accumulator)
if r != 0:
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_create() at the C level. "
f"This could mean that the {mod_name} module does not "
"support derived metric calculation, or that "
"another kind of error occurred. It may be possible "
"to retrieve additional information from the stderr "
"stream.")

buf = ffi.new("void **")
r = 1
while r >= 1:
r = libdutil.darshan_log_get_record(log_handle['handle'], modules[mod_name]['idx'], buf)
if r < 1:
break
rbuf = ffi.cast(mod_type, buf)
r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], rbuf[0], 1)
if r_i != 0:
libdutil.darshan_free(buf[0])
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_inject() at the C level. "
"It may be possible "
"to retrieve additional information from the stderr "
"stream.")
darshan_derived_metrics = ffi.new("struct darshan_derived_metrics *")
r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
darshan_derived_metrics,
rbuf[0])
libdutil.darshan_free(buf[0])
libdutil.darshan_accumulator_destroy(darshan_accumulator[0])
log_close(log_handle)
if r != 0:
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_emit() at the C level. "
"It may be possible "
"to retrieve additional information from the stderr "
"stream.")
return darshan_derived_metrics
3 changes: 2 additions & 1 deletion darshan-util/pydarshan/darshan/cli/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ <h3>${fig_title}</h3>
<figcaption>${fig.fig_description}</figcaption>
% else:
<!-- temporary handling for DXT-disabled cases -->
<figcaption style="font-weight: bold; color: red; width: 400px;">
<!-- now also handles the bandwidth text... -->
<figcaption style="font-weight: bold; color: ${fig.text_only_color}; width: 400px;">
${fig.fig_description}
</figcaption>
% endif
Expand Down
34 changes: 34 additions & 0 deletions darshan-util/pydarshan/darshan/cli/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import darshan
import darshan.cli
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.experimental.plots import (
plot_dxt_heatmap,
plot_io_cost,
Expand Down Expand Up @@ -53,6 +54,11 @@ def __init__(
fig_args: dict,
fig_description: str = "",
fig_width: int = 500,
# when there is no HTML data generated
# for the figure (i.e., no image/plot),
# we have the option of changing the caption
# text color for a warning/important standalone text
text_only_color: str = "red",
):
self.section_title = section_title
if not fig_title:
Expand All @@ -65,7 +71,11 @@ def __init__(
# temporary handling for DXT disabled cases
# so special error message can be passed
# in place of an encoded image
# NOTE: this code path is now also
# being used for adding the bandwidth
# text, which doesn't really have an image...
self.fig_html = None
self.text_only_color = text_only_color
if self.fig_func:
self.generate_fig()

Expand Down Expand Up @@ -487,6 +497,30 @@ def register_figures(self):
)
self.figures.append(opcount_fig)

try:
# this is really just some text
# so using ReportFigure feels awkward...
bandwidth_fig = ReportFigure(
section_title=sect_title,
fig_title="",
fig_func=None,
fig_args=None,
fig_description=log_get_bytes_bandwidth(log_path=self.log_path,
mod_name=mod),
text_only_color="blue")
self.figures.append(bandwidth_fig)
except (RuntimeError, KeyError):
# the module probably doesn't support derived metrics
# calculations, but the C code doesn't distinguish other
# types of errors

# the KeyError appears to be needed for a subset of logs
# for which _structdefs lacks APMPI or APXC entries;
# for example `e3sm_io_heatmap_only.darshan` in logs
# repo
pass


#########################
# Data Access by Category
if not {"POSIX", "STDIO"}.isdisjoint(set(self.report.modules)):
Expand Down
53 changes: 53 additions & 0 deletions darshan-util/pydarshan/darshan/lib/accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from darshan.backend.cffi_backend import log_get_derived_metrics


def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
"""
Summarize I/O performance for a given darshan module.

Parameters
----------
log_path : str
Path to the darshan binary log file.
mod_name : str
Name of the darshan module to summarize the I/O
performance for.

Returns
-------
out: str
A short string summarizing the performance of the given module
in the provided log file, including bandwidth and total data
transferred.

Raises
------
RuntimeError
When a provided module name is not supported for the accumulator
interface for provision of the summary data, or for any other
error that occurs in the C/CFFI interface.
ValueError
When a provided module name does not exist in the log file.

Examples
--------

>>> from darshan.log_utils import get_log_path
>>> from darshan.lib.accum import log_get_bytes_bandwidth

>>> log_path = get_log_path("imbalanced-io.darshan")
>>> log_get_bytes_bandwidth(log_path, "POSIX")
I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s

>>> log_get_bytes_bandwidth(log_path, "MPI-IO")
I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
"""
# get total bytes (in MiB) and bandwidth (in MiB/s) for
# a given module -- this information was commonly reported
# in the old perl-based summary reports
darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
mod_name=mod_name)
total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
total_bw = darshan_derived_metrics.agg_perf_by_slowest
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
return ret_str
93 changes: 93 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_lib_accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.log_utils import get_log_path

import pytest


@pytest.mark.parametrize("log_path, mod_name, expected_str", [
# the expected bytes/bandwidth strings are pasted
# directly from the old perl summary reports;
# exceptions noted below
# in some cases we defer to darshan-parser for the expected
# values; see discussion in gh-839
("imbalanced-io.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
("imbalanced-io.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
# imbalanced-io.darshan does have LUSTRE data,
# but it doesn't support derived metrics at time
# of writing
("imbalanced-io.darshan",
"LUSTRE",
"RuntimeError"),
("imbalanced-io.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
("partial_data_stdio.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
("partial_data_stdio.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
# the C derived metrics code can't distinguish
# between different kinds of errors at this time,
# but we can still intercept in some cases...
("partial_data_stdio.darshan",
"GARBAGE",
"ValueError"),
# TODO: determine if the lack of APMPI and
# any other "add-ons" in _structdefs is a bug
# in the control flow for `log_get_derived_metrics()`?
("e3sm_io_heatmap_only.darshan",
"APMPI",
"KeyError"),
("skew-app.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
("skew-app.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
])
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
# test the basic scenario of retrieving
# the total data transferred and bandwidth
# for all records in a given module; the situation
# of accumulating drived metrics with filtering
# (i.e., for a single filename) is not tested here

log_path = get_log_path(log_path)
if expected_str == "RuntimeError":
with pytest.raises(RuntimeError,
match=f"{mod_name} module does not support derived"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "ValueError":
with pytest.raises(ValueError,
match=f"{mod_name} is not in the available log"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
elif expected_str == "KeyError":
with pytest.raises(KeyError, match=f"{mod_name}"):
log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
else:
actual_str = log_get_bytes_bandwidth(log_path=log_path,
mod_name=mod_name)
assert actual_str == expected_str
6 changes: 6 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ def test_main_all_logs_repo_files(tmpdir, log_filepath):
else:
assert actual_runtime_heatmap_titles == 0

# check for presence of bandwidth summary strings
# (more detailed per-module probes are present
# in test_derived_metrics_bytes_and_bandwidth())
assert "I/O performance estimate" in report_str
assert "color: blue" in report_str


class TestReportData:

Expand Down