Skip to content

Commit

Permalink
MAINT: improve C accum error handling
Browse files Browse the repository at this point in the history
* returning `-1` from a public API C function isn't sufficient
to provide useful error information when working in Python/CFFI--it
only tells you something went wrong if you check for a return
code in the first place (which isn't normally done in Python
anyway--normally you end execution at the error point and `exit()`
with an appropriate error message)

* when there are multiple conditions that can trigger the `-1`
return value, the situation is even worse, one literally has
to `printf` sprinkle the source to figure out what went wrong
where

* as a compromise, I'll leave the `-1` approach in since that
is quite common in standard `C`, but I'm going to add in
prints to `stderr` so that Python can then intercept the `-1`
and refer the user to `stderr`

* also, `darshan_accumulator_inject()` assumed that the `module_id`
was reasonable because it was set/checked in
`darshan_accumulator_create()`, however my experience in darshan-hpcgh-839
was that the accumulator memory location can get freed, or not
properly set at CFFI boundary after calling the creation function,
so I think the assumption that a previous function was called
and worked perfectly is too fragile--I'm adding error handling
to prevent a hard segfault on nonsense values of that structure
member as a result
  • Loading branch information
tylerjereddy committed Oct 30, 2022
1 parent 045b9bd commit ca4f3ea
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 1 deletion.
11 changes: 10 additions & 1 deletion darshan-util/darshan-logutils-accumulator.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <stdlib.h>
#include <assert.h>
#include <stdio.h>

#include "darshan-logutils.h"
#include "uthash-1.9.2/src/uthash.h"
Expand Down Expand Up @@ -107,10 +108,16 @@ int darshan_accumulator_inject(darshan_accumulator acc,
int ret;
file_hash_entry_t *hfile = NULL;

if(acc->module_id >= DARSHAN_KNOWN_MODULE_COUNT || acc->module_id < 0) {
fprintf(stderr, "darshan_accumulator_inject received an accumulator struct with an id that is likely corrupted\n");
return(-1);
}

if(!mod_logutils[acc->module_id]->log_agg_records ||
!mod_logutils[acc->module_id]->log_sizeof_record ||
!mod_logutils[acc->module_id]->log_record_metrics) {
/* this module doesn't support this operation */
fprintf(stderr, "darshan_accumulator_inject is operating on a module that doesn't support this operation");
return(-1);
}

Expand All @@ -126,8 +133,10 @@ int darshan_accumulator_inject(darshan_accumulator acc,
ret = mod_logutils[acc->module_id]->log_record_metrics( new_record,
&rec_id, &r_bytes, &w_bytes, &max_offset, &io_total_time,
&md_only_time, &rw_only_time, &rank, &nprocs);
if(ret < 0)
if(ret < 0) {
fprintf(stderr, "darshan_accumulator_inject was unable to retrieve generic metrics from record");
return(-1);
}

/* accumulate performance metrics */

Expand Down
33 changes: 33 additions & 0 deletions darshan-util/pydarshan/darshan/backend/api_def_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,34 @@


header = """/* from darshan-logutils.h */
struct darshan_derived_metrics {
int64_t total_bytes;
double unique_io_total_time_by_slowest;
double unique_rw_only_time_by_slowest;
double unique_md_only_time_by_slowest;
int unique_io_slowest_rank;
double shared_io_total_time_by_slowest;
double agg_perf_by_slowest;
double agg_time_by_slowest;
struct darshan_file_category_counters;
};
struct darshan_accumulator {
int64_t module_id;
int64_t job_nprocs;
void* agg_record;
int num_records;
void *file_hash_table;
double shared_io_total_time_by_slowest;
int64_t total_bytes;
double *rank_cumul_io_total_time;
double *rank_cumul_rw_only_time;
double *rank_cumul_md_only_time;
};
struct darshan_mnt_info
{
char mnt_type[3031];
Expand All @@ -23,6 +51,11 @@
int partial_flag;
};
int darshan_accumulator_emit(struct darshan_accumulator, struct darshan_derived_metrics*, void* aggregation_record);
int darshan_accumulator_destroy(struct darshan_accumulator);
int darshan_accumulator_create(enum darshan_module_id, int64_t, struct darshan_accumulator*);
int darshan_accumulator_inject(struct darshan_accumulator, void*, int);
/* from darshan-log-format.h */
typedef uint64_t darshan_record_id;
Expand Down
35 changes: 35 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_cffi_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,38 @@ def test_log_get_generic_record(dtype):
# make sure the returned key/column names agree
assert actual_counter_names == expected_counter_names
assert actual_fcounter_names == expected_fcounter_names


@pytest.mark.parametrize("log_path", [
"imbalanced-io.darshan",
])
def test_accumulator_invalid_id(capfdbinary, log_path):
# check for proper error handling of invalid
# id in darshan_accumulator_inject() C function
log_path = get_log_path(log_path)
log = backend.log_open(log_path)
jobrec = backend.ffi.new("struct darshan_job *")
backend.libdutil.darshan_log_get_job(log['handle'], jobrec)
modules = backend.log_get_modules(log)
for mod_name in modules:
mod_type = backend._structdefs[mod_name]
darshan_accumulator = backend.ffi.new("struct darshan_accumulator *")
buf = backend.ffi.new("void **")
r = backend.libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf)
if r < 1:
continue
rbuf = backend.ffi.cast(mod_type, buf)
ret_create = backend.libdutil.darshan_accumulator_create(modules[mod_name]['idx'],
jobrec[0].nprocs,
darshan_accumulator)
# creation should work just fine
assert ret_create == 0
# the memory handling around the CFFI interface/struct opacity
# is such that we expect an error here:
r = backend.libdutil.darshan_accumulator_inject(darshan_accumulator[0], rbuf[0], 1)
if r == -1:
# stderr should provide something useful
captured = capfdbinary.readouterr()
assert b"id that is likely corrupted" in captured.err
else:
pytest.skip("darshan_accumulator_inject() is working in this scenario")

0 comments on commit ca4f3ea

Please sign in to comment.