Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/timeout #80

Merged
merged 10 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
schedule:
- cron: 0 8 * * 0

jobs:
jobs:
build:
name: Test Build
runs-on: ubuntu-latest
Expand All @@ -18,7 +18,11 @@ jobs:
uses: actions/checkout@v2

- name: install packages
run: sudo apt update; sudo apt upgrade -y; sudo apt install -y build-essential ninja-build libglib2.0-dev libfdt-dev libpixman-1-dev zlib1g-dev python3-tables python3-pandas python3-prctl python3-json5 python3-protobuf libprotobuf-c-dev protobuf-compiler protobuf-c-compiler python3-tqdm; pip install protobuf==4.21.12
run: |
sudo apt update
sudo apt upgrade -y
sudo apt install -y build-essential ninja-build libglib2.0-dev libfdt-dev libpixman-1-dev zlib1g-dev libprotobuf-c-dev protobuf-compiler protobuf-c-compiler libcap-dev
pip install -r requirements.txt

- name: Checkout submodules
run: git submodule update --init
Expand Down
111 changes: 59 additions & 52 deletions controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
import logging
from multiprocessing import Manager, Process, Value
from pathlib import Path
import psutil
import signal
from statistics import mean
import subprocess
import sys
import tables
Expand Down Expand Up @@ -357,6 +359,7 @@ def read_backup(hdf5_file):
"tb_info": bool(hdf5_flags["tb_info"][0]),
"mem_info": bool(hdf5_flags["mem_info"][0]),
"max_instruction_count": int(hdf5_flags["max_instruction_count"][0]),
"fault_count": int(hdf5_flags["fault_count"][0]),
"start": {
"address": int(hdf5_start_address["address"][0]),
"counter": int(hdf5_start_address["counter"][0]),
Expand Down Expand Up @@ -436,7 +439,7 @@ def read_backup(hdf5_file):
rows = f_in.root.Goldenrun.armregisters.iterrows()
else:
raise tables.NoSuchNodeError(
"No supported register architecture could be found in the HDF5 file"
"No supported register architecture could be found in the HDF5 file, run with the overwrite flag to overwrite"
)

backup_goldenrun[register_backup_name] = []
Expand All @@ -455,6 +458,14 @@ def read_backup(hdf5_file):
backup_goldenrun[register_backup_name].append(registers)

# Process expanded faults
if (
f_in.root.Backup.expanded_faults._v_nchildren
!= backup_config["fault_count"]
):
raise tables.NoSuchNodeError(
f"Out of {backup_config['fault_count']} faults, only {f_in.root.Backup.expanded_faults._v_nchildren} are available in the backup. Run with the overwrite flag to overwrite"
)

backup_expanded_faults = []
exp_n = 0

Expand Down Expand Up @@ -564,7 +575,7 @@ def controller(
"""
clogger.info("Controller start")

t0 = time.time()
total_start_time = time.time()

hdf5path = args.hdf5file
qemu_output = args.debug
Expand Down Expand Up @@ -597,10 +608,8 @@ def controller(
"Backup could not be found in the HDF5 file, run with the overwrite flag to overwrite!"
)
return config_qemu
except tables.NoSuchNodeError:
clogger.warning(
"Invalid/unsupported backup file, run with the overwrite flag to overwrite!"
)
except tables.NoSuchNodeError as e:
clogger.warning(e)
return config_qemu

clogger.info("Checking the backup")
Expand Down Expand Up @@ -746,7 +755,7 @@ def controller(
),
)
p.start()
p_list.append({"process": p, "start_time": time.time()})
p_list.append({"process": p, "start_time": time.time(), "faults": faults})
clogger.debug(f"Started worker {faults['index']}. Running: {len(p_list)}.")
clogger.debug(f"Fault address: {faults['faultlist'][0].address}")
clogger.debug(
Expand All @@ -764,63 +773,60 @@ def controller(
mem_max = max(mem_list)

# Calculate length of running processes
times.clear()
time_max = 0
current_time = time.time()
for i in range(len(p_list)):
p = p_list[i]
tmp = current_time - p["start_time"]
# If the current processing time is lower than moving average, do not punish the time
if tmp < p_time_mean:
times.append(0)
else:
times.append(tmp - p_time_mean)
# Find max time in list (This list will show the longest running
# process minus the moving average)
if len(times) > 0:
time_max = max(times)

for i in range(len(p_list)):
p = p_list[i]
times = [time.time() - p["start_time"] for p in p_list]
# If the current processing time is lower than moving average, do not punish the time
times = [max(0, time - p_time_mean) for time in times]
time_max = max(times) if times else 0

for i, p in enumerate(p_list):
# Find finished processes
p["process"].join(timeout=0)

# Halt experiment if timeout duration exceeded
# If gdb is used the timeout is not applicable
if (
p["process"].is_alive()
and (time.time() - p["start_time"]) > config_qemu["timeout"]
and not config_qemu.get("gdb", False)
):
clogger.warning(f"Experiment {p['faults']['index']} ran into timeout")
# Search for qemu thread and kill qemu process if found
# qemu process is a child process of qemu thread
qemu_thread_name = f"qemu{p['faults']['index']}"
for process in psutil.process_iter():
if process.name() != qemu_thread_name:
continue
clogger.debug(f"{process.name()} killed")
assert (
len(process.children()) == 1
), "qemu thread should only have qemu child process"
process.children()[0].terminate()
break
else:
clogger.debug(f"{qemu_thread_name} not found to kill")
# Wait for worker process terminates
p["process"].join()

if p["process"].is_alive() is False:
# Recalculate moving average
p_time_list.append(current_time - p["start_time"])
len_p_time_list = len(p_time_list)
if len_p_time_list > num_workers + 2:
p_time_list.pop(0)
p_time_mean = sum(p_time_list) / len_p_time_list
clogger.debug("Current running Average {}".format(p_time_mean))
p_time_list.append(time.time() - p["start_time"])
p_time_list = p_time_list[-(num_workers + 2) :] # remove old entries
p_time_mean = mean(p_time_list)
clogger.debug(f"Process time running mean: {p_time_mean:.3f}s")
# Remove process from list
p_list.pop(i)
break

clogger.debug("{} experiments remaining in queue".format(queue_output.qsize()))
p_logger.join()
clogger.debug("Done with qemu and logger, controller exit")

clogger.debug("Done with qemu and logger")

t1 = time.time()
m, s = divmod(t1 - t0, 60)
h, m = divmod(m, 60)
clogger.info(
"Took {}:{}:{} to complete all experiments".format(int(h), int(m), int(s))
)

if faultlist:
tperindex = (t1 - t0) / len(faultlist)
else:
tperindex = t1 - t0
total_runtime = time.time() - total_start_time
total_runtime_hh_mm_ss = time.strftime("%H:%M:%S", time.gmtime(total_runtime))
clogger.info(f"Took {total_runtime_hh_mm_ss} to complete all experiments")

tperworker = tperindex / num_workers
clogger.debug(
"Took average of {}s per fault, python worker rough runtime is {}s".format(
tperindex, tperworker
)
)
time_per_experiment = total_runtime / len(faultlist) if faultlist else total_runtime
clogger.debug(f"Average runtimes\n" f"\tper fault:\t{time_per_experiment:.3f}s")

clogger.debug("controller exit")
return config_qemu


Expand Down Expand Up @@ -1006,6 +1012,7 @@ def process_arguments(args):
qemu_conf["tb_info"] = faultlist.get("tb_info", True)
qemu_conf["mem_info"] = faultlist.get("mem_info", False)
qemu_conf["ring_buffer"] = faultlist.get("ring_buffer", True)
qemu_conf["timeout"] = faultlist.get("timeout", 1200)

# Command line argument takes precedence
if args.disable_ring_buffer:
Expand Down
61 changes: 61 additions & 0 deletions examples/stm32-timeout-wfi/fault.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"max_instruction_count": 100,
"start" : {
"address" : 0x0800006a,
"counter" : 1
},
"end" :[
{
"address" : 0x08000056,
"counter" : 3
},
{
"address" : 0x08000070,
"counter" : 1
}
],
"faults" :[
[
{
"fault_address" : [0x20001fec],
"fault_type" : "data",
"fault_model" : "set0",
"fault_lifespan" : [100],
"fault_mask" : [1],
"trigger_address" : [0x0800004a],
"trigger_counter" : [1]
}
],
[
{
"fault_address" : [0x08000056],
"fault_type" : "instruction",
"fault_model" : "overwrite",
"num_bytes" : 2,
"fault_lifespan" : [100],
"fault_mask" : [0x46c0],
"trigger_address" : [0x08000040],
"trigger_counter" : [1]
}
],
[
{
"fault_address" : [3],
"fault_type" : "register",
"fault_model" : "set0",
"fault_lifespan" : [0],
"fault_mask" : [0xffffffff],
"trigger_address" : [0x08000054],
"trigger_counter" : [1]
}
]
],
"memorydump": [
{
"address" : 0x08000000,
"length" : 1023
}
],
"mem_info": true,
"timeout": 1
}
8 changes: 8 additions & 0 deletions examples/stm32-timeout-wfi/qemuconf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"qemu" : "../../qemu/build/debug/arm-softmmu/qemu-system-arm",
"bios" : "",
"kernel" : "minimal.elf",
"plugin" : "../../faultplugin/libfaultplugin.so",
"machine" : "stm32vldiscovery",
"additional_qemu_args" : ""
}
2 changes: 2 additions & 0 deletions examples/stm32-timeout-wfi/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
python3 ../../controller.py --debug --fault fault.json --qemu qemuconf.json output.hdf5
13 changes: 13 additions & 0 deletions examples/stm32-timeout-wfi/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BINARY = minimal

PREFIX :=arm-none-eabi-

CC :=$(PREFIX)gcc

LDSCRIPT = ./stm32f0-discovery.ld

make:
$(CC) -g -mcpu=cortex-m0 -mthumb -Wl,-static -nostartfiles -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group -Wl,-T,$(LDSCRIPT) minimal.c -o $(BINARY).elf

clean:
rm $(BINARY).elf -f
47 changes: 47 additions & 0 deletions examples/stm32-timeout-wfi/src/minimal.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
typedef void (*vector_table_entry_t)(void);

typedef struct {
unsigned int *initial_sp_value; /**< Initial stack pointer value. */
vector_table_entry_t reset;
vector_table_entry_t nmi;
vector_table_entry_t hard_fault;
vector_table_entry_t memory_manage_fault; /* not in CM0 */
vector_table_entry_t bus_fault; /* not in CM0 */
vector_table_entry_t usage_fault; /* not in CM0 */
vector_table_entry_t reserved_x001c[4];
vector_table_entry_t sv_call;
vector_table_entry_t debug_monitor; /* not in CM0 */
vector_table_entry_t reserved_x0034;
vector_table_entry_t pend_sv;
vector_table_entry_t systick;
vector_table_entry_t irq[0];
} vector_table_t;

extern vector_table_t vector_table;

int main(void) {
volatile int i = 1;
volatile int x = 0;

while (i) {
__asm__("nop");
}

x = 0x10;

return x;
}

void reset_handler(void) {
main();

while(1) {
__asm__("wfi");
}
}

__attribute__ ((section(".vectors")))
vector_table_t vector_table = {
.initial_sp_value = (unsigned *)0x20002000,
.reset = reset_handler
};
21 changes: 21 additions & 0 deletions examples/stm32-timeout-wfi/src/stm32f0-discovery.ld
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MEMORY
{
rom (rx) : ORIGIN = 0x08000000, LENGTH = 64K
ram (rwx) : ORIGIN = 0x20000000, LENGTH = 8K
}

EXTERN (vector_table)

ENTRY(reset_handler)

SECTIONS
{
.text : {
*(.vectors) /* Vector table */
*(.text*) /* Program code */
. = ALIGN(4);
} >rom

end = .;
}

6 changes: 6 additions & 0 deletions fault-readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,9 @@ Use of the ring buffer implementation to store the list of executed translation

### mem_info
Enable collection of data on all memory accesses. The configuration property expects to be passed a boolean value. If unspecified, it will default to `false`.

### timeout
Maximum execution duration in seconds for a single experiment.
If exceeded, the experiment will be stopped.
If unspecified, it will default to `1200` seconds.
See the example in `examples/stm32-timeout-wfi` for more details.
Loading
Loading