diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index db7022c..f1c737f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,7 +8,7 @@ on: schedule: - cron: 0 8 * * 0 -jobs: +jobs: build: name: Test Build runs-on: ubuntu-latest @@ -18,7 +18,11 @@ jobs: uses: actions/checkout@v2 - name: install packages - run: sudo apt update; sudo apt upgrade -y; sudo apt install -y build-essential ninja-build libglib2.0-dev libfdt-dev libpixman-1-dev zlib1g-dev python3-tables python3-pandas python3-prctl python3-json5 python3-protobuf libprotobuf-c-dev protobuf-compiler protobuf-c-compiler python3-tqdm; pip install protobuf==4.21.12 + run: | + sudo apt update + sudo apt upgrade -y + sudo apt install -y build-essential ninja-build libglib2.0-dev libfdt-dev libpixman-1-dev zlib1g-dev libprotobuf-c-dev protobuf-compiler protobuf-c-compiler libcap-dev + pip install -r requirements.txt - name: Checkout submodules run: git submodule update --init diff --git a/controller.py b/controller.py index 5f564db..1e656ad 100755 --- a/controller.py +++ b/controller.py @@ -21,7 +21,9 @@ import logging from multiprocessing import Manager, Process, Value from pathlib import Path +import psutil import signal +from statistics import mean import subprocess import sys import tables @@ -357,6 +359,7 @@ def read_backup(hdf5_file): "tb_info": bool(hdf5_flags["tb_info"][0]), "mem_info": bool(hdf5_flags["mem_info"][0]), "max_instruction_count": int(hdf5_flags["max_instruction_count"][0]), + "fault_count": int(hdf5_flags["fault_count"][0]), "start": { "address": int(hdf5_start_address["address"][0]), "counter": int(hdf5_start_address["counter"][0]), @@ -436,7 +439,7 @@ def read_backup(hdf5_file): rows = f_in.root.Goldenrun.armregisters.iterrows() else: raise tables.NoSuchNodeError( - "No supported register architecture could be found in the HDF5 file" + "No supported register architecture could be found in the HDF5 file, run with the overwrite flag to overwrite" ) backup_goldenrun[register_backup_name] = [] @@ -455,6 +458,14 @@ def read_backup(hdf5_file): backup_goldenrun[register_backup_name].append(registers) # Process expanded faults + if ( + f_in.root.Backup.expanded_faults._v_nchildren + != backup_config["fault_count"] + ): + raise tables.NoSuchNodeError( + f"Out of {backup_config['fault_count']} faults, only {f_in.root.Backup.expanded_faults._v_nchildren} are available in the backup. Run with the overwrite flag to overwrite" + ) + backup_expanded_faults = [] exp_n = 0 @@ -564,7 +575,7 @@ def controller( """ clogger.info("Controller start") - t0 = time.time() + total_start_time = time.time() hdf5path = args.hdf5file qemu_output = args.debug @@ -597,10 +608,8 @@ def controller( "Backup could not be found in the HDF5 file, run with the overwrite flag to overwrite!" ) return config_qemu - except tables.NoSuchNodeError: - clogger.warning( - "Invalid/unsupported backup file, run with the overwrite flag to overwrite!" - ) + except tables.NoSuchNodeError as e: + clogger.warning(e) return config_qemu clogger.info("Checking the backup") @@ -746,7 +755,7 @@ def controller( ), ) p.start() - p_list.append({"process": p, "start_time": time.time()}) + p_list.append({"process": p, "start_time": time.time(), "faults": faults}) clogger.debug(f"Started worker {faults['index']}. Running: {len(p_list)}.") clogger.debug(f"Fault address: {faults['faultlist'][0].address}") clogger.debug( @@ -764,63 +773,60 @@ def controller( mem_max = max(mem_list) # Calculate length of running processes - times.clear() - time_max = 0 - current_time = time.time() - for i in range(len(p_list)): - p = p_list[i] - tmp = current_time - p["start_time"] - # If the current processing time is lower than moving average, do not punish the time - if tmp < p_time_mean: - times.append(0) - else: - times.append(tmp - p_time_mean) - # Find max time in list (This list will show the longest running - # process minus the moving average) - if len(times) > 0: - time_max = max(times) - - for i in range(len(p_list)): - p = p_list[i] + times = [time.time() - p["start_time"] for p in p_list] + # If the current processing time is lower than moving average, do not punish the time + times = [max(0, time - p_time_mean) for time in times] + time_max = max(times) if times else 0 + + for i, p in enumerate(p_list): # Find finished processes p["process"].join(timeout=0) + + # Halt experiment if timeout duration exceeded + # If gdb is used the timeout is not applicable + if ( + p["process"].is_alive() + and (time.time() - p["start_time"]) > config_qemu["timeout"] + and not config_qemu.get("gdb", False) + ): + clogger.warning(f"Experiment {p['faults']['index']} ran into timeout") + # Search for qemu thread and kill qemu process if found + # qemu process is a child process of qemu thread + qemu_thread_name = f"qemu{p['faults']['index']}" + for process in psutil.process_iter(): + if process.name() != qemu_thread_name: + continue + clogger.debug(f"{process.name()} killed") + assert ( + len(process.children()) == 1 + ), "qemu thread should only have qemu child process" + process.children()[0].terminate() + break + else: + clogger.debug(f"{qemu_thread_name} not found to kill") + # Wait for worker process terminates + p["process"].join() + if p["process"].is_alive() is False: # Recalculate moving average - p_time_list.append(current_time - p["start_time"]) - len_p_time_list = len(p_time_list) - if len_p_time_list > num_workers + 2: - p_time_list.pop(0) - p_time_mean = sum(p_time_list) / len_p_time_list - clogger.debug("Current running Average {}".format(p_time_mean)) + p_time_list.append(time.time() - p["start_time"]) + p_time_list = p_time_list[-(num_workers + 2) :] # remove old entries + p_time_mean = mean(p_time_list) + clogger.debug(f"Process time running mean: {p_time_mean:.3f}s") # Remove process from list p_list.pop(i) - break clogger.debug("{} experiments remaining in queue".format(queue_output.qsize())) p_logger.join() + clogger.debug("Done with qemu and logger, controller exit") - clogger.debug("Done with qemu and logger") - - t1 = time.time() - m, s = divmod(t1 - t0, 60) - h, m = divmod(m, 60) - clogger.info( - "Took {}:{}:{} to complete all experiments".format(int(h), int(m), int(s)) - ) - - if faultlist: - tperindex = (t1 - t0) / len(faultlist) - else: - tperindex = t1 - t0 + total_runtime = time.time() - total_start_time + total_runtime_hh_mm_ss = time.strftime("%H:%M:%S", time.gmtime(total_runtime)) + clogger.info(f"Took {total_runtime_hh_mm_ss} to complete all experiments") - tperworker = tperindex / num_workers - clogger.debug( - "Took average of {}s per fault, python worker rough runtime is {}s".format( - tperindex, tperworker - ) - ) + time_per_experiment = total_runtime / len(faultlist) if faultlist else total_runtime + clogger.debug(f"Average runtimes\n" f"\tper fault:\t{time_per_experiment:.3f}s") - clogger.debug("controller exit") return config_qemu @@ -1006,6 +1012,7 @@ def process_arguments(args): qemu_conf["tb_info"] = faultlist.get("tb_info", True) qemu_conf["mem_info"] = faultlist.get("mem_info", False) qemu_conf["ring_buffer"] = faultlist.get("ring_buffer", True) + qemu_conf["timeout"] = faultlist.get("timeout", 1200) # Command line argument takes precedence if args.disable_ring_buffer: diff --git a/examples/stm32-timeout-wfi/fault.json b/examples/stm32-timeout-wfi/fault.json new file mode 100644 index 0000000..7b6fae2 --- /dev/null +++ b/examples/stm32-timeout-wfi/fault.json @@ -0,0 +1,61 @@ +{ + "max_instruction_count": 100, + "start" : { + "address" : 0x0800006a, + "counter" : 1 + }, + "end" :[ + { + "address" : 0x08000056, + "counter" : 3 + }, + { + "address" : 0x08000070, + "counter" : 1 + } +], + "faults" :[ + [ + { + "fault_address" : [0x20001fec], + "fault_type" : "data", + "fault_model" : "set0", + "fault_lifespan" : [100], + "fault_mask" : [1], + "trigger_address" : [0x0800004a], + "trigger_counter" : [1] + } + ], + [ + { + "fault_address" : [0x08000056], + "fault_type" : "instruction", + "fault_model" : "overwrite", + "num_bytes" : 2, + "fault_lifespan" : [100], + "fault_mask" : [0x46c0], + "trigger_address" : [0x08000040], + "trigger_counter" : [1] + } + ], + [ + { + "fault_address" : [3], + "fault_type" : "register", + "fault_model" : "set0", + "fault_lifespan" : [0], + "fault_mask" : [0xffffffff], + "trigger_address" : [0x08000054], + "trigger_counter" : [1] + } + ] + ], + "memorydump": [ + { + "address" : 0x08000000, + "length" : 1023 + } + ], + "mem_info": true, + "timeout": 1 +} diff --git a/examples/stm32-timeout-wfi/qemuconf.json b/examples/stm32-timeout-wfi/qemuconf.json new file mode 100644 index 0000000..7bf30fc --- /dev/null +++ b/examples/stm32-timeout-wfi/qemuconf.json @@ -0,0 +1,8 @@ +{ + "qemu" : "../../qemu/build/debug/arm-softmmu/qemu-system-arm", + "bios" : "", + "kernel" : "minimal.elf", + "plugin" : "../../faultplugin/libfaultplugin.so", + "machine" : "stm32vldiscovery", + "additional_qemu_args" : "" +} diff --git a/examples/stm32-timeout-wfi/run.sh b/examples/stm32-timeout-wfi/run.sh new file mode 100755 index 0000000..5de4f1c --- /dev/null +++ b/examples/stm32-timeout-wfi/run.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python3 ../../controller.py --debug --fault fault.json --qemu qemuconf.json output.hdf5 diff --git a/examples/stm32-timeout-wfi/src/Makefile b/examples/stm32-timeout-wfi/src/Makefile new file mode 100644 index 0000000..cfc4d07 --- /dev/null +++ b/examples/stm32-timeout-wfi/src/Makefile @@ -0,0 +1,13 @@ +BINARY = minimal + +PREFIX :=arm-none-eabi- + +CC :=$(PREFIX)gcc + +LDSCRIPT = ./stm32f0-discovery.ld + +make: + $(CC) -g -mcpu=cortex-m0 -mthumb -Wl,-static -nostartfiles -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group -Wl,-T,$(LDSCRIPT) minimal.c -o $(BINARY).elf + +clean: + rm $(BINARY).elf -f diff --git a/examples/stm32-timeout-wfi/src/minimal.c b/examples/stm32-timeout-wfi/src/minimal.c new file mode 100644 index 0000000..313c941 --- /dev/null +++ b/examples/stm32-timeout-wfi/src/minimal.c @@ -0,0 +1,47 @@ +typedef void (*vector_table_entry_t)(void); + +typedef struct { + unsigned int *initial_sp_value; /**< Initial stack pointer value. */ + vector_table_entry_t reset; + vector_table_entry_t nmi; + vector_table_entry_t hard_fault; + vector_table_entry_t memory_manage_fault; /* not in CM0 */ + vector_table_entry_t bus_fault; /* not in CM0 */ + vector_table_entry_t usage_fault; /* not in CM0 */ + vector_table_entry_t reserved_x001c[4]; + vector_table_entry_t sv_call; + vector_table_entry_t debug_monitor; /* not in CM0 */ + vector_table_entry_t reserved_x0034; + vector_table_entry_t pend_sv; + vector_table_entry_t systick; + vector_table_entry_t irq[0]; +} vector_table_t; + +extern vector_table_t vector_table; + +int main(void) { + volatile int i = 1; + volatile int x = 0; + + while (i) { + __asm__("nop"); + } + + x = 0x10; + + return x; +} + +void reset_handler(void) { + main(); + + while(1) { + __asm__("wfi"); + } +} + +__attribute__ ((section(".vectors"))) +vector_table_t vector_table = { + .initial_sp_value = (unsigned *)0x20002000, + .reset = reset_handler +}; diff --git a/examples/stm32-timeout-wfi/src/stm32f0-discovery.ld b/examples/stm32-timeout-wfi/src/stm32f0-discovery.ld new file mode 100644 index 0000000..649adb0 --- /dev/null +++ b/examples/stm32-timeout-wfi/src/stm32f0-discovery.ld @@ -0,0 +1,21 @@ +MEMORY +{ + rom (rx) : ORIGIN = 0x08000000, LENGTH = 64K + ram (rwx) : ORIGIN = 0x20000000, LENGTH = 8K +} + +EXTERN (vector_table) + +ENTRY(reset_handler) + +SECTIONS +{ + .text : { + *(.vectors) /* Vector table */ + *(.text*) /* Program code */ + . = ALIGN(4); + } >rom + + end = .; +} + diff --git a/fault-readme.md b/fault-readme.md index 51bfcbb..603dc28 100644 --- a/fault-readme.md +++ b/fault-readme.md @@ -231,3 +231,9 @@ Use of the ring buffer implementation to store the list of executed translation ### mem_info Enable collection of data on all memory accesses. The configuration property expects to be passed a boolean value. If unspecified, it will default to `false`. + +### timeout +Maximum execution duration in seconds for a single experiment. +If exceeded, the experiment will be stopped. +If unspecified, it will default to `1200` seconds. +See the example in `examples/stm32-timeout-wfi` for more details. diff --git a/faultclass.py b/faultclass.py index 48fe9ac..ab3f09d 100644 --- a/faultclass.py +++ b/faultclass.py @@ -19,6 +19,7 @@ from multiprocessing import Process import os import shlex +import signal import subprocess import time @@ -82,6 +83,18 @@ def detect_model(fault_model): ) +class Timeout: + raised = False + + def __init__(self): + signal.signal(signal.SIGINT, self.raise_timeout) + signal.signal(signal.SIGTERM, self.raise_timeout) + + def raise_timeout(self, *args): + self.raised = True + raise KeyboardInterrupt + + class Register(IntEnum): ARM = 0 RISCV = 1 @@ -562,6 +575,8 @@ def readout_data( max_ram_usage = 0 regtype = None + timeout = Timeout() + # Load data from the pipe data_protobuf = data_pb2.Data() data_protobuf.ParseFromString(pipe.read()) @@ -655,7 +670,7 @@ def readout_data( max_ram_usage = gather_process_ram_usage(queue_ram_usage, max_ram_usage) - return max_ram_usage + return (max_ram_usage, timeout.raised) def create_fifos(): @@ -825,7 +840,7 @@ def python_worker( # From here Qemu has started execution. Now prepare for # data extraction - mem = readout_data( + (mem, timeout_raised) = readout_data( data_fifo, index, queue_output, @@ -836,6 +851,11 @@ def python_worker( qemu_post=qemu_post, qemu_pre_data=qemu_pre_data, ) + + if timeout_raised: + logger.error(f"Terminate process {index}") + p_qemu.terminate() + p_qemu.join() delete_fifos() diff --git a/hdf5logger.py b/hdf5logger.py index 6a99ff9..3fe2a0c 100644 --- a/hdf5logger.py +++ b/hdf5logger.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import queue import signal import logging import time @@ -141,6 +142,7 @@ class config_table(tables.IsDescription): mem_info = tables.BoolCol() max_instruction_count = tables.UInt64Col() memory_dump = tables.BoolCol() + fault_count = tables.UInt64Col() class hash_table(tables.IsDescription): @@ -396,6 +398,7 @@ def process_config(f, configgroup, exp, myfilter): config_row["tb_info"] = exp["tb_info"] config_row["mem_info"] = exp["mem_info"] config_row["max_instruction_count"] = exp["max_instruction_count"] + config_row["fault_count"] = exp["fault_count"] config_row.append() @@ -441,6 +444,8 @@ def process_config(f, configgroup, exp, myfilter): def process_backup(f, configgroup, exp, myfilter, stop_signal): + exp["config"]["fault_count"] = len(exp["expanded_faultlist"]) + process_config(f, configgroup, exp["config"], myfilter) fault_expanded_group = f.create_group( @@ -511,7 +516,11 @@ def hdf5collector( if stop_signal.value == 1: break # readout queue and get next output from qemu. Will block - exp = queue_output.get() + try: + exp = queue_output.get_nowait() + except queue.Empty: + continue + t1 = time.time() logger.debug( "got exp {}, {} still need to be performed. Took {}s. Elements in queu: {}".format( diff --git a/requirements.txt b/requirements.txt index c70e777..4ac55f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tables==3.7.0 json5==0.9.10 protobuf==4.21.12 tqdm==4.65.0 +psutil==5.9.6