From 65f3ebcfda1751a6bebbd745962307e0af669ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20K=C3=BCchler?= Date: Fri, 11 Aug 2023 13:34:47 +0200 Subject: [PATCH] Optimize Run + Design Except Feature + Various (#70) ## Optimize config/dir creation As part of the setup, before running experiments, we create all working directories and place the `config.json` in the folder. Until now, this relied on pure Ansible. However, for many jobs, this creation becomes a bottleneck. Now, there is a custom module that does this more efficiently. ## Optimize result fetching We replaced the slow fetching of results with an additional custom module. More efficiency is possible because now `tsp` or `slurm` do not need to process a completing job id one-by-one. Instead, they can report that a list of job ids finished and can be downloaded. ## New Experiment Design Feature: `except_filters` We added a basic implementation of #71 that allows filtering out certain combinations. ## New Super ETL Feature: `pipelines` filter We add the possibility when running a super_etl via make to run only a subset of pipelines (see `pipelines="a b"`) ## ETL Extract Optimization We only flatten results that require flattening, which speeds up the processing. ## ETL Step We add a new ETL Loader that allows storing a data frame as a pickle. ## Option to save ETL results to Notion We add a utility function that allows storing the results of a loader to a Notion page. --------- Co-authored-by: Hidde L --- Makefile | 7 +- ansible.cfg | 5 +- demo_project/demo_latency.py | 7 +- .../designs/example03-format.yml | 28 +- .../suite_design.yml | 1 + .../etl_results/pipeline1/pipeline1.csv | 12 +- .../suite_design.yml | 2 + .../format_cross/run_0/rep_0/config.json | 0 .../format_cross/run_1/rep_0/config.json | 0 .../format_cross/run_10/rep_0/config.json | 14 - .../run_10/rep_0/small/host_0/stderr.log | 0 .../run_10/rep_0/small/host_0/stdout.log | 1 - .../format_cross/run_11/rep_0/config.json | 14 - .../run_11/rep_0/small/host_0/stderr.log | 0 .../run_11/rep_0/small/host_0/stdout.log | 1 - .../format_cross/run_2/rep_0/config.json | 0 .../format_cross/run_3/rep_0/config.json | 0 .../format_cross/run_4/rep_0/config.json | 0 .../format_cross/run_5/rep_0/config.json | 0 .../format_cross/run_6/rep_0/config.json | 0 .../format_cross/run_7/rep_0/config.json | 0 .../format_cross/run_8/rep_0/config.json | 0 .../format_cross/run_9/rep_0/config.json | 0 .../format_cross/state.yml | 4 +- .../format_levellist/run_0/rep_0/config.json | 0 .../format_levellist/run_1/rep_0/config.json | 0 .../format_levellist/run_2/rep_0/config.json | 0 .../format_mixed/run_0/rep_0/config.json | 0 .../format_mixed/run_1/rep_0/config.json | 0 .../format_mixed/run_10/rep_0/config.json | 15 - .../run_10/rep_0/small/host_0/stderr.log | 0 .../run_10/rep_0/small/host_0/stdout.log | 1 - .../format_mixed/run_11/rep_0/config.json | 15 - .../run_11/rep_0/small/host_0/stderr.log | 0 .../run_11/rep_0/small/host_0/stdout.log | 1 - .../format_mixed/run_2/rep_0/config.json | 0 .../format_mixed/run_3/rep_0/config.json | 0 .../format_mixed/run_4/rep_0/config.json | 0 .../format_mixed/run_5/rep_0/config.json | 0 .../format_mixed/run_6/rep_0/config.json | 0 .../format_mixed/run_7/rep_0/config.json | 8 +- .../run_7/rep_0/small/host_0/stdout.log | 2 +- .../format_mixed/run_8/rep_0/config.json | 4 +- .../run_8/rep_0/small/host_0/stdout.log | 2 +- .../format_mixed/run_9/rep_0/config.json | 4 +- .../run_9/rep_0/small/host_0/stdout.log | 2 +- .../format_mixed/state.yml | 4 +- .../suite_design.yml | 15 + .../suite_design_ext.yml | 30 -- .../suite_design.yml | 1 + .../suite_design.yml | 3 + .../example06-vars_$expected/suite_design.yml | 1 + .../example07-etl_$expected/suite_design.yml | 4 + doespy/doespy/design/exp_design.py | 31 ++ doespy/doespy/design/extend.py | 24 ++ doespy/doespy/etl/etl_base.py | 40 ++- doespy/doespy/etl/etl_util.py | 120 +++++++ doespy/doespy/etl/steps/loaders.py | 34 +- doespy/doespy/etl/super_etl.py | 16 +- src/experiment-suite.yml | 9 +- .../experiment-job/library/collect_results.py | 113 +++++++ .../experiment-job/library/setup_job_dirs.py | 75 +++++ src/roles/experiment-job/tasks/main.yml | 261 +++++++--------- .../suite-scheduler-enqueue/library/tsp.py | 6 +- .../suite-scheduler-enqueue/tasks/slurm.yml | 1 + .../suite-scheduler-remove/library/tsp.py | 293 ------------------ .../suite-scheduler-remove/tasks/bsub.yml | 2 +- .../suite-scheduler-remove/tasks/slurm.yml | 2 +- .../suite-scheduler-remove/tasks/tsp.yml | 5 +- .../filter_plugins/helpers.py | 42 ++- .../suite-scheduler-status/tasks/bsub.yml | 39 +-- .../suite-scheduler-status/tasks/slurm.yml | 23 +- .../suite-scheduler-status/tasks/tsp.yml | 11 +- 73 files changed, 707 insertions(+), 648 deletions(-) mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_0/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_1/rep_0/config.json delete mode 100755 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/config.json delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stderr.log delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stdout.log delete mode 100755 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/config.json delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stderr.log delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stdout.log mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_2/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_3/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_4/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_5/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_6/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_7/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_8/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_cross/run_9/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_0/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_1/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_2/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_0/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_1/rep_0/config.json delete mode 100755 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/config.json delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stderr.log delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stdout.log delete mode 100755 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/config.json delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stderr.log delete mode 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stdout.log mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_2/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_3/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_4/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_5/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_6/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/config.json mode change 100755 => 100644 demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/config.json create mode 100644 src/roles/experiment-job/library/collect_results.py create mode 100644 src/roles/experiment-job/library/setup_job_dirs.py delete mode 100644 src/roles/suite-scheduler-remove/library/tsp.py diff --git a/Makefile b/Makefile index a6b5029e..e9d99835 100644 --- a/Makefile +++ b/Makefile @@ -32,6 +32,10 @@ ifdef id myid=--id $(id) endif +ifdef pipelines + mypipelines=--pipelines $(pipelines) +endif + # on `make` and `make help` list all targets with information help: @echo 'Running Experiments' @@ -48,6 +52,7 @@ help: @echo ' make etl-design suite= id= - same as `make etl ...` but uses the pipeline from the suite design instead of results' @echo ' make etl-all - run etl pipelines of all results' @echo ' make etl-super config= out= - run the super etl to combine results of multiple suites (for e.g., demo_plots)' + @echo ' make etl-super ... pipelines=" " - run only a subset of pipelines in the super etl' @echo 'Clean ETL' @echo ' make etl-clean suite= id= - delete etl results from specific suite (can be regenerated with make etl ...)' @echo ' make etl-clean-all - delete etl results from all suites (can be regenerated with make etl-all)' @@ -170,7 +175,7 @@ etl-all: install # e.g., make etl-super config=demo_plots out=/home/kuenico/dev/doe-suite/tmp etl-super: install @cd $(does_config_dir) && \ - poetry run python $(PWD)/doespy/doespy/etl/super_etl.py --config $(config) --output_path $(out) + poetry run python $(PWD)/doespy/doespy/etl/super_etl.py --config $(config) --output_path $(out) $(mypipelines) # delete etl results for a specific `suite` and `id` (can be regenerated with `make etl suite= id=`) etl-clean: install diff --git a/ansible.cfg b/ansible.cfg index eaf15fee..2acf0b3f 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -13,11 +13,12 @@ any_errors_fatal_setup = true any_errors_fatal_experiments = false + # TODO [nku] can I control this cfg from the makefile? `make run suite=abc id=new` vs `make run-v suite=abc id=new` vs `make run-vvvv suite=abc id=new` stdout_callback = community.general.selective + # speedup by using ssh pipelining -pipelining = True [ssh_connection] pipelining = True -ssh_args = -o ControlMaster=auto -o ControlPersist=1200 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=300s \ No newline at end of file diff --git a/demo_project/demo_latency.py b/demo_project/demo_latency.py index 651435ba..404a3eae 100644 --- a/demo_project/demo_latency.py +++ b/demo_project/demo_latency.py @@ -24,9 +24,10 @@ def main(): print("Measuring Latency...") data = {} - data["latency"] = a * args.size + random.uniform( - -1, 1 - ) # latency depends linear on size + some noise for reps + noise = 0.93290707138428 + #noise = random.uniform(-1, 1) + data["latency"] = a * args.size + noise # latency depends linear on size + some noise for reps + time.sleep(10) # wait 15 seconds diff --git a/demo_project/doe-suite-config/designs/example03-format.yml b/demo_project/doe-suite-config/designs/example03-format.yml index 61d6de2d..df2b3aa2 100644 --- a/demo_project/doe-suite-config/designs/example03-format.yml +++ b/demo_project/doe-suite-config/designs/example03-format.yml @@ -9,6 +9,8 @@ # # The `cross` format uses the keyword `$FACTOR$` as a YAML key, # while the `factor list` uses `$FACTOR$` as a YAML value and expects a corresponding level in the `factor_levels` list. +# +# The `except_filters` construct can be used to ignore specific combinations of configuration (e.g., the cross product between two factors except a specific combination should be skipped) # experiment in the pure `cross` format format_cross: @@ -25,8 +27,16 @@ format_cross: name: $FACTOR$: [app1, app2, app3] # varied parameter between runs (factor) # hyperparam: X -> not used in this experiment + except_filters: + # we ignore the combination of vector_size 40 with app2 and app3 and only run it with app1 + - vector_size: 40 + app: + name: app2 + - vector_size: 40 + app: + name: app3 # - # The experiment `format_cross` results in 12 runs: + # The experiment `format_cross` results in 10 runs: # - {"vector_size": 10, "app.name": app1, "seed": 1234} # - {"vector_size": 10, "app.name": app2, "seed": 1234} # - {"vector_size": 10, "app.name": app3, "seed": 1234} @@ -40,8 +50,8 @@ format_cross: # - {"vector_size": 30, "app.name": app3, "seed": 1234} # - {"vector_size": 40, "app.name": app1, "seed": 1234} - # - {"vector_size": 40, "app.name": app2, "seed": 1234} - # - {"vector_size": 40, "app.name": app3, "seed": 1234} + # - {"vector_size": 40, "app.name": app2, "seed": 1234} -> Ignored by except_filters + # - {"vector_size": 40, "app.name": app3, "seed": 1234} -> Ignored by except_filters # experiment in the pure `level list` format @@ -102,6 +112,14 @@ format_mixed: - app: name: app3 hyperparam: 5 + except_filters: + # we ignore the combination of vector_size 40 with app2 and app3 and only run it with app1 + - vector_size: 40 + app: + name: app2 + - vector_size: 40 + app: + name: app3 # The mix between `cross`and `level-list` is the most flexible because it allows to define $FACTORS$ # for which we want to create the cross product (e.g., `vector_size`) and @@ -125,8 +143,8 @@ format_mixed: # - {"vector_size": 30, "app.name": app3, "app.hyperparam": 5 , "seed": 1234} # - {"vector_size": 40, "app.name": app1, "app.hyperparam": 0.1, "seed": 1234} - # - {"vector_size": 40, "app.name": app2, "app.hyperparam": 10 , "seed": 1234} - # - {"vector_size": 40, "app.name": app3, "app.hyperparam": 5 , "seed": 1234} + # - {"vector_size": 40, "app.name": app2, "app.hyperparam": 10 , "seed": 1234} -> ignored by except_filters + # - {"vector_size": 40, "app.name": app3, "app.hyperparam": 5 , "seed": 1234} -> ignored by except_filters $ETL$: check_error: # ensures that stderr.log is empty everywhere and that no files are generated except stdout.log diff --git a/demo_project/doe-suite-results/example01-minimal_$expected/suite_design.yml b/demo_project/doe-suite-results/example01-minimal_$expected/suite_design.yml index 6de07ba7..fd003111 100644 --- a/demo_project/doe-suite-results/example01-minimal_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example01-minimal_$expected/suite_design.yml @@ -20,6 +20,7 @@ minimal: - '!' factor_levels: - {} + except_filters: [] $ETL$: check_error: experiments: diff --git a/demo_project/doe-suite-results/example02-single_$expected/etl_results/pipeline1/pipeline1.csv b/demo_project/doe-suite-results/example02-single_$expected/etl_results/pipeline1/pipeline1.csv index fabb5915..6b60a6a7 100644 --- a/demo_project/doe-suite-results/example02-single_$expected/etl_results/pipeline1/pipeline1.csv +++ b/demo_project/doe-suite-results/example02-single_$expected/etl_results/pipeline1/pipeline1.csv @@ -1,7 +1,7 @@ ,suite_name,suite_id,exp_name,run,host_type,host_idx,factor_columns,source_file,opt,out,payload_size_mb,$CMD$.small,latency_mean,latency_min,latency_max,latency_std,latency_count -0,example02-single,$expected,experiment_1,0,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,10,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 10 --out json'}],14.9329070714,14.9329070714,14.9329070714,0.0,2 -1,example02-single,$expected,experiment_1,1,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,10,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 10 --out json'}],27.9329070714,27.9329070714,27.9329070714,0.0,2 -2,example02-single,$expected,experiment_1,2,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,20,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 20 --out json'}],28.9329070714,28.9329070714,28.9329070714,0.0,2 -3,example02-single,$expected,experiment_1,3,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,20,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 20 --out json'}],54.9329070714,54.9329070714,54.9329070714,0.0,2 -4,example02-single,$expected,experiment_1,4,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,30,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 30 --out json'}],42.9329070714,42.9329070714,42.9329070714,0.0,2 -5,example02-single,$expected,experiment_1,5,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,30,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 30 --out json'}],81.9329070714,81.9329070714,81.9329070714,0.0,2 +0,example02-single,$expected,experiment_1,0,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,10,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 10 --out json'}],14.932907071384278,14.932907071384278,14.932907071384278,0.0,2 +1,example02-single,$expected,experiment_1,1,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,10,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 10 --out json'}],27.932907071384278,27.932907071384278,27.932907071384278,0.0,2 +2,example02-single,$expected,experiment_1,2,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,20,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 20 --out json'}],28.932907071384278,28.932907071384278,28.932907071384278,0.0,2 +3,example02-single,$expected,experiment_1,3,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,20,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 20 --out json'}],54.93290707138428,54.93290707138428,54.93290707138428,0.0,2 +4,example02-single,$expected,experiment_1,4,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,True,json,30,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt True --size 30 --out json'}],42.93290707138428,42.93290707138428,42.93290707138428,0.0,2 +5,example02-single,$expected,experiment_1,5,small,0,"['payload_size_mb', 'opt']",demo_latency_out.json,False,json,30,[{'main': '/cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/.venv/bin/python /cluster/home/kunicola/doe-suite/example_nku/example02-single/code/demo_project/demo_latency.py --opt False --size 30 --out json'}],81.93290707138428,81.93290707138428,81.93290707138428,0.0,2 diff --git a/demo_project/doe-suite-results/example02-single_$expected/suite_design.yml b/demo_project/doe-suite-results/example02-single_$expected/suite_design.yml index c3c63a32..cb6e1ba4 100644 --- a/demo_project/doe-suite-results/example02-single_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example02-single_$expected/suite_design.yml @@ -22,6 +22,7 @@ experiment_1: - false factor_levels: - {} + except_filters: [] experiment_2: n_repetitions: 3 common_roles: [] @@ -45,6 +46,7 @@ experiment_2: other: '[0, 1]' factor_levels: - {} + except_filters: [] $ETL$: pipeline1: experiments: diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_0/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_0/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_1/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_1/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/config.json deleted file mode 100755 index 52b58f18..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "$CMD$": { - "small": [ - { - "main": "echo \"run app=app2 with vec=40 seed=1234\"" - } - ] - }, - "app": { - "name": "app2" - }, - "seed": 1234, - "vector_size": 40 -} \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stderr.log b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stderr.log deleted file mode 100644 index e69de29b..00000000 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stdout.log deleted file mode 100644 index 1ff24799..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_10/rep_0/small/host_0/stdout.log +++ /dev/null @@ -1 +0,0 @@ -run app=app2 with vec=40 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/config.json deleted file mode 100755 index ea6c0240..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "$CMD$": { - "small": [ - { - "main": "echo \"run app=app3 with vec=40 seed=1234\"" - } - ] - }, - "app": { - "name": "app3" - }, - "seed": 1234, - "vector_size": 40 -} \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stderr.log b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stderr.log deleted file mode 100644 index e69de29b..00000000 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stdout.log deleted file mode 100644 index f0016f6a..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_11/rep_0/small/host_0/stdout.log +++ /dev/null @@ -1 +0,0 @@ -run app=app3 with vec=40 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_2/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_2/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_3/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_3/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_4/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_4/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_5/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_5/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_6/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_6/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_7/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_7/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_8/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_8/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_9/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_cross/run_9/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_cross/state.yml b/demo_project/doe-suite-results/example03-format_$expected/format_cross/state.yml index 584d8673..04cc252b 100755 --- a/demo_project/doe-suite-results/example03-format_$expected/format_cross/state.yml +++ b/demo_project/doe-suite-results/example03-format_$expected/format_cross/state.yml @@ -1,8 +1,8 @@ --- -exp_job_ids: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 9, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 10, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 11, 'exp_run_rep': 0}] +exp_job_ids: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 9, 'exp_run_rep': 0}] exp_job_ids_unfinished: [] # pending + queued + running exp_job_ids_pending: [] exp_job_ids_queued: [] exp_job_ids_running: [] -exp_job_ids_finished: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 9, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 10, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 11, 'exp_run_rep': 0}] \ No newline at end of file +exp_job_ids_finished: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_cross', 'exp_run': 9, 'exp_run_rep': 0}] \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_0/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_0/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_1/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_1/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_2/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_levellist/run_2/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_0/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_0/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_1/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_1/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/config.json deleted file mode 100755 index f3c2da77..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/config.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "$CMD$": { - "small": [ - { - "main": "echo \"run app=app3 with hyperparam=5 vec=30 seed=1234\"" - } - ] - }, - "app": { - "hyperparam": 5, - "name": "app3" - }, - "seed": 1234, - "vector_size": 30 -} \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stderr.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stderr.log deleted file mode 100644 index e69de29b..00000000 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stdout.log deleted file mode 100644 index 8f7ad18c..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_10/rep_0/small/host_0/stdout.log +++ /dev/null @@ -1 +0,0 @@ -run app=app3 with hyperparam=5 vec=30 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/config.json deleted file mode 100755 index 85c5eb4a..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/config.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "$CMD$": { - "small": [ - { - "main": "echo \"run app=app3 with hyperparam=5 vec=40 seed=1234\"" - } - ] - }, - "app": { - "hyperparam": 5, - "name": "app3" - }, - "seed": 1234, - "vector_size": 40 -} \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stderr.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stderr.log deleted file mode 100644 index e69de29b..00000000 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stdout.log deleted file mode 100644 index 01a1b584..00000000 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_11/rep_0/small/host_0/stdout.log +++ /dev/null @@ -1 +0,0 @@ -run app=app3 with hyperparam=5 vec=40 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_2/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_2/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_3/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_3/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_4/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_4/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_5/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_5/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_6/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_6/rep_0/config.json old mode 100755 new mode 100644 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/config.json old mode 100755 new mode 100644 index 951f689a..0cb0fdd7 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/config.json +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/config.json @@ -2,14 +2,14 @@ "$CMD$": { "small": [ { - "main": "echo \"run app=app2 with hyperparam=10 vec=40 seed=1234\"" + "main": "echo \"run app=app3 with hyperparam=5 vec=10 seed=1234\"" } ] }, "app": { - "hyperparam": 10, - "name": "app2" + "hyperparam": 5, + "name": "app3" }, "seed": 1234, - "vector_size": 40 + "vector_size": 10 } \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/small/host_0/stdout.log index be9f7564..495a88d9 100644 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/small/host_0/stdout.log +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_7/rep_0/small/host_0/stdout.log @@ -1 +1 @@ -run app=app2 with hyperparam=10 vec=40 seed=1234 +run app=app3 with hyperparam=5 vec=10 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/config.json old mode 100755 new mode 100644 index 0cb0fdd7..eda06b0b --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/config.json +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/config.json @@ -2,7 +2,7 @@ "$CMD$": { "small": [ { - "main": "echo \"run app=app3 with hyperparam=5 vec=10 seed=1234\"" + "main": "echo \"run app=app3 with hyperparam=5 vec=20 seed=1234\"" } ] }, @@ -11,5 +11,5 @@ "name": "app3" }, "seed": 1234, - "vector_size": 10 + "vector_size": 20 } \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/small/host_0/stdout.log index 495a88d9..1c799095 100644 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/small/host_0/stdout.log +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_8/rep_0/small/host_0/stdout.log @@ -1 +1 @@ -run app=app3 with hyperparam=5 vec=10 seed=1234 +run app=app3 with hyperparam=5 vec=20 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/config.json b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/config.json old mode 100755 new mode 100644 index eda06b0b..f3c2da77 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/config.json +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/config.json @@ -2,7 +2,7 @@ "$CMD$": { "small": [ { - "main": "echo \"run app=app3 with hyperparam=5 vec=20 seed=1234\"" + "main": "echo \"run app=app3 with hyperparam=5 vec=30 seed=1234\"" } ] }, @@ -11,5 +11,5 @@ "name": "app3" }, "seed": 1234, - "vector_size": 20 + "vector_size": 30 } \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/small/host_0/stdout.log b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/small/host_0/stdout.log index 1c799095..8f7ad18c 100644 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/small/host_0/stdout.log +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/run_9/rep_0/small/host_0/stdout.log @@ -1 +1 @@ -run app=app3 with hyperparam=5 vec=20 seed=1234 +run app=app3 with hyperparam=5 vec=30 seed=1234 diff --git a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/state.yml b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/state.yml index 68a1555b..ff1dab04 100755 --- a/demo_project/doe-suite-results/example03-format_$expected/format_mixed/state.yml +++ b/demo_project/doe-suite-results/example03-format_$expected/format_mixed/state.yml @@ -1,8 +1,8 @@ --- -exp_job_ids: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 9, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 10, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 11, 'exp_run_rep': 0}] +exp_job_ids: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 9, 'exp_run_rep': 0}] exp_job_ids_unfinished: [] # pending + queued + running exp_job_ids_pending: [] exp_job_ids_queued: [] exp_job_ids_running: [] -exp_job_ids_finished: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 9, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 10, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 11, 'exp_run_rep': 0}] \ No newline at end of file +exp_job_ids_finished: [{'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 0, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 1, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 2, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 3, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 4, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 5, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 6, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 7, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 8, 'exp_run_rep': 0}, {'suite': 'example03-format', 'suite_id': '$expected', 'exp_name': 'format_mixed', 'exp_run': 9, 'exp_run_rep': 0}] \ No newline at end of file diff --git a/demo_project/doe-suite-results/example03-format_$expected/suite_design.yml b/demo_project/doe-suite-results/example03-format_$expected/suite_design.yml index 56a41a48..4134859a 100644 --- a/demo_project/doe-suite-results/example03-format_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example03-format_$expected/suite_design.yml @@ -24,6 +24,13 @@ format_cross: - app3 factor_levels: - {} + except_filters: + - vector_size: 40 + app: + name: app2 + - vector_size: 40 + app: + name: app3 format_levellist: n_repetitions: 1 common_roles: [] @@ -49,6 +56,7 @@ format_levellist: - app: name: app3 hyperparam: 5 + except_filters: [] format_mixed: n_repetitions: 1 common_roles: [] @@ -80,6 +88,13 @@ format_mixed: - app: name: app3 hyperparam: 5 + except_filters: + - vector_size: 40 + app: + name: app2 + - vector_size: 40 + app: + name: app3 $ETL$: check_error: experiments: diff --git a/demo_project/doe-suite-results/example03-format_$expected/suite_design_ext.yml b/demo_project/doe-suite-results/example03-format_$expected/suite_design_ext.yml index 147268a3..04cf9a58 100644 --- a/demo_project/doe-suite-results/example03-format_$expected/suite_design_ext.yml +++ b/demo_project/doe-suite-results/example03-format_$expected/suite_design_ext.yml @@ -69,20 +69,6 @@ format_cross: $CMD$: small: - main: echo "run app=app1 with vec=40 seed=1234" -- seed: 1234 - vector_size: 40 - app: - name: app2 - $CMD$: - small: - - main: echo "run app=app2 with vec=40 seed=1234" -- seed: 1234 - vector_size: 40 - app: - name: app3 - $CMD$: - small: - - main: echo "run app=app3 with vec=40 seed=1234" format_levellist: - seed: 1234 app: @@ -162,14 +148,6 @@ format_mixed: $CMD$: small: - main: echo "run app=app2 with hyperparam=10 vec=30 seed=1234" -- seed: 1234 - vector_size: 40 - app: - name: app2 - hyperparam: 10 - $CMD$: - small: - - main: echo "run app=app2 with hyperparam=10 vec=40 seed=1234" - seed: 1234 vector_size: 10 app: @@ -194,11 +172,3 @@ format_mixed: $CMD$: small: - main: echo "run app=app3 with hyperparam=5 vec=30 seed=1234" -- seed: 1234 - vector_size: 40 - app: - name: app3 - hyperparam: 5 - $CMD$: - small: - - main: echo "run app=app3 with hyperparam=5 vec=40 seed=1234" diff --git a/demo_project/doe-suite-results/example04-multi_$expected/suite_design.yml b/demo_project/doe-suite-results/example04-multi_$expected/suite_design.yml index 8674ce6a..280e56cc 100644 --- a/demo_project/doe-suite-results/example04-multi_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example04-multi_$expected/suite_design.yml @@ -39,6 +39,7 @@ exp_client_server: host_vars: client: msg: good day server + except_filters: [] $ETL$: check_error: experiments: diff --git a/demo_project/doe-suite-results/example05-complex_$expected/suite_design.yml b/demo_project/doe-suite-results/example05-complex_$expected/suite_design.yml index 53fcb640..0f5be827 100644 --- a/demo_project/doe-suite-results/example05-complex_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example05-complex_$expected/suite_design.yml @@ -19,6 +19,7 @@ exp_single: opt: true - info: run2 - without optimization opt: false + except_filters: [] exp_multi_1: n_repetitions: 3 common_roles: @@ -50,6 +51,7 @@ exp_multi_1: - good day server factor_levels: - {} + except_filters: [] exp_multi_2: n_repetitions: 2 common_roles: @@ -76,6 +78,7 @@ exp_multi_2: postfix: parties - prefix: hello postfix: people + except_filters: [] $ETL$: check_error: experiments: diff --git a/demo_project/doe-suite-results/example06-vars_$expected/suite_design.yml b/demo_project/doe-suite-results/example06-vars_$expected/suite_design.yml index 824668a7..34f29820 100644 --- a/demo_project/doe-suite-results/example06-vars_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example06-vars_$expected/suite_design.yml @@ -30,4 +30,5 @@ shared_vars: factor_levels: - define_factor: f1 - define_factor: f2 + except_filters: [] $ETL$: {} diff --git a/demo_project/doe-suite-results/example07-etl_$expected/suite_design.yml b/demo_project/doe-suite-results/example07-etl_$expected/suite_design.yml index 6239178b..19ee66a9 100644 --- a/demo_project/doe-suite-results/example07-etl_$expected/suite_design.yml +++ b/demo_project/doe-suite-results/example07-etl_$expected/suite_design.yml @@ -22,6 +22,7 @@ square: - 2 factor_levels: - {} + except_filters: [] plus: n_repetitions: 1 common_roles: [] @@ -48,6 +49,7 @@ plus: - W factor_levels: - {} + except_filters: [] triangle1: n_repetitions: 1 common_roles: [] @@ -71,6 +73,7 @@ triangle1: - 4 factor_levels: - {} + except_filters: [] triangle2: n_repetitions: 1 common_roles: [] @@ -92,6 +95,7 @@ triangle2: y: 3 - x: 1 y: 5 + except_filters: [] $ETL$: coord_square: experiments: diff --git a/doespy/doespy/design/exp_design.py b/doespy/doespy/design/exp_design.py index 4a41f002..81c6739f 100644 --- a/doespy/doespy/design/exp_design.py +++ b/doespy/doespy/design/exp_design.py @@ -361,6 +361,10 @@ class Experiment(MyBaseModel): """For the factors of an experiment, lists the different levels. For example, `n_clients` can be a factor with two levels: 1 and 100.""" + except_filters: List[Dict] = [] + """A list of filters that can be used to exclude certain runs from the experiment. + """ + class Config: extra = "forbid" @@ -410,6 +414,33 @@ def check_factor_levels(cls, values): expected={expected_factor_paths} actual={actual_factors}" return values + @root_validator(skip_on_failure=True) + def check_except_filters(cls, values): + """Every entry in ``except_filters`` must be a subset of the actual factors. + """ + + all_factors = set() + + # add level factors + for x in values['ctx'].my_experiment_factor_paths_levellist: + all_factors.add(tuple(x)) + + for x in values['ctx'].my_experiment_factor_paths_cross: + assert x[-1] == "$FACTOR$" + all_factors.add(tuple(x[:-1])) # remove the $FACTOR$ + + for filt in values.get("except_filters"): + filtered_factors = set() + for path, _value in dutil.nested_dict_iter(filt): + filtered_factors.add(tuple(path)) + + + assert filtered_factors.issubset(all_factors), \ + f"except_filters entry is not a subset of the actual factors: \ + except_filter={filtered_factors} all_factors={all_factors}" + + return values + # TODO [nku] could also extract some of them automatically from pydantic models? RESERVED_KEYWORDS = ["state", "$FACTOR$", "is_controller_yes", "is_controller_no", "check_status_yes", "check_status_no", "localhost", "n_repetitions", "common_roles", "host_types", "base_experiment", "factor_levels", "n", "init_roles", "check_status", "$CMD$"] def get_keywords(): diff --git a/doespy/doespy/design/extend.py b/doespy/doespy/design/extend.py index ae2cfb52..1c431c93 100644 --- a/doespy/doespy/design/extend.py +++ b/doespy/doespy/design/extend.py @@ -18,6 +18,7 @@ from pydantic import root_validator + def extend(suite_design, exp_specific_vars, use_cmd_shellcheck=False): """ @@ -59,6 +60,14 @@ def extend(suite_design, exp_specific_vars, use_cmd_shellcheck=False): factor_level = merge_hash( factor_level, cross_factor_level, recursive=True ) + + + # introduce except_filters to skip certain runs + skip_run = any(_is_subset_dict(except_level, factor_level) for except_level in exp.get("except_filters", [])) + if skip_run: + print(f"Skipping run with factor_level={factor_level}") + continue # we are skipping this combination + run_config = copy.deepcopy(base_experiment) # overwrite $FACTOR$ with the concrete level of the run @@ -178,6 +187,21 @@ def _nested_dict_iter(nested, p=[]): yield key, value, p +def _is_subset_dict(sub_dict, main_dict): + """Checks if sub_dict is a subset of main_dict""" + + for key, value in sub_dict.items(): + if key not in main_dict: + return False + if isinstance(value, dict): + if not _is_subset_dict(value, main_dict[key]): + return False + else: + if value != main_dict[key]: + return False + return True + + def _insert_config(config, key, parent_path, value): d = config for k in parent_path: diff --git a/doespy/doespy/etl/etl_base.py b/doespy/doespy/etl/etl_base.py index 5ec50a5e..52d731e2 100644 --- a/doespy/doespy/etl/etl_base.py +++ b/doespy/doespy/etl/etl_base.py @@ -10,6 +10,8 @@ import ruamel.yaml from pydantic import ValidationError +from tqdm import tqdm + from doespy import util from doespy import status from doespy.design import validate_extend @@ -59,11 +61,30 @@ def run_multi_suite( flag_output_dir_config_name: bool = True, flag_output_dir_pipeline: bool = True, etl_from_design: bool = False, + pipeline_filter: List[str] = None, return_df: bool = False, ): pipeline_design = _load_super_etl_design(name=super_etl) + + + + # filtering out pipelines by the pipeline_filter + if pipeline_filter is not None: + + # check that pipeline_filter is valid + for pipeline_name in pipeline_filter: + assert pipeline_name not in ["$SUITE_ID$", "$ETL$"], "Pipeline filter cannot be $SUITE_ID$ or $ETL$" + assert pipeline_name in pipeline_design["$ETL$"], f"Pipeline filter not found in super etl design: {pipeline_name} (existing={pipeline_design['$ETL$'].keys()})" + + # find pipelines to remove + existing_pipelines = set(pipeline_design["$ETL$"].keys()) + filtered_out_pipelines = existing_pipelines - set(pipeline_filter) + print(f"Filtering our pipelines: {filtered_out_pipelines}") + for p in filtered_out_pipelines: + del pipeline_design["$ETL$"][p] + return run_etl( config_name=super_etl, pipeline_design=pipeline_design, @@ -116,9 +137,12 @@ def run_etl( if not has_exp_result: res_dir = util.get_suite_results_dir(suite=suite, id=suite_id) - suite_status, _etl_error = status.get_suite_status(res_dir) - if suite_status[experiment]["n_jobs_finished"] > 0: - has_exp_result = True + # check that results from at least one run are present + for x in os.listdir(os.path.join(res_dir, experiment)): + if x.startswith("run_"): + has_exp_result = True + break + suite_design = _load_suite_design(suite, suite_id, etl_from_design) @@ -278,6 +302,7 @@ def _extract_experiments_suite(suite, experiments, suite_id_map): raise ValueError(f"Suite Id cannot be None: {d} (set default or suite in suite id map)") return d else: + # TODO [nku] it could also be a feature to have a list of suite ids for the same suite raise ValueError("Suite ids must be a value or dict!") @@ -474,7 +499,7 @@ def extract( runs = _list_dir_only(exp_dir) factor_columns = _parse_factors(base_experiments[exp]) - for run in runs: + for run in tqdm(runs, desc=f"processing runs of experiment {exp}"): run_dir = os.path.join(exp_dir, run) reps = _list_dir_only(run_dir) @@ -524,6 +549,8 @@ def extract( res_lst.append(res) df = pd.DataFrame(res_lst) + + res_lst.clear() return df @@ -616,4 +643,7 @@ def _is_file(path, f): def _flatten_d(d): - return json.loads(pd.json_normalize(d, sep=".").iloc[0].to_json()) + if any(isinstance(i, dict) for i in d.values()): + return json.loads(pd.json_normalize(d, sep=".").iloc[0].to_json()) + else: + return d diff --git a/doespy/doespy/etl/etl_util.py b/doespy/doespy/etl/etl_util.py index 88e0df18..f843cad7 100644 --- a/doespy/doespy/etl/etl_util.py +++ b/doespy/doespy/etl/etl_util.py @@ -1,5 +1,11 @@ +import json + import pandas as pd +from tqdm import tqdm +import requests +import time +import os def expand_factors(df: pd.DataFrame, columns: list) -> list: """ @@ -104,3 +110,117 @@ def print_etl_pipeline(etl_pipeline, name): for line in out: print(line) print() + +def escape_tuple_str(tup) -> str: + as_str = "_".join(tup) + # remove any dots + as_str = as_str.replace(".", "") + return as_str + +def save_notion(filenames, etl_info, notion_dict): + etl_output_dir = etl_info["etl_output_dir"] + pipeline = etl_info["pipeline"] + super_etl_name = etl_info["suite"] + + project = notion_dict["project"] + parent_block_id = notion_dict["block_id"] + + s3_urls = save_files_to_s3("doe-suite-plots", project, super_etl_name, pipeline, etl_output_dir, filenames) + + # Your Notion API key + notion_api_key = os.environ["NOTION_API_KEY"] + + url = f"https://api.notion.com/v1/blocks/{parent_block_id}/children" + + # Headers + headers = { + "Authorization": f"Bearer {notion_api_key}", + "Content-Type": "application/json", + "Notion-Version": "2022-06-28" # update if newer version is available + } + + clear_children = True + if clear_children: + notion_remove_children(url, headers) + + for plot_url in tqdm(s3_urls, desc="Adding images to Notion"): + # URL to Notion API + + # Request body + data = { + "children": [ + { + "object": "block", + "type": "embed", + "embed": { + "url": plot_url + } + } + ] + } + # Convert Python dictionary to JSON + data_json = json.dumps(data) + + # Send POST request to Notion API + response = requests.patch(url, headers=headers, data=data_json) + + # Check the response + if response.status_code != 200: + print(f"Failed to add image, status code: {response.status_code}, for url {url}") + print(f"Response: {response.text}") + +def notion_remove_children(url, headers): + # Send GET request to Notion API to retrieve child blocks + response = requests.get(url, headers=headers) + + # Check the response + if response.status_code == 200: + # Convert the response to JSON + children = response.json() + + # Iterate over each child block and delete it + for child in children['results']: + child_id = child['id'] + + # URL to Notion API for deleting a block + delete_url = f"https://api.notion.com/v1/blocks/{child_id}" + + # Send DELETE request to Notion API to delete the child block + delete_response = requests.delete(delete_url, headers=headers) + + # Check the delete response + if delete_response.status_code != 200: + print(f"Failed to delete block with ID {child_id}, status code: {delete_response.status_code}") + print(f"Response: {delete_response.text}") + + # Wait for 0.1 second to prevent rate limit + time.sleep(0.1) + + print("Successfully deleted all existing plots") + + +def save_files_to_s3(bucket, project, super_etl_name, pipeline, local_output_dir, filenames, file_format="png", bucket_region="eu-central-1"): + import boto3 + from datetime import datetime + import urllib.parse + + timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + + urls = [] + for filename in tqdm(filenames, desc="Uploading plots to s3"): + # Set the filename using the current timestamp + file_path_local = os.path.join(local_output_dir, f"{filename}.{file_format}") + file_path_s3 = f'{project}/super_etl/{super_etl_name}/{pipeline}/{timestamp}/{filename}.png' + + # Create a session using your AWS credentials + s3 = boto3.resource('s3') + + # Upload the file + with open(file_path_local, "rb") as data: + s3.Bucket(bucket).put_object(Key=file_path_s3, Body=data) + + # url encode + url = f"https://{bucket}.s3.{bucket_region}.amazonaws.com/{urllib.parse.quote(file_path_s3)}" + urls.append(url) + + return urls diff --git a/doespy/doespy/etl/steps/loaders.py b/doespy/doespy/etl/steps/loaders.py index 529be2a2..18e2a8c5 100644 --- a/doespy/doespy/etl/steps/loaders.py +++ b/doespy/doespy/etl/steps/loaders.py @@ -75,7 +75,6 @@ class CsvSummaryLoader(Loader): r"""The `CsvSummaryLoader` creates a CSV file of the data frame from the `Transformer` stage. - :param output_dir: path relative to the etl output directory to store the csv. :param skip_empty: ignore empty df. .. code-block:: yaml @@ -84,8 +83,8 @@ class CsvSummaryLoader(Loader): $ETL$: loaders: CsvSummaryLoader: {} # with default output dir - CsvSummaryLoader: # with custom output dir - output_dir: dir1 + CsvSummaryLoader: # with skip empty df + skip_empty: True """ def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: @@ -99,6 +98,35 @@ def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: df.to_csv(os.path.join(output_dir, f"{etl_info['pipeline']}.csv")) +class PickleSummaryLoader(Loader): + + skip_empty: bool = False + + r"""The `PickleSummaryLoader` creates a Pickle file of the data frame from the `Transformer` stage. + + :param skip_empty: ignore empty df. + + .. code-block:: yaml + :caption: Example ETL Pipeline Design + + $ETL$: + loaders: + PickleSummaryLoader: {} # with default output dir + PickleSummaryLoader: # with skip empty dir + skip_empty: True + """ + + def load(self, df: pd.DataFrame, options: Dict, etl_info: Dict) -> None: + + if self.skip_empty and df.empty: + return + elif df.empty: + raise ValueError("PickleSummaryLoader: DataFrame is empty so not creating an output file.") + else: + output_dir = self.get_output_dir(etl_info) + df.to_pickle(os.path.join(output_dir, f"{etl_info['pipeline']}.pkl")) + + class LatexTableLoader(Loader): r"""The `LatexTableLoader` creates a tex file of the data frame from the `Transformer` stage formatted as a Latex table. diff --git a/doespy/doespy/etl/super_etl.py b/doespy/doespy/etl/super_etl.py index 7227ea8e..4cabbef2 100644 --- a/doespy/doespy/etl/super_etl.py +++ b/doespy/doespy/etl/super_etl.py @@ -4,6 +4,11 @@ def main(): + + # TODO [nku] validate the super-etl design with pydantic? + + # TODO [nku] allow for super-etl to be located in a different directory? -> specifically, the results of the super-etl should be in the same directory as the super-etl config + parser = argparse.ArgumentParser(description="") parser.add_argument("--config", type=str, required=True) parser.add_argument( @@ -19,7 +24,7 @@ def main(): ) parser.add_argument( "--output_dir_pipeline", - action="store_false", + action="store_true", help="Whether to output in a subdir with the name of the pipeline.", ) @@ -29,6 +34,14 @@ def main(): help="Use the pipelines from doe-suite-config/designs or suite_design.yml", ) + + parser.add_argument( + "--pipelines", + nargs="+", + required=False, + help="ETL super pipelines to run. If not specified, all pipelines will be run.", + ) + args = parser.parse_args() etl_base.run_multi_suite( @@ -37,6 +50,7 @@ def main(): flag_output_dir_config_name=not args.output_dir_config_name_disabled, flag_output_dir_pipeline=not args.output_dir_pipeline, etl_from_design=args.load_from_design, + pipeline_filter=args.pipelines, return_df=False, ) diff --git a/src/experiment-suite.yml b/src/experiment-suite.yml index d7b59035..f4bcb4aa 100644 --- a/src/experiment-suite.yml +++ b/src/experiment-suite.yml @@ -185,7 +185,7 @@ - name: start jobs for experiment {{ exp_name }} include_role: name: experiment-job - loop: "{{ range(0, (exp_job_ids_unfinished | length), 1) | list }}" + loop: "{{ range(0, (exp_job_ids_unfinished | length), 1) | list }}" # TODO [nku] this is not optimal because of the multi job ids per loop_control: loop_var: unfinished_job_idx @@ -203,6 +203,12 @@ msg: unexpected error occured in experiment = {{ exp_name }} when: is_expected_error is not defined or not is_expected_error +# - name: Save the updated state of the experiment run (save job ids) +# include_role: +# name: experiment-state +# vars: +# expstate: save + # the loop until task in `experiment-job` throws an error if the number of tries are exceeded. # here we catch this error and handle this gracefully. (every other error is handled by the previous task) - name: handle expected error if number of tries exceeded @@ -211,6 +217,7 @@ # when: is_expected_error + ########################################################################## # Cleanup Cloud (terminate instances, remove vpc) # ########################################################################## diff --git a/src/roles/experiment-job/library/collect_results.py b/src/roles/experiment-job/library/collect_results.py new file mode 100644 index 00000000..418f000e --- /dev/null +++ b/src/roles/experiment-job/library/collect_results.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +import os, json, subprocess, warnings + + +DOCUMENTATION = r''' +--- + +''' + +RETURN = r''' +''' + +from ansible.module_utils.basic import AnsibleModule + + +def jobid2workingdir(job_id, base): + + """ + Derives the path for the working directory corresponding to the job_id within `base`. + + job_id: {'suite': X, 'suite_id': X, 'exp_name': X, ... } + base: a path to a directory in which the workingdir resides + + return: path to the working directory for a job + """ + + exp_working_dir = os.path.join(base, + f"{job_id['suite']}_{job_id['suite_id']}", + job_id['exp_name'], + f"run_{job_id['exp_run']}", + f"rep_{job_id['exp_run_rep']}") + + return exp_working_dir + + +def run_module(): + # define available arguments/parameters a user can pass to the module + module_args = dict( + job_ids_ready_to_collect_results=dict(type='list', required=True), + exp_host_lst=dict(type='list', required=True), + local_result_dir=dict(type='str', required=True), + remote_result_dir=dict(type='str', required=True), + ) + + result = dict( + changed=False, + original_message='', + message='' + ) + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + + if module.check_mode: + module.exit_json(**result) + + + changed = False # we start with changed false + + + for job_id in module.params["job_ids_ready_to_collect_results"]: + + + local_results_dir_base = jobid2workingdir(job_id, module.params["local_result_dir"]) + remote_results_dir = os.path.join(jobid2workingdir(job_id, module.params["remote_result_dir"]), "results") + remote_config_file = os.path.join(jobid2workingdir(job_id, module.params["remote_result_dir"]), "config.json") + + for i, my_host in enumerate(module.params["exp_host_lst"]): + + + # create local result dir + local_results_dir = os.path.join(local_results_dir_base, my_host['host_type'], f"host_{my_host['exp_host_type_idx']}" ) + os.makedirs(local_results_dir, exist_ok=True) + + # fetch results + src_path = f"{my_host['public_dns_name']}:{remote_results_dir}/*" + try: + # -L is needed to follow symlinks + _completed_process = subprocess.run(["rsync", "-azL", src_path, local_results_dir], check=True) + except subprocess.CalledProcessError as e: + warnings.warn(f"Rsync command failed to fetch results with return code {e.returncode} dir={local_results_dir}") + raise e + + + # fetch config.json from first host + if i == 0: + src_path = f"{my_host['public_dns_name']}:{remote_config_file}" + try: + _completed_process = subprocess.run(["rsync", "-az", src_path, local_results_dir_base], check=True) + except subprocess.CalledProcessError as e: + warnings.warn(f"Rsync command failed to fetch config.json with return code {e.returncode}") + raise e + + changed = True + + + result['changed'] = changed + + module.exit_json(**result) + + + +def main(): + run_module() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/roles/experiment-job/library/setup_job_dirs.py b/src/roles/experiment-job/library/setup_job_dirs.py new file mode 100644 index 00000000..687eec0f --- /dev/null +++ b/src/roles/experiment-job/library/setup_job_dirs.py @@ -0,0 +1,75 @@ +#!/usr/bin/python + +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +import os, json + + +DOCUMENTATION = r''' +--- + +''' + +RETURN = r''' +''' + +from ansible.module_utils.basic import AnsibleModule + + + + +def run_module(): + # define available arguments/parameters a user can pass to the module + module_args = dict( + jobs=dict(type='list', required=True), + ) + + result = dict( + changed=False, + original_message='', + message='' + ) + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + + # if the user is working with this module in only check mode we do not + # want to make any changes to the environment, just return the current + # state with no modifications + if module.check_mode: + module.exit_json(**result) + + + changed = False # we start with changed false + + jobs = module.params["jobs"] + + for job in jobs: + + if os.path.exists(job["exp_working_dir"]): + continue + + os.makedirs(os.path.join(job["exp_working_dir"], "results")) + os.makedirs(os.path.join(job["exp_working_dir"], "scratch")) + + cfg_file = os.path.join(job["exp_working_dir"], "config.json") + if not os.path.isfile(cfg_file): + with open(os.path.join(job["exp_working_dir"], "config.json") , 'w') as f: + json.dump(job["exp_run_config"], f, indent=4, sort_keys=True, separators=(',', ': ')) + + changed = True + + result['changed'] = changed + + module.exit_json(**result) + + + +def main(): + run_module() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/roles/experiment-job/tasks/main.yml b/src/roles/experiment-job/tasks/main.yml index 80941ae7..546c1272 100644 --- a/src/roles/experiment-job/tasks/main.yml +++ b/src/roles/experiment-job/tasks/main.yml @@ -86,40 +86,23 @@ - name: build job schedule list (for each host of experiment, information about job and what is required to run) set_fact: exp_jobs_to_enqueue: "{{ exp_job_ids_to_enqueue | to_job_schedule_lst(exp_host_lst, exp_runs_ext, remote.results_dir) }}" + no_log: true ################################################################### # Prepare Experiment Environment (directories, files, ...) # ################################################################### - - name: Create results and scratch directories (per-host) - ansible.builtin.shell: > - mkdir -m755 -p - {{ jobs_grouped | last | map(attribute='exp_working_dir') | map('regex_replace', '^(.*)$', '\1/scratch') | join(' ') }} - {{ jobs_grouped | last | map(attribute='exp_working_dir') | map('regex_replace', '^(.*)$', '\1/results') | join(' ') }} + + - name: Setup job working dirs (config.json + create results/scratch dirs) + setup_job_dirs: + jobs: "{{ jobs_grouped | last }}" delegate_to: "{{ jobs_grouped | first }}" # first entry in group loop: "{{ exp_jobs_to_enqueue | groupby('host_info.public_dns_name') }}" loop_control: loop_var: jobs_grouped + no_log: true -# - name: Create results and scratch directories (per-host) -# ansible.builtin.command: -# cmd: mkdir -m755 -p {{ '{' }}{{ jobs_grouped | last | map(attribute='exp_working_dir') | map('regex_replace', '^(.*)$', '\1/scratch') | map('quote') | join(',') }}{{ '}' }} -## mkdir -m755 -p {{ '{' }}{{ jobs_grouped | last | map(attribute='exp_working_dir') | map('regex_replace', '^(.*)$', '\1/results') | join(',') }}{{ '}' }} -# delegate_to: "{{ jobs_grouped | first }}" # first entry in group -# loop: "{{ exp_jobs_to_enqueue | groupby('host_info.public_dns_name') }}" -# loop_control: -# loop_var: jobs_grouped - - - name: Create run config file in working directory - template: - src: config.json.j2 - dest: "{{ job.exp_working_dir }}/config.json" - mode: 0755 - delegate_to: "{{ job.host_info.public_dns_name }}" - loop: "{{ exp_jobs_to_enqueue }}" - loop_control: - loop_var: job ################################################################### # Enqueue all Jobs # @@ -138,6 +121,8 @@ exp_job_ids_pending: "{{ exp_job_ids_pending | difference(exp_job_ids_to_enqueue) }}" exp_job_ids_to_enqueue: [] + + ################################################################### # End: Schedule new Jobs (enqueue task in task spooler) # ################################################################### @@ -149,164 +134,138 @@ # Wait and the Get Status of All Job # ################################################################### +- name: Wait until new jobs complete + when: exp_job_ids_queued | length > 0 + block: -- name: Set the job id to wait until finished + update state - set_fact: - job_id_to_wait_for: "{{ cur_job_id }}" - exp_job_ids_queued: "{{ exp_job_ids_queued | difference([cur_job_id]) }}" - exp_job_ids_running: "{{ [cur_job_id] }}" - vars: - # if there is still a running_job_id then take this one, else take the first of the queued jobs - cur_job_id: "{{ (exp_job_ids_running | list + exp_job_ids_queued | list) | first }}" + - name: Save the updated state of the experiment run (save job ids) + include_role: + name: experiment-state + vars: + expstate: save -- name: Save the updated state of the experiment run (save job ids) - include_role: - name: experiment-state - vars: - expstate: save + - name: mark that in the next task, there could be an expected error (number of retires exceeded) + set_fact: + is_expected_error: True -- name: mark that in the next task, there could be an expected error (number of retires exceeded) - set_fact: - is_expected_error: True + - name: Get status of job + include_role: + name: suite-scheduler-status + tasks_from: "{{ job_scheduler }}" + # In the future, we can define this as an additional config parameter. -- name: Get status of job - include_role: - name: suite-scheduler-status - tasks_from: "{{ job_scheduler }}" - # In the future, we can define this as an additional config parameter. + - name: mark that from here on, any error is unexpected + set_fact: + is_expected_error: False -- name: mark that from here on, any error is unexpected - set_fact: - is_expected_error: False + - assert: + that: + - exp_job_ids_completing is defined # <- these are the job ids that we now need to fetch the results from + ################################################################### # Download Results for newly finished job # ################################################################### -- name: Download Results - block: - - debug: - msg: - - "remote_results_dir={{ remote_results_dir }}" - - "local_results_dir_base={{ local_results_dir_base }}" - - "remote_config_file= {{ remote_config_file }}" - - - name: Create local folder (for results) - delegate_to: localhost - file: - path: "{{ local_results_dir_base }}/{{ my_host.host_type }}/host_{{ my_host.exp_host_type_idx }}" - state: directory - mode: 0755 - loop: "{{ exp_host_lst }}" - loop_control: - loop_var: my_host - - name: Fetch Results (if experiment done) - delegate_to: localhost - local_action: command rsync -az "{{ my_host.public_dns_name }}:{{ remote_results_dir }}/*" "{{ local_results_dir_base }}/{{ my_host.host_type }}/host_{{ my_host.exp_host_type_idx }}" - loop: "{{ exp_host_lst }}" - loop_control: - loop_var: my_host - - name: Save the config of the job - delegate_to: localhost - local_action: "command rsync -az '{{ my_host.public_dns_name }}:{{ remote_config_file }}' '{{ local_results_dir_base }}'" - loop: "{{ exp_host_lst[:1] }}" - loop_control: - loop_var: my_host - - - name: Run ETL Pipeline - block: - - - name: Run ETL pipeline over results files (start) - delegate_to: localhost - ansible.builtin.shell: - cmd: python ../doespy/doespy/etl/etl.py --suite {{ suite }} --id {{ suite_id }} - when: ((current_exec_etl | int - previous_exec_etl | default(0) | int) > (etl_minimum_delay_sec|int)) or exp_job_ids_unfinished | list | length == 1 - - - name: update prev timestamp - set_fact: - is_exec: True - previous_exec_etl: "{{ current_exec_etl }}" - when: ((current_exec_etl | int - previous_exec_etl | default(0) | int) > (etl_minimum_delay_sec|int)) or exp_job_ids_unfinished | list | length == 1 - - rescue: - - name: Inform about etl errors - fail: - msg: "Error occured in the etl pipeline -> cannot produce results but we continue with running the experiment \n(error details: {{ etl_error_file }})" - ignore_errors: True - - - name: Creating Error File - delegate_to: localhost - copy: - dest: "{{ etl_error_file }}" - content: "{{ ansible_failed_result | to_nice_yaml }}" - vars: - - etl_error_file: "{{ local.results_dir}}/{{ suite }}_{{ suite_id }}/ETL_ERROR.log" - - current_exec_etl: "{{ lookup('pipe', 'date +%s') }}" +- name: Collect Results + delegate_to: localhost + when: exp_job_ids_queued | length > 0 + collect_results: + job_ids_ready_to_collect_results: "{{ exp_job_ids_completing }}" + exp_host_lst: "{{ exp_host_lst }}" + local_result_dir: "{{ local.results_dir}}" + remote_result_dir: "{{ remote.results_dir }}" + loop: "{{ exp_host_lst }}" + loop_control: + loop_var: my_host +- name: Run ETL Pipeline + when: exp_job_ids_queued | length > 0 + block: + - name: Run ETL pipeline over results files (start) + delegate_to: localhost + ansible.builtin.shell: + cmd: python ../doespy/doespy/etl/etl.py --suite {{ suite }} --id {{ suite_id }} + rescue: + - name: Inform about etl errors + fail: + msg: "Error occured in the etl pipeline -> cannot produce results but we continue with running the experiment \n(error details: {{ etl_error_file }})" + ignore_errors: True + + - name: Creating Error File + delegate_to: localhost + copy: + dest: "{{ etl_error_file }}" + content: "{{ ansible_failed_result | to_nice_yaml }}" vars: - - remote_results_dir: "{{ job_id_to_wait_for | jobid2workingdir(remote.results_dir) + '/results' }}" - - local_results_dir_base: "{{ job_id_to_wait_for | jobid2workingdir(local.results_dir) }}" - - remote_config_file: "{{ job_id_to_wait_for | jobid2workingdir(remote.results_dir) + '/config.json' }}" + etl_error_file: "{{ local.results_dir}}/{{ suite }}_{{ suite_id }}/ETL_ERROR.log" +- set_fact: + cur_exp_job_ids_queued: "{{ exp_job_ids_queued }}" -################################################################### -# Cleanup task spooler queue # -################################################################### +- name: Post cleanup / update + when: cur_exp_job_ids_queued | length > 0 + block: -- name: Remove finished job with downloaded results from queue - include_role: - name: suite-scheduler-remove - tasks_from: "{{ job_scheduler }}" - # In the future, we can define this as an additional config parameter. + ################################################################### + # Cleanup task spooler queue # + ################################################################### + - name: Remove finished job with downloaded results from queue + include_role: + name: suite-scheduler-remove + tasks_from: "{{ job_scheduler }}" + # In the future, we can define this as an additional config parameter. -################################################################### -# Update experiment state # -################################################################### + ################################################################### + # Update experiment state # + ################################################################### -- name: update vars 1 - set_fact: - exp_job_ids_running: [] - exp_job_ids_finished: "{{ exp_job_ids_finished | list + [job_id_to_wait_for] }}" + - name: update vars 1 + set_fact: + exp_job_ids_running: [] + exp_job_ids_finished: "{{ exp_job_ids_finished | list + exp_job_ids_completing | list }}" + exp_job_ids_queued: "{{ exp_job_ids_queued | difference(exp_job_ids_completing) | list }}" # remove all exp_job_ids_completing from exp_job_ids_queued + exp_job_ids_completing: [] -- name: update vars 2 - set_fact: - exp_job_ids_unfinished: "{{ exp_job_ids_pending | list + exp_job_ids_queued | list + exp_job_ids_running | list}}" + - name: update vars 2 + set_fact: + exp_job_ids_unfinished: "{{ exp_job_ids_pending | list + exp_job_ids_queued | list + exp_job_ids_running | list}}" -- name: Save the updated state of the experiment run (save job ids) - include_role: - name: experiment-state - vars: - expstate: save + - name: Save the updated state of the experiment run (save job ids) + include_role: + name: experiment-state + vars: + expstate: save -################################################################### -# Compute suite progress info and output # -################################################################### + ################################################################### + # Compute suite progress info and output # + ################################################################### -- name: compute overview of progress in different experiments in suite - set_fact: - suite_progress_info: "{{ suite_progress_info | default({}) | combine({my_exp_name: {'require_suite_to_finish': my_require_suite ,'n_finished': my_n_finished | int, 'n_unfinished': my_n_unfinished | int, 'progress': (my_n_finished | int / (my_n_finished | int + my_n_unfinished | int) * 100) | round(2) | string + ' %' }})}}" - vars: - my_exp_name: "{{ hostvars[my_controller_host].exp_name }}" - my_n_unfinished: "{{ hostvars[my_controller_host].exp_job_ids_unfinished | length }}" - my_require_suite: "{{ hostvars[my_controller_host].exp_job_ids_pending | length > 0 }}" - my_n_finished: "{{ hostvars[my_controller_host].exp_job_ids_finished | length }}" - loop: "{{ groups['is_controller_yes'] }}" - loop_control: - loop_var: my_controller_host + - name: compute overview of progress in different experiments in suite + set_fact: + suite_progress_info: "{{ suite_progress_info | default({}) | combine({my_exp_name: {'require_suite_to_finish': my_require_suite ,'n_finished': my_n_finished | int, 'n_unfinished': my_n_unfinished | int, 'progress': (my_n_finished | int / (my_n_finished | int + my_n_unfinished | int) * 100) | round(2) | string + ' %' }})}}" + vars: + my_exp_name: "{{ hostvars[my_controller_host].exp_name }}" + my_n_unfinished: "{{ hostvars[my_controller_host].exp_job_ids_unfinished | length }}" + my_require_suite: "{{ hostvars[my_controller_host].exp_job_ids_pending | length > 0 }}" + my_n_finished: "{{ hostvars[my_controller_host].exp_job_ids_finished | length }}" + loop: "{{ groups['is_controller_yes'] }}" + loop_control: + loop_var: my_controller_host -- name: output progress information of experiments - debug: - msg: "{{ suite_progress_info[item] }}" - loop: "{{ suite_progress_info.keys() | sort }}" - tags: [print_action] \ No newline at end of file + - name: output progress information of experiments + debug: + msg: "{{ suite_progress_info[item] }}" + loop: "{{ suite_progress_info.keys() | sort }}" + tags: [print_action] \ No newline at end of file diff --git a/src/roles/suite-scheduler-enqueue/library/tsp.py b/src/roles/suite-scheduler-enqueue/library/tsp.py index e0000863..9df58972 100644 --- a/src/roles/suite-scheduler-enqueue/library/tsp.py +++ b/src/roles/suite-scheduler-enqueue/library/tsp.py @@ -143,7 +143,7 @@ def run_module(): # id task to remove (could take list maybe?) remove_task_id=dict(type='str', required=False), - remove_task_label=dict(type='str', required=False), + remove_task_labels=dict(type='list', required=False), # boolean that indicates whether to clear clear_tasks=dict(type='bool', required=False, default=False), @@ -214,12 +214,12 @@ def run_module(): subprocess.run(["tsp", "-r", module.params['remove_task_id']], capture_output=True, text=True) changed = True - elif module.params['remove_task_label']: + elif module.params['remove_task_labels']: tasks = get_tasks(return_pid=False) for task in tasks: - if task["label"] == module.params['remove_task_label']: + if task["label"] in module.params['remove_task_labels']: subprocess.run(["tsp", "-r", task["id"]], capture_output=True, text=True) changed = True diff --git a/src/roles/suite-scheduler-enqueue/tasks/slurm.yml b/src/roles/suite-scheduler-enqueue/tasks/slurm.yml index 76ad4715..741a41ff 100644 --- a/src/roles/suite-scheduler-enqueue/tasks/slurm.yml +++ b/src/roles/suite-scheduler-enqueue/tasks/slurm.yml @@ -25,6 +25,7 @@ shell: scancel {{ jobs_to_cancel | map(attribute='slurm_job_id') | join(' ')}} when: (jobs_to_cancel | list | length) > 0 +# TODO [nku] could replace with custom module that does the enqueue (group by host and then do the enqueue) - name: schedule the job on euler using slurm (sbatch) vars: experiment_name: "{{ job.job_info | safe_job_info_string }}" diff --git a/src/roles/suite-scheduler-remove/library/tsp.py b/src/roles/suite-scheduler-remove/library/tsp.py deleted file mode 100644 index e0000863..00000000 --- a/src/roles/suite-scheduler-remove/library/tsp.py +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/python - -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type - -import subprocess, os, signal, re, time - - -DOCUMENTATION = r''' ---- -module: tsp - -short_description: This module creates an interface to task-spooler - -version_added: "1.0.0" - -description: This module allows to (i) schedule new tasks to the batch job system task-spooler, (ii) clear task-spooler (stop running tasks and remove all queued), (iii) remove an individual task from the queue. For tasks scheduled with this module, the command given to task spooler is extended such that it runs in the provided working_dir and outputs stdout and stderr in separate files. - -options: - remove_task_id: - description: Remove the task (e.g., because it is finished) with the task spooler task id from the task spooler queue - required: false - type: str - remove_task_label: - description: Remove all tasks (e.g., because they are finished) with the task spooler task label from the task spooler queue - required: false - type: str - clear_tasks: - description: Before adding new commands (tasks), stop all running tasks and remove all tasks from the queue. - required: false - type: bool - cmd: - description: The command to add at the end of the task spooler queue. - required: false - type: str - cmd_label: - description: The label of the command to add (required if there is a command) - required: false - type: str - cmd_working_dir: - description: The working dir of the command (required if there is a command) - required: false - type: str - cmd_stdout_file: - description: The file to log stdout, relative to the working dir (required if there is a command). - required: false - type: str - cmd_stderr_file: - description: The file to log stderr, relative to the working dir (required if there is a command). - required: false - type: str -''' - -EXAMPLES = r''' -- name: Clear the task-spooler queue (and stop all running) - tsp: - clear_tasks: True - -- name: Add a command but first clear the task-spooler queue (and stop all running) - tsp: - clear_tasks: True - cmd: echo hello - cmd_label: abc - cmd_working_dir: /home/ubuntu - cmd_stdout_file: stdout.log - cmd_stderr_file: stderr.log - -- name: Add a command to task spooler - tsp: - cmd: echo hello - cmd_label: abc - cmd_working_dir: /home/ubuntu - cmd_stdout_file: stdout.log - cmd_stderr_file: stderr.log - - -- name: Remove the task with id 2 from the queue - tsp: - remove_task_id: 2 - -- name: Remove all tasks with label 'abc' from the queue - tsp: - remove_task_label: abc -''' - -RETURN = r''' -''' - -from ansible.module_utils.basic import AnsibleModule - - - -def get_tasks(return_pid): - completed_process = subprocess.run(["tsp"], capture_output=True) - - lines = completed_process.stdout.decode('utf-8').splitlines() - - tasks = [] - - pattern = r'([0-9]+)\s+(queued|running|finished)\s+([^\s]+)\s+(?:([0-9]+)\s+([^\s]+)|\s+)\s(?:\[(.+)\](.+)|(.+))' - - for line in lines[1:]: # ignore the first line (header) - m = re.search(pattern, line) - - if not m: - raise ValueError("not matched: " + line) - - cmd = m.group(7) - if cmd is None: - cmd = m.group(8) # has no label => cmd is in group 7 - - d = { - "id": m.group(1), - "state": m.group(2), - "output": m.group(3), - "error_level": m.group(4), - "times": m.group(5), - "label": m.group(6), - "cmd": cmd, - "pid": None, - #"full_line": m.group(0), - } - - if return_pid and d["state"] == "running": - cp = subprocess.run(["tsp", "-p", d["id"]], capture_output=True) - d["pid"] = cp.stdout.decode('utf-8').strip() - - tasks.append(d) - return tasks - - -def run_module(): - # define available arguments/parameters a user can pass to the module - module_args = dict( - - # used for building commands (schedule command) - cmd=dict(type='str', required=False), - cmd_label=dict(type='str', required=False), - cmd_working_dir=dict(type='str', required=False), - cmd_stdout_file=dict(type='str', required=False), - cmd_stderr_file=dict(type='str', required=False), - - # id task to remove (could take list maybe?) - remove_task_id=dict(type='str', required=False), - - remove_task_label=dict(type='str', required=False), - - # boolean that indicates whether to clear - clear_tasks=dict(type='bool', required=False, default=False), - - ) - - result = dict( - changed=False, - original_message='', - message='' - ) - - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) - - # if the user is working with this module in only check mode we do not - # want to make any changes to the environment, just return the current - # state with no modifications - if module.check_mode: - module.exit_json(**result) - - - # TSP LOGIC: - changed = False # we start with changed false - - if module.params["clear_tasks"]: - # stop all running tasks and clear the queue - changed = clear_task_spooler() - - - if module.params["cmd"]: # run a command - - if not all(module.params[v] is not None for v in ["cmd_label", "cmd_working_dir", "cmd_stdout_file", "cmd_stderr_file"]): - module.fail_json(msg="for adding a cmd: label, working_dir, stdout_file, stderr_file are required", **result) - - - # get current tsp state - tasks = get_tasks(return_pid=False) - - labels = [d["label"] for d in tasks] - - cmd_label = module.params["cmd_label"] - - # cmd with same label already in task spooler => don't add - if cmd_label is not None and cmd_label in labels: - changed = changed # or False -> here nothing changed - - else: # add cmd to task spooler queue - - # build cmd that changes working directory and outputs stdout to file, stderr to file and everything to stdout - cmd = f"cd {module.params['cmd_working_dir']} && (({module.params['cmd']}) | tee {module.params['cmd_stdout_file']}) 3>&1 1>&2 2>&3 | tee {module.params['cmd_stderr_file']}" - - if cmd_label is not None: - tsp_cmd = ["tsp", "-L", cmd_label, "/bin/sh", "-c", cmd] - else: - tsp_cmd = ["tsp", "/bin/sh", "-c", cmd] - - # enqueue command - subprocess.run(tsp_cmd, capture_output=True, text=True, check=True) - - changed = True - - - # remove task from task spooler queue - if module.params['remove_task_id']: - subprocess.run(["tsp", "-r", module.params['remove_task_id']], capture_output=True, text=True) - - changed = True - elif module.params['remove_task_label']: - - tasks = get_tasks(return_pid=False) - - for task in tasks: - if task["label"] == module.params['remove_task_label']: - subprocess.run(["tsp", "-r", task["id"]], capture_output=True, text=True) - changed = True - - # mark whether module changed - result['changed'] = changed - - - # in the event of a successful module execution, you will want to - # simple AnsibleModule.exit_json(), passing the key/value results - module.exit_json(**result) - -def clear_task_spooler(): - - # clear all done - subprocess.run(["tsp", "-C"], capture_output=True, check=True) - - tasks = get_tasks(return_pid=True) - - if len(tasks) == 0: - return False # changed - - # make single slot - subprocess.run(["tsp", "-S", "1"], capture_output=True, check=True) - - # add a dummy task that just waits for one minute (when this is running, then we know that in between cmds, the task is not done because it takes 1 min) - cp = subprocess.run(["tsp", "-L", "DUMMY", "sleep", "60"], capture_output=True, check=True) - - # make the dummy task urgent (next in line) - subprocess.run(["tsp", "-u"], capture_output=True, check=True) - - for task in tasks: - if task["state"] == "running": - # kill the running processes - os.killpg(int(task['pid']), signal.SIGTERM) - - # now the dummy task is running - def get_dummy_task_pid(): - tasks = get_tasks(return_pid=True) - dummy_task_pid = None - for task in tasks: - if task["state"] == "running": - if task["label"] != "DUMMY" or dummy_task_pid is not None: - raise ValueError(f"unexpected running task (only single dummy task should run): {task}") - dummy_task_pid = task["pid"] - - if dummy_task_pid is None: - raise ValueError("running dummy task not found") - - return dummy_task_pid - - try: - dummy_task_pid = get_dummy_task_pid() - except ValueError as e: - time.sleep(3) # add some slack for process to finish - # try again - dummy_task_pid = get_dummy_task_pid() - - # clear the task spooler (remove all jobs in queue) - subprocess.run(["tsp", "-K"], capture_output=True, check=True) - - # finally also kill the dummy task by pid - os.killpg(int(dummy_task_pid) , signal.SIGTERM) - - return True # changed - - -def main(): - run_module() - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/roles/suite-scheduler-remove/tasks/bsub.yml b/src/roles/suite-scheduler-remove/tasks/bsub.yml index 6047aad4..f39829d5 100644 --- a/src/roles/suite-scheduler-remove/tasks/bsub.yml +++ b/src/roles/suite-scheduler-remove/tasks/bsub.yml @@ -4,7 +4,7 @@ - assert: that: - - job_id_to_wait_for is defined + - exp_job_ids_completing is defined - exp_name is defined - name: Noop diff --git a/src/roles/suite-scheduler-remove/tasks/slurm.yml b/src/roles/suite-scheduler-remove/tasks/slurm.yml index 5802f138..ba013663 100644 --- a/src/roles/suite-scheduler-remove/tasks/slurm.yml +++ b/src/roles/suite-scheduler-remove/tasks/slurm.yml @@ -2,7 +2,7 @@ - assert: that: - - job_id_to_wait_for is defined + - exp_job_ids_completing is defined - exp_name is defined - name: Noop (slurm queue is cleanup automatically) diff --git a/src/roles/suite-scheduler-remove/tasks/tsp.yml b/src/roles/suite-scheduler-remove/tasks/tsp.yml index 4277ce71..3c19ad77 100644 --- a/src/roles/suite-scheduler-remove/tasks/tsp.yml +++ b/src/roles/suite-scheduler-remove/tasks/tsp.yml @@ -2,12 +2,13 @@ - assert: that: - - job_id_to_wait_for is defined + - exp_job_ids_completing is defined - exp_name is defined + - name: Remove finished job with downloaded results from queue tsp: - remove_task_label: "{{ job_id_to_wait_for | to_json | string }}" + remove_task_labels: "{{ exp_job_ids_completing | map('to_json') | map('string') }}" delegate_to: "{{ host }}" loop: "{{ groups[exp_name] }}" loop_control: diff --git a/src/roles/suite-scheduler-status/filter_plugins/helpers.py b/src/roles/suite-scheduler-status/filter_plugins/helpers.py index 623cad5a..48a34c35 100644 --- a/src/roles/suite-scheduler-status/filter_plugins/helpers.py +++ b/src/roles/suite-scheduler-status/filter_plugins/helpers.py @@ -4,28 +4,31 @@ # from ...filter_plugins.helpers import safe_job_info_string -def tsp_job_finished(tsp_tasks, job_id): +def tsp_jobs_finished(job_ids, tsp_tasks): - """ - return: True if their is a task in tsp_tasks with the given job_id - """ - for task in tsp_tasks: + completed_jobs = [] - task_job_id = json.loads(task["label"]) + for task in tsp_tasks: - if task_job_id == job_id: - # found matching job + if task["state"] == "finished": + task_job_id = json.loads(task["label"]) - if task["state"] == "running" or task["state"] == "queued": - return False - elif task["state"] == "finished": - return True + if task_job_id in job_ids: + completed_jobs.append(task_job_id) else: - raise ValueError(f"tsp task with unknown task state = {task['state']} (task={task})") + raise ValueError(f"no matching job found in tsp: {job_ids} tsp_tasks={tsp_tasks}") + + elif task["state"] == "running" or task["state"] == "queued": + pass + else: + raise ValueError(f"tsp task with unknown task state = {task['state']} (task={task})") + + + return completed_jobs + - raise ValueError(f"no matching job found in tsp: {job_id} tsp_tasks={tsp_tasks}") def get_tsp_task_id(tsp_tasks, job_id): @@ -112,14 +115,19 @@ def safe_job_info_string(job_info): queued_or_running_labels = [bjob["label"] for bjob in bjobs] + completed_jobs = [] + for queued_job in queued_jobs: safe_id = safe_job_info_string(queued_job) if safe_id not in queued_or_running_labels: # job has finished - return queued_job + return completed_jobs.append(queued_job) - return '' + if len(completed_jobs) == 0: + return '' + else: + return completed_jobs class FilterModule(object): @@ -127,7 +135,7 @@ class FilterModule(object): def filters(self): return { - "tsp_job_finished": tsp_job_finished, + "tsp_jobs_finished": tsp_jobs_finished, "bsub_jobs_finished": bsub_jobs_finished, "get_tsp_task_id": get_tsp_task_id, "to_job_schedule_lst": to_job_schedule_lst, diff --git a/src/roles/suite-scheduler-status/tasks/bsub.yml b/src/roles/suite-scheduler-status/tasks/bsub.yml index 8344b0bc..bce716b0 100644 --- a/src/roles/suite-scheduler-status/tasks/bsub.yml +++ b/src/roles/suite-scheduler-status/tasks/bsub.yml @@ -2,45 +2,30 @@ # NOTE: At the moment `bsub` (LSF) is not used for scheduling tasks # but we leave the scheduler if we want to add a cloud later that uses LSF -# Since bjobs runs multiple jobs in parallel, we ignore `job_id_to_wait_for`. -# This script waits for any of the active jobs to complete and then sets job_id_to_wait_for as one of the completed jobs. +# WARNING: Since the introduction of exp_job_ids_completing, the role is untested - assert: that: - exp_name is defined + - exp_job_ids_queued is defined + - exp_job_ids_queued | length > 0 -- name: Debug bjobs - bjobs_info: - register: bjobs_result -- name: debug bjobs - debug: - msg: "{{ bjobs_result }}" +#- name: Debug bjobs +# bjobs_info: +# register: bjobs_result +#- name: debug bjobs +# debug: +# msg: "{{ bjobs_result }}" -- name: Store job_id that system wants us to wait for - set_fact: - system_job_id: "{{ job_id_to_wait_for }}" - name: Get bjobs bjobs_info: register: bjobs_result - until: (exp_job_ids_queued + exp_job_ids_running) | bsub_jobs_finished(bjobs_result.tasks) | length + until: exp_job_ids_queued | bsub_jobs_finished(bjobs_result.tasks) | length retries: "{{ job_n_tries }}" delay: "{{ job_check_wait_time }}" -#- name: debug bjobs -# debug: -# msg: "{{ bjobs_result }}" - -# Quick hack to be compatible with what experiment-job expects: -# Set job_id_to_wait_for to the jsonified version of job_info -- set_fact: - job_id_to_wait_for: "{{ (exp_job_ids_queued + exp_job_ids_running) | bsub_jobs_finished(bjobs_result.tasks) }}" - -#- name: debug wait for -# debug: -# msg: "{{ job_id_to_wait_for }}" -- name: Override the job id to wait until finished + update state +- name: Marking jobs ids as completing set_fact: - exp_job_ids_queued: "{{ (exp_job_ids_queued + [system_job_id]) | difference([job_id_to_wait_for]) }}" - exp_job_ids_running: "{{ [job_id_to_wait_for] }}" \ No newline at end of file + exp_job_ids_completing: "{{ exp_job_ids_queued | bsub_jobs_finished(bjobs_result.tasks) }}" diff --git a/src/roles/suite-scheduler-status/tasks/slurm.yml b/src/roles/suite-scheduler-status/tasks/slurm.yml index 26ceea29..81cccda2 100644 --- a/src/roles/suite-scheduler-status/tasks/slurm.yml +++ b/src/roles/suite-scheduler-status/tasks/slurm.yml @@ -6,15 +6,14 @@ - assert: that: - exp_name is defined + - exp_job_ids_queued is defined + - exp_job_ids_queued | length > 0 -- name: Store job_id that system wants us to wait for - set_fact: - system_job_id: "{{ job_id_to_wait_for }}" - name: Get slurm slurm_info: - job_ids: "{{ exp_job_ids_queued + exp_job_ids_running }}" - job_id_names: "{{ (exp_job_ids_queued + exp_job_ids_running) | map('safe_job_info_string') | list }}" + job_ids: "{{ exp_job_ids_queued }}" + job_id_names: "{{ exp_job_ids_queued | map('safe_job_info_string') | list }}" register: slurm_result until: slurm_result.complete | length > 0 retries: "{{ job_n_tries }}" @@ -26,16 +25,6 @@ # TODO [nku] could raise a warning if slurm_result.error is non empty -# Quick hack to be compatible with what experiment-job expects: -# Set job_id_to_wait_for to the jsonified version of job_info -- set_fact: - job_id_to_wait_for: "{{ slurm_result.complete | first }}" - -- name: debug wait for - debug: - msg: "{{ job_id_to_wait_for }}" - -- name: Override the job id to wait until finished + update state +- name: Marking jobs ids as completing set_fact: - exp_job_ids_queued: "{{ (exp_job_ids_queued + [system_job_id]) | difference([job_id_to_wait_for]) }}" - exp_job_ids_running: "{{ [job_id_to_wait_for] }}" \ No newline at end of file + exp_job_ids_completing: "{{ slurm_result.complete }}" diff --git a/src/roles/suite-scheduler-status/tasks/tsp.yml b/src/roles/suite-scheduler-status/tasks/tsp.yml index 74565b0c..65c05727 100644 --- a/src/roles/suite-scheduler-status/tasks/tsp.yml +++ b/src/roles/suite-scheduler-status/tasks/tsp.yml @@ -2,15 +2,22 @@ - assert: that: - exp_name is defined + - exp_job_ids_queued is defined + - exp_job_ids_queued | length > 0 - name: Get status of job # Note: if the number of tries are exceeded, the task raises an error which stops this role and is caught in the parent tsp_info: register: tsp_result - until: (tsp_result.tasks | tsp_job_finished(job_id_to_wait_for)) | bool + until: exp_job_ids_queued | tsp_jobs_finished(tsp_result.tasks) | length retries: "{{ job_n_tries }}" delay: "{{ job_check_wait_time }}" delegate_to: "{{ host }}" loop: "{{ groups[exp_name] | intersect(groups['check_status_yes']) }}" # only check status of jobs for host_types with 'check_status' == true loop_control: - loop_var: host \ No newline at end of file + loop_var: host + + +- name: Marking jobs ids as completing + set_fact: + exp_job_ids_completing: "{{ exp_job_ids_queued | tsp_jobs_finished(tsp_result.results[-1].tasks) }}"