From e827e75f997d76077f83cbaf787fdc98ec71d1f0 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 22 Jan 2024 15:06:30 +0100
Subject: [PATCH 01/97] created remoteRunner class

---
 .vscode/settings.json          |  4 +-
 kernel_tuner/runners/remote.py | 98 ++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 kernel_tuner/runners/remote.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a4d473dd..3089f374a 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,8 +10,8 @@
         "editor.formatOnType": true,
         "editor.formatOnSave": false,
         "editor.codeActionsOnSave": {
-            "source.fixAll": true,
-            "source.organizeImports": true,
+            "source.fixAll": "explicit",
+            "source.organizeImports": "explicit"
         }
     },
     "black-formatter.args": [
diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
new file mode 100644
index 000000000..cb4df9ed5
--- /dev/null
+++ b/kernel_tuner/runners/remote.py
@@ -0,0 +1,98 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.runner import Runner
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+
+
+class RemotelRunner(Runner):
+
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+        #detect language and create high-level device interface
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+
+        self.units = self.dev.units
+        self.quiet = device_options.quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+
+        ray.init()
+        # Get cluster resources
+        cluster_resources = ray.cluster_resources()
+        self.num_gpus = cluster_resources.get("GPU", 0)  # Default to 0 if no GPUs are found
+
+    def get_environment(self):
+        return self.dev.get_environment()
+
+    
+    def run(self, parameter_space, tuning_options):
+
+        logging.debug('remote runner started for ' + self.kernel_options.kernel_name)
+
+        results = []
+
+        # iterate over parameter space
+        for element in parameter_space:
+            results = [self.remote_run.remote(element, tuning_options) for _ in range(self.num_gpus)]
+
+        return ray.get(results)
+
+    @ray.remote(num_gpus=1)  # Requesting 1 GPU for this task
+    def remote_run(self, element, tuning_options):
+        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From deab579334c05aa168c50bda226ca9e6562d5a81 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:15:53 +0100
Subject: [PATCH 02/97] added remote actor class

---
 kernel_tuner/runners/remote_actor.py | 79 ++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 kernel_tuner/runners/remote_actor.py

diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
new file mode 100644
index 000000000..e6ced0892
--- /dev/null
+++ b/kernel_tuner/runners/remote_actor.py
@@ -0,0 +1,79 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class RemoteActor:
+    def __init__(self, 
+                 units,
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers):
+        
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+
+    def execute(self, element, tuning_options):
+        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From 52287b9d7b6236b74ff8b5cb6fda7d57fa38dd69 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:26:13 +0100
Subject: [PATCH 03/97] update remote runner

---
 kernel_tuner/runners/remote.py | 88 +++++++++-------------------------
 1 file changed, 23 insertions(+), 65 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index cb4df9ed5..4d81ce877 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -5,15 +5,15 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
+from kernel_tuner.runners.remote_actor import RemoteActor
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 
 
-class RemotelRunner(Runner):
+class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-
         self.units = self.dev.units
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
@@ -24,75 +24,33 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
 
-        ray.init()
+        # Initialize Ray
+        ray.init(ignore_reinit_error=True)
+
         # Get cluster resources
         cluster_resources = ray.cluster_resources()
-        self.num_gpus = cluster_resources.get("GPU", 0)  # Default to 0 if no GPUs are found
-
-    def get_environment(self):
+        self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
+
+        # Create RemoteActor instances
+        self.actors = [RemoteActor.remote(self.dev.units, 
+                                          device_options.quiet,
+                                          kernel_source, 
+                                          kernel_options, 
+                                          device_options, 
+                                          iterations, 
+                                          observers) for _ in range(self.num_gpus)]
+
+    def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
     def run(self, parameter_space, tuning_options):
+        future_results = []
 
-        logging.debug('remote runner started for ' + self.kernel_options.kernel_name)
-
-        results = []
-
-        # iterate over parameter space
+        # Iterate over parameter space and distribute work to actors
         for element in parameter_space:
-            results = [self.remote_run.remote(element, tuning_options) for _ in range(self.num_gpus)]
-
-        return ray.get(results)
-
-    @ray.remote(num_gpus=1)  # Requesting 1 GPU for this task
-    def remote_run(self, element, tuning_options):
-        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
+            future = [actor.execute.remote(element, tuning_options) for actor in self.actors]
+            future_results.extend(future)
 
-        return params
\ No newline at end of file
+        return ray.get(future_results)
+    
\ No newline at end of file

From 40cc888896ef5a7a6755f71416139824754c58da Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:27:23 +0100
Subject: [PATCH 04/97] added remote_mode function argument to tune_kernel and
 related remote runner selection logic

---
 kernel_tuner/interface.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 1d206307b..ec32bafb9 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,6 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.remote import RemoteRunner # ADDED HERE
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -574,6 +575,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
+    remote_mode=False, # ADDED HERE
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -650,7 +652,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+    selected_runner = SimulationRunner if simulation_mode else (RemoteRunner if remote_mode else SequentialRunner) # ADDED HERE
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 

From b14aaf01b9f1ab8c931479a21596233d4196ea9a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:28:01 +0100
Subject: [PATCH 05/97] added parallel tuning test

---
 test/test_parallel_tuning.py | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 test/test_parallel_tuning.py

diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
new file mode 100644
index 000000000..f0da92dc5
--- /dev/null
+++ b/test/test_parallel_tuning.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, remote_mode=True)
+    assert len(result) > 0
\ No newline at end of file

From 1a55a5c8e8ba6577bbf545ccc052d9adc221deab Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 24 Jan 2024 00:04:22 +0100
Subject: [PATCH 06/97] added pool of actors

---
 kernel_tuner/runners/remote.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 4d81ce877..60db5a36c 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -2,6 +2,7 @@
 from datetime import datetime, timezone
 from time import perf_counter
 import ray
+from ray.util.actor_pool import ActorPool
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -39,18 +40,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
                                           device_options, 
                                           iterations, 
                                           observers) for _ in range(self.num_gpus)]
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
     def run(self, parameter_space, tuning_options):
-        future_results = []
+        
+        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
 
-        # Iterate over parameter space and distribute work to actors
-        for element in parameter_space:
-            future = [actor.execute.remote(element, tuning_options) for actor in self.actors]
-            future_results.extend(future)
-
-        return ray.get(future_results)
+        return results
     
\ No newline at end of file

From 4fef594d60f5f44e0a6da8315980e0658481528a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 24 Jan 2024 11:42:58 +0100
Subject: [PATCH 07/97] clean up remote runner and actor

---
 kernel_tuner/runners/remote.py       | 18 ------------------
 kernel_tuner/runners/remote_actor.py |  6 +++---
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 60db5a36c..7c858af99 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -1,37 +1,21 @@
 import logging
-from datetime import datetime, timezone
-from time import perf_counter
 import ray
 from ray.util.actor_pool import ActorPool
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.remote_actor import RemoteActor
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-
 
 class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = device_options.quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-
         # Initialize Ray
         ray.init(ignore_reinit_error=True)
-
         # Get cluster resources
         cluster_resources = ray.cluster_resources()
         self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
-
         # Create RemoteActor instances
         self.actors = [RemoteActor.remote(self.dev.units, 
                                           device_options.quiet,
@@ -48,8 +32,6 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-
         return results
     
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index e6ced0892..ce6a06a7e 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -26,11 +26,11 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-
-    def execute(self, element, tuning_options):
-        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
+    def execute(self, element, tuning_options):
+        
         params = dict(zip(tuning_options.tune_params.keys(), element))
 
         result = None

From 3f3b9e60135e375963a1902bc877183b25c67960 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 30 Jan 2024 11:02:22 +0100
Subject: [PATCH 08/97] updates on remote code

---
 kernel_tuner/runners/remote.py       | 76 +++++++++++++++++++++++-----
 kernel_tuner/runners/remote_actor.py | 11 ++--
 test/test_parallel_tuning.py         |  1 +
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 7c858af99..40d72c8c9 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -1,6 +1,9 @@
 import logging
 import ray
+import sys
+import os
 from ray.util.actor_pool import ActorPool
+from time import perf_counter
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -9,21 +12,32 @@
 class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
-        #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = device_options.quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        self.observers = observers
+        self.iterations = iterations
+        self.device_options = device_options
+
+        # Define cluster resources
+        self.num_gpus = get_num_devices(kernel_source.lang)
+        print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
+        resources = {}
+        for id in range(self.num_gpus):
+            gpu_resource_name = f"gpu_{id}"
+            resources[gpu_resource_name] = 1
         # Initialize Ray
-        ray.init(ignore_reinit_error=True)
-        # Get cluster resources
-        cluster_resources = ray.cluster_resources()
-        self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True)
         # Create RemoteActor instances
-        self.actors = [RemoteActor.remote(self.dev.units, 
-                                          device_options.quiet,
-                                          kernel_source, 
-                                          kernel_options, 
-                                          device_options, 
-                                          iterations, 
-                                          observers) for _ in range(self.num_gpus)]
+        self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
 
@@ -32,6 +46,42 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
+        print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
-    
\ No newline at end of file
+    
+    def create_actor_on_gpu(self, gpu_id):
+        gpu_resource_name = f"gpu_{gpu_id}"
+        return RemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
+                                                                            self.kernel_source, 
+                                                                            self.kernel_options, 
+                                                                            self.device_options, 
+                                                                            self.iterations, 
+                                                                            self.observers,
+                                                                            gpu_id)
+
+# DONT KNOW WHERE TO PUT IT YET
+def get_num_devices(lang):
+    num_devices = 0
+    if lang.upper() == "CUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "CUPY":
+        import cupy
+        num_devices = cupy.cuda.runtime.getDeviceCount()
+    elif lang.upper() == "NVCUDA":
+        # NVCUDA usually refers to NVIDIA's CUDA, so you can use pycuda or a similar approach
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "OPENCL":
+        import pyopencl as cl
+        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
+    elif lang.upper() == "HIP":
+        from pyhip import hip
+        num_devices = hip.hipGetDeviceCount()
+    else:
+        raise ValueError(f"Unsupported language: {lang}")
+
+    return num_devices
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index ce6a06a7e..57624fe6f 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -2,22 +2,25 @@
 from datetime import datetime, timezone
 from time import perf_counter
 import ray
+import sys
+
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
 
 @ray.remote(num_gpus=1)
 class RemoteActor:
     def __init__(self, 
-                 units,
                  quiet,
                  kernel_source,
                  kernel_options, 
                  device_options,
                  iterations,
-                 observers):
+                 observers,
+                 gpu_id):
         
+        self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = units
+        self.units = self.dev.units
         self.quiet = quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
@@ -30,7 +33,7 @@ def __init__(self,
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
     def execute(self, element, tuning_options):
-        
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
         params = dict(zip(tuning_options.tune_params.keys(), element))
 
         result = None
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index f0da92dc5..9419d11a1 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+import logging
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.backends import nvcuda

From fe5da39289ad18d0552339570a3d248b070db7c5 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 2 Apr 2024 16:07:31 +0200
Subject: [PATCH 09/97] changed naming from remote to parallel

---
 kernel_tuner/interface.py                       | 9 ++++++---
 kernel_tuner/runners/{remote.py => parallel.py} | 4 ++--
 test/test_parallel_tuning.py                    | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)
 rename kernel_tuner/runners/{remote.py => parallel.py} (94%)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index ec32bafb9..6b53e599b 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,7 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.remote import RemoteRunner # ADDED HERE
+from kernel_tuner.runners.parallel import ParallelRunner # ADDED HERE
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -58,6 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
+    ensemble
 )
 
 strategy_map = {
@@ -76,6 +77,7 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
+    "ensemble": ensemble
 }
 
 
@@ -385,6 +387,7 @@ def __deepcopy__(self, _):
             * "pso" particle swarm optimization
             * "random_sample" takes a random sample of the search space
             * "simulated_annealing" simulated annealing strategy
+            * "ensemble" Ensemble Strategy
 
         Strategy-specific parameters and options are explained under strategy_options.
 
@@ -575,7 +578,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
-    remote_mode=False, # ADDED HERE
+    parallel_mode=False, # ADDED HERE
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -652,7 +655,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else (RemoteRunner if remote_mode else SequentialRunner) # ADDED HERE
+    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner) # ADDED HERE
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/parallel.py
similarity index 94%
rename from kernel_tuner/runners/remote.py
rename to kernel_tuner/runners/parallel.py
index 40d72c8c9..550fdb057 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/parallel.py
@@ -9,7 +9,7 @@
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.remote_actor import RemoteActor
 
-class RemoteRunner(Runner):
+class ParallelRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
@@ -47,6 +47,7 @@ def get_environment(self, tuning_options):
     
     def run(self, parameter_space, tuning_options):
         print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
+        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
     
@@ -71,7 +72,6 @@ def get_num_devices(lang):
         import cupy
         num_devices = cupy.cuda.runtime.getDeviceCount()
     elif lang.upper() == "NVCUDA":
-        # NVCUDA usually refers to NVIDIA's CUDA, so you can use pycuda or a similar approach
         import pycuda.driver as cuda
         cuda.init()
         num_devices = cuda.Device.count()
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index 9419d11a1..9a2e6a362 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -37,5 +37,5 @@ def env():
 
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
-    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, remote_mode=True)
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, parallel_mode=True)
     assert len(result) > 0
\ No newline at end of file

From a43dc8491202685953227591e0c142bd89fce1b1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:07:24 +0200
Subject: [PATCH 10/97] added get_num_devices function

---
 kernel_tuner/util.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 19b29c0f1..90acb77bb 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1253,3 +1253,27 @@ def cuda_error_check(error):
         if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
+
+def get_num_devices(lang):
+    num_devices = 0
+    if lang.upper() == "CUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "CUPY":
+        import cupy
+        num_devices = cupy.cuda.runtime.getDeviceCount()
+    elif lang.upper() == "NVCUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "OPENCL":
+        import pyopencl as cl
+        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
+    elif lang.upper() == "HIP":
+        from pyhip import hip
+        num_devices = hip.hipGetDeviceCount()
+    else:
+        raise ValueError(f"Unsupported language: {lang}")
+
+    return num_devices
\ No newline at end of file

From ab3aa243bc33c929827fc46c96528176838b7910 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:08:57 +0200
Subject: [PATCH 11/97] added ensemble and parallel runner related stuff

---
 kernel_tuner/interface.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 6b53e599b..d7a97fbe8 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,7 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.parallel import ParallelRunner # ADDED HERE
+from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -578,7 +578,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
-    parallel_mode=False, # ADDED HERE
+    parallel_mode=False,
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -616,6 +616,8 @@ def tune_kernel(
         tuning_options["max_fevals"] = strategy_options["max_fevals"]
     if strategy_options and "time_limit" in strategy_options:
         tuning_options["time_limit"] = strategy_options["time_limit"]
+    if strategy_options and "ensemble" in strategy_options:
+        tuning_options["ensemble"] = strategy_options["ensemble"]
 
     logging.debug("tune_kernel called")
     logging.debug("kernel_options: %s", util.get_config_string(kernel_options))
@@ -655,7 +657,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner) # ADDED HERE
+    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 

From e8a7228fe412bac5697e376ec8df5905dfa593d0 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:11:06 +0200
Subject: [PATCH 12/97] switched to new naming of parallel remote and some
 clean up

---
 kernel_tuner/runners/parallel.py | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 550fdb057..65a32cf37 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,7 +7,8 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.runners.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.util import get_num_devices
 
 class ParallelRunner(Runner):
 
@@ -46,42 +47,17 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
+        #print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
     
     def create_actor_on_gpu(self, gpu_id):
         gpu_resource_name = f"gpu_{gpu_id}"
-        return RemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
+        return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
                                                                             self.kernel_source, 
                                                                             self.kernel_options, 
                                                                             self.device_options, 
                                                                             self.iterations, 
                                                                             self.observers,
                                                                             gpu_id)
-
-# DONT KNOW WHERE TO PUT IT YET
-def get_num_devices(lang):
-    num_devices = 0
-    if lang.upper() == "CUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "CUPY":
-        import cupy
-        num_devices = cupy.cuda.runtime.getDeviceCount()
-    elif lang.upper() == "NVCUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "OPENCL":
-        import pyopencl as cl
-        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
-    elif lang.upper() == "HIP":
-        from pyhip import hip
-        num_devices = hip.hipGetDeviceCount()
-    else:
-        raise ValueError(f"Unsupported language: {lang}")
-
-    return num_devices
\ No newline at end of file

From 3dd748c0a9b4bfa7f07b4a9b0bf1c0cb8770d81f Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:14:29 +0200
Subject: [PATCH 13/97] added class instances needed down the line in the
 execution of the ensemble strategy

---
 kernel_tuner/runners/sequential.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index c493a0089..bf4cd6303 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -40,6 +40,9 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options # needed for the ensemble strategy down the line
+        self.iterations = iterations # needed for the ensemble strategy down the line
+        self.observers = observers # needed for the ensemble strategy down the line
 
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)

From e743bec8e18dd85dce672cdddd266709a23311ac Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:16:13 +0200
Subject: [PATCH 14/97] changed naming due to ensemble implementation, this was
 the original remote_actor

---
 kernel_tuner/runners/parallel_remote_actor.py | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 kernel_tuner/runners/parallel_remote_actor.py

diff --git a/kernel_tuner/runners/parallel_remote_actor.py b/kernel_tuner/runners/parallel_remote_actor.py
new file mode 100644
index 000000000..e913974a7
--- /dev/null
+++ b/kernel_tuner/runners/parallel_remote_actor.py
@@ -0,0 +1,82 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class ParallelRemoteActor():
+    def __init__(self, 
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 gpu_id):
+        
+        self.gpu_id = gpu_id
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        #move data to the GPU
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+    def execute(self, element, tuning_options):
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From df949d09c1832f38f8791647688ce09c9b73fccb Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:17:23 +0200
Subject: [PATCH 15/97] started ensemble implementation, very basic
 functionality works

---
 kernel_tuner/runners/remote_actor.py |  61 +++------------
 kernel_tuner/strategies/ensemble.py  | 106 +++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 49 deletions(-)
 create mode 100644 kernel_tuner/strategies/ensemble.py

diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index 57624fe6f..6332972fd 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -6,9 +6,11 @@
 
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
+from kernel_tuner.runners.simulation import SimulationRunner
 
 @ray.remote(num_gpus=1)
-class RemoteActor:
+class RemoteActor():
     def __init__(self, 
                  quiet,
                  kernel_source,
@@ -29,54 +31,15 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
-    def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
+    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
+        selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+        runner = selected_runner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers)
+        results = strategy.tune(searchspace, runner, tuning_options)
+        return results
\ No newline at end of file
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
new file mode 100644
index 000000000..43e83348b
--- /dev/null
+++ b/kernel_tuner/strategies/ensemble.py
@@ -0,0 +1,106 @@
+import random
+import sys
+import os
+import ray
+from ray.util.actor_pool import ActorPool
+
+import numpy as np
+
+from kernel_tuner import util
+from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.strategies import common
+from kernel_tuner.strategies.common import CostFunc, scale_from_params
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.util import get_num_devices
+
+from kernel_tuner.strategies import (
+    basinhopping,
+    bayes_opt,
+    brute_force,
+    diff_evo,
+    dual_annealing,
+    firefly_algorithm,
+    genetic_algorithm,
+    greedy_ils,
+    greedy_mls,
+    minimize,
+    mls,
+    ordered_greedy_mls,
+    pso,
+    random_sample,
+    simulated_annealing,
+)
+
+strategy_map = {
+    "brute_force": brute_force,
+    "random_sample": random_sample,
+    "minimize": minimize,
+    "basinhopping": basinhopping,
+    "diff_evo": diff_evo,
+    "genetic_algorithm": genetic_algorithm,
+    "greedy_mls": greedy_mls,
+    "ordered_greedy_mls": ordered_greedy_mls,
+    "greedy_ils": greedy_ils,
+    "dual_annealing": dual_annealing,
+    "mls": mls,
+    "pso": pso,
+    "simulated_annealing": simulated_annealing,
+    "firefly_algorithm": firefly_algorithm,
+    "bayes_opt": bayes_opt,
+}
+
+def tune(searchspace: Searchspace, runner, tuning_options):
+    # Define cluster resources
+    num_gpus = get_num_devices(runner.kernel_source.lang)
+    print(f"Number of GPUs in use: {num_gpus}", file=sys. stderr)
+    resources = {}
+    for id in range(num_gpus):
+        gpu_resource_name = f"gpu_{id}"
+        resources[gpu_resource_name] = 1
+    # Initialize Ray
+    os.environ["RAY_DEDUP_LOGS"] = "0"
+    ray.init(resources=resources, include_dashboard=True)
+    # Create RemoteActor instances
+    actors = [create_actor_on_gpu(id, runner) for id in range(num_gpus)]
+    # Create a pool of RemoteActor actors
+    #actor_pool = ActorPool(actors)
+    
+    if "ensemble" in tuning_options:
+        ensemble = tuning_options["ensemble"]
+    else:
+        ensemble = ["random_sample", "random_sample", "random_sample"] # For now its just a random ensemble not based on any logic
+    
+    ensemble = [strategy_map[strategy] for strategy in ensemble]
+    tasks = []
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    for i in range(len(ensemble)):
+        strategy = ensemble[i]
+        actor = actors[i]
+        task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
+        tasks.append(task)
+    all_results = ray.get(tasks)
+
+    unique_configs = set()
+    final_results = []
+
+    for strategy_results in all_results:
+        for new_result in strategy_results:
+            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+
+            if config_signature not in unique_configs:
+                final_results.append(new_result)
+                unique_configs.add(config_signature)
+
+    return final_results
+
+# ITS REPEATING CODE, SAME IN parallel.py
+def create_actor_on_gpu(gpu_id, runner):
+    gpu_resource_name = f"gpu_{gpu_id}"
+    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.quiet,
+                                                                        runner.kernel_source, 
+                                                                        runner.kernel_options, 
+                                                                        runner.device_options, 
+                                                                        runner.iterations, 
+                                                                        runner.observers,
+                                                                        gpu_id)
\ No newline at end of file

From 45a1747f1929f2f1d608bff0e99c5113d790e0fd Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:17:46 +0200
Subject: [PATCH 16/97] updated tests

---
 test/test_ensemble_tuning.py | 42 ++++++++++++++++++++++++++++++++++++
 test/test_parallel_tuning.py |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 test/test_ensemble_tuning.py

diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
new file mode 100644
index 000000000..e5c807d43
--- /dev/null
+++ b/test/test_ensemble_tuning.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+import logging
+import sys
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True)
+    assert len(result) > 0
\ No newline at end of file
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index 9a2e6a362..bbe4d96b7 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 import logging
+import sys
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.backends import nvcuda

From 5fb592785d232e59700aed8a66b45e6b58481d7c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:25:02 +0200
Subject: [PATCH 17/97] clean up in parallel runner

---
 kernel_tuner/runners/parallel.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 65a32cf37..0f7477652 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,7 +7,7 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.runners.ray.parallel_remote_actor import ParallelRemoteActor
 from kernel_tuner.util import get_num_devices
 
 class ParallelRunner(Runner):
@@ -47,7 +47,6 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        #print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results

From a96ef433c58bcebdb28c9f0750d9c55042dc843c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:26:28 +0200
Subject: [PATCH 18/97] moved to sub directory ray

---
 kernel_tuner/runners/parallel_remote_actor.py | 82 -------------------
 kernel_tuner/runners/remote_actor.py          | 45 ----------
 2 files changed, 127 deletions(-)
 delete mode 100644 kernel_tuner/runners/parallel_remote_actor.py
 delete mode 100644 kernel_tuner/runners/remote_actor.py

diff --git a/kernel_tuner/runners/parallel_remote_actor.py b/kernel_tuner/runners/parallel_remote_actor.py
deleted file mode 100644
index e913974a7..000000000
--- a/kernel_tuner/runners/parallel_remote_actor.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-
-@ray.remote(num_gpus=1)
-class ParallelRemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 gpu_id):
-        
-        self.gpu_id = gpu_id
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-    def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
deleted file mode 100644
index 6332972fd..000000000
--- a/kernel_tuner/runners/remote_actor.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-from kernel_tuner.runners.sequential import SequentialRunner
-from kernel_tuner.runners.simulation import SimulationRunner
-
-@ray.remote(num_gpus=1)
-class RemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 gpu_id):
-        
-        self.gpu_id = gpu_id
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        self.device_options = device_options
-        self.iterations = iterations
-        self.observers = observers
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
-        selected_runner = SimulationRunner if simulation_mode else SequentialRunner
-        runner = selected_runner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers)
-        results = strategy.tune(searchspace, runner, tuning_options)
-        return results
\ No newline at end of file

From c831f5f7563483333cdcd74330e0bc0c4207edb6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:27:29 +0200
Subject: [PATCH 19/97] added subdirectory ray with all 3 actor classes

---
 kernel_tuner/runners/ray/cache_manager.py     | 23 ++++++
 .../runners/ray/parallel_remote_actor.py      | 82 +++++++++++++++++++
 kernel_tuner/runners/ray/remote_actor.py      | 39 +++++++++
 3 files changed, 144 insertions(+)
 create mode 100644 kernel_tuner/runners/ray/cache_manager.py
 create mode 100644 kernel_tuner/runners/ray/parallel_remote_actor.py
 create mode 100644 kernel_tuner/runners/ray/remote_actor.py

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
new file mode 100644
index 000000000..437499352
--- /dev/null
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -0,0 +1,23 @@
+import ray
+import json
+
+from kernel_tuner.util import store_cache
+
+@ray.remote
+class CacheManager:
+    def __init__(self, tuning_options):
+        self.tuning_options = tuning_options
+
+    def store(self, key, params):
+        store_cache(key, params, self.tuning_options)
+
+    def check_and_retrieve(self, key):
+        """Checks if a result exists for the given key and returns it if found."""
+        if self.tuning_options.cache:
+            return self.tuning_options.cache.get(key, None)
+        else:
+            return None
+    
+    def get_tuning_options(self):
+        """Returns the current tuning options."""
+        return self.tuning_options
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
new file mode 100644
index 000000000..e913974a7
--- /dev/null
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -0,0 +1,82 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class ParallelRemoteActor():
+    def __init__(self, 
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 gpu_id):
+        
+        self.gpu_id = gpu_id
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        #move data to the GPU
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+    def execute(self, element, tuning_options):
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
new file mode 100644
index 000000000..c092d78e7
--- /dev/null
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -0,0 +1,39 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
+from kernel_tuner.runners.simulation import SimulationRunner
+
+@ray.remote(num_gpus=1)
+class RemoteActor():
+    def __init__(self, 
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 cache_manager):
+        
+        self.kernel_source = kernel_source
+        self.simulation_mode = False
+        self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
+        self.cache_manager = cache_manager
+        
+    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
+        if simulation_mode:
+            runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers)
+        else:
+            runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers, cache_manager=self.cache_manager)
+        results = strategy.tune(searchspace, runner, tuning_options)
+        return results
+    
\ No newline at end of file

From 0cc2a6e273cccc6a602ae3ea43f87424955f0b90 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:28:24 +0200
Subject: [PATCH 20/97] itegrated calls to cache manager functions when running
 in ensemble

---
 kernel_tuner/runners/sequential.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index bf4cd6303..0e6855ece 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -2,6 +2,7 @@
 import logging
 from datetime import datetime, timezone
 from time import perf_counter
+import ray
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -11,7 +12,7 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -43,6 +44,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.device_options = device_options # needed for the ensemble strategy down the line
         self.iterations = iterations # needed for the ensemble strategy down the line
         self.observers = observers # needed for the ensemble strategy down the line
+        self.cache_manager = cache_manager
 
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
@@ -78,8 +80,9 @@ def run(self, parameter_space, tuning_options):
 
             # check if configuration is in the cache
             x_int = ",".join([str(i) for i in element])
-            if tuning_options.cache and x_int in tuning_options.cache:
-                params.update(tuning_options.cache[x_int])
+            cache_result = self.config_in_cache(x_int, tuning_options)
+            if cache_result:
+                params.update(cache_result)
                 params['compile_time'] = 0
                 params['verification_time'] = 0
                 params['benchmark_time'] = 0
@@ -114,9 +117,23 @@ def run(self, parameter_space, tuning_options):
                 print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
 
                 # add configuration to cache
-                store_cache(x_int, params, tuning_options)
+                self.store_in_cache(x_int, params, tuning_options)
 
             # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 
         return results
+
+    def config_in_cache(self, x_int, tuning_options):
+        if self.cache_manager:
+            return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
+        elif tuning_options.cache and x_int in tuning_options.cache:
+            return tuning_options.cache[x_int]
+        else:
+            return None
+
+    def store_in_cache(self, x_int, params, tuning_options):
+        if self.cache_manager:
+            ray.get(self.cache_manager.store.remote(x_int, params))
+        else:
+            store_cache(x_int, params, tuning_options)
\ No newline at end of file

From b816f3d2cd9ffcf321f1a0ba131140bf29b16d6a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:28:59 +0200
Subject: [PATCH 21/97] added cache manager logic

---
 kernel_tuner/strategies/ensemble.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 43e83348b..203785ad8 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -11,8 +11,9 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.cache_manager import CacheManager
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -61,15 +62,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
     ray.init(resources=resources, include_dashboard=True)
+    cache_manager = CacheManager.remote(tuning_options)
     # Create RemoteActor instances
-    actors = [create_actor_on_gpu(id, runner) for id in range(num_gpus)]
-    # Create a pool of RemoteActor actors
-    #actor_pool = ActorPool(actors)
+    actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]
     
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
-        ensemble = ["random_sample", "random_sample", "random_sample"] # For now its just a random ensemble not based on any logic
+        ensemble = ["random_sample", "random_sample", "random_sample"]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
     tasks = []
@@ -80,6 +80,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
         tasks.append(task)
     all_results = ray.get(tasks)
+    tuning_options = ray.get(cache_manager.get_tuning_options.remote())
 
     unique_configs = set()
     final_results = []
@@ -94,13 +95,11 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     return final_results
 
-# ITS REPEATING CODE, SAME IN parallel.py
-def create_actor_on_gpu(gpu_id, runner):
+def create_actor_on_gpu(gpu_id, runner, cache_manager):
     gpu_resource_name = f"gpu_{gpu_id}"
-    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.quiet,
-                                                                        runner.kernel_source, 
+    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
                                                                         runner.kernel_options, 
                                                                         runner.device_options, 
                                                                         runner.iterations, 
                                                                         runner.observers,
-                                                                        gpu_id)
\ No newline at end of file
+                                                                        cache_manager)
\ No newline at end of file

From 781839ab2e83d782903f05a19aeaafbf41bcd821 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Mon, 8 Apr 2024 12:09:00 +0200
Subject: [PATCH 22/97] added instances needed for the ensemble down the line
 of execution

---
 kernel_tuner/runners/simulation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index 27eadf073..c0d9c2d00 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -58,6 +58,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
+        self.device_options = device_options # needed for the ensemble strategy down the line
+        self.iterations = iterations # needed for the ensemble strategy down the line
+        self.observers = observers # needed for the ensemble strategy down the line
+
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
         env["simulation"] = True

From 9f8d212ec0590b3602b9de4d92630c5cb7f91979 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Mon, 8 Apr 2024 12:09:50 +0200
Subject: [PATCH 23/97] added strategy option to get_options function

---
 kernel_tuner/strategies/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 2185cb2f7..a45df191c 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -44,7 +44,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")

From d08b5d4b6d72d6d9634270a443cec748850a2d3b Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 14:29:04 +0200
Subject: [PATCH 24/97] added ignore_reinit_error to ray init

---
 kernel_tuner/runners/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 0f7477652..34641298d 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -36,7 +36,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
             resources[gpu_resource_name] = 1
         # Initialize Ray
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True)
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
         # Create RemoteActor instances
         self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
         # Create a pool of RemoteActor actors

From 903c981bf3e5d13a09f15c3d49fb2ed83e1cc73c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 14:29:35 +0200
Subject: [PATCH 25/97] added ignore_reinit_error to ray init

---
 kernel_tuner/strategies/ensemble.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 203785ad8..e31ac60ff 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -61,7 +61,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         resources[gpu_resource_name] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True)
+    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     cache_manager = CacheManager.remote(tuning_options)
     # Create RemoteActor instances
     actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]

From 1a2219a38285c62cc049ae4be2694c0550f4aa77 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 16:32:35 +0200
Subject: [PATCH 26/97] added cache manager to parallel tuning

---
 kernel_tuner/runners/parallel.py              |  8 ++-
 .../runners/ray/parallel_remote_actor.py      | 58 ++++---------------
 kernel_tuner/strategies/brute_force.py        |  9 ++-
 3 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 34641298d..2a25c2104 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -26,6 +26,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
+        self.cache_manager = None
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
@@ -46,7 +47,12 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options):
+    def run(self, parameter_space, tuning_options, cache_manager):
+        self.cache_manager = cache_manager
+        # Distribute the cache manager to all actors and initialize runners of actors
+        for actor in self.actors:
+            actor.set_cache_manager.remote(cache_manager)
+            actor.init_runner.remote()
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index e913974a7..71b763326 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -6,6 +6,7 @@
 
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
 
 @ray.remote(num_gpus=1)
 class ParallelRemoteActor():
@@ -29,54 +30,17 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+        self.cache_manager = None
+        self.runner = None
 
     def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
+        self.runner.run(element, tuning_options, self.cache_manager)
 
-        result = None
-        warmup_time = 0
+    def set_cache_manager(self, cache_manager):
+        self.cache_manager = cache_manager
 
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
+    def init_runner(self):
+        if self.cache_manager is None:
+            raise ValueError("Cache manager is not set.")
+        self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                       self.iterations, self.observers, cache_manager=self.cache_manager)
\ No newline at end of file
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index a0e3f8ebe..ba3d834ad 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -1,13 +1,18 @@
 """ The default strategy that iterates through the whole parameter space """
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
+from kernel_tuner.runners.parallel import ParallelRunner
+from kernel_tuner.runners.ray.cache_manager import CacheManager
 
 _options = {}
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
-    # call the runner
-    return runner.run(searchspace.sorted_list(), tuning_options)
+    if isinstance(runner, ParallelRunner):
+        cache_manager = CacheManager.remote(tuning_options)
+        return runner.run(searchspace.sorted_list(), tuning_options, cache_manager)
+    else:
+        return runner.run(searchspace.sorted_list(), tuning_options)
 
 
 tune.__doc__ = common.get_strategy_docstring("Brute Force", _options)

From a476585a8f4a261372fe9610e8e0d0093de368c3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 11 Apr 2024 15:10:40 +0200
Subject: [PATCH 27/97] re-assign tuning options to final version from the
 cache manager at the end of the parallel runner execution

---
 kernel_tuner/runners/parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 2a25c2104..07e3933ab 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -55,6 +55,7 @@ def run(self, parameter_space, tuning_options, cache_manager):
             actor.init_runner.remote()
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
+        tuning_options = ray.get(cache_manager.get_tuning_options.remote())
         return results
     
     def create_actor_on_gpu(self, gpu_id):

From 6233e09af7f2ba92de4653e9064728d0410c081f Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 11 Apr 2024 15:13:08 +0200
Subject: [PATCH 28/97] small bug fix in execute

---
 kernel_tuner/runners/ray/parallel_remote_actor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 71b763326..81629781b 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -30,11 +30,14 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
         self.cache_manager = None
         self.runner = None
 
     def execute(self, element, tuning_options):
-        self.runner.run(element, tuning_options, self.cache_manager)
+        return self.runner.run([element], tuning_options)[0]
 
     def set_cache_manager(self, cache_manager):
         self.cache_manager = cache_manager

From 14e5f0b3ec0e606a8f1f10bc9b3ae4f2d0017607 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 16 Apr 2024 11:14:44 +0200
Subject: [PATCH 29/97] updates to run ensemble in simulation mode on CPUs

---
 kernel_tuner/runners/ray/remote_actor.py |  2 +-
 kernel_tuner/strategies/ensemble.py      | 48 +++++++++++++++---------
 kernel_tuner/util.py                     |  6 ++-
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index c092d78e7..a68b63ace 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -9,7 +9,7 @@
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 
-@ray.remote(num_gpus=1)
+@ray.remote
 class RemoteActor():
     def __init__(self, 
                  kernel_source,
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index e31ac60ff..902402e2c 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -52,28 +52,34 @@
 }
 
 def tune(searchspace: Searchspace, runner, tuning_options):
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    if "ensemble" in tuning_options:
+        ensemble = tuning_options["ensemble"]
+    else:
+        ensemble = ["random_sample", "random_sample"]
+
     # Define cluster resources
-    num_gpus = get_num_devices(runner.kernel_source.lang)
-    print(f"Number of GPUs in use: {num_gpus}", file=sys. stderr)
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"Number of devices available: {num_devices}", file=sys. stderr)
+    if num_devices < len(ensemble):
+        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({len(ensemble)})")
+    
     resources = {}
-    for id in range(num_gpus):
-        gpu_resource_name = f"gpu_{id}"
-        resources[gpu_resource_name] = 1
+    for id in range(len(ensemble)):
+        device_resource_name = f"device_{id}"
+        resources[device_resource_name] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
-    cache_manager = CacheManager.remote(tuning_options)
-    # Create RemoteActor instances
-    actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]
-    
-    if "ensemble" in tuning_options:
-        ensemble = tuning_options["ensemble"]
+    if simulation_mode:
+        ray.init(num_cpus=len(ensemble) + 1, include_dashboard=True, ignore_reinit_error=True)
     else:
-        ensemble = ["random_sample", "random_sample", "random_sample"]
+        ray.init(num_gpus=len(ensemble), num_cpus=1, include_dashboard=True, ignore_reinit_error=True)
+    # Create cache manager and actors
+    cache_manager = CacheManager.remote(tuning_options)
+    actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
     tasks = []
-    simulation_mode = True if isinstance(runner, SimulationRunner) else False
     for i in range(len(ensemble)):
         strategy = ensemble[i]
         actor = actors[i]
@@ -93,11 +99,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
 
+    #kill all actors and chache manager
+    for actor in actors:
+        ray.kill(actor)
+    ray.kill(cache_manager)
+
     return final_results
 
-def create_actor_on_gpu(gpu_id, runner, cache_manager):
-    gpu_resource_name = f"gpu_{gpu_id}"
-    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
+def create_actor_on_device(device_id, runner, cache_manager, simulation_mode):
+    if simulation_mode:
+        resource_options= {"num_cpus": 1}
+    else:
+        resource_options= {"num_gpus": 1}
+    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
                                                                         runner.kernel_options, 
                                                                         runner.device_options, 
                                                                         runner.iterations, 
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 3392dead2..b9ecf9b3a 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1279,9 +1279,11 @@ def cuda_error_check(error):
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
-def get_num_devices(lang):
+def get_num_devices(lang, simulation_mode=False):
     num_devices = 0
-    if lang.upper() == "CUDA":
+    if simulation_mode:
+        num_devices = os.cpu_count()
+    elif lang.upper() == "CUDA":
         import pycuda.driver as cuda
         cuda.init()
         num_devices = cuda.Device.count()

From a963dac09eb3d3b48713e61a54c161bcbd22bffb Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 23 Apr 2024 12:44:11 +0200
Subject: [PATCH 30/97] fixed problem with ray resources and stalling actors

---
 kernel_tuner/runners/ray/cache_manager.py |  2 +-
 kernel_tuner/strategies/ensemble.py       | 25 +++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 437499352..882207f02 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -3,7 +3,7 @@
 
 from kernel_tuner.util import store_cache
 
-@ray.remote
+@ray.remote(num_cpus=1)
 class CacheManager:
     def __init__(self, tuning_options):
         self.tuning_options = tuning_options
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 902402e2c..725414048 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -66,16 +66,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     
     resources = {}
     for id in range(len(ensemble)):
-        device_resource_name = f"device_{id}"
+        device_resource_name = f"gpu_{id}"
         resources[device_resource_name] = 1
+    resources["cache_manager_cpu"] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    if simulation_mode:
-        ray.init(num_cpus=len(ensemble) + 1, include_dashboard=True, ignore_reinit_error=True)
-    else:
-        ray.init(num_gpus=len(ensemble), num_cpus=1, include_dashboard=True, ignore_reinit_error=True)
+    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
-    cache_manager = CacheManager.remote(tuning_options)
+    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
@@ -106,14 +104,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     return final_results
 
-def create_actor_on_device(device_id, runner, cache_manager, simulation_mode):
+def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
+    gpu_resource_name = f"gpu_{gpu_id}"
     if simulation_mode:
         resource_options= {"num_cpus": 1}
     else:
         resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
-                                                                        runner.kernel_options, 
-                                                                        runner.device_options, 
-                                                                        runner.iterations, 
-                                                                        runner.observers,
-                                                                        cache_manager)
\ No newline at end of file
+    return RemoteActor.options(**resource_options, resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
+                                                                                            runner.kernel_options, 
+                                                                                            runner.device_options, 
+                                                                                            runner.iterations, 
+                                                                                            runner.observers,
+                                                                                            cache_manager)
\ No newline at end of file

From c55b8704e6570e4441564b24b236cad433b7cb4e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:17:34 +0200
Subject: [PATCH 31/97] added setup_resources and new impl of costfunc (not yet
 tested and still have to deal with stop criterion)

---
 kernel_tuner/strategies/common.py | 90 ++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index e9cdfeab4..5ff3cacaf 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -6,6 +6,7 @@
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.util import get_num_devices
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -44,7 +45,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", "maxiter"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -73,6 +74,35 @@ def __call__(self, x, check_restrictions=True):
         util.check_stop_criterion(self.tuning_options)
 
         # snap values in x to nearest actual value for each parameter, unscale x if needed
+        configs = [self._prepare_config(cfg) for cfg in configs]
+
+        legal_configs, illegal_results = self._get_legal_configs(configs)
+        results = self.runner.run(legal_configs, self.tuning_options)
+        self.results.extend(results)
+
+        for result in results:
+            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            x_int = ",".join([str(i) for i in config])
+            # append to tuning results
+            if x_int not in self.tuning_options.unique_results:
+                self.tuning_options.unique_results[x_int] = result
+
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.runner.last_strategy_start_time = perf_counter()
+
+        # get numerical return values, taking optimization direction into account
+        all_results = results + illegal_results
+        return_values = []
+        for result in all_results:
+            return_value = result[self.tuning_options.objective] or sys.float_info.max
+            return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
+
+        if len(return_values) == 1:
+            return return_values[0]
+        return return_values
+    
+    def _prepare_config(self, x):
+        """Prepare a single configuration by snapping to nearest values and/or scaling."""
         if self.snap:
             if self.scaling:
                 params = unscale_and_snap_to_nearest(x, self.searchspace.tune_params, self.tuning_options.eps)
@@ -81,38 +111,21 @@ def __call__(self, x, check_restrictions=True):
         else:
             params = x
         logging.debug('params ' + str(params))
+        return params
+    
+    def _get_legal_configs(self, configs) -> list:
+            results = []
+            legal_configs = []
+            for config in configs:
+                params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
+                legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
+                if not legal:
+                    params_dict[self.tuning_options.objective] = util.InvalidConfig()
+                    results.append(params_dict)
+                else:
+                    legal_configs.append(config)
+            return legal_configs, results
 
-        legal = True
-        result = {}
-        x_int = ",".join([str(i) for i in params])
-
-        # else check if this is a legal (non-restricted) configuration
-        if check_restrictions and self.searchspace.restrictions:
-            params_dict = dict(zip(self.searchspace.tune_params.keys(), params))
-            legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
-            if not legal:
-                result = params_dict
-                result[self.tuning_options.objective] = util.InvalidConfig()
-
-        if legal:
-            # compile and benchmark this instance
-            res = self.runner.run([params], self.tuning_options)
-            result = res[0]
-
-            # append to tuning results
-            if x_int not in self.tuning_options.unique_results:
-                self.tuning_options.unique_results[x_int] = result
-
-            self.results.append(result)
-
-            # upon returning from this function control will be given back to the strategy, so reset the start time
-            self.runner.last_strategy_start_time = perf_counter()
-
-        # get numerical return value, taking optimization direction into account
-        return_value = result[self.tuning_options.objective] or sys.float_info.max
-        return_value = return_value if not self.tuning_options.objective_higher_is_better else -return_value
-
-        return return_value
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""
@@ -243,3 +256,16 @@ def scale_from_params(params, tune_params, eps):
     for i, v in enumerate(tune_params.values()):
         x[i] = 0.5 * eps + v.index(params[i])*eps
     return x
+
+def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"Number of devices available: {num_devices}", file=sys.stderr)
+    if num_devices < ensemble_size:
+        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
+    
+    resources = {}
+    for id in range(ensemble_size):
+        device_resource_name = f"gpu_{id}"
+        resources[device_resource_name] = 1
+    resources["cache_manager_cpu"] = 1
+    return resources
\ No newline at end of file

From d8541a07ef38ac4881ba3e4af5d9e17a58a6569e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:20:51 +0200
Subject: [PATCH 32/97] added ensemble and memetic to strategy map and import

---
 kernel_tuner/interface.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 1267915c5..81ae7de48 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -58,7 +58,8 @@
     pso,
     random_sample,
     simulated_annealing,
-    ensemble
+    ensemble,
+    memetic
 )
 
 strategy_map = {
@@ -77,7 +78,8 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
-    "ensemble": ensemble
+    "ensemble": ensemble,
+    "memetic": memetic,
 }
 
 

From c755254a1406d07cee57f3c3acfc5dc2ba85d73a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:22:27 +0200
Subject: [PATCH 33/97] rearranged how parallel runner deals with cache manager
 and actor's lifecycle

---
 kernel_tuner/runners/parallel.py              | 48 +++++++++++--------
 .../runners/ray/parallel_remote_actor.py      | 10 ++--
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 07e3933ab..1a2e894c6 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,7 +12,7 @@
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, resources=None):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -26,39 +26,44 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
-        self.cache_manager = None
+        self.cache_manager = cache_manager
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
         print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
-        resources = {}
-        for id in range(self.num_gpus):
-            gpu_resource_name = f"gpu_{id}"
-            resources[gpu_resource_name] = 1
+        if resources is None:
+            for id in range(self.num_gpus):
+                gpu_resource_name = f"gpu_{id}"
+                resources[gpu_resource_name] = 1
         # Initialize Ray
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
-        # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
-        # Create a pool of RemoteActor actors
-        self.actor_pool = ActorPool(self.actors)
+        if not ray.is_initialized():
+            os.environ["RAY_DEDUP_LOGS"] = "0"
+            ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options, cache_manager):
-        self.cache_manager = cache_manager
-        # Distribute the cache manager to all actors and initialize runners of actors
-        for actor in self.actors:
-            actor.set_cache_manager.remote(cache_manager)
-            actor.init_runner.remote()
+    def run(self, parameter_space, tuning_options, cache_manager=None):
+        if self.cache_manager is None:
+            if cache_manager is None:
+                raise ValueError("A cache manager is required for parallel execution")
+            self.cache_manager = cache_manager
+        # Create RemoteActor instances
+        self.actors = [self.create_actor_on_gpu(id, self.cache_manager) for id in range(self.num_gpus)]
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-        tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        tuning_options.update(new_tuning_options)
+
+        for actor in self.actors:
+            ray.kill(actor)
+        
         return results
     
-    def create_actor_on_gpu(self, gpu_id):
+    def create_actor_on_gpu(self, gpu_id, cache_manager):
         gpu_resource_name = f"gpu_{gpu_id}"
         return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
                                                                             self.kernel_source, 
@@ -66,4 +71,5 @@ def create_actor_on_gpu(self, gpu_id):
                                                                             self.device_options, 
                                                                             self.iterations, 
                                                                             self.observers,
-                                                                            gpu_id)
+                                                                            gpu_id,
+                                                                            cache_manager)
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 81629781b..051c8689c 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -17,7 +17,8 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 gpu_id):
+                 gpu_id,
+                 cache_manager):
         
         self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
@@ -33,15 +34,14 @@ def __init__(self,
         self.device_options = device_options
         self.iterations = iterations
         self.observers = observers
-        self.cache_manager = None
+        self.cache_manager = cache_manager
         self.runner = None
 
     def execute(self, element, tuning_options):
+        if self.runner is None:
+            self.init_runner()
         return self.runner.run([element], tuning_options)[0]
 
-    def set_cache_manager(self, cache_manager):
-        self.cache_manager = cache_manager
-
     def init_runner(self):
         if self.cache_manager is None:
             raise ValueError("Cache manager is not set.")

From a23ef94ca297656798772e27e8d7164c8e4ce42d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:25:50 +0200
Subject: [PATCH 34/97] initial adaptions for memetic and cleaned up logic of
 ensemble

---
 kernel_tuner/strategies/ensemble.py | 83 +++++++++++++++++------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 725414048..d5933750f 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -2,6 +2,8 @@
 import sys
 import os
 import ray
+import copy
+import logging
 from ray.util.actor_pool import ActorPool
 
 import numpy as np
@@ -9,7 +11,7 @@
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, setup_resources
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -51,29 +53,20 @@
     "bayes_opt": bayes_opt,
 }
 
-def tune(searchspace: Searchspace, runner, tuning_options):
+def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
         ensemble = ["random_sample", "random_sample"]
-
-    # Define cluster resources
-    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"Number of devices available: {num_devices}", file=sys. stderr)
-    if num_devices < len(ensemble):
-        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({len(ensemble)})")
-    
-    resources = {}
-    for id in range(len(ensemble)):
-        device_resource_name = f"gpu_{id}"
-        resources[device_resource_name] = 1
-    resources["cache_manager_cpu"] = 1
+    resources = setup_resources(len(ensemble), simulation_mode, runner)
     # Initialize Ray
-    os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+    if not ray.is_initialized():
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
-    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    if cache_manager is None:
+        cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
@@ -81,27 +74,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     for i in range(len(ensemble)):
         strategy = ensemble[i]
         actor = actors[i]
-        task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
+        remote_tuning_options = setup_tuning_options(tuning_options)
+        task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
         tasks.append(task)
     all_results = ray.get(tasks)
-    tuning_options = ray.get(cache_manager.get_tuning_options.remote())
-
-    unique_configs = set()
-    final_results = []
-
-    for strategy_results in all_results:
-        for new_result in strategy_results:
-            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+    new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+    tuning_options.update(new_tuning_options)
+    final_results, population = process_results(all_results, searchspace)
 
-            if config_signature not in unique_configs:
-                final_results.append(new_result)
-                unique_configs.add(config_signature)
-
-    #kill all actors and chache manager
-    for actor in actors:
-        ray.kill(actor)
-    ray.kill(cache_manager)
+    if population: # for memetic strategy
+        tuning_options.strategy_options["population"] = population
+        logging.debug(f"tuning_options.strategy_options[population]: {tuning_options.strategy_options['population']}")
 
+    clean_up(actors, cache_manager)
     return final_results
 
 def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
@@ -115,4 +100,32 @@ def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
                                                                                             runner.device_options, 
                                                                                             runner.iterations, 
                                                                                             runner.observers,
-                                                                                            cache_manager)
\ No newline at end of file
+                                                                                            cache_manager)
+
+def setup_tuning_options(tuning_options):
+    new_tuning_options = copy.deepcopy(tuning_options)
+    if "candidates" in tuning_options.strategy_options:
+        #new_tuning_options.strategy_options.pop("candidates")
+        if len(tuning_options.strategy_options["candidates"]) > 0:
+            new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+    return new_tuning_options
+
+def process_results(all_results, searchspace):
+    unique_configs = set()
+    final_results = []
+    population = [] # for memetic strategy
+
+    for (strategy_results, tuning_options) in all_results:
+        if "candidate" in tuning_options.strategy_options:
+            population.append(tuning_options.strategy_options["candidate"])
+        for new_result in strategy_results:
+            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+            if config_signature not in unique_configs:
+                final_results.append(new_result)
+                unique_configs.add(config_signature)
+    return final_results, population
+
+def clean_up(actors, cache_manager):
+    for actor in actors:
+        ray.kill(actor)
+    ray.kill(cache_manager)

From 697ead0805c44fc2a623bfdc51785677963bd154 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:26:24 +0200
Subject: [PATCH 35/97] returning tuning_options for memetic logic

---
 kernel_tuner/runners/ray/remote_actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index a68b63ace..61127d5be 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -35,5 +35,5 @@ def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
             runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers, cache_manager=self.cache_manager)
         results = strategy.tune(searchspace, runner, tuning_options)
-        return results
+        return results, tuning_options
     
\ No newline at end of file

From b247ed0d8900ebe692850c9334c2730a1ae58682 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:26:52 +0200
Subject: [PATCH 36/97] init impl of memetic strategy

---
 kernel_tuner/strategies/memetic.py | 118 +++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 kernel_tuner/strategies/memetic.py

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
new file mode 100644
index 000000000..5708099be
--- /dev/null
+++ b/kernel_tuner/strategies/memetic.py
@@ -0,0 +1,118 @@
+import logging
+import ray
+import os
+
+from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.runners.parallel import ParallelRunner
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.strategies.common import setup_resources
+
+from kernel_tuner.strategies import (
+    basinhopping,
+    bayes_opt,
+    brute_force,
+    diff_evo,
+    dual_annealing,
+    firefly_algorithm,
+    genetic_algorithm,
+    greedy_ils,
+    greedy_mls,
+    minimize,
+    mls,
+    ordered_greedy_mls,
+    pso,
+    random_sample,
+    simulated_annealing,
+    ensemble,
+    memetic
+)
+
+strategy_map = {
+    "brute_force": brute_force,
+    "random_sample": random_sample,
+    "minimize": minimize,
+    "basinhopping": basinhopping,
+    "diff_evo": diff_evo,
+    "genetic_algorithm": genetic_algorithm,
+    "greedy_mls": greedy_mls,
+    "ordered_greedy_mls": ordered_greedy_mls,
+    "greedy_ils": greedy_ils,
+    "dual_annealing": dual_annealing,
+    "mls": mls,
+    "pso": pso,
+    "simulated_annealing": simulated_annealing,
+    "firefly_algorithm": firefly_algorithm,
+    "bayes_opt": bayes_opt,
+}
+
+# Pseudo code from "Memetic algorithms and memetic computing optimization: A literature review" by Ferrante Neri and Carlos Cotta
+# function BasicMA (in P: Problem, in par: Parameters):
+# Solution; 
+# begin 
+#     pop ← Initialize(par, P); 
+#     repeat 
+#         newpop1 ← Cooperate(pop, par, P); 
+#         newpop2 ← Improve(newpop1, par, P); 
+#         pop ← Compete (pop, newpop2); 
+#         if Converged(pop) then 
+#             pop ← Restart(pop, par); 
+#         end 
+#     until TerminationCriterion(par); 
+#     return GetNthBest(pop, 1); 
+# end
+
+ls_strategies_list = {
+    "greedy_mls",
+    "ordered_greedy_mls",
+    "greedy_ils",
+    "mls",
+    "hill_climbing"
+}
+
+pop_based_strategies_list = {
+    "genetic_algorithm",
+    "differential_evolution",
+    "pso"
+}
+
+
+def tune(searchspace: Searchspace, runner, tuning_options):
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    ls_strategies = ["greedy_ils", "greedy_ils", "greedy_ils", "greedy_ils"]
+    pop_based_strategy = "genetic_algorithm"
+    iterations = 10
+
+    if set(ls_strategies) <= ls_strategies_list:
+        tuning_options["ensemble"] = ls_strategies
+    else:
+        raise ValueError("Provided local search ensemble are not all local search strategies")
+
+    if pop_based_strategy in pop_based_strategies_list:
+        pop_based_strategy = strategy_map[pop_based_strategy]
+    else:
+        raise ValueError("Provided population based strategy is not a population based strategy")
+    
+    tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
+    tuning_options.strategy_options["max_fevals"] = 10
+    tuning_options.strategy_options["maxiter"] = 10
+
+    resources = setup_resources(len(ls_strategies), simulation_mode, runner)
+    # Initialize Ray
+    if not ray.is_initialized():
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+    # Create cache manager and actors
+    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                runner.iterations, runner.observers, cache_manager=cache_manager,
+                                resources=resources)
+    
+    for i in range(iterations):
+        print(f"Memetic algorithm iteration {i}")
+        print(f"start local search ensemble with candidates = {tuning_options.strategy_options['candidates']}")
+        ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        print(f"start pop base algo with population = {tuning_options.strategy_options['population']}")
+        results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
+
+    return results
\ No newline at end of file

From 948ab7fc0a3b4004afdd029b626bda8e2ffec854 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:27:35 +0200
Subject: [PATCH 37/97] initial adapion for memetic strategy

---
 kernel_tuner/strategies/genetic_algorithm.py | 54 +++++++++++++++-----
 kernel_tuner/strategies/greedy_ils.py        |  9 ++--
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index c29c150b5..d202a86c8 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -7,39 +7,40 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
+from kernel_tuner.runners.parallel import ParallelRunner
 
 _options = dict(
     popsize=("population size", 20),
     maxiter=("maximum number of generations", 100),
     method=("crossover method to use, choose any from single_point, two_point, uniform, disruptive_uniform", "uniform"),
     mutation_chance=("chance to mutate is 1 in mutation_chance", 10),
+    population=("initial population", None),
 )
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
-    pop_size, generations, method, mutation_chance = common.get_options(options, _options)
+    pop_size, generations, method, mutation_chance, population = common.get_options(options, _options)
     crossover = supported_methods[method]
 
     best_score = 1e20
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+    if not population:
+        population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+    else:
+        pop_size = len(population)
 
     for generation in range(generations):
 
-        # determine fitness of population members
-        weighted_population = []
-        for dna in population:
-            try:
-                time = cost_func(dna, check_restrictions=False)
-            except util.StopCriterionReached as e:
-                if tuning_options.verbose:
-                    print(e)
-                return cost_func.results
-
-            weighted_population.append((dna, time))
+        # Evaluate the entire population
+        try:
+            weighted_population = evaluate_population(runner, cost_func, population)
+        except util.StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            return cost_func.results
 
         # population is sorted such that better configs have higher chance of reproducing
         weighted_population.sort(key=lambda x: x[1])
@@ -69,7 +70,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
-
+    tuning_options.strategy_options["candidates"] = population # for memetic strategy
     return cost_func.results
 
 
@@ -177,3 +178,28 @@ def disruptive_uniform_crossover(dna1, dna2):
     "uniform": uniform_crossover,
     "disruptive_uniform": disruptive_uniform_crossover,
 }
+
+def evaluate_population(runner, cost_func, population):
+    """
+    Evaluate the population based on the type of runner.
+
+    Parameters:
+    - runner: The runner (ParallelRunner or SequentialRunner) determining how to process evaluations.
+    - cost_func: A function capable of evaluating the population.
+    - population: List of individuals to be evaluated.
+
+    Returns:
+    - List of tuples (dna, fitness_score) representing the population and their evaluation results.
+    """
+    print(f"population: {population}")
+    if isinstance(runner, ParallelRunner):
+        # Process the whole population at once if using a ParallelRunner
+        results = cost_func(population, check_restrictions=False)
+        return list(zip(population, results))
+    else:
+        # Process each individual sequentially for SequentialRunner
+        weighted_population = []
+        for dna in population:
+            time = cost_func(dna, check_restrictions=False)  # Cost function called with a single-element list
+            weighted_population.append((dna, time))
+        return weighted_population
\ No newline at end of file
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index a4c521746..1aa00ec51 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -9,7 +9,8 @@
 _options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
                        restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
                        no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
+                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3),
+                       candidate=("initial candidate for the search", None))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
@@ -17,7 +18,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
 
-    neighbor, restart, no_improvement, randomwalk = common.get_options(options, _options)
+    neighbor, restart, no_improvement, randomwalk, candidate = common.get_options(options, _options)
 
     perm_size = int(randomwalk * dna_size)
     if perm_size == 0:
@@ -31,7 +32,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
     #while searching
-    candidate = searchspace.get_random_sample(1)[0]
+    if not candidate:
+        candidate = searchspace.get_random_sample(1)[0]
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
@@ -53,6 +55,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
+    tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 
 

From 9e40d4ec9d8efe620a8163355fe5ad1996b961f8 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 14:39:01 +0200
Subject: [PATCH 38/97] removed brute_force from strategy map and import

---
 kernel_tuner/strategies/ensemble.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index d5933750f..164599040 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -20,7 +20,6 @@
 from kernel_tuner.strategies import (
     basinhopping,
     bayes_opt,
-    brute_force,
     diff_evo,
     dual_annealing,
     firefly_algorithm,
@@ -36,7 +35,6 @@
 )
 
 strategy_map = {
-    "brute_force": brute_force,
     "random_sample": random_sample,
     "minimize": minimize,
     "basinhopping": basinhopping,

From 3cb428db56904a136089ab9e8980b5d40aa9896a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 14:40:02 +0200
Subject: [PATCH 39/97] fixes of new costfunc and stop criterion is checked
 retrospectively

---
 kernel_tuner/strategies/common.py | 43 ++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 5ff3cacaf..5ff17fea6 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -75,23 +75,16 @@ def __call__(self, x, check_restrictions=True):
 
         # snap values in x to nearest actual value for each parameter, unscale x if needed
         configs = [self._prepare_config(cfg) for cfg in configs]
-
-        legal_configs, illegal_results = self._get_legal_configs(configs)
-        results = self.runner.run(legal_configs, self.tuning_options)
-        self.results.extend(results)
-
-        for result in results:
-            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
-            x_int = ",".join([str(i) for i in config])
-            # append to tuning results
-            if x_int not in self.tuning_options.unique_results:
-                self.tuning_options.unique_results[x_int] = result
-
-        # upon returning from this function control will be given back to the strategy, so reset the start time
-        self.runner.last_strategy_start_time = perf_counter()
+        
+        legal_configs = configs
+        illegal_results = []
+        if check_restrictions and self.searchspace.restrictions:
+            legal_configs, illegal_results = self._get_legal_configs(configs)
+        
+        final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
 
         # get numerical return values, taking optimization direction into account
-        all_results = results + illegal_results
+        all_results = final_results + illegal_results
         return_values = []
         for result in all_results:
             return_value = result[self.tuning_options.objective] or sys.float_info.max
@@ -125,7 +118,27 @@ def _get_legal_configs(self, configs) -> list:
                 else:
                     legal_configs.append(config)
             return legal_configs, results
+    
+    def _evaluate_configs(self, configs):
+        results = self.runner.run(configs, self.tuning_options)
+        self.results.extend(results)
+
+        final_results = []
+        for result in results:
+            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            x_int = ",".join([str(i) for i in config])
+            # append to tuning results
+            if x_int not in self.tuning_options.unique_results:
+                self.tuning_options.unique_results[x_int] = result
+                # check if max_fevals is reached or time limit is exceeded within the the results
+                util.check_stop_criterion(self.tuning_options)
+            final_results.append(result)
+
+        self.results.append(final_results)
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.runner.last_strategy_start_time = perf_counter()
 
+        return final_results
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""

From 2d13fc36db0e6b78468cd1f54dad4e636a194796 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:42:06 +0200
Subject: [PATCH 40/97] fixed bug with tuning options cache manager

---
 kernel_tuner/runners/parallel.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 1a2e894c6..98e6a4c63 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -30,7 +30,6 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
-        print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
         if resources is None:
             for id in range(self.num_gpus):
                 gpu_resource_name = f"gpu_{id}"
@@ -55,7 +54,7 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-        new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
 
         for actor in self.actors:

From 1a2ba539fcf38343d2a6ffea42457575aaa325b8 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:45:10 +0200
Subject: [PATCH 41/97] fixed some bugs for memetic algo functioning

---
 kernel_tuner/strategies/ensemble.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 164599040..53bedd010 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -63,7 +63,9 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
         os.environ["RAY_DEDUP_LOGS"] = "0"
         ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
+    kill_cache_manager = False
     if cache_manager is None:
+        kill_cache_manager = True
         cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
@@ -82,9 +84,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
 
     if population: # for memetic strategy
         tuning_options.strategy_options["population"] = population
-        logging.debug(f"tuning_options.strategy_options[population]: {tuning_options.strategy_options['population']}")
 
-    clean_up(actors, cache_manager)
+    clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
 def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
@@ -103,7 +104,6 @@ def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
     if "candidates" in tuning_options.strategy_options:
-        #new_tuning_options.strategy_options.pop("candidates")
         if len(tuning_options.strategy_options["candidates"]) > 0:
             new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
     return new_tuning_options
@@ -117,13 +117,14 @@ def process_results(all_results, searchspace):
         if "candidate" in tuning_options.strategy_options:
             population.append(tuning_options.strategy_options["candidate"])
         for new_result in strategy_results:
-            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+            config_signature = tuple(new_result[key] for key in searchspace.tune_params)
             if config_signature not in unique_configs:
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
     return final_results, population
 
-def clean_up(actors, cache_manager):
+def clean_up(actors, cache_manager, kill_cache_manager):
     for actor in actors:
         ray.kill(actor)
-    ray.kill(cache_manager)
+    if kill_cache_manager:
+        ray.kill(cache_manager)

From 2aba6f506bdaf44c8590b70f8ce614758c228e98 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:45:58 +0200
Subject: [PATCH 42/97] removed debug prints

---
 kernel_tuner/strategies/genetic_algorithm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index d202a86c8..7142ac6cb 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -191,7 +191,6 @@ def evaluate_population(runner, cost_func, population):
     Returns:
     - List of tuples (dna, fitness_score) representing the population and their evaluation results.
     """
-    print(f"population: {population}")
     if isinstance(runner, ParallelRunner):
         # Process the whole population at once if using a ParallelRunner
         results = cost_func(population, check_restrictions=False)

From cd3f2121b775a5bc2b6695206fc2f2716b1f431e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:51:29 +0200
Subject: [PATCH 43/97] fixed problem with single config input and final
 results data structure

---
 kernel_tuner/strategies/common.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 5ff17fea6..8db9cf103 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -73,8 +73,8 @@ def __call__(self, x, check_restrictions=True):
         # check if max_fevals is reached or time limit is exceeded
         util.check_stop_criterion(self.tuning_options)
 
-        # snap values in x to nearest actual value for each parameter, unscale x if needed
-        configs = [self._prepare_config(cfg) for cfg in configs]
+        x_list = [x] if self._is_single_configuration(x) else x
+        configs = [self._prepare_config(cfg) for cfg in x_list]
         
         legal_configs = configs
         illegal_results = []
@@ -82,18 +82,32 @@ def __call__(self, x, check_restrictions=True):
             legal_configs, illegal_results = self._get_legal_configs(configs)
         
         final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
-
         # get numerical return values, taking optimization direction into account
         all_results = final_results + illegal_results
         return_values = []
         for result in all_results:
             return_value = result[self.tuning_options.objective] or sys.float_info.max
             return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
-
+        
         if len(return_values) == 1:
             return return_values[0]
         return return_values
     
+    def _is_single_configuration(self, x):
+        # Check if x is an int or float
+        if isinstance(x, (int, float)):
+            return True
+        
+        # Check if x is a numpy array with only floats or ints
+        if isinstance(x, np.ndarray):
+            return x.dtype.kind in 'if'  # Checks for data type being integer ('i') or float ('f')
+
+        # Check if x is a list or tuple and all elements are int or float
+        if isinstance(x, (list, tuple)):
+            return all(isinstance(item, (int, float)) for item in x)
+        
+        return False
+    
     def _prepare_config(self, x):
         """Prepare a single configuration by snapping to nearest values and/or scaling."""
         if self.snap:
@@ -103,10 +117,9 @@ def _prepare_config(self, x):
                 params = snap_to_nearest_config(x, self.searchspace.tune_params)
         else:
             params = x
-        logging.debug('params ' + str(params))
         return params
     
-    def _get_legal_configs(self, configs) -> list:
+    def _get_legal_configs(self, configs):
             results = []
             legal_configs = []
             for config in configs:
@@ -121,11 +134,10 @@ def _get_legal_configs(self, configs) -> list:
     
     def _evaluate_configs(self, configs):
         results = self.runner.run(configs, self.tuning_options)
-        self.results.extend(results)
 
         final_results = []
         for result in results:
-            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            config = tuple(result[key] for key in self.tuning_options.tune_params if key in result)
             x_int = ",".join([str(i) for i in config])
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
@@ -134,7 +146,7 @@ def _evaluate_configs(self, configs):
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 
-        self.results.append(final_results)
+        self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time
         self.runner.last_strategy_start_time = perf_counter()
 
@@ -272,7 +284,6 @@ def scale_from_params(params, tune_params, eps):
 
 def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"Number of devices available: {num_devices}", file=sys.stderr)
     if num_devices < ensemble_size:
         raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
     

From af9bd5e17eebe2c16f249847850ac1b5a735ebf3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:52:11 +0200
Subject: [PATCH 44/97] added progress prints of memetic algo and kill
 statement for cache manager

---
 kernel_tuner/strategies/memetic.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 5708099be..6a679a913 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -1,10 +1,12 @@
 import logging
 import ray
 import os
+import sys
 
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import setup_resources
 
@@ -109,10 +111,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                                 resources=resources)
     
     for i in range(iterations):
-        print(f"Memetic algorithm iteration {i}")
-        print(f"start local search ensemble with candidates = {tuning_options.strategy_options['candidates']}")
+        print(f"Memetic iteration: {i}", file=sys.stderr)
+        print(f"Candidates local search: {tuning_options.strategy_options['candidates']}", file=sys.stderr)
         ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
-        print(f"start pop base algo with population = {tuning_options.strategy_options['population']}")
+        print(f"Population pop based: {tuning_options.strategy_options['population']}", file=sys.stderr)
         results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
 
+    ray.kill(cache_manager)
+
     return results
\ No newline at end of file

From d382f05be6c7e00a6ec966f67a778176be4af447 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:13:29 +0200
Subject: [PATCH 45/97] sort results for retrospective stop criterion check

---
 kernel_tuner/strategies/common.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 8db9cf103..d50863f8f 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -134,6 +134,9 @@ def _get_legal_configs(self, configs):
     
     def _evaluate_configs(self, configs):
         results = self.runner.run(configs, self.tuning_options)
+        # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
+        if "timestamp" in results[0]:
+            results.sort(key=lambda x: x['timestamp'])
 
         final_results = []
         for result in results:
@@ -142,7 +145,7 @@ def _evaluate_configs(self, configs):
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
                 self.tuning_options.unique_results[x_int] = result
-                # check if max_fevals is reached or time limit is exceeded within the the results
+                # check restrospectively if max_fevals is reached or time limit is exceeded within the the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 

From 218b8acd66f225b6fd90cbcb06b5b31f3f3dec6b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:27:41 +0200
Subject: [PATCH 46/97] added comments

---
 kernel_tuner/strategies/common.py | 72 +++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index d50863f8f..2f0fe1693 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -109,7 +109,16 @@ def _is_single_configuration(self, x):
         return False
     
     def _prepare_config(self, x):
-        """Prepare a single configuration by snapping to nearest values and/or scaling."""
+        """
+        Prepare a single configuration by snapping to nearest values and/or scaling.
+
+        Args:
+            x (list): The input configuration to be prepared.
+
+        Returns:
+            list: The prepared configuration.
+
+        """
         if self.snap:
             if self.scaling:
                 params = unscale_and_snap_to_nearest(x, self.searchspace.tune_params, self.tuning_options.eps)
@@ -120,19 +129,41 @@ def _prepare_config(self, x):
         return params
     
     def _get_legal_configs(self, configs):
-            results = []
-            legal_configs = []
-            for config in configs:
-                params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
-                legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
-                if not legal:
-                    params_dict[self.tuning_options.objective] = util.InvalidConfig()
-                    results.append(params_dict)
-                else:
-                    legal_configs.append(config)
-            return legal_configs, results
+        """
+        Filters and categorizes configurations into legal and illegal based on defined restrictions. 
+        Configurations are checked against restrictions; illegal ones are modified to indicate an invalid state and 
+        included in the results. Legal configurations are collected and returned for potential use.
+
+        Parameters:
+            configs (list of tuple): Configurations to be checked, each represented as a tuple of parameter values.
+
+        Returns:
+            tuple: A pair containing a list of legal configurations and a list of results with illegal configurations marked.
+        """
+        results = []
+        legal_configs = []
+        for config in configs:
+            params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
+            legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
+            if not legal:
+                params_dict[self.tuning_options.objective] = util.InvalidConfig()
+                results.append(params_dict)
+            else:
+                legal_configs.append(config)
+        return legal_configs, results
     
     def _evaluate_configs(self, configs):
+        """
+        Evaluate and manage configurations based on tuning options. Results are sorted by timestamp to maintain 
+        order during parallel processing. The function ensures no duplicates in results and checks for stop criteria 
+        post-processing. Strategy start time is updated upon completion.
+
+        Parameters:
+            configs (list): Configurations to be evaluated.
+
+        Returns:
+            list of dict: Processed results of the evaluations.
+        """
         results = self.runner.run(configs, self.tuning_options)
         # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
         if "timestamp" in results[0]:
@@ -145,7 +176,7 @@ def _evaluate_configs(self, configs):
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
                 self.tuning_options.unique_results[x_int] = result
-                # check restrospectively if max_fevals is reached or time limit is exceeded within the the results
+                # check retrospectively if max_fevals is reached or time limit is exceeded within the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 
@@ -286,6 +317,21 @@ def scale_from_params(params, tune_params, eps):
     return x
 
 def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
+    """
+    Configures resources for an ensemble based on device availability and ensemble size. Checks device 
+    availability against the required number and assigns necessary resources to each GPU and the cache manager.
+
+    Parameters:
+        ensemble_size (int): Required number of devices.
+        simulation_mode (bool): Indicates if the simulation mode affects device availability.
+        runner: Provides access to device information.
+
+    Returns:
+        dict: Resource allocation for GPUs and other components.
+
+    Raises:
+        ValueError: If available devices are insufficient for the ensemble size.
+    """
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
         raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")

From 79b7a506e9ff068b54fa1b50efc8d2e57ccb6605 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:38:15 +0200
Subject: [PATCH 47/97] updated returning results logic in _evaluate_configs()

---
 kernel_tuner/strategies/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 2f0fe1693..76990b575 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -179,6 +179,8 @@ def _evaluate_configs(self, configs):
                 # check retrospectively if max_fevals is reached or time limit is exceeded within the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
+            # in case of stop creterion reached, save the results so far
+            self.results.append(result)
 
         self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time

From 88f63b4f812123e511f57ca43b786ae69c00a541 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:44:23 +0200
Subject: [PATCH 48/97] added comments

---
 kernel_tuner/strategies/common.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 76990b575..3a6e83612 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -94,18 +94,23 @@ def __call__(self, x, check_restrictions=True):
         return return_values
     
     def _is_single_configuration(self, x):
-        # Check if x is an int or float
+        """
+        Determines if the input is a single configuration based on its type and composition.
+        
+        Parameters:
+            x: The input to check, which can be an int, float, numpy array, list, or tuple.
+
+        Returns:
+            bool: True if `x` is a single configuration, which includes being a singular int or float, 
+                a numpy array of ints or floats, or a list or tuple where all elements are ints or floats.
+                Otherwise, returns False.
+        """
         if isinstance(x, (int, float)):
             return True
-        
-        # Check if x is a numpy array with only floats or ints
         if isinstance(x, np.ndarray):
             return x.dtype.kind in 'if'  # Checks for data type being integer ('i') or float ('f')
-
-        # Check if x is a list or tuple and all elements are int or float
         if isinstance(x, (list, tuple)):
             return all(isinstance(item, (int, float)) for item in x)
-        
         return False
     
     def _prepare_config(self, x):

From a2afd1d451fd19a8d34249be2f2315378dc8545d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 30 Apr 2024 15:54:04 +0200
Subject: [PATCH 49/97] updates to run more strategies then devices available

---
 kernel_tuner/runners/parallel.py              | 31 +++++------
 .../runners/ray/parallel_remote_actor.py      |  2 -
 kernel_tuner/runners/ray/remote_actor.py      |  7 +--
 kernel_tuner/strategies/common.py             | 29 ++---------
 kernel_tuner/strategies/ensemble.py           | 52 ++++++++++++-------
 kernel_tuner/strategies/memetic.py            | 19 +++----
 6 files changed, 64 insertions(+), 76 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 98e6a4c63..fe06a8c2d 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,7 +12,7 @@
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, resources=None):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, num_gpus, cache_manager=None):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -27,17 +27,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.iterations = iterations
         self.device_options = device_options
         self.cache_manager = cache_manager
+        self.num_gpus = num_gpus
 
-        # Define cluster resources
-        self.num_gpus = get_num_devices(kernel_source.lang)
-        if resources is None:
-            for id in range(self.num_gpus):
-                gpu_resource_name = f"gpu_{id}"
-                resources[gpu_resource_name] = 1
         # Initialize Ray
         if not ray.is_initialized():
             os.environ["RAY_DEDUP_LOGS"] = "0"
-            ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+            ray.init(include_dashboard=True, ignore_reinit_error=True)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
@@ -49,7 +44,7 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
                 raise ValueError("A cache manager is required for parallel execution")
             self.cache_manager = cache_manager
         # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(id, self.cache_manager) for id in range(self.num_gpus)]
+        self.actors = [self.create_actor_on_gpu(self.cache_manager) for _ in range(self.num_gpus)]
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
@@ -62,13 +57,11 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
         
         return results
     
-    def create_actor_on_gpu(self, gpu_id, cache_manager):
-        gpu_resource_name = f"gpu_{gpu_id}"
-        return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
-                                                                            self.kernel_source, 
-                                                                            self.kernel_options, 
-                                                                            self.device_options, 
-                                                                            self.iterations, 
-                                                                            self.observers,
-                                                                            gpu_id,
-                                                                            cache_manager)
+    def create_actor_on_gpu(self, cache_manager):
+        return ParallelRemoteActor.remote(self.quiet,
+                                            self.kernel_source, 
+                                            self.kernel_options, 
+                                            self.device_options, 
+                                            self.iterations, 
+                                            self.observers,
+                                            cache_manager)
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 051c8689c..bc0d192e7 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -17,10 +17,8 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 gpu_id,
                  cache_manager):
         
-        self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = quiet
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 61127d5be..fba5e0069 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -26,14 +26,15 @@ def __init__(self,
         self.iterations = iterations
         self.observers = observers
         self.cache_manager = cache_manager
+        self.runner = None
         
     def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
         if simulation_mode:
-            runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
+            self.runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers)
         else:
-            runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+            self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers, cache_manager=self.cache_manager)
-        results = strategy.tune(searchspace, runner, tuning_options)
+        results = strategy.tune(searchspace, self.runner, tuning_options)
         return results, tuning_options
     
\ No newline at end of file
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 3a6e83612..65db1831c 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -1,6 +1,7 @@
 import logging
 import sys
 from time import perf_counter
+import warnings
 
 import numpy as np
 
@@ -323,29 +324,9 @@ def scale_from_params(params, tune_params, eps):
         x[i] = 0.5 * eps + v.index(params[i])*eps
     return x
 
-def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
-    """
-    Configures resources for an ensemble based on device availability and ensemble size. Checks device 
-    availability against the required number and assigns necessary resources to each GPU and the cache manager.
-
-    Parameters:
-        ensemble_size (int): Required number of devices.
-        simulation_mode (bool): Indicates if the simulation mode affects device availability.
-        runner: Provides access to device information.
-
-    Returns:
-        dict: Resource allocation for GPUs and other components.
-
-    Raises:
-        ValueError: If available devices are insufficient for the ensemble size.
-    """
+def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
+    
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
-        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
-    
-    resources = {}
-    for id in range(ensemble_size):
-        device_resource_name = f"gpu_{id}"
-        resources[device_resource_name] = 1
-    resources["cache_manager_cpu"] = 1
-    return resources
\ No newline at end of file
+         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+    
\ No newline at end of file
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 53bedd010..eaac68284 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -4,14 +4,13 @@
 import ray
 import copy
 import logging
-from ray.util.actor_pool import ActorPool
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, setup_resources
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -53,31 +52,47 @@
 
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    ensemble = []
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
-        ensemble = ["random_sample", "random_sample"]
-    resources = setup_resources(len(ensemble), simulation_mode, runner)
+        ensemble = ["greedy_ils", "greedy_ils"]
+    ensemble_size = len(ensemble)
+    
     # Initialize Ray
     if not ray.is_initialized():
+        check_num_devices(ensemble_size, simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    
     # Create cache manager and actors
     kill_cache_manager = False
     if cache_manager is None:
         kill_cache_manager = True
-        cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
-    actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
+        cache_manager = CacheManager.remote(tuning_options)
+    actors = [create_actor(runner, cache_manager, simulation_mode) for _ in range(ensemble_size)]
     
+    # Execute all actor with one strategy each
     ensemble = [strategy_map[strategy] for strategy in ensemble]
-    tasks = []
-    for i in range(len(ensemble)):
+    pending_tasks = {}
+    for i in range(ensemble_size):
         strategy = ensemble[i]
         actor = actors[i]
         remote_tuning_options = setup_tuning_options(tuning_options)
         task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
-        tasks.append(task)
-    all_results = ray.get(tasks)
+        pending_tasks[task] = actor
+    
+    # As soon as an actor is done we need to kill it to give space to other actors
+    all_results = []
+    while pending_tasks:
+        done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
+        for done_id in done_ids:
+            result = ray.get(done_id)
+            all_results.append(result)
+            actor = pending_tasks.pop(done_id)
+            ray.kill(actor)
+
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
     final_results, population = process_results(all_results, searchspace)
@@ -88,18 +103,17 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
-def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
-    gpu_resource_name = f"gpu_{gpu_id}"
+def create_actor(runner, cache_manager, simulation_mode):
     if simulation_mode:
         resource_options= {"num_cpus": 1}
     else:
         resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options, resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
-                                                                                            runner.kernel_options, 
-                                                                                            runner.device_options, 
-                                                                                            runner.iterations, 
-                                                                                            runner.observers,
-                                                                                            cache_manager)
+    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
+                                                            runner.kernel_options, 
+                                                            runner.device_options, 
+                                                            runner.iterations, 
+                                                            runner.observers,
+                                                            cache_manager)
 
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 6a679a913..f582a6eb7 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -8,7 +8,8 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import setup_resources
+from kernel_tuner.strategies.common import check_num_devices
+from kernel_tuner.util import get_num_devices
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -81,7 +82,7 @@
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    ls_strategies = ["greedy_ils", "greedy_ils", "greedy_ils", "greedy_ils"]
+    ls_strategies = ['greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils']
     pop_based_strategy = "genetic_algorithm"
     iterations = 10
 
@@ -96,19 +97,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         raise ValueError("Provided population based strategy is not a population based strategy")
     
     tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
-    tuning_options.strategy_options["max_fevals"] = 10
-    tuning_options.strategy_options["maxiter"] = 10
+    tuning_options.strategy_options["max_fevals"] = (100 // iterations) // 2
+    tuning_options.strategy_options["maxiter"] = (100 // iterations) // 2
 
-    resources = setup_resources(len(ls_strategies), simulation_mode, runner)
     # Initialize Ray
     if not ray.is_initialized():
+        check_num_devices(len(ls_strategies), simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     # Create cache manager and actors
-    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    cache_manager = CacheManager.remote(tuning_options)
     pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, cache_manager=cache_manager,
-                                resources=resources)
+                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager)
     
     for i in range(iterations):
         print(f"Memetic iteration: {i}", file=sys.stderr)

From d950b2db42daee5fe893256e0e8aa2d6907d4d5a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:07:31 +0200
Subject: [PATCH 50/97] returning last two lists of candidates for memetic algo

---
 kernel_tuner/strategies/ensemble.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index eaac68284..810c6fb09 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -95,10 +95,12 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
 
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
-    final_results, population = process_results(all_results, searchspace)
+    final_results, population, candidates = process_results(all_results, searchspace)
 
     if population: # for memetic strategy
         tuning_options.strategy_options["population"] = population
+    if candidates: # for memetic strategy
+        tuning_options.strategy_options["candidates"] = candidates
 
     clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
@@ -126,8 +128,11 @@ def process_results(all_results, searchspace):
     unique_configs = set()
     final_results = []
     population = [] # for memetic strategy
+    candidates = [] # for memetic strategy
 
     for (strategy_results, tuning_options) in all_results:
+        if "old_candidate" in tuning_options.strategy_options:
+            candidates.append(tuning_options.strategy_options["old_candidate"])
         if "candidate" in tuning_options.strategy_options:
             population.append(tuning_options.strategy_options["candidate"])
         for new_result in strategy_results:
@@ -135,7 +140,7 @@ def process_results(all_results, searchspace):
             if config_signature not in unique_configs:
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
-    return final_results, population
+    return final_results, population, candidates
 
 def clean_up(actors, cache_manager, kill_cache_manager):
     for actor in actors:

From 980777fb57e33c2d2a1603f72ba590e9d1ab5534 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:09:21 +0200
Subject: [PATCH 51/97] returning last two candidates for memetic algo

---
 kernel_tuner/strategies/greedy_ils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index 1aa00ec51..575b89bd2 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -34,12 +34,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     #while searching
     if not candidate:
         candidate = searchspace.get_random_sample(1)[0]
+    old_candidate = candidate # for memetic strategy
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
 
         try:
+            old_candidate = candidate # for memetic strategy
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
@@ -55,6 +57,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
+    tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
     tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 

From 95a2f0fe427230c6fee05ead46a51a72e2535d11 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:10:02 +0200
Subject: [PATCH 52/97] returning last two populations for memetic algo

---
 kernel_tuner/strategies/genetic_algorithm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index 7142ac6cb..310ff820f 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -31,11 +31,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         population = list(list(p) for p in searchspace.get_random_sample(pop_size))
     else:
         pop_size = len(population)
-
+    
     for generation in range(generations):
 
         # Evaluate the entire population
         try:
+            old_population = population
             weighted_population = evaluate_population(runner, cost_func, population)
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
@@ -70,6 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
+    tuning_options.strategy_options["population"] = old_population # for memetic strategy
     tuning_options.strategy_options["candidates"] = population # for memetic strategy
     return cost_func.results
 

From 89c499bbee6df0722de3693371ef54c6d69317f6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:11:14 +0200
Subject: [PATCH 53/97] implemented adaptive local search depth logic and fix
 few issues, works also in simulation

---
 kernel_tuner/strategies/memetic.py | 135 +++++++++++++++++++++--------
 1 file changed, 99 insertions(+), 36 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index f582a6eb7..e49d8ed55 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -2,6 +2,7 @@
 import ray
 import os
 import sys
+import copy
 
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.runners.parallel import ParallelRunner
@@ -49,22 +50,6 @@
     "bayes_opt": bayes_opt,
 }
 
-# Pseudo code from "Memetic algorithms and memetic computing optimization: A literature review" by Ferrante Neri and Carlos Cotta
-# function BasicMA (in P: Problem, in par: Parameters):
-# Solution; 
-# begin 
-#     pop ← Initialize(par, P); 
-#     repeat 
-#         newpop1 ← Cooperate(pop, par, P); 
-#         newpop2 ← Improve(newpop1, par, P); 
-#         pop ← Compete (pop, newpop2); 
-#         if Converged(pop) then 
-#             pop ← Restart(pop, par); 
-#         end 
-#     until TerminationCriterion(par); 
-#     return GetNthBest(pop, 1); 
-# end
-
 ls_strategies_list = {
     "greedy_mls",
     "ordered_greedy_mls",
@@ -81,43 +66,121 @@
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
+    options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    ls_strategies = ['greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils']
-    pop_based_strategy = "genetic_algorithm"
-    iterations = 10
+    local_search = options.get('local_search', 'greedy_ils')
+    global_search = options.get('global_search', "genetic_algorithm")
+    max_feval = options.get("max_fevals", 100)
+    alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
+    lsd = options.get("lsd", 25) # Local Search Depth (LSD)
+    maxiter = options.get("maxiter", 3)
+    popsize = options.get("popsize", 10)
 
-    if set(ls_strategies) <= ls_strategies_list:
-        tuning_options["ensemble"] = ls_strategies
+    if local_search in ls_strategies_list:
+        tuning_options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
-    if pop_based_strategy in pop_based_strategies_list:
-        pop_based_strategy = strategy_map[pop_based_strategy]
+    if global_search in pop_based_strategies_list:
+        global_search = strategy_map[global_search]
     else:
         raise ValueError("Provided population based strategy is not a population based strategy")
     
-    tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
-    tuning_options.strategy_options["max_fevals"] = (100 // iterations) // 2
-    tuning_options.strategy_options["maxiter"] = (100 // iterations) // 2
+    tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
 
     # Initialize Ray
     if not ray.is_initialized():
-        check_num_devices(len(ls_strategies), simulation_mode, runner)
+        check_num_devices(popsize, simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
         ray.init(include_dashboard=True, ignore_reinit_error=True)
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     # Create cache manager and actors
     cache_manager = CacheManager.remote(tuning_options)
-    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager)
+    if simulation_mode:
+        pop_runner = runner
+    else:
+        pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
+                                simulation_mode=simulation_mode)
     
-    for i in range(iterations):
-        print(f"Memetic iteration: {i}", file=sys.stderr)
-        print(f"Candidates local search: {tuning_options.strategy_options['candidates']}", file=sys.stderr)
-        ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
-        print(f"Population pop based: {tuning_options.strategy_options['population']}", file=sys.stderr)
-        results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
+    all_results = []
+    all_results_dict = {}
+    feval = 0
+    while feval < max_feval:
+        print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
+        if feval + lsd + maxiter * popsize > max_feval:
+            lsd = max_feval - feval - maxiter * popsize
+        print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
+        # Global Search (GS)
+        print(f"DEBUG:=================Global Search=================", file=sys.stderr)
+        tuning_options.strategy_options["maxiter"] = maxiter
+        pop_start_gs = copy.deepcopy(tuning_options.strategy_options["population"])
+        results = global_search.tune(searchspace, pop_runner, tuning_options)
+        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
+        feval += maxiter * popsize
+
+        pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
+        pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
+        pop_end_gs_res = get_pop_results(pop_end_gs, all_results_dict)
+        afi_gs = calculate_afi(pop_start_gs_res, pop_end_gs_res, maxiter, all_results_dict)
+
+        # Local Search (LS)
+        print(f"DEBUG:=================Local Search=================", file=sys.stderr)
+        tuning_options.strategy_options["max_fevals"] = lsd
+        pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
+        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
+        feval += lsd
+
+        pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
+        pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
+        pop_end_ls_res = get_pop_results(pop_end_ls, all_results_dict)
+        afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
+
+        # Adaptive Local Search Depth (ALSD)
+        if lsd > 3:
+            if afi_ls > afi_gs:
+                lsd += alsd
+            elif afi_ls < afi_gs:
+                lsd -= alsd
+            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
 
-    return results
\ No newline at end of file
+    return results
+
+def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
+    delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
+    afi = delta_fitness / feval
+    print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
+    return afi
+
+def fitness_increment(pop_before, pop_after):
+    if len(pop_before) != len(pop_after):
+        raise ValueError("populations must have the same size.")
+    
+    sum_before = sum(t for t in pop_before if isinstance(t, float))
+    sum_after = sum(t for t in pop_after if isinstance(t, float))
+    difference_sum = sum_before - sum_after
+    print(f"DEBUG:fitness_increment difference_sum: {difference_sum}", file=sys.stderr)
+    return difference_sum
+
+def get_pop_results(pop, results):
+    print(f"DEBUG:get_pop_results pop = {pop}", file=sys.stderr)
+    times = []
+    for entry in pop:
+        key = ','.join(map(str, entry))
+        if key in results:
+            time = results[key]
+            times.append(time)
+        else:
+            times.append(None)
+
+    print(f"DEBUG:get_pop_results times = {times}", file=sys.stderr)
+    return times
+
+def add_to_results(all_results, all_results_dict, results, tune_params):
+    for result in results:
+        key = ",".join(str(result[param]) for param in tune_params)
+        all_results_dict[key] = result["time"]
+        all_results.append(result)

From babba0b95e047b3dd6c1162c6d6f09ca7307d7ce Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 6 May 2024 15:33:40 +0200
Subject: [PATCH 54/97] modifications related to last iteration of memetic algo

---
 kernel_tuner/strategies/memetic.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index e49d8ed55..8f18c3076 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -73,8 +73,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     max_feval = options.get("max_fevals", 100)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 3)
-    popsize = options.get("popsize", 10)
+    maxiter = options.get("maxiter", 2)
+    popsize = options.get("popsize", 20)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -108,9 +108,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     feval = 0
     while feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        if feval + lsd + maxiter * popsize > max_feval:
-            lsd = max_feval - feval - maxiter * popsize
+        feval_left = max_feval - feval
+        if feval_left < lsd + maxiter * popsize:
+            maxiter = feval_left // popsize
+            if maxiter == 1: # It doesnt make sense to have one generation for global search, so we give all final resources to local search
+                maxiter = 0
+                lsd = feval_left
+            lsd = feval_left - maxiter * popsize
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
+
         # Global Search (GS)
         print(f"DEBUG:=================Global Search=================", file=sys.stderr)
         tuning_options.strategy_options["maxiter"] = maxiter
@@ -138,20 +144,21 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
 
         # Adaptive Local Search Depth (ALSD)
-        if lsd > 3:
+        if afi_gs is not None and afi_ls is not None:
             if afi_ls > afi_gs:
                 lsd += alsd
             elif afi_ls < afi_gs:
-                lsd -= alsd
-            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
+                lsd -= alsd if lsd - alsd > 5 else 5
+                print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
 
     return results
 
 def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
+    # Average Fitness Increment (AFI)
     delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval
+    afi = delta_fitness / feval if feval > 0 else None
     print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
     return afi
 

From e0e1e61b2db8cea9ce86f3ebfe3b1fcd433d0b9b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 6 May 2024 15:34:27 +0200
Subject: [PATCH 55/97] updates related to old popuation logic

---
 kernel_tuner/strategies/genetic_algorithm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index 310ff820f..b082ce3c6 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -32,6 +32,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     else:
         pop_size = len(population)
     
+    old_population = population
     for generation in range(generations):
 
         # Evaluate the entire population

From 630578253194d3e5662b06811dea654045f9ed11 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:51:29 +0200
Subject: [PATCH 56/97] unified two actors into one

---
 .../runners/ray/parallel_remote_actor.py      | 47 -------------------
 kernel_tuner/runners/ray/remote_actor.py      | 43 ++++++++++-------
 2 files changed, 27 insertions(+), 63 deletions(-)
 delete mode 100644 kernel_tuner/runners/ray/parallel_remote_actor.py

diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
deleted file mode 100644
index bc0d192e7..000000000
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-from kernel_tuner.runners.sequential import SequentialRunner
-
-@ray.remote(num_gpus=1)
-class ParallelRemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 cache_manager):
-        
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        self.device_options = device_options
-        self.iterations = iterations
-        self.observers = observers
-        self.cache_manager = cache_manager
-        self.runner = None
-
-    def execute(self, element, tuning_options):
-        if self.runner is None:
-            self.init_runner()
-        return self.runner.run([element], tuning_options)[0]
-
-    def init_runner(self):
-        if self.cache_manager is None:
-            raise ValueError("Cache manager is not set.")
-        self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                       self.iterations, self.observers, cache_manager=self.cache_manager)
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index fba5e0069..3eceb4414 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,11 +1,5 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
 import ray
-import sys
 
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 
@@ -17,24 +11,41 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 cache_manager):
-        
+                 cache_manager=None,
+                 simulation_mode=False):
         self.kernel_source = kernel_source
-        self.simulation_mode = False
         self.kernel_options = kernel_options
         self.device_options = device_options
         self.iterations = iterations
         self.observers = observers
         self.cache_manager = cache_manager
+        self.simulation_mode = simulation_mode
         self.runner = None
+            
+    def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        if self.runner is None:
+            self.init_runner()
+        if strategy and searchspace:
+            results = strategy.tune(searchspace, self.runner, tuning_options)
+            return results, tuning_options
+        elif element:
+            return self.runner.run([element], tuning_options)[0]
+        else:
+            raise ValueError("Invalid arguments for ray actor's execute method.")
         
-    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
-        if simulation_mode:
+    def set_cache_manager(self, cache_manager):
+        if self.cache_manager is None:
+            self.cache_manager = cache_manager
+
+    def get_cache_magaer(self):
+        return self.cache_manager
+    
+    def init_runner(self):
+        if self.cache_manager is None:
+            raise ValueError("Cache manager is not set.")
+        if self.simulation_mode:
             self.runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers)
+                                            self.iterations, self.observers)
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers, cache_manager=self.cache_manager)
-        results = strategy.tune(searchspace, self.runner, tuning_options)
-        return results, tuning_options
-    
\ No newline at end of file
+                                       self.iterations, self.observers, cache_manager=self.cache_manager)

From 0f2b7e4190833b39b65996f8ea23eb818c28836e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:52:55 +0200
Subject: [PATCH 57/97] updates related to actors unification and memetic algo
 development

---
 kernel_tuner/runners/parallel.py | 45 +++++++++++++++-----------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index fe06a8c2d..76f27c619 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,14 +7,16 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.ray.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, num_gpus, cache_manager=None):
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
+                 num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
@@ -28,11 +30,17 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.device_options = device_options
         self.cache_manager = cache_manager
         self.num_gpus = num_gpus
+        self.actors = actors
 
-        # Initialize Ray
-        if not ray.is_initialized():
-            os.environ["RAY_DEDUP_LOGS"] = "0"
-            ray.init(include_dashboard=True, ignore_reinit_error=True)
+        if num_gpus is None:
+            self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
+
+        initialize_ray(num_gpus)
+
+        # Create RemoteActor instances
+        if actors is None:
+            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
+            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, simulation_mode, id) for id in range(self.num_gpus)]
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
@@ -41,27 +49,16 @@ def get_environment(self, tuning_options):
     def run(self, parameter_space, tuning_options, cache_manager=None):
         if self.cache_manager is None:
             if cache_manager is None:
-                raise ValueError("A cache manager is required for parallel execution")
+                cache_manager = CacheManager.remote(tuning_options)
             self.cache_manager = cache_manager
-        # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(self.cache_manager) for _ in range(self.num_gpus)]
+        # set the cache manager for each actor. Can't be done in constructor because we do not have yet the tuning_options
+        for actor in self.actors:
+            ray.get(actor.set_cache_manager.remote(self.cache_manager))
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
+        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
         new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
-
-        for actor in self.actors:
-            ray.kill(actor)
         
         return results
-    
-    def create_actor_on_gpu(self, cache_manager):
-        return ParallelRemoteActor.remote(self.quiet,
-                                            self.kernel_source, 
-                                            self.kernel_options, 
-                                            self.device_options, 
-                                            self.iterations, 
-                                            self.observers,
-                                            cache_manager)

From 63ddedb5ba8b364e60cad24244913a712d33b97b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:53:39 +0200
Subject: [PATCH 58/97] added create_actor_on_device and initialize_ray

---
 kernel_tuner/strategies/common.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 65db1831c..787750825 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -2,12 +2,14 @@
 import sys
 from time import perf_counter
 import warnings
+import ray
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -46,7 +48,8 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", "maxiter"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
+                                       "maxiter", "lsd", "popsize", "alsd", ]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -329,4 +332,28 @@ def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
          warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
-    
\ No newline at end of file
+
+def create_actor_on_device(kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, id):
+    # Check if Ray is initialized, raise an error if not
+    if not ray.is_initialized():
+        raise RuntimeError("Ray is not initialized. Initialize Ray before creating an actor (remember to include resources).")
+
+    if simulation_mode:
+        resource_options = {"num_cpus": 1}
+    else:
+        resource_options = {"num_gpus": 1}
+    
+    # Create the actor with the specified options and resources
+    return RemoteActor.options(**resource_options).remote(kernel_source, 
+                                                            kernel_options, 
+                                                            device_options, 
+                                                            iterations, 
+                                                            observers,
+                                                            cache_manager=cache_manager,
+                                                            simulation_mode=simulation_mode)
+
+def initialize_ray():
+    # Initialize Ray
+    if not ray.is_initialized():
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+

From d7fe9b40a919cd6b3f8afbe588ac260a4eb4b393 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:55:45 +0200
Subject: [PATCH 59/97] updates realted to unification of actors, memetic algo,
 and reutilization of actors for performance

---
 kernel_tuner/strategies/ensemble.py | 59 ++++++++++++-----------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 810c6fb09..f678f7b74 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -4,13 +4,15 @@
 import ray
 import copy
 import logging
+import warnings
+from collections import deque
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices, create_actor_on_device, initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -50,40 +52,41 @@
     "bayes_opt": bayes_opt,
 }
 
-def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
+def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"DEBUG: num_devices={num_devices}", file=sys.stderr)
+    
     ensemble = []
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
+    if num_devices < ensemble_size:
+        warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+    num_actors = num_devices if ensemble_size > num_devices else ensemble_size
     
-    # Initialize Ray
-    if not ray.is_initialized():
-        check_num_devices(ensemble_size, simulation_mode, runner)
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    initialize_ray(num_devices)
     
     # Create cache manager and actors
-    kill_cache_manager = False
     if cache_manager is None:
-        kill_cache_manager = True
         cache_manager = CacheManager.remote(tuning_options)
-    actors = [create_actor(runner, cache_manager, simulation_mode) for _ in range(ensemble_size)]
+    if actors is None:
+        runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
+        actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
     
     # Execute all actor with one strategy each
     ensemble = [strategy_map[strategy] for strategy in ensemble]
+    ensemble_queue = deque(ensemble)
     pending_tasks = {}
-    for i in range(ensemble_size):
-        strategy = ensemble[i]
-        actor = actors[i]
+    for actor in actors:
+        strategy = ensemble_queue.popleft()
         remote_tuning_options = setup_tuning_options(tuning_options)
-        task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
+        task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
         pending_tasks[task] = actor
     
-    # As soon as an actor is done we need to kill it to give space to other actors
+
     all_results = []
     while pending_tasks:
         done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
@@ -91,7 +94,12 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
             result = ray.get(done_id)
             all_results.append(result)
             actor = pending_tasks.pop(done_id)
-            ray.kill(actor)
+
+            if ensemble_queue:
+                strategy = ensemble_queue.popleft()
+                remote_tuning_options = setup_tuning_options(tuning_options)
+                task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+                pending_tasks[task] = actor
 
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
@@ -102,21 +110,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     if candidates: # for memetic strategy
         tuning_options.strategy_options["candidates"] = candidates
 
-    clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
-def create_actor(runner, cache_manager, simulation_mode):
-    if simulation_mode:
-        resource_options= {"num_cpus": 1}
-    else:
-        resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
-                                                            runner.kernel_options, 
-                                                            runner.device_options, 
-                                                            runner.iterations, 
-                                                            runner.observers,
-                                                            cache_manager)
-
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
     if "candidates" in tuning_options.strategy_options:
@@ -141,9 +136,3 @@ def process_results(all_results, searchspace):
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
     return final_results, population, candidates
-
-def clean_up(actors, cache_manager, kill_cache_manager):
-    for actor in actors:
-        ray.kill(actor)
-    if kill_cache_manager:
-        ray.kill(cache_manager)

From 46fcde17613a9d2529468ece9b0f51a043784eb1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:56:37 +0200
Subject: [PATCH 60/97] returning 80% of cpus for simulation mode in
 get_num_devices

---
 kernel_tuner/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index b9ecf9b3a..434feb6ff 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1282,7 +1282,7 @@ def cuda_error_check(error):
 def get_num_devices(lang, simulation_mode=False):
     num_devices = 0
     if simulation_mode:
-        num_devices = os.cpu_count()
+        num_devices = int(round(os.cpu_count() * 0.8)) # keep resources for the main process and other tasks
     elif lang.upper() == "CUDA":
         import pycuda.driver as cuda
         cuda.init()

From d54384808fcebc130d268bc90537179c35160141 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:57:18 +0200
Subject: [PATCH 61/97] updates realted to actor unification and reutilization
 of actors for performance

---
 kernel_tuner/strategies/memetic.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 8f18c3076..c8d18887c 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -9,8 +9,9 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import check_num_devices
+from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -70,7 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 100)
+    max_feval = options.get("max_fevals", 500)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
@@ -88,20 +89,17 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     
     tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
 
-    # Initialize Ray
-    if not ray.is_initialized():
-        check_num_devices(popsize, simulation_mode, runner)
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(include_dashboard=True, ignore_reinit_error=True)
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    # Create cache manager and actors
+    check_num_devices(num_gpus, simulation_mode, runner)
+    initialize_ray(num_gpus)
+    # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options)
-    if simulation_mode:
-        pop_runner = runner
-    else:
-        pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+    num_actors = num_gpus if num_gpus < popsize else popsize
+    runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
+    actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
+    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                 runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
-                                simulation_mode=simulation_mode)
+                                simulation_mode=simulation_mode, actors=actors)
     
     all_results = []
     all_results_dict = {}
@@ -134,7 +132,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         print(f"DEBUG:=================Local Search=================", file=sys.stderr)
         tuning_options.strategy_options["max_fevals"] = lsd
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd
 
@@ -152,6 +150,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
+    for actor in actors:
+        ray.kill(actor)
 
     return results
 

From 15df6ea9b72f4eee09d4ec1cf0f0107cf037cb43 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 15:46:46 +0200
Subject: [PATCH 62/97] updates on feval counting and distributing

---
 kernel_tuner/strategies/memetic.py | 48 +++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index c8d18887c..6ecd779e4 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -71,7 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 500)
+    max_feval = options.get("max_fevals", 2000)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
@@ -91,7 +91,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     check_num_devices(num_gpus, simulation_mode, runner)
-    initialize_ray(num_gpus)
+    initialize_ray()
     # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options)
     num_actors = num_gpus if num_gpus < popsize else popsize
@@ -104,15 +104,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     all_results = []
     all_results_dict = {}
     feval = 0
+    afi_gs, afi_ls = None, None
     while feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        feval_left = max_feval - feval
-        if feval_left < lsd + maxiter * popsize:
-            maxiter = feval_left // popsize
-            if maxiter == 1: # It doesnt make sense to have one generation for global search, so we give all final resources to local search
-                maxiter = 0
-                lsd = feval_left
-            lsd = feval_left - maxiter * popsize
+        maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
 
         # Global Search (GS)
@@ -134,7 +129,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += lsd
+        feval += lsd * popsize
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
@@ -147,7 +142,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 lsd += alsd
             elif afi_ls < afi_gs:
                 lsd -= alsd if lsd - alsd > 5 else 5
-                print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
+            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
     for actor in actors:
@@ -191,3 +186,34 @@ def add_to_results(all_results, all_results_dict, results, tune_params):
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]
         all_results.append(result)
+
+def distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls):
+    remaining_feval = max_feval - feval
+    if remaining_feval < (lsd + maxiter) * popsize:
+        # Calculate how many full batches of popsize can still be processed
+        proportion = remaining_feval // popsize
+
+        if afi_gs is None or afi_ls is None:
+            maxiter = int(proportion * 0.5)
+            lsd = int(proportion * 0.5)
+        else:
+            if afi_gs > afi_ls:
+                # More evaluations to maxiter
+                maxiter = int(proportion * 0.6)
+                lsd = int(proportion * 0.4)
+            else:
+                # More evaluations to lsd
+                maxiter = int(proportion * 0.4)
+                lsd = int(proportion * 0.6)
+
+        # If maxiter ends up being 1, assign all remaining feval to lsd
+        if maxiter == 1:
+            lsd = proportion  # Give all available batches to lsd
+            maxiter = 0
+
+        # Ensure at least one of maxiter or lsd is non-zero if there are still fevals to be used
+        if maxiter == 0 and lsd == 0 and remaining_feval > 0:
+            lsd = 1  # Allocate at least one batch to lsd to ensure progress
+
+    return maxiter, lsd
+ 
\ No newline at end of file

From ec719a209d9ad3ad43dc6cb8fc7e7ca99578db7c Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:37:36 +0200
Subject: [PATCH 63/97] added logic for time limit stop

---
 kernel_tuner/strategies/memetic.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 6ecd779e4..1800bb0b6 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -10,7 +10,7 @@
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, check_stop_criterion, StopCriterionReached
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 from kernel_tuner.strategies import (
@@ -71,11 +71,11 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 2000)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
     popsize = options.get("popsize", 20)
+    max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -105,9 +105,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     all_results_dict = {}
     feval = 0
     afi_gs, afi_ls = None, None
-    while feval < max_feval:
+    while (max_feval is None) or feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
+        if max_feval is not None:
+            maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
 
         # Global Search (GS)
@@ -117,6 +118,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = global_search.tune(searchspace, pop_runner, tuning_options)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += maxiter * popsize
+        try:
+            check_stop_criterion(tuning_options)
+        except StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            break
 
         pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
         pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
@@ -130,6 +137,13 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
+        try:
+            print(f"DEBUG: check for sto criterion in memetic algo", file=sys.stderr)
+            check_stop_criterion(tuning_options)
+        except StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            break
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
@@ -148,12 +162,13 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     for actor in actors:
         ray.kill(actor)
 
-    return results
+    return all_results
 
 def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
     # Average Fitness Increment (AFI)
+    assert(feval >= 0)
     delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval if feval > 0 else None
+    afi = delta_fitness / feval if feval > 0 else 0
     print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
     return afi
 
@@ -182,6 +197,8 @@ def get_pop_results(pop, results):
     return times
 
 def add_to_results(all_results, all_results_dict, results, tune_params):
+    print(f"DEBUG:add_to_results results size = {len(results)}", file=sys.stderr)
+    print(f"DEBUG:add_to_results all_results size = {len(all_results)}", file=sys.stderr)
     for result in results:
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]

From 6c2a62b7e3db0c8c0455939be1894edbd4ebbd39 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:40:22 +0200
Subject: [PATCH 64/97] debug prints clean up

---
 kernel_tuner/strategies/memetic.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 1800bb0b6..3f2922f4b 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -138,7 +138,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
         try:
-            print(f"DEBUG: check for sto criterion in memetic algo", file=sys.stderr)
             check_stop_criterion(tuning_options)
         except StopCriterionReached as e:
             if tuning_options.verbose:
@@ -179,7 +178,6 @@ def fitness_increment(pop_before, pop_after):
     sum_before = sum(t for t in pop_before if isinstance(t, float))
     sum_after = sum(t for t in pop_after if isinstance(t, float))
     difference_sum = sum_before - sum_after
-    print(f"DEBUG:fitness_increment difference_sum: {difference_sum}", file=sys.stderr)
     return difference_sum
 
 def get_pop_results(pop, results):
@@ -197,8 +195,6 @@ def get_pop_results(pop, results):
     return times
 
 def add_to_results(all_results, all_results_dict, results, tune_params):
-    print(f"DEBUG:add_to_results results size = {len(results)}", file=sys.stderr)
-    print(f"DEBUG:add_to_results all_results size = {len(all_results)}", file=sys.stderr)
     for result in results:
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]

From c7fd2af656f331177b4be55dae534b0b32f58faa Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:41:47 +0200
Subject: [PATCH 65/97] unified parallel tuning and parallel ensemble logic in
 ParallelRunner

---
 kernel_tuner/runners/parallel.py         | 127 +++++++++++++++++++++--
 kernel_tuner/runners/ray/remote_actor.py |   5 +-
 kernel_tuner/strategies/ensemble.py      |  76 ++------------
 3 files changed, 129 insertions(+), 79 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 76f27c619..e6cdce2ab 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -4,11 +4,13 @@
 import os
 from ray.util.actor_pool import ActorPool
 from time import perf_counter
+from collections import deque
+import copy
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, get_nested_types
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -20,7 +22,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
-        self.simulation_mode = False
+        self.simulation_mode = simulation_mode
         self.start_time = perf_counter()
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
@@ -35,7 +37,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         if num_gpus is None:
             self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
 
-        initialize_ray(num_gpus)
+        initialize_ray()
 
         # Create RemoteActor instances
         if actors is None:
@@ -46,19 +48,126 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options, cache_manager=None):
+    def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
+        if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
+            raise ValueError("tuning_options cannot be None")
+        
         if self.cache_manager is None:
             if cache_manager is None:
                 cache_manager = CacheManager.remote(tuning_options)
             self.cache_manager = cache_manager
-        # set the cache manager for each actor. Can't be done in constructor because we do not have yet the tuning_options
+        
+        # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
             ray.get(actor.set_cache_manager.remote(self.cache_manager))
-        # Create a pool of RemoteActor actors
-        self.actor_pool = ActorPool(self.actors)
-        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+    
+        # Determine what type of parallelism and run appropriately
+        if parameter_space and not ensemble and not searchspace:
+            results, tuning_options_list = self.run_parallel_tuning(tuning_options, parameter_space)
+        elif ensemble and searchspace and not parameter_space:
+            results, tuning_options_list = self.run_parallel_ensemble(ensemble, tuning_options, searchspace)
+        else:
+            raise ValueError("Invalid arguments to parallel runner run method")
+        
+        # Update tuning options
         new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
+        if self.simulation_mode:
+            tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
+            print(f"DEBUG: simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
         
         return results
+
+    def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
+        """
+        Runs strategies from the ensemble in parallel using distributed actors, 
+        manages dynamic task allocation, and collects results.
+        """
+        ensemble_queue = deque(ensemble)
+        pending_tasks = {}
+        all_results = []
+
+        # Start initial tasks for each actor
+        for actor in self.actors:
+            strategy = ensemble_queue.popleft()
+            remote_tuning_options = self._setup_tuning_options(tuning_options)
+            task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+            pending_tasks[task] = actor
+        
+        # Manage task completion and redistribution
+        while pending_tasks:
+            done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
+            for done_id in done_ids:
+                result = ray.get(done_id)
+                all_results.append(result)
+                actor = pending_tasks.pop(done_id)
+
+                # Reassign actors if strategies remain
+                if ensemble_queue:
+                    strategy = ensemble_queue.popleft()
+                    remote_tuning_options = self._setup_tuning_options(tuning_options)
+                    task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+                    pending_tasks[task] = actor
+        
+        # Process results to extract population and candidates for further use
+        results, tuning_options_list, population, candidates = self._process_results_ensemble(all_results)
+
+        # Update tuning options for memetic strategies
+        if population:
+            tuning_options.strategy_options["population"] = population
+        if candidates:
+            tuning_options.strategy_options["candidates"] = candidates
+        return results, tuning_options_list
+    
+    def _setup_tuning_options(self, tuning_options):
+        new_tuning_options = copy.deepcopy(tuning_options)
+        if "candidates" in tuning_options.strategy_options:
+            if len(tuning_options.strategy_options["candidates"]) > 0:
+                new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+        return new_tuning_options
+    
+    def _process_results_ensemble(self, all_results):
+        population = [] # for memetic strategy
+        candidates = [] # for memetic strategy
+        results = []
+        tuning_options_list = []
+
+        for (strategy_results, tuning_options) in all_results:
+            if "old_candidate" in tuning_options.strategy_options:
+                candidates.append(tuning_options.strategy_options["old_candidate"])
+            if "candidate" in tuning_options.strategy_options:
+                population.append(tuning_options.strategy_options["candidate"])
+            results.extend(strategy_results)
+            tuning_options_list.append(tuning_options)
+
+        return results, tuning_options_list, population, candidates
+
+
+    def run_parallel_tuning(self, tuning_options, parameter_space):
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
+        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
+        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+        results = [x[0] for x in all_results]
+        tuning_options_list = [x[1] for x in all_results]
+        return results, tuning_options_list
+    
+    def _process_results(self, all_results, searchspace):
+        unique_configs = set()
+        final_results = []
+
+        for (strategy_results, tuning_options) in all_results:
+            for new_result in strategy_results:
+                config_signature = tuple(new_result[key] for key in searchspace.tune_params)
+                if config_signature not in unique_configs:
+                    final_results.append(new_result)
+                    unique_configs.add(config_signature)
+        return final_results
+    
+    def _calculate_simulated_time(self, tuning_options_list):
+        simulated_times = []
+        for tuning_options in tuning_options_list:
+            print(f"DEBUG:_calculate_simulated_time tuning_options.simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
+            simulated_times.append(tuning_options.simulated_time)
+        #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
+        return max(simulated_times)
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 3eceb4414..3956c8648 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,7 +1,9 @@
 import ray
+import sys
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.util import get_nested_types
 
 @ray.remote
 class RemoteActor():
@@ -29,7 +31,8 @@ def execute(self, tuning_options, strategy=None, searchspace=None, element=None)
             results = strategy.tune(searchspace, self.runner, tuning_options)
             return results, tuning_options
         elif element:
-            return self.runner.run([element], tuning_options)[0]
+            results = self.runner.run([element], tuning_options)[0]
+            return results, tuning_options
         else:
             raise ValueError("Invalid arguments for ray actor's execute method.")
         
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index f678f7b74..78fd85001 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -17,6 +17,7 @@
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.runners.parallel import ParallelRunner
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -55,84 +56,21 @@
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"DEBUG: num_devices={num_devices}", file=sys.stderr)
     
     ensemble = []
     if "ensemble" in tuning_options:
-        ensemble = tuning_options["ensemble"]
+        ensemble = tuning_options.ensemble
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
-    
-    initialize_ray(num_devices)
-    
-    # Create cache manager and actors
-    if cache_manager is None:
-        cache_manager = CacheManager.remote(tuning_options)
-    if actors is None:
-        runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-        actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
-    
-    # Execute all actor with one strategy each
+
     ensemble = [strategy_map[strategy] for strategy in ensemble]
-    ensemble_queue = deque(ensemble)
-    pending_tasks = {}
-    for actor in actors:
-        strategy = ensemble_queue.popleft()
-        remote_tuning_options = setup_tuning_options(tuning_options)
-        task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
-        pending_tasks[task] = actor
+    parallel_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                    runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
+                                    simulation_mode=simulation_mode, actors=actors)
+    final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
     
-
-    all_results = []
-    while pending_tasks:
-        done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
-        for done_id in done_ids:
-            result = ray.get(done_id)
-            all_results.append(result)
-            actor = pending_tasks.pop(done_id)
-
-            if ensemble_queue:
-                strategy = ensemble_queue.popleft()
-                remote_tuning_options = setup_tuning_options(tuning_options)
-                task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
-                pending_tasks[task] = actor
-
-    new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
-    tuning_options.update(new_tuning_options)
-    final_results, population, candidates = process_results(all_results, searchspace)
-
-    if population: # for memetic strategy
-        tuning_options.strategy_options["population"] = population
-    if candidates: # for memetic strategy
-        tuning_options.strategy_options["candidates"] = candidates
-
     return final_results
-
-def setup_tuning_options(tuning_options):
-    new_tuning_options = copy.deepcopy(tuning_options)
-    if "candidates" in tuning_options.strategy_options:
-        if len(tuning_options.strategy_options["candidates"]) > 0:
-            new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
-    return new_tuning_options
-
-def process_results(all_results, searchspace):
-    unique_configs = set()
-    final_results = []
-    population = [] # for memetic strategy
-    candidates = [] # for memetic strategy
-
-    for (strategy_results, tuning_options) in all_results:
-        if "old_candidate" in tuning_options.strategy_options:
-            candidates.append(tuning_options.strategy_options["old_candidate"])
-        if "candidate" in tuning_options.strategy_options:
-            population.append(tuning_options.strategy_options["candidate"])
-        for new_result in strategy_results:
-            config_signature = tuple(new_result[key] for key in searchspace.tune_params)
-            if config_signature not in unique_configs:
-                final_results.append(new_result)
-                unique_configs.add(config_signature)
-    return final_results, population, candidates

From af532c544e2bdbe6a0da384b5efdca879250051e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:25:55 +0200
Subject: [PATCH 66/97] added self.init_arguments for parallel runner execution

---
 kernel_tuner/observers/nvml.py        | 6 ++++++
 kernel_tuner/observers/pmt.py         | 3 +++
 kernel_tuner/observers/powersensor.py | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 0fd812a34..2a496441a 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -315,6 +315,9 @@ def __init__(
         continous_duration=1,
     ):
         """Create an NVMLObserver."""
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, device, save_all, nvidia_smi_fallback, use_locked_clocks, continous_duration]
+
         if nvidia_smi_fallback:
             self.nvml = nvml(
                 device,
@@ -424,6 +427,9 @@ def __init__(self, observables, parent, nvml_instance, continous_duration=1):
         self.parent = parent
         self.nvml = nvml_instance
 
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, parent, nvml_instance, continous_duration]
+
         supported = ["power_readings", "nvml_power", "nvml_energy"]
         for obs in observables:
             if obs not in supported:
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 6efb1209a..750b784bc 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -33,6 +33,9 @@ class PMTObserver(BenchmarkObserver):
     def __init__(self, observable=None):
         if not pmt:
             raise ImportError("could not import pmt")
+        
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observable]
 
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index 6d07e8977..e05b854a6 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -27,6 +27,9 @@ class PowerSensorObserver(BenchmarkObserver):
     def __init__(self, observables=None, device=None):
         if not powersensor:
             raise ImportError("could not import powersensor")
+        
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, device]
 
         supported = ["ps_energy", "ps_power"]
         for obs in observables:

From 82d988687f9e7ab9956518c5ef9333a55ffb9e99 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:28:35 +0200
Subject: [PATCH 67/97] fix about non-pickleable observers and other small
 adjustments

---
 kernel_tuner/runners/parallel.py | 69 ++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e6cdce2ab..242d2e1d9 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -10,7 +10,7 @@
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices, get_nested_types
+from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -19,14 +19,11 @@ class ParallelRunner(Runner):
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
                  num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
-        self.quiet = device_options.quiet
         self.kernel_source = kernel_source
-        self.warmed_up = False
         self.simulation_mode = simulation_mode
+        self.kernel_options = kernel_options
         self.start_time = perf_counter()
         self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
@@ -36,49 +33,52 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
 
         if num_gpus is None:
             self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
-
+        
         initialize_ray()
 
-        # Create RemoteActor instances
-        if actors is None:
-            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, simulation_mode, id) for id in range(self.num_gpus)]
-
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
-    
     def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
         if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
         
+        # Create RemoteActor instances
+        if self.actors is None:
+            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
+            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, self.simulation_mode, id) for id in range(self.num_gpus)]
+
         if self.cache_manager is None:
             if cache_manager is None:
-                cache_manager = CacheManager.remote(tuning_options)
+                cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
             self.cache_manager = cache_manager
         
         # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
             ray.get(actor.set_cache_manager.remote(self.cache_manager))
     
+        # Some observers can't be pickled
+        run_tuning_options = copy.deepcopy(tuning_options)
+        run_tuning_options['observers'] = None
         # Determine what type of parallelism and run appropriately
         if parameter_space and not ensemble and not searchspace:
-            results, tuning_options_list = self.run_parallel_tuning(tuning_options, parameter_space)
+            results, tuning_options_list = self.parallel_function_evaluation(run_tuning_options, parameter_space)
         elif ensemble and searchspace and not parameter_space:
-            results, tuning_options_list = self.run_parallel_ensemble(ensemble, tuning_options, searchspace)
+            results, tuning_options_list = self.multi_strategy_parallel_execution(ensemble, run_tuning_options, searchspace)
         else:
             raise ValueError("Invalid arguments to parallel runner run method")
         
         # Update tuning options
-        new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
-        tuning_options.update(new_tuning_options)
+        # NOTE: tuning options won't have the state of the observers created in the actors as they can't be pickled
+        cache, cachefile = ray.get(self.cache_manager.get_cache.remote())
+        tuning_options.cache = cache
+        tuning_options.cachefile = cachefile
         if self.simulation_mode:
             tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
-            print(f"DEBUG: simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
         
         return results
 
-    def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
+    def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
         """
         Runs strategies from the ensemble in parallel using distributed actors, 
         manages dynamic task allocation, and collects results.
@@ -86,11 +86,20 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
         all_results = []
+        max_feval = tuning_options.strategy_options["max_fevals"]
+        num_strategies = len(ensemble)
+
+        # distributing feval to all strategies
+        base_eval_per_strategy = max_feval // num_strategies
+        remainder = max_feval % num_strategies
+        evaluations_per_strategy = [base_eval_per_strategy] * num_strategies
+        for i in range(remainder):
+            evaluations_per_strategy[i] += 1
 
         # Start initial tasks for each actor
         for actor in self.actors:
             strategy = ensemble_queue.popleft()
-            remote_tuning_options = self._setup_tuning_options(tuning_options)
+            remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
             task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
             pending_tasks[task] = actor
         
@@ -105,7 +114,7 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
                 # Reassign actors if strategies remain
                 if ensemble_queue:
                     strategy = ensemble_queue.popleft()
-                    remote_tuning_options = self._setup_tuning_options(tuning_options)
+                    remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
         
@@ -117,13 +126,15 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
             tuning_options.strategy_options["population"] = population
         if candidates:
             tuning_options.strategy_options["candidates"] = candidates
+        
         return results, tuning_options_list
     
-    def _setup_tuning_options(self, tuning_options):
+    def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
         if "candidates" in tuning_options.strategy_options:
             if len(tuning_options.strategy_options["candidates"]) > 0:
                 new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+        new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
@@ -143,11 +154,11 @@ def _process_results_ensemble(self, all_results):
         return results, tuning_options_list, population, candidates
 
 
-    def run_parallel_tuning(self, tuning_options, parameter_space):
+    def parallel_function_evaluation(self, tuning_options, parameter_space):
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(tuning_options, element=v), parameter_space))
         results = [x[0] for x in all_results]
         tuning_options_list = [x[1] for x in all_results]
         return results, tuning_options_list
@@ -167,7 +178,13 @@ def _process_results(self, all_results, searchspace):
     def _calculate_simulated_time(self, tuning_options_list):
         simulated_times = []
         for tuning_options in tuning_options_list:
-            print(f"DEBUG:_calculate_simulated_time tuning_options.simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
             simulated_times.append(tuning_options.simulated_time)
         #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
-        return max(simulated_times)
\ No newline at end of file
+        return max(simulated_times)
+
+    def clean_up_ray(self):
+        if self.actors is not None:
+            for actor in self.actors:
+                ray.kill(actor)
+        if self.cache_manager is not None:
+            ray.kill(self.cache_manager)
\ No newline at end of file

From c6a2f36277c26f8fc60c12dfafce2e2a04a82161 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:29:54 +0200
Subject: [PATCH 68/97] now the cache manager deals only with the cache and not
 with the entire tuning option dict

---
 kernel_tuner/runners/ray/cache_manager.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 882207f02..9aeb56855 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -1,23 +1,23 @@
 import ray
-import json
 
 from kernel_tuner.util import store_cache
 
 @ray.remote(num_cpus=1)
 class CacheManager:
-    def __init__(self, tuning_options):
-        self.tuning_options = tuning_options
+    def __init__(self, cache, cachefile):
+        from kernel_tuner.interface import Options # importing here due to circular import
+        self.tuning_options = Options({'cache': cache, 'cachefile': cachefile})
 
     def store(self, key, params):
         store_cache(key, params, self.tuning_options)
 
     def check_and_retrieve(self, key):
         """Checks if a result exists for the given key and returns it if found."""
-        if self.tuning_options.cache:
-            return self.tuning_options.cache.get(key, None)
+        if self.tuning_options['cache']:
+            return self.tuning_options['cache'].get(key, None)
         else:
             return None
     
-    def get_tuning_options(self):
+    def get_cache(self):
         """Returns the current tuning options."""
-        return self.tuning_options
+        return self.tuning_options['cache'], self.tuning_options['cachefile']

From 5fe2e56653bb0200a32fa58c33404999b90b2523 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:30:25 +0200
Subject: [PATCH 69/97] fix related to non-pickleable observers

---
 kernel_tuner/runners/ray/remote_actor.py | 32 +++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 3956c8648..96d244c3b 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,9 +1,11 @@
 import ray
 import sys
+import copy
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.util import get_nested_types
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.observers.register import RegisterObserver
 
 @ray.remote
 class RemoteActor():
@@ -12,26 +14,44 @@ def __init__(self,
                  kernel_options, 
                  device_options,
                  iterations,
-                 observers,
+                 observers_type_and_arguments,
                  cache_manager=None,
                  simulation_mode=False):
         self.kernel_source = kernel_source
         self.kernel_options = kernel_options
         self.device_options = device_options
         self.iterations = iterations
-        self.observers = observers
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-            
+
+        # observers can't be pickled to the actor so we need to re-initialize them
+        register_observer = False
+        self.observers = []
+        for (observer, arguments) in observers_type_and_arguments:
+            if isinstance(observer, RegisterObserver):
+                register_observer = True
+            else:
+                self.observers.append(observer(*arguments))
+        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, **device_options) if not simulation_mode else None
+        # the register observer needs dev to be initialized, that's why its done later
+        if register_observer:
+            self.observers.append(RegisterObserver(self.dev))
+
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
         if strategy and searchspace:
-            results = strategy.tune(searchspace, self.runner, tuning_options)
+            results = strategy.tune(searchspace, self.runner,  tuning_options)
+            # observers can't be pickled
+            tuning_options['observers'] = None
             return results, tuning_options
         elif element:
-            results = self.runner.run([element], tuning_options)[0]
+            results = self.runner.run([element],  tuning_options)[0]
+            # observers can't be pickled
+            tuning_options['observers'] = None
             return results, tuning_options
         else:
             raise ValueError("Invalid arguments for ray actor's execute method.")

From 3b3317c4ff7e425eec86512e82810bf298937285 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:31:13 +0200
Subject: [PATCH 70/97] update related to new cache manager

---
 kernel_tuner/strategies/brute_force.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index ba3d834ad..b08efea03 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,8 +9,8 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
-        cache_manager = CacheManager.remote(tuning_options)
-        return runner.run(searchspace.sorted_list(), tuning_options, cache_manager)
+        cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
+        return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)
     else:
         return runner.run(searchspace.sorted_list(), tuning_options)
 

From 1593806ba2535864f18e297dd202c623e7ebfea9 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:32:48 +0200
Subject: [PATCH 71/97] added cleanup at the end of the ensemble

---
 kernel_tuner/strategies/ensemble.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 78fd85001..a5268dc18 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -54,6 +54,8 @@
 }
 
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
+    clean_up = True if actors is None and cache_manager is None else False
+    options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     
@@ -63,6 +65,9 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
+
+    tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
@@ -72,5 +77,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
                                     runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
                                     simulation_mode=simulation_mode, actors=actors)
     final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
+
+    if clean_up:
+        parallel_runner.clean_up_ray()
     
     return final_results

From efd5be20147a6162a19c65057fc6d6c7ff0c86db Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:34:13 +0200
Subject: [PATCH 72/97] changes to hyperparameters

---
 kernel_tuner/strategies/memetic.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 3f2922f4b..be5c95db4 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -76,6 +76,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     maxiter = options.get("maxiter", 2)
     popsize = options.get("popsize", 20)
     max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
+    print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -93,7 +94,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     check_num_devices(num_gpus, simulation_mode, runner)
     initialize_ray()
     # Create cache manager, actors and parallel runner
-    cache_manager = CacheManager.remote(tuning_options)
+    cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize
     runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
     actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
@@ -132,7 +133,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Local Search (LS)
         print(f"DEBUG:=================Local Search=================", file=sys.stderr)
-        tuning_options.strategy_options["max_fevals"] = lsd
+        tuning_options.strategy_options["max_fevals"] = lsd * popsize
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
@@ -154,7 +155,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             if afi_ls > afi_gs:
                 lsd += alsd
             elif afi_ls < afi_gs:
-                lsd -= alsd if lsd - alsd > 5 else 5
+                lsd -= alsd
+            # Less than 5 lsd doesn't make sense
+            if lsd < 5:
+                lsd = 5
             print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)

From bc66244741d2dc1c41b7f140d2c657f1e8d1d95d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:34:46 +0200
Subject: [PATCH 73/97] changes related to non-pickleable observers

---
 kernel_tuner/strategies/common.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 787750825..189620649 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -10,6 +10,10 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
+from kernel_tuner.observers.nvml import NVMLObserver, NVMLPowerObserver
+from kernel_tuner.observers.pmt import PMTObserver
+from kernel_tuner.observers.powersensor import PowerSensorObserver
+from kernel_tuner.observers.register import RegisterObserver
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -343,12 +347,22 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
     else:
         resource_options = {"num_gpus": 1}
     
+    observers_type_and_arguments = []
+    if observers is not None:
+        # observers can't be pickled so we will re-initialize them in the actors
+        # observers related to backends will be initialized once we call the device interface inside the actor, that is why we skip them here
+        for i, observer in enumerate(observers):
+            if isinstance(observer, (NVMLObserver, NVMLPowerObserver, PMTObserver, PowerSensorObserver)):
+                observers_type_and_arguments.append((observer.__class__, observer.init_arguments))
+            if isinstance(observer, RegisterObserver):
+                observers_type_and_arguments.append((observer.__class__, []))
+    
     # Create the actor with the specified options and resources
     return RemoteActor.options(**resource_options).remote(kernel_source, 
                                                             kernel_options, 
                                                             device_options, 
                                                             iterations, 
-                                                            observers,
+                                                            observers_type_and_arguments=observers_type_and_arguments,
                                                             cache_manager=cache_manager,
                                                             simulation_mode=simulation_mode)
 

From 9e9f1afe3ef24696225425aa652f165682e20870 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 31 May 2024 11:08:35 +0200
Subject: [PATCH 74/97] updated init_arguments to a dict

---
 kernel_tuner/observers/nvml.py        | 17 ++++++++++++++---
 kernel_tuner/observers/pmt.py         |  4 +++-
 kernel_tuner/observers/powersensor.py |  5 ++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 2a496441a..bc93a275b 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -316,8 +316,14 @@ def __init__(
     ):
         """Create an NVMLObserver."""
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, device, save_all, nvidia_smi_fallback, use_locked_clocks, continous_duration]
-
+        self.init_arguments = {
+            "observables": observables,
+            "device": device,
+            "save_all": save_all,
+            "nvidia_smi_fallback": nvidia_smi_fallback,
+            "use_locked_clocks": use_locked_clocks,
+            "continous_duration": continous_duration
+        }
         if nvidia_smi_fallback:
             self.nvml = nvml(
                 device,
@@ -428,7 +434,12 @@ def __init__(self, observables, parent, nvml_instance, continous_duration=1):
         self.nvml = nvml_instance
 
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, parent, nvml_instance, continous_duration]
+        self.init_arguments = {
+            "observables": observables,
+            "parent": parent,
+            "nvml_instance": nvml_instance,
+            "continous_duration": continous_duration
+        }
 
         supported = ["power_readings", "nvml_power", "nvml_energy"]
         for obs in observables:
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 750b784bc..f7f652d89 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -35,7 +35,9 @@ def __init__(self, observable=None):
             raise ImportError("could not import pmt")
         
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observable]
+        self.init_arguments = {
+            "observable": observable
+        }
 
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index e05b854a6..c946f9d44 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -29,7 +29,10 @@ def __init__(self, observables=None, device=None):
             raise ImportError("could not import powersensor")
         
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, device]
+        self.init_arguments = {
+            "observables": observables,
+            "device": device
+        }
 
         supported = ["ps_energy", "ps_power"]
         for obs in observables:

From 3fed66cdf87533b11c8833a56170a2cbc811351e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 31 May 2024 11:10:46 +0200
Subject: [PATCH 75/97] updates for searchspace split, ensemble related fix,
 and observer executing on correct device in parallel mode

---
 kernel_tuner/runners/parallel.py         | 18 ++++++++--
 kernel_tuner/runners/ray/remote_actor.py | 35 +++++++++++--------
 kernel_tuner/searchspace.py              | 44 ++++++++++++++++++++++++
 kernel_tuner/strategies/common.py        |  5 +--
 kernel_tuner/strategies/ensemble.py      |  6 +---
 kernel_tuner/strategies/memetic.py       |  6 ++--
 6 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 242d2e1d9..dc579c901 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -46,7 +46,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         # Create RemoteActor instances
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, self.simulation_mode, id) for id in range(self.num_gpus)]
+            self.actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for id in range(self.num_gpus)]
 
         if self.cache_manager is None:
             if cache_manager is None:
@@ -86,7 +86,9 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
         all_results = []
-        max_feval = tuning_options.strategy_options["max_fevals"]
+        options = tuning_options.strategy_options
+        max_feval = options["max_fevals"]
+        split_searchspace = options["split_searchspace"] if "split_searchspace" in options else False
         num_strategies = len(ensemble)
 
         # distributing feval to all strategies
@@ -96,9 +98,17 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         for i in range(remainder):
             evaluations_per_strategy[i] += 1
 
+        # Ensure we always have a list of search spaces
+        if split_searchspace:
+            searchspaces = searchspace.split_searchspace(num_strategies)
+        else:
+            searchspaces = [searchspace] * num_strategies
+        searchspaces = deque(searchspaces)
+
         # Start initial tasks for each actor
         for actor in self.actors:
             strategy = ensemble_queue.popleft()
+            searchspace = searchspaces.popleft()
             remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
             task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
             pending_tasks[task] = actor
@@ -114,6 +124,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
                 # Reassign actors if strategies remain
                 if ensemble_queue:
                     strategy = ensemble_queue.popleft()
+                    searchspace = searchspaces.popleft()
                     remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
@@ -128,6 +139,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
             tuning_options.strategy_options["candidates"] = candidates
         
         return results, tuning_options_list
+
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
@@ -135,6 +147,8 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
             if len(tuning_options.strategy_options["candidates"]) > 0:
                 new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
+        # the stop criterion uses the max feval in tuning options for some reason
+        new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 96d244c3b..759a902a1 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -15,6 +15,7 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers_type_and_arguments,
+                 id,
                  cache_manager=None,
                  simulation_mode=False):
         self.kernel_source = kernel_source
@@ -24,20 +25,9 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-
-        # observers can't be pickled to the actor so we need to re-initialize them
-        register_observer = False
-        self.observers = []
-        for (observer, arguments) in observers_type_and_arguments:
-            if isinstance(observer, RegisterObserver):
-                register_observer = True
-            else:
-                self.observers.append(observer(*arguments))
-        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, **device_options) if not simulation_mode else None
-        # the register observer needs dev to be initialized, that's why its done later
-        if register_observer:
-            self.observers.append(RegisterObserver(self.dev))
+        self.id = id
+        self._reinitialize_observers(observers_type_and_arguments)
+        
 
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
         tuning_options['observers'] = self.observers
@@ -72,3 +62,20 @@ def init_runner(self):
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                        self.iterations, self.observers, cache_manager=self.cache_manager)
+
+    def _reinitialize_observers(self, observers_type_and_arguments):
+        # observers can't be pickled to the actor so we need to re-initialize them
+        register_observer = False
+        self.observers = []
+        for (observer, arguments) in observers_type_and_arguments:
+            if "device" in arguments:
+                arguments["device"] = self.id
+            if isinstance(observer, RegisterObserver):
+                register_observer = True
+            else:
+                self.observers.append(observer(**arguments))
+        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+        self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
+        # the register observer needs dev to be initialized, that's why its done later
+        if register_observer:
+            self.observers.append(RegisterObserver(self.dev))
\ No newline at end of file
diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index 5ee7f7ce2..f68295e93 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -50,6 +50,11 @@ def __init__(
         restrictions = restrictions if restrictions is not None else []
         self.tune_params = tune_params
         self.restrictions = restrictions
+        self.max_threads = max_threads
+        self.block_size_names = block_size_names
+        self.framework = framework
+        self.solver_method = solver_method
+        self.path_to_ATF_cache = path_to_ATF_cache
         # the searchspace can add commonly used constraints (e.g. maxprod(blocks) <= maxthreads)
         self._modified_restrictions = restrictions
         self.param_names = list(self.tune_params.keys())
@@ -727,3 +732,42 @@ def order_param_configs(
                 f"The number of ordered parameter configurations ({len(ordered_param_configs)}) differs from the original number of parameter configurations ({len(param_configs)})"
             )
         return ordered_param_configs
+
+    def split_searchspace(self, n: int) -> List['Searchspace']:
+        """Splits the searchspace into n more or less equal parts using a round-robin approach."""
+        if n <= 0:
+            raise ValueError("Number of parts must be greater than zero.")
+        if n > self.size:
+            raise ValueError(f"Cannot split into more parts ({n}) than the size of the searchspace ({self.size}).")
+
+        # Initialize the parts and their corresponding tune_params
+        parts = [{param: [] for param in self.tune_params} for _ in range(n)]
+        
+        # Distribute configurations in a round-robin fashion
+        for index, config in enumerate(self.list):
+            part_index = index % n
+            for j, param in enumerate(self.param_names):
+                parts[part_index][param].append(config[j])
+
+        # Remove duplicates and sort parameters within each part
+        for part_tune_params in parts:
+            for param in part_tune_params:
+                part_tune_params[param] = sorted(list(set(part_tune_params[param])))
+
+        # Create Searchspace objects for each part
+        searchspace_parts = []
+        for part_tune_params in parts:
+            part_searchspace = Searchspace(
+                tune_params=part_tune_params,
+                restrictions=self.restrictions,
+                max_threads=self.max_threads,
+                block_size_names=self.block_size_names,
+                build_neighbors_index=self.build_neighbors_index,
+                neighbor_method=self.neighbor_method,
+                framework=self.framework,
+                solver_method=self.solver_method,
+                path_to_ATF_cache=self.path_to_ATF_cache
+            )
+            searchspace_parts.append(part_searchspace)
+
+        return searchspace_parts
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 189620649..47fefd505 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -53,7 +53,7 @@ def make_strategy_options_doc(strategy_options):
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
     accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", ]
+                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -364,7 +364,8 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
                                                             iterations, 
                                                             observers_type_and_arguments=observers_type_and_arguments,
                                                             cache_manager=cache_manager,
-                                                            simulation_mode=simulation_mode)
+                                                            simulation_mode=simulation_mode,
+                                                            id=id)
 
 def initialize_ray():
     # Initialize Ray
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index a5268dc18..d8a919399 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -59,11 +59,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     
-    ensemble = []
-    if "ensemble" in tuning_options:
-        ensemble = tuning_options.ensemble
-    else:
-        ensemble = ["greedy_ils", "greedy_ils"]
+    ensemble = options.get('ensemble', ["greedy_ils", "greedy_ils"])
     ensemble_size = len(ensemble)
 
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index be5c95db4..3fd5f2b12 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -79,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
-        tuning_options["ensemble"] = [local_search] * popsize
+        options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
@@ -88,7 +88,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     else:
         raise ValueError("Provided population based strategy is not a population based strategy")
     
-    tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
+    options["population"] = searchspace.get_random_sample(popsize)
 
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     check_num_devices(num_gpus, simulation_mode, runner)
@@ -97,7 +97,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize
     runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-    actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
+    actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=cache_manager, simulation_mode=simulation_mode) for id in range(num_actors)]
     pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                 runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
                                 simulation_mode=simulation_mode, actors=actors)

From 86a9b677b6a6b75a22b4be228cd234d0b6cceb00 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Wed, 5 Jun 2024 14:29:11 +0200
Subject: [PATCH 76/97] small corections related to stop criterion for memetic

---
 kernel_tuner/strategies/greedy_ils.py | 11 ++++++++++-
 kernel_tuner/strategies/memetic.py    | 20 ++++----------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index 575b89bd2..bbceb76b8 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -35,7 +35,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     if not candidate:
         candidate = searchspace.get_random_sample(1)[0]
     old_candidate = candidate # for memetic strategy
-    best_score = cost_func(candidate, check_restrictions=False)
+    try:
+        best_score = cost_func(candidate, check_restrictions=False)
+    except util.StopCriterionReached as e:
+        tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
+        tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
+        if tuning_options.verbose:
+            print(e)
+        return cost_func.results
 
     last_improvement = 0
     while fevals < max_fevals:
@@ -45,6 +52,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
+            tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
+            tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
             if tuning_options.verbose:
                 print(e)
             return cost_func.results
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 3fd5f2b12..ac42cdecd 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -71,15 +71,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
-    lsd = options.get("lsd", 25) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 2)
+    alsd = options.get("alsd", 5) # Adaptive Local Search Depth (ALSD)
+    lsd = options.get("lsd", 30) # Local Search Depth (LSD)
+    maxiter = options.get("maxiter", 3)
     popsize = options.get("popsize", 20)
     max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
     print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
-        options["ensemble"] = [local_search] * popsize
+        tuning_options.strategy_options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
@@ -119,12 +119,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = global_search.tune(searchspace, pop_runner, tuning_options)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += maxiter * popsize
-        try:
-            check_stop_criterion(tuning_options)
-        except StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            break
 
         pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
         pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
@@ -138,12 +132,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
-        try:
-            check_stop_criterion(tuning_options)
-        except StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            break
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])

From de5fc4948ccda1444605a0eb15001c66de3ca51b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:22:44 +0200
Subject: [PATCH 77/97] added logic to check if all GPUs are of the same type

---
 kernel_tuner/runners/parallel.py         | 34 ++++++++++++-----
 kernel_tuner/runners/ray/remote_actor.py |  9 ++++-
 kernel_tuner/util.py                     | 47 +++++++++++++-----------
 3 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index dc579c901..e86063ee7 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -10,7 +10,7 @@
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, GPUTypeMismatchError
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -30,12 +30,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.cache_manager = cache_manager
         self.num_gpus = num_gpus
         self.actors = actors
-
-        if num_gpus is None:
-            self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
         
         initialize_ray()
 
+        if num_gpus is None:
+            self.num_gpus = get_num_devices(simulation_mode)
+
+        # So we know the number of GPUs in the cache file
+        if not simulation_mode:
+            self.dev.name = [self.dev.name] * self.num_gpus
+
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
@@ -46,7 +50,11 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         # Create RemoteActor instances
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for id in range(self.num_gpus)]
+            self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
+        
+        # Check if all GPUs are of the same type
+        if not self.simulation_mode and not self._check_gpus_equals():
+            raise GPUTypeMismatchError(f"Different GPU types found") 
 
         if self.cache_manager is None:
             if cache_manager is None:
@@ -88,7 +96,6 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         all_results = []
         options = tuning_options.strategy_options
         max_feval = options["max_fevals"]
-        split_searchspace = options["split_searchspace"] if "split_searchspace" in options else False
         num_strategies = len(ensemble)
 
         # distributing feval to all strategies
@@ -99,10 +106,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
             evaluations_per_strategy[i] += 1
 
         # Ensure we always have a list of search spaces
-        if split_searchspace:
-            searchspaces = searchspace.split_searchspace(num_strategies)
-        else:
-            searchspaces = [searchspace] * num_strategies
+        searchspaces = [searchspace] * num_strategies
         searchspaces = deque(searchspaces)
 
         # Start initial tasks for each actor
@@ -196,6 +200,16 @@ def _calculate_simulated_time(self, tuning_options_list):
         #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
         return max(simulated_times)
 
+    def _check_gpus_equals(self):
+        gpu_types = []
+        for actor in self.actors:
+            gpu_types.append(ray.get(actor.get_gpu_type.remote(self.kernel_source.lang)))
+        if len(set(gpu_types)) == 1:
+            print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
+            return True
+        else:
+            return False
+
     def clean_up_ray(self):
         if self.actors is not None:
             for actor in self.actors:
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 759a902a1..8ea23ca1b 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -6,6 +6,7 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.observers.register import RegisterObserver
+from kernel_tuner.util import get_gpu_id, get_gpu_type
 
 @ray.remote
 class RemoteActor():
@@ -25,7 +26,7 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-        self.id = id
+        self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
         self._reinitialize_observers(observers_type_and_arguments)
         
 
@@ -78,4 +79,8 @@ def _reinitialize_observers(self, observers_type_and_arguments):
         self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
         # the register observer needs dev to be initialized, that's why its done later
         if register_observer:
-            self.observers.append(RegisterObserver(self.dev))
\ No newline at end of file
+            self.observers.append(RegisterObserver(self.dev))
+
+    def get_gpu_type(self, lang):
+        print(f"DEBUG:actor get_gpu_type called", file=sys.stderr)
+        return get_gpu_type(lang)
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 1502521ef..9cd0b0ac2 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -11,6 +11,8 @@
 from inspect import signature
 from types import FunctionType
 from typing import Optional, Union
+import ray
+import subprocess
 
 import numpy as np
 from constraint import (
@@ -90,6 +92,9 @@ class SkippableFailure(Exception):
 class StopCriterionReached(Exception):
     """Exception thrown when a stop criterion has been reached."""
 
+class GPUTypeMismatchError(Exception):
+    """Exception thrown when GPU types are not the same in parallel execution"""
+
 
 try:
     import torch
@@ -1277,28 +1282,26 @@ def cuda_error_check(error):
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
-def get_num_devices(lang, simulation_mode=False):
-    num_devices = 0
+def get_num_devices(simulation_mode=False):
+    resources = ray.cluster_resources()
     if simulation_mode:
-        num_devices = int(round(os.cpu_count() * 0.8)) # keep resources for the main process and other tasks
-    elif lang.upper() == "CUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "CUPY":
-        import cupy
-        num_devices = cupy.cuda.runtime.getDeviceCount()
-    elif lang.upper() == "NVCUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "OPENCL":
-        import pyopencl as cl
-        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
-    elif lang.upper() == "HIP":
-        from pyhip import hip
-        num_devices = hip.hipGetDeviceCount()
+        num_devices = round(resources.get("CPU") * 0.8)
     else:
-        raise ValueError(f"Unsupported language: {lang}")
+        num_devices = resources.get("GPU")
+    print(f"DEBUG: {num_devices} Ray devices detected", file=sys.stderr)
+    return int(num_devices)
 
-    return num_devices
\ No newline at end of file
+def get_gpu_id(lang):
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES") or os.environ.get("NVIDIA_VISIBLE_DEVICES") or "No GPU assigned"
+    else:
+        raise NotImplementedError("TODO: implement other languages")
+    return int(gpu_id)
+
+def get_gpu_type(lang):
+    gpu_id = get_gpu_id(lang)
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        result = subprocess.run(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader', '-i', str(gpu_id)], capture_output=True, text=True)
+        return result.stdout.strip()
+    else:
+        raise NotImplementedError("TODO: implement other languages")
\ No newline at end of file

From 1b0adb05b11545a5048cdf40d329e38eae9ef148 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:25:08 +0200
Subject: [PATCH 78/97] deleted split searchspace function

---
 kernel_tuner/searchspace.py | 39 -------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index f68295e93..0317ff434 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -732,42 +732,3 @@ def order_param_configs(
                 f"The number of ordered parameter configurations ({len(ordered_param_configs)}) differs from the original number of parameter configurations ({len(param_configs)})"
             )
         return ordered_param_configs
-
-    def split_searchspace(self, n: int) -> List['Searchspace']:
-        """Splits the searchspace into n more or less equal parts using a round-robin approach."""
-        if n <= 0:
-            raise ValueError("Number of parts must be greater than zero.")
-        if n > self.size:
-            raise ValueError(f"Cannot split into more parts ({n}) than the size of the searchspace ({self.size}).")
-
-        # Initialize the parts and their corresponding tune_params
-        parts = [{param: [] for param in self.tune_params} for _ in range(n)]
-        
-        # Distribute configurations in a round-robin fashion
-        for index, config in enumerate(self.list):
-            part_index = index % n
-            for j, param in enumerate(self.param_names):
-                parts[part_index][param].append(config[j])
-
-        # Remove duplicates and sort parameters within each part
-        for part_tune_params in parts:
-            for param in part_tune_params:
-                part_tune_params[param] = sorted(list(set(part_tune_params[param])))
-
-        # Create Searchspace objects for each part
-        searchspace_parts = []
-        for part_tune_params in parts:
-            part_searchspace = Searchspace(
-                tune_params=part_tune_params,
-                restrictions=self.restrictions,
-                max_threads=self.max_threads,
-                block_size_names=self.block_size_names,
-                build_neighbors_index=self.build_neighbors_index,
-                neighbor_method=self.neighbor_method,
-                framework=self.framework,
-                solver_method=self.solver_method,
-                path_to_ATF_cache=self.path_to_ATF_cache
-            )
-            searchspace_parts.append(part_searchspace)
-
-        return searchspace_parts

From 513028679aa2d4a588ec4a33e16399127598978b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:25:57 +0200
Subject: [PATCH 79/97] changed place where ray is initialized

---
 kernel_tuner/strategies/memetic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index ac42cdecd..083b117a7 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -89,10 +89,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         raise ValueError("Provided population based strategy is not a population based strategy")
     
     options["population"] = searchspace.get_random_sample(popsize)
-
-    num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    check_num_devices(num_gpus, simulation_mode, runner)
+    
     initialize_ray()
+    num_gpus = get_num_devices(simulation_mode=simulation_mode)
+    check_num_devices(num_gpus, simulation_mode, runner)
     # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize

From 5b9d8178b5df7906baceb1f9b9cd09a6eaedd51b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:26:44 +0200
Subject: [PATCH 80/97] setting BO to random sampling if needed

---
 kernel_tuner/strategies/ensemble.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index d8a919399..1ae443240 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -57,11 +57,14 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     clean_up = True if actors is None and cache_manager is None else False
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    initialize_ray()
+    num_devices = get_num_devices(simulation_mode=simulation_mode)
     
-    ensemble = options.get('ensemble', ["greedy_ils", "greedy_ils"])
+    ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)
 
+    if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
+        tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
 
     if num_devices < ensemble_size:

From 040a57ec5600ee800e92e0fb02556c028ec4dcce Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 17:39:46 +0200
Subject: [PATCH 81/97] added num_gpus option

---
 kernel_tuner/interface.py           | 10 +++++++---
 kernel_tuner/strategies/ensemble.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 81ae7de48..63c4c2fff 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -618,8 +618,8 @@ def tune_kernel(
         tuning_options["max_fevals"] = strategy_options["max_fevals"]
     if strategy_options and "time_limit" in strategy_options:
         tuning_options["time_limit"] = strategy_options["time_limit"]
-    if strategy_options and "ensemble" in strategy_options:
-        tuning_options["ensemble"] = strategy_options["ensemble"]
+    if strategy_options and "num_gpus" in strategy_options:
+        tuning_options["num_gpus"] = strategy_options["num_gpus"]
 
     logging.debug("tune_kernel called")
     logging.debug("kernel_options: %s", util.get_config_string(kernel_options))
@@ -661,7 +661,11 @@ def tune_kernel(
     # select the runner for this job based on input
     selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
     tuning_options.simulated_time = 0
-    runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
+    if parallel_mode:
+         num_gpus = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else None
+         runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers, num_gpus=num_gpus)
+    else:
+        runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
     # the user-specified function may or may not have an optional atol argument;
     # we normalize it so that it always accepts atol.
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 1ae443240..4c16b4f8f 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -58,7 +58,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
-    num_devices = get_num_devices(simulation_mode=simulation_mode)
+    num_devices = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else get_num_devices(simulation_mode=simulation_mode)
     
     ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)

From acaaeb12a3102fe48e4ead134aa02999e1e5c548 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 10 Jun 2024 15:38:59 +0200
Subject: [PATCH 82/97] removed debug print

---
 kernel_tuner/runners/ray/remote_actor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 8ea23ca1b..219fb6732 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -82,5 +82,4 @@ def _reinitialize_observers(self, observers_type_and_arguments):
             self.observers.append(RegisterObserver(self.dev))
 
     def get_gpu_type(self, lang):
-        print(f"DEBUG:actor get_gpu_type called", file=sys.stderr)
         return get_gpu_type(lang)

From 63d9f653437398227100d03e9b67c987e1e775cc Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 10 Jun 2024 15:39:33 +0200
Subject: [PATCH 83/97] added check_and_retrive strategy option

---
 kernel_tuner/runners/sequential.py     | 2 +-
 kernel_tuner/strategies/brute_force.py | 1 +
 kernel_tuner/strategies/ensemble.py    | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index b4fc18c57..3ee43be0f 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -125,7 +125,7 @@ def run(self, parameter_space, tuning_options):
         return results
 
     def config_in_cache(self, x_int, tuning_options):
-        if self.cache_manager:
+        if self.cache_manager and tuning_options.strategy_options['check_and_retrieve']:
             return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
         elif tuning_options.cache and x_int in tuning_options.cache:
             return tuning_options.cache[x_int]
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index b08efea03..ac5ae985a 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,6 +9,7 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
+        tuning_options.strategy_options['check_and_retrieve'] = False
         cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
         return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)
     else:
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 4c16b4f8f..2a19f9f74 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -66,6 +66,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+    tuning_options.strategy_options['check_and_retrieve'] = True
 
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)

From e604510d7ec5d5d5447ef0953bd23bf1528101b6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 18 Jun 2024 10:52:39 +0200
Subject: [PATCH 84/97] moved reinitialization of actor observers to execute
 method, before was in init

---
 kernel_tuner/runners/ray/remote_actor.py | 11 ++++++++---
 kernel_tuner/strategies/common.py        |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 219fb6732..138636def 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -27,10 +27,14 @@ def __init__(self,
         self.simulation_mode = simulation_mode
         self.runner = None
         self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
-        self._reinitialize_observers(observers_type_and_arguments)
+        self.observers_initialized = False
+        self.observers_type_and_arguments = observers_type_and_arguments
         
 
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        if not self.observers_initialized:
+            self._reinitialize_observers(self.observers_type_and_arguments)
+            self.observers_initialized = True
         tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
@@ -65,6 +69,7 @@ def init_runner(self):
                                        self.iterations, self.observers, cache_manager=self.cache_manager)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
+        print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
         register_observer = False
         self.observers = []
@@ -75,10 +80,10 @@ def _reinitialize_observers(self, observers_type_and_arguments):
                 register_observer = True
             else:
                 self.observers.append(observer(**arguments))
-        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-        self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
         # the register observer needs dev to be initialized, that's why its done later
         if register_observer:
+            # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+            self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
             self.observers.append(RegisterObserver(self.dev))
 
     def get_gpu_type(self, lang):
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 47fefd505..6d010a0a9 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -53,7 +53,7 @@ def make_strategy_options_doc(strategy_options):
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
     accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace"]
+                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace", "check_and_retrieve"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")

From 5933a6974d14b4f5779baa52d374f14e9207378e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 18 Jun 2024 18:08:50 +0200
Subject: [PATCH 85/97] changes related to re-initialization of observers in
 actor init and device interface

---
 kernel_tuner/runners/parallel.py         |  3 ++-
 kernel_tuner/runners/ray/remote_actor.py | 25 ++++++++++--------------
 kernel_tuner/runners/sequential.py       |  4 ++--
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e86063ee7..628c95958 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -203,7 +203,8 @@ def _calculate_simulated_time(self, tuning_options_list):
     def _check_gpus_equals(self):
         gpu_types = []
         for actor in self.actors:
-            gpu_types.append(ray.get(actor.get_gpu_type.remote(self.kernel_source.lang)))
+            env = ray.get(actor.get_environment.remote())
+            gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
             print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
             return True
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 138636def..88aac10b6 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -26,15 +26,14 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-        self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
-        self.observers_initialized = False
-        self.observers_type_and_arguments = observers_type_and_arguments
-        
+        self.id = None
+        self._reinitialize_observers(observers_type_and_arguments)
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options) if not simulation_mode else None
 
+    def get_environment(self):
+        return self.dev.get_environment()
+    
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
-        if not self.observers_initialized:
-            self._reinitialize_observers(self.observers_type_and_arguments)
-            self.observers_initialized = True
         tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
@@ -66,25 +65,21 @@ def init_runner(self):
                                             self.iterations, self.observers)
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                       self.iterations, self.observers, cache_manager=self.cache_manager)
+                                       self.iterations, self.observers, cache_manager=self.cache_manager, dev=self.dev)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
         print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
-        register_observer = False
         self.observers = []
         for (observer, arguments) in observers_type_and_arguments:
             if "device" in arguments:
+                self.id = get_gpu_id(self.kernel_source.lang) if self.id is None else self.id
                 arguments["device"] = self.id
             if isinstance(observer, RegisterObserver):
-                register_observer = True
+                self.observers.append(RegisterObserver())
             else:
                 self.observers.append(observer(**arguments))
-        # the register observer needs dev to be initialized, that's why its done later
-        if register_observer:
-            # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-            self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
-            self.observers.append(RegisterObserver(self.dev))
+        
 
     def get_gpu_type(self, lang):
         return get_gpu_type(lang)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 3ee43be0f..7fe39858a 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -12,7 +12,7 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, dev=None):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -30,7 +30,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :type iterations: int
         """
         #detect language and create high-level device interface
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if dev is None else dev
 
         self.units = self.dev.units
         self.quiet = device_options.quiet

From 4e4c47b42b776ac5a3e13761737264b92d4be902 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 21 Jun 2024 19:08:11 +0200
Subject: [PATCH 86/97] removed unnecesary blocking ray.get

---
 kernel_tuner/runners/parallel.py   | 13 +++++++++----
 kernel_tuner/runners/sequential.py |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 628c95958..53bf96160 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -51,7 +51,11 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
             self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
-        
+            # actors_ready_futures = [actor.__ray_ready__.remote() for actor in futures]
+            # ray.wait(actors_ready_futures, num_returns=len(actors_ready_futures), timeout=None)
+            # self.actors = futures
+
+
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
             raise GPUTypeMismatchError(f"Different GPU types found") 
@@ -63,7 +67,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         
         # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
-            ray.get(actor.set_cache_manager.remote(self.cache_manager))
+            actor.set_cache_manager.remote(self.cache_manager)
     
         # Some observers can't be pickled
         run_tuning_options = copy.deepcopy(tuning_options)
@@ -202,8 +206,9 @@ def _calculate_simulated_time(self, tuning_options_list):
 
     def _check_gpus_equals(self):
         gpu_types = []
-        for actor in self.actors:
-            env = ray.get(actor.get_environment.remote())
+        env_refs = [actor.get_environment.remote() for actor in self.actors]
+        environments = ray.get(env_refs)
+        for env in environments:
             gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
             print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 7fe39858a..e19242549 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -134,6 +134,6 @@ def config_in_cache(self, x_int, tuning_options):
 
     def store_in_cache(self, x_int, params, tuning_options):
         if self.cache_manager:
-            ray.get(self.cache_manager.store.remote(x_int, params))
+            self.cache_manager.store.remote(x_int, params)
         else:
             store_cache(x_int, params, tuning_options)
\ No newline at end of file

From 104205d34560cfbbb7649f014518ce10a7b6ec66 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 11:33:51 +0200
Subject: [PATCH 87/97] removed debug prints

---
 kernel_tuner/runners/parallel.py         | 2 +-
 kernel_tuner/runners/ray/remote_actor.py | 1 -
 kernel_tuner/util.py                     | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 53bf96160..61b0edf50 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -211,7 +211,7 @@ def _check_gpus_equals(self):
         for env in environments:
             gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
-            print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
+            print(f"Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
             return True
         else:
             return False
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 88aac10b6..533dea5b3 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -68,7 +68,6 @@ def init_runner(self):
                                        self.iterations, self.observers, cache_manager=self.cache_manager, dev=self.dev)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
-        print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
         self.observers = []
         for (observer, arguments) in observers_type_and_arguments:
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 9cd0b0ac2..21a6edd08 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1288,7 +1288,6 @@ def get_num_devices(simulation_mode=False):
         num_devices = round(resources.get("CPU") * 0.8)
     else:
         num_devices = resources.get("GPU")
-    print(f"DEBUG: {num_devices} Ray devices detected", file=sys.stderr)
     return int(num_devices)
 
 def get_gpu_id(lang):

From 123fba516e738f31d552ef167ad02284a276c1e3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:19:48 +0200
Subject: [PATCH 88/97] added greedy ils esemble instead of default

---
 test/test_ensemble_tuning.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
index e5c807d43..69efb5a68 100644
--- a/test/test_ensemble_tuning.py
+++ b/test/test_ensemble_tuning.py
@@ -17,9 +17,11 @@
 def env():
     kernel_string = """
     extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
-        int i = blockIdx.x * block_size_x + threadIdx.x;
-        if (i<n) {
-            c[i] = a[i] + b[i];
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int j = blockIdx.y * blockDim.y + threadIdx.y;
+        int index = i + j * gridDim.x * blockDim.x;
+        if (index < n) {
+            c[index] = a[index] + b[index];
         }
     }
     """
@@ -32,11 +34,16 @@ def env():
 
     args = [c, a, b, n]
     tune_params = dict()
-    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    # Extend the range of block sizes for a bigger search space
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(30)]
+    tune_params["block_size_y"] = [1 + i for i in range(1, 16)]
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
-    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True)
+    strategy_options = {"ensemble": ["greedy_ils", "greedy_ils"]}
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", 
+                            parallel_mode=True, strategy_options=strategy_options)
     assert len(result) > 0
\ No newline at end of file

From d381011f9ff628f9cdb7552e402134afc2397561 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:20:23 +0200
Subject: [PATCH 89/97] added check on strategy_options

---
 kernel_tuner/strategies/brute_force.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index ac5ae985a..1ba83a467 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,6 +9,8 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
+        if tuning_options.strategy_options is None:
+            tuning_options.strategy_options = {}
         tuning_options.strategy_options['check_and_retrieve'] = False
         cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
         return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)

From 7e832e33ca1882f3319dd79911223dbe9be91141 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:20:56 +0200
Subject: [PATCH 90/97] removed all memetic algo related stuff

---
 kernel_tuner/interface.py                    |   4 +-
 kernel_tuner/runners/parallel.py             |  28 +--
 kernel_tuner/strategies/common.py            |   3 +-
 kernel_tuner/strategies/genetic_algorithm.py |  60 ++---
 kernel_tuner/strategies/greedy_ils.py        |  25 +--
 kernel_tuner/strategies/memetic.py           | 224 -------------------
 6 files changed, 27 insertions(+), 317 deletions(-)
 delete mode 100644 kernel_tuner/strategies/memetic.py

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 63c4c2fff..e40304d08 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -58,8 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
-    ensemble,
-    memetic
+    ensemble
 )
 
 strategy_map = {
@@ -79,7 +78,6 @@
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
     "ensemble": ensemble,
-    "memetic": memetic,
 }
 
 
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 61b0edf50..8884b89e7 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -51,11 +51,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
             self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
-            # actors_ready_futures = [actor.__ray_ready__.remote() for actor in futures]
-            # ray.wait(actors_ready_futures, num_returns=len(actors_ready_futures), timeout=None)
-            # self.actors = futures
-
-
+            
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
             raise GPUTypeMismatchError(f"Different GPU types found") 
@@ -137,43 +133,28 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
         
-        # Process results to extract population and candidates for further use
-        results, tuning_options_list, population, candidates = self._process_results_ensemble(all_results)
-
-        # Update tuning options for memetic strategies
-        if population:
-            tuning_options.strategy_options["population"] = population
-        if candidates:
-            tuning_options.strategy_options["candidates"] = candidates
+        # Process results
+        results, tuning_options_list = self._process_results_ensemble(all_results)
         
         return results, tuning_options_list
 
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
-        if "candidates" in tuning_options.strategy_options:
-            if len(tuning_options.strategy_options["candidates"]) > 0:
-                new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         # the stop criterion uses the max feval in tuning options for some reason
         new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
-        population = [] # for memetic strategy
-        candidates = [] # for memetic strategy
         results = []
         tuning_options_list = []
 
         for (strategy_results, tuning_options) in all_results:
-            if "old_candidate" in tuning_options.strategy_options:
-                candidates.append(tuning_options.strategy_options["old_candidate"])
-            if "candidate" in tuning_options.strategy_options:
-                population.append(tuning_options.strategy_options["candidate"])
             results.extend(strategy_results)
             tuning_options_list.append(tuning_options)
 
-        return results, tuning_options_list, population, candidates
+        return results, tuning_options_list
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
@@ -201,7 +182,6 @@ def _calculate_simulated_time(self, tuning_options_list):
         simulated_times = []
         for tuning_options in tuning_options_list:
             simulated_times.append(tuning_options.simulated_time)
-        #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
         return max(simulated_times)
 
     def _check_gpus_equals(self):
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 6d010a0a9..7ea022519 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -52,8 +52,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace", "check_and_retrieve"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "check_and_retrieve"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index b082ce3c6..52361a744 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -7,42 +7,39 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
-from kernel_tuner.runners.parallel import ParallelRunner
 
 _options = dict(
     popsize=("population size", 20),
     maxiter=("maximum number of generations", 100),
     method=("crossover method to use, choose any from single_point, two_point, uniform, disruptive_uniform", "uniform"),
     mutation_chance=("chance to mutate is 1 in mutation_chance", 10),
-    population=("initial population", None),
 )
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
-    pop_size, generations, method, mutation_chance, population = common.get_options(options, _options)
+    pop_size, generations, method, mutation_chance = common.get_options(options, _options)
     crossover = supported_methods[method]
 
     best_score = 1e20
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    if not population:
-        population = list(list(p) for p in searchspace.get_random_sample(pop_size))
-    else:
-        pop_size = len(population)
-    
-    old_population = population
+    population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+
     for generation in range(generations):
 
-        # Evaluate the entire population
-        try:
-            old_population = population
-            weighted_population = evaluate_population(runner, cost_func, population)
-        except util.StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            return cost_func.results
+        # determine fitness of population members
+        weighted_population = []
+        for dna in population:
+            try:
+                time = cost_func(dna, check_restrictions=False)
+            except util.StopCriterionReached as e:
+                if tuning_options.verbose:
+                    print(e)
+                return cost_func.results
+
+            weighted_population.append((dna, time))
 
         # population is sorted such that better configs have higher chance of reproducing
         weighted_population.sort(key=lambda x: x[1])
@@ -72,8 +69,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
-    tuning_options.strategy_options["population"] = old_population # for memetic strategy
-    tuning_options.strategy_options["candidates"] = population # for memetic strategy
+
     return cost_func.results
 
 
@@ -180,28 +176,4 @@ def disruptive_uniform_crossover(dna1, dna2):
     "two_point": two_point_crossover,
     "uniform": uniform_crossover,
     "disruptive_uniform": disruptive_uniform_crossover,
-}
-
-def evaluate_population(runner, cost_func, population):
-    """
-    Evaluate the population based on the type of runner.
-
-    Parameters:
-    - runner: The runner (ParallelRunner or SequentialRunner) determining how to process evaluations.
-    - cost_func: A function capable of evaluating the population.
-    - population: List of individuals to be evaluated.
-
-    Returns:
-    - List of tuples (dna, fitness_score) representing the population and their evaluation results.
-    """
-    if isinstance(runner, ParallelRunner):
-        # Process the whole population at once if using a ParallelRunner
-        results = cost_func(population, check_restrictions=False)
-        return list(zip(population, results))
-    else:
-        # Process each individual sequentially for SequentialRunner
-        weighted_population = []
-        for dna in population:
-            time = cost_func(dna, check_restrictions=False)  # Cost function called with a single-element list
-            weighted_population.append((dna, time))
-        return weighted_population
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index bbceb76b8..26d15f591 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -9,8 +9,7 @@
 _options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
                        restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
                        no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3),
-                       candidate=("initial candidate for the search", None))
+                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
@@ -18,7 +17,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
 
-    neighbor, restart, no_improvement, randomwalk, candidate = common.get_options(options, _options)
+    neighbor, restart, no_improvement, randomwalk = common.get_options(options, _options)
 
     perm_size = int(randomwalk * dna_size)
     if perm_size == 0:
@@ -32,28 +31,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
     #while searching
-    if not candidate:
-        candidate = searchspace.get_random_sample(1)[0]
-    old_candidate = candidate # for memetic strategy
-    try:
-        best_score = cost_func(candidate, check_restrictions=False)
-    except util.StopCriterionReached as e:
-        tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-        tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
-        if tuning_options.verbose:
-            print(e)
-        return cost_func.results
+    candidate = searchspace.get_random_sample(1)[0]
+    best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
 
         try:
-            old_candidate = candidate # for memetic strategy
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
-            tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-            tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
             if tuning_options.verbose:
                 print(e)
             return cost_func.results
@@ -66,8 +53,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
-    tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-    tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 
 
@@ -78,4 +63,4 @@ def random_walk(indiv, permutation_size, no_improve, last_improve, searchspace:
         return searchspace.get_random_sample(1)[0]
     for _ in range(permutation_size):
         indiv = mutate(indiv, 0, searchspace, cache=False)
-    return indiv
+    return indiv
\ No newline at end of file
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
deleted file mode 100644
index 083b117a7..000000000
--- a/kernel_tuner/strategies/memetic.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import logging
-import ray
-import os
-import sys
-import copy
-
-from kernel_tuner.searchspace import Searchspace
-from kernel_tuner.runners.parallel import ParallelRunner
-from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.sequential import SequentialRunner
-from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
-from kernel_tuner.util import get_num_devices, check_stop_criterion, StopCriterionReached
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
-
-from kernel_tuner.strategies import (
-    basinhopping,
-    bayes_opt,
-    brute_force,
-    diff_evo,
-    dual_annealing,
-    firefly_algorithm,
-    genetic_algorithm,
-    greedy_ils,
-    greedy_mls,
-    minimize,
-    mls,
-    ordered_greedy_mls,
-    pso,
-    random_sample,
-    simulated_annealing,
-    ensemble,
-    memetic
-)
-
-strategy_map = {
-    "brute_force": brute_force,
-    "random_sample": random_sample,
-    "minimize": minimize,
-    "basinhopping": basinhopping,
-    "diff_evo": diff_evo,
-    "genetic_algorithm": genetic_algorithm,
-    "greedy_mls": greedy_mls,
-    "ordered_greedy_mls": ordered_greedy_mls,
-    "greedy_ils": greedy_ils,
-    "dual_annealing": dual_annealing,
-    "mls": mls,
-    "pso": pso,
-    "simulated_annealing": simulated_annealing,
-    "firefly_algorithm": firefly_algorithm,
-    "bayes_opt": bayes_opt,
-}
-
-ls_strategies_list = {
-    "greedy_mls",
-    "ordered_greedy_mls",
-    "greedy_ils",
-    "mls",
-    "hill_climbing"
-}
-
-pop_based_strategies_list = {
-    "genetic_algorithm",
-    "differential_evolution",
-    "pso"
-}
-
-
-def tune(searchspace: Searchspace, runner, tuning_options):
-    options = tuning_options.strategy_options
-    simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    local_search = options.get('local_search', 'greedy_ils')
-    global_search = options.get('global_search', "genetic_algorithm")
-    alsd = options.get("alsd", 5) # Adaptive Local Search Depth (ALSD)
-    lsd = options.get("lsd", 30) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 3)
-    popsize = options.get("popsize", 20)
-    max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
-    print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
-
-    if local_search in ls_strategies_list:
-        tuning_options.strategy_options["ensemble"] = [local_search] * popsize
-    else:
-        raise ValueError("Provided local search ensemble are not all local search strategies")
-
-    if global_search in pop_based_strategies_list:
-        global_search = strategy_map[global_search]
-    else:
-        raise ValueError("Provided population based strategy is not a population based strategy")
-    
-    options["population"] = searchspace.get_random_sample(popsize)
-    
-    initialize_ray()
-    num_gpus = get_num_devices(simulation_mode=simulation_mode)
-    check_num_devices(num_gpus, simulation_mode, runner)
-    # Create cache manager, actors and parallel runner
-    cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
-    num_actors = num_gpus if num_gpus < popsize else popsize
-    runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-    actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=cache_manager, simulation_mode=simulation_mode) for id in range(num_actors)]
-    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
-                                simulation_mode=simulation_mode, actors=actors)
-    
-    all_results = []
-    all_results_dict = {}
-    feval = 0
-    afi_gs, afi_ls = None, None
-    while (max_feval is None) or feval < max_feval:
-        print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        if max_feval is not None:
-            maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
-        print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
-
-        # Global Search (GS)
-        print(f"DEBUG:=================Global Search=================", file=sys.stderr)
-        tuning_options.strategy_options["maxiter"] = maxiter
-        pop_start_gs = copy.deepcopy(tuning_options.strategy_options["population"])
-        results = global_search.tune(searchspace, pop_runner, tuning_options)
-        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += maxiter * popsize
-
-        pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
-        pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
-        pop_end_gs_res = get_pop_results(pop_end_gs, all_results_dict)
-        afi_gs = calculate_afi(pop_start_gs_res, pop_end_gs_res, maxiter, all_results_dict)
-
-        # Local Search (LS)
-        print(f"DEBUG:=================Local Search=================", file=sys.stderr)
-        tuning_options.strategy_options["max_fevals"] = lsd * popsize
-        pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
-        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += lsd * popsize
-
-        pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
-        pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        pop_end_ls_res = get_pop_results(pop_end_ls, all_results_dict)
-        afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
-
-        # Adaptive Local Search Depth (ALSD)
-        if afi_gs is not None and afi_ls is not None:
-            if afi_ls > afi_gs:
-                lsd += alsd
-            elif afi_ls < afi_gs:
-                lsd -= alsd
-            # Less than 5 lsd doesn't make sense
-            if lsd < 5:
-                lsd = 5
-            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
-
-    ray.kill(cache_manager)
-    for actor in actors:
-        ray.kill(actor)
-
-    return all_results
-
-def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
-    # Average Fitness Increment (AFI)
-    assert(feval >= 0)
-    delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval if feval > 0 else 0
-    print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
-    return afi
-
-def fitness_increment(pop_before, pop_after):
-    if len(pop_before) != len(pop_after):
-        raise ValueError("populations must have the same size.")
-    
-    sum_before = sum(t for t in pop_before if isinstance(t, float))
-    sum_after = sum(t for t in pop_after if isinstance(t, float))
-    difference_sum = sum_before - sum_after
-    return difference_sum
-
-def get_pop_results(pop, results):
-    print(f"DEBUG:get_pop_results pop = {pop}", file=sys.stderr)
-    times = []
-    for entry in pop:
-        key = ','.join(map(str, entry))
-        if key in results:
-            time = results[key]
-            times.append(time)
-        else:
-            times.append(None)
-
-    print(f"DEBUG:get_pop_results times = {times}", file=sys.stderr)
-    return times
-
-def add_to_results(all_results, all_results_dict, results, tune_params):
-    for result in results:
-        key = ",".join(str(result[param]) for param in tune_params)
-        all_results_dict[key] = result["time"]
-        all_results.append(result)
-
-def distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls):
-    remaining_feval = max_feval - feval
-    if remaining_feval < (lsd + maxiter) * popsize:
-        # Calculate how many full batches of popsize can still be processed
-        proportion = remaining_feval // popsize
-
-        if afi_gs is None or afi_ls is None:
-            maxiter = int(proportion * 0.5)
-            lsd = int(proportion * 0.5)
-        else:
-            if afi_gs > afi_ls:
-                # More evaluations to maxiter
-                maxiter = int(proportion * 0.6)
-                lsd = int(proportion * 0.4)
-            else:
-                # More evaluations to lsd
-                maxiter = int(proportion * 0.4)
-                lsd = int(proportion * 0.6)
-
-        # If maxiter ends up being 1, assign all remaining feval to lsd
-        if maxiter == 1:
-            lsd = proportion  # Give all available batches to lsd
-            maxiter = 0
-
-        # Ensure at least one of maxiter or lsd is non-zero if there are still fevals to be used
-        if maxiter == 0 and lsd == 0 and remaining_feval > 0:
-            lsd = 1  # Allocate at least one batch to lsd to ensure progress
-
-    return maxiter, lsd
- 
\ No newline at end of file

From 65d32c1daf2f763b3472608d3be48f9186b25e79 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:29:02 +0200
Subject: [PATCH 91/97] added ray to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3175ed34a..0209574c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ python-constraint2 = "^2.0.0b5"
 xmltodict = "*"
 pandas = ">=2.0.0"
 scikit-learn = ">=1.0.2"
+ray = ">=2.9.1"
 # Torch can be used with Kernel Tuner, but is not a dependency, should be up to the user to use it
 
 # List of optional dependencies for user installation, e.g. `pip install kernel_tuner[cuda]`, used in the below `extras`.

From 503df1b81b77b40d15d031cf2e8a7acf94540fbc Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:42:40 +0200
Subject: [PATCH 92/97] updated toml file with ray dashboard

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b48f7e458..721c60e7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,7 @@ python-constraint2 = "^2.0.0b5"
 xmltodict = "*"
 pandas = ">=2.0.0"
 scikit-learn = ">=1.0.2"
-ray = ">=2.9.1"
+ray = { version = ">=2.9.1", extras = ["default"] }
 # Torch can be used with Kernel Tuner, but is not a dependency, should be up to the user to use it
 
 # List of optional dependencies for user installation, e.g. `pip install kernel_tuner[cuda]`, used in the below `extras`.

From c126a011e19a51d5eb31767638c80b1644d7c202 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:20:09 +0200
Subject: [PATCH 93/97] fix small bug in _evaluate_configs

---
 kernel_tuner/strategies/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 7ea022519..5e4dba354 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -194,7 +194,6 @@ def _evaluate_configs(self, configs):
             # in case of stop creterion reached, save the results so far
             self.results.append(result)
 
-        self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time
         self.runner.last_strategy_start_time = perf_counter()
 

From 4df1b0d872f85a0a32593620a9f2068ff9ce9e62 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:33:49 +0200
Subject: [PATCH 94/97] adapted test for ensemble

---
 test/strategies/test_strategies.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/strategies/test_strategies.py b/test/strategies/test_strategies.py
index 096be38b0..1001aabec 100644
--- a/test/strategies/test_strategies.py
+++ b/test/strategies/test_strategies.py
@@ -36,7 +36,7 @@ def vector_add():
 @pytest.mark.parametrize('strategy', strategy_map)
 def test_strategies(vector_add, strategy):
 
-    options = dict(popsize=5, neighbor='adjacent')
+    options = dict(popsize=5)
 
     print(f"testing {strategy}")
 

From 29a507cc44812c2d40998c696fb79ea3a1151f9c Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:46:36 +0200
Subject: [PATCH 95/97] cleaned up not used imports

---
 kernel_tuner/runners/parallel.py         |  3 ---
 kernel_tuner/runners/ray/remote_actor.py |  2 --
 kernel_tuner/strategies/ensemble.py      | 15 +--------------
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 8884b89e7..e81341160 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -1,7 +1,5 @@
-import logging
 import ray
 import sys
-import os
 from ray.util.actor_pool import ActorPool
 from time import perf_counter
 from collections import deque
@@ -9,7 +7,6 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices, GPUTypeMismatchError
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 533dea5b3..c0743ad22 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,6 +1,4 @@
 import ray
-import sys
-import copy
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 2a19f9f74..9cdc0b90e 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -1,22 +1,9 @@
-import random
-import sys
-import os
-import ray
-import copy
-import logging
 import warnings
-from collections import deque
 
-import numpy as np
-
-from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
-from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices, create_actor_on_device, initialize_ray
+from kernel_tuner.strategies.common import initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
-from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.runners.parallel import ParallelRunner
 
 from kernel_tuner.strategies import (

From 7c49a29da61886af990abc8c72fb3128619f272e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:56:53 +0200
Subject: [PATCH 96/97] added comments

---
 kernel_tuner/runners/parallel.py    | 18 ++++++++++++++++++
 kernel_tuner/strategies/ensemble.py |  4 ++++
 2 files changed, 22 insertions(+)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e81341160..871b93228 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -144,6 +144,9 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
+        """
+        Process the results from the ensemble execution.
+        """
         results = []
         tuning_options_list = []
 
@@ -155,6 +158,9 @@ def _process_results_ensemble(self, all_results):
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
+        """
+        Perform parallel function evaluation.
+        """
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
@@ -164,6 +170,9 @@ def parallel_function_evaluation(self, tuning_options, parameter_space):
         return results, tuning_options_list
     
     def _process_results(self, all_results, searchspace):
+        """
+        Process the results and remove duplicates based on the searchspace.
+        """
         unique_configs = set()
         final_results = []
 
@@ -176,12 +185,18 @@ def _process_results(self, all_results, searchspace):
         return final_results
     
     def _calculate_simulated_time(self, tuning_options_list):
+        """
+        Calculate the maximum simulated time from the list of tuning options.
+        """
         simulated_times = []
         for tuning_options in tuning_options_list:
             simulated_times.append(tuning_options.simulated_time)
         return max(simulated_times)
 
     def _check_gpus_equals(self):
+        """
+        Check if all GPUs are of the same type.
+        """
         gpu_types = []
         env_refs = [actor.get_environment.remote() for actor in self.actors]
         environments = ray.get(env_refs)
@@ -194,6 +209,9 @@ def _check_gpus_equals(self):
             return False
 
     def clean_up_ray(self):
+        """
+        Clean up Ray actors and cache manager.
+        """
         if self.actors is not None:
             for actor in self.actors:
                 ray.kill(actor)
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 9cdc0b90e..7e66f0360 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -50,19 +50,23 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)
 
+    # setup strategy options
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
     tuning_options.strategy_options['check_and_retrieve'] = True
 
+    # define number of ray actors needed
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
 
     ensemble = [strategy_map[strategy] for strategy in ensemble]
+
     parallel_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                     runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
                                     simulation_mode=simulation_mode, actors=actors)
+    
     final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
 
     if clean_up:

From eb5db41fb0bee44ed7a606e4e88c9ecc3d77f877 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 4 Jul 2024 16:52:37 +0200
Subject: [PATCH 97/97] added documentation and related fixes

---
 doc/source/optimization.rst            |  1 +
 kernel_tuner/interface.py              |  1 +
 kernel_tuner/runners/parallel.py       | 99 ++++++++++++++++++++++++--
 kernel_tuner/runners/sequential.py     |  6 ++
 kernel_tuner/strategies/brute_force.py |  2 +-
 kernel_tuner/strategies/ensemble.py    | 21 ++++--
 6 files changed, 119 insertions(+), 11 deletions(-)

diff --git a/doc/source/optimization.rst b/doc/source/optimization.rst
index 59219ad51..2b8dd8987 100644
--- a/doc/source/optimization.rst
+++ b/doc/source/optimization.rst
@@ -25,6 +25,7 @@ the ``strategy=`` optional argument of ``tune_kernel()``. Kernel Tuner currently
  * "pso" particle swarm optimization
  * "random_sample" takes a random sample of the search space
  * "simulated_annealing" simulated annealing strategy
+ * "ensemble" ensemble strategy
 
 Most strategies have some mechanism built in to detect when to stop tuning, which may be controlled through specific 
 parameters that can be passed to the strategies using the ``strategy_options=`` optional argument of ``tune_kernel()``. You 
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index e40304d08..0be907737 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -467,6 +467,7 @@ def __deepcopy__(self, _):
         ),
         ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")),
         ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")),
+        ("parallel_mode", ("Run the auto-tuning on multiple devices (brute-force execution)", "bool")),
         ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")),
     ]
 )
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 871b93228..a7f2d95fc 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,9 +12,41 @@
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
 class ParallelRunner(Runner):
+    """ParallelRunner is used for tuning with multiple processes/threads using Ray for distributed computing."""
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
                  num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
+        """Instantiate the ParallelRunner.
+
+        :param kernel_source: The kernel source
+        :type kernel_source: kernel_tuner.core.KernelSource
+
+        :param kernel_options: A dictionary with all options for the kernel.
+        :type kernel_options: kernel_tuner.interface.Options
+
+        :param device_options: A dictionary with all options for the device
+            on which the kernel should be tuned.
+        :type device_options: kernel_tuner.interface.Options
+
+        :param iterations: The number of iterations used for benchmarking
+            each kernel instance.
+        :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param num_gpus: Number of GPUs to use. Defaults to None.
+        :type num_gpus: int, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :param actors: List of pre-initialized actors. Defaults to None.
+        :type actors: list, optional
+
+        :param simulation_mode: Flag to indicate simulation mode. Defaults to False.
+        :type simulation_mode: bool, optional
+        """
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
         self.kernel_source = kernel_source
         self.simulation_mode = simulation_mode
@@ -41,6 +73,26 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
+        """Run the tuning process with parallel execution.
+
+        :param parameter_space: The parameter space to explore.
+        :type parameter_space: iterable
+
+        :param tuning_options: Tuning options. Defaults to None.
+        :type tuning_options: dict, optional
+
+        :param ensemble: List of strategies for ensemble. Defaults to None.
+        :type ensemble: list, optional
+
+        :param searchspace: The search space to explore. Defaults to None.
+        :type searchspace: kernel_tuner.searchspace.Searchspace, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :returns: Results of the tuning process.
+        :rtype: list of dict
+        """
         if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
         
@@ -84,9 +136,20 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         return results
 
     def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
-        """
-        Runs strategies from the ensemble in parallel using distributed actors, 
+        """Runs strategies from the ensemble in parallel using distributed actors,
         manages dynamic task allocation, and collects results.
+
+        :param ensemble: List of strategies to execute.
+        :type ensemble: list
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param searchspace: Search space to explore.
+        :type searchspace: kernel_tuner.searchspace.Searchspace
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
         """
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
@@ -137,6 +200,17 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
 
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
+        """Set up tuning options for each strategy in the ensemble.
+
+        :param tuning_options: Original tuning options.
+        :type tuning_options: dict
+
+        :param evaluations_per_strategy: Number of evaluations per strategy.
+        :type evaluations_per_strategy: list
+
+        :returns: Modified tuning options.
+        :rtype: dict
+        """
         new_tuning_options = copy.deepcopy(tuning_options)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         # the stop criterion uses the max feval in tuning options for some reason
@@ -144,8 +218,13 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
-        """
-        Process the results from the ensemble execution.
+        """Process the results from the ensemble execution.
+
+        :param all_results: List of results from all strategies.
+        :type all_results: list
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
         """
         results = []
         tuning_options_list = []
@@ -158,8 +237,16 @@ def _process_results_ensemble(self, all_results):
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
-        """
-        Perform parallel function evaluation.
+        """Perform parallel function evaluation.
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param parameter_space: Parameter space to explore.
+        :type parameter_space: list
+
+        :returns: Results and tuning options list.
+        :rtype: tuple
         """
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index e19242549..46ba17e0a 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -28,6 +28,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param iterations: The number of iterations used for benchmarking
             each kernel instance.
         :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
         """
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if dev is None else dev
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index 1ba83a467..cf6ba521b 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -4,7 +4,7 @@
 from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 
-_options = {}
+_options = dict(num_gpus=("Number of gpus to run parallel execution", None))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 7e66f0360..2dab125f4 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -1,6 +1,11 @@
+"""
+The ensemble strategy that optimizes the search through the parameter space using a combination of multiple strategies.
+"""
+
 import warnings
 
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.util import get_num_devices
@@ -40,20 +45,26 @@
     "bayes_opt": bayes_opt,
 }
 
+_options = dict(
+    ensemble=("List of strategies to be used in the ensemble", ["random_sample", "random_sample"]),
+    max_fevals=("Maximum number of function evaluations", None),
+    num_gpus=("Number of gpus to run the parallel ensemble on", None)
+)
+
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     clean_up = True if actors is None and cache_manager is None else False
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
-    num_devices = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else get_num_devices(simulation_mode=simulation_mode)
-    
-    ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
+
+    ensemble, max_fevals, num_gpus =common.get_options(tuning_options.strategy_options, _options)
+    num_devices = num_gpus if num_gpus is not None else get_num_devices(simulation_mode=simulation_mode)  
     ensemble_size = len(ensemble)
 
     # setup strategy options
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
-    tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+    tuning_options.strategy_options["max_fevals"] = 100 * ensemble_size if max_fevals is None else max_fevals
     tuning_options.strategy_options['check_and_retrieve'] = True
 
     # define number of ray actors needed
@@ -73,3 +84,5 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
         parallel_runner.clean_up_ray()
     
     return final_results
+
+tune.__doc__ = common.get_strategy_docstring("Ensemble", _options)
\ No newline at end of file