inmanta · jptrindade · Sep 24, 2024 · Sep 24, 2024 · Sep 26, 2024 · Sep 27, 2024
diff --git a/changelogs/unreleased/implement-priority-mechanism-on-scheduler.yml b/changelogs/unreleased/implement-priority-mechanism-on-scheduler.yml
@@ -0,0 +1,4 @@
+description: Implement priority mechanism on scheduler
+change-type: minor
+destination-branches: [master]
+
diff --git a/src/inmanta/agent/agent_new.py b/src/inmanta/agent/agent_new.py
@@ -31,6 +31,7 @@
 from inmanta.const import AGENT_SCHEDULER_ID
 from inmanta.data.model import AttributeStateChange, ResourceVersionIdStr
 from inmanta.deploy.scheduler import ResourceScheduler
+from inmanta.deploy.work import TaskPriority
 from inmanta.protocol import SessionEndpoint, methods, methods_v2
 from inmanta.types import Apireturn
 from inmanta.util import CronSchedule, IntervalSchedule, ScheduledTask, Scheduler, TaskMethod, TaskSchedule, join_threadpools
@@ -132,8 +133,14 @@ def periodic_schedule(
                 return True
             return False
 
-        periodic_schedule("deploy", self.scheduler.deploy, self._deploy_interval, self._deploy_splay_value)
-        periodic_schedule("repair", self.scheduler.repair, self._repair_interval, self._repair_splay_value)
+        async def interval_deploy() -> None:
+            await self.scheduler.deploy(TaskPriority.INTERVAL_DEPLOY)
+
+        async def interval_repair() -> None:
+            await self.scheduler.repair(TaskPriority.INTERVAL_REPAIR)
+
+        periodic_schedule("deploy", interval_deploy, self._deploy_interval, self._deploy_splay_value)
+        periodic_schedule("repair", interval_repair, self._repair_interval, self._repair_splay_value)
 
     def _enable_time_trigger(self, action: TaskMethod, schedule: TaskSchedule) -> None:
         self._sched.add_action(action, schedule)
@@ -240,10 +247,10 @@ async def trigger_update(self, env: uuid.UUID, agent: str, incremental_deploy: b
         assert agent == AGENT_SCHEDULER_ID
         if incremental_deploy:
             LOGGER.info("Agent %s got a trigger to run deploy in environment %s", agent, env)
-            await self.scheduler.deploy()
+            await self.scheduler.deploy(TaskPriority.USER_DEPLOY)
         else:
             LOGGER.info("Agent %s got a trigger to run repair in environment %s", agent, env)
-            await self.scheduler.repair()
+            await self.scheduler.repair(TaskPriority.USER_REPAIR)
         return 200
 
     @protocol.handle(methods.trigger_read_version, env="tid", agent="id")

diff --git a/src/inmanta/deploy/scheduler.py b/src/inmanta/deploy/scheduler.py
@@ -32,7 +32,7 @@
 from inmanta.deploy import work
 from inmanta.deploy.state import DeploymentResult, ModelState, ResourceDetails, ResourceState, ResourceStatus
 from inmanta.deploy.tasks import DryRun, RefreshFact, Task
-from inmanta.deploy.work import PrioritizedTask
+from inmanta.deploy.work import PrioritizedTask, TaskPriority
 from inmanta.protocol import Client
 from inmanta.resources import Id
 
@@ -169,20 +169,20 @@ async def stop(self) -> None:
         self._work.agent_queues.send_shutdown()
         await asyncio.gather(*self._workers.values())
 
-    async def deploy(self) -> None:
+    async def deploy(self, priority: TaskPriority) -> None:
         """
         Trigger a deploy
         """
         async with self._scheduler_lock:
-            self._work.deploy_with_context(self._state.dirty, deploying=self._deploying)
+            self._work.deploy_with_context(self._state.dirty, priority, deploying=self._deploying_stale)
 
-    async def repair(self) -> None:
+    async def repair(self, priority: TaskPriority) -> None:
         """
         Trigger a repair, i.e. mark all resources as dirty, then trigger a deploy.
         """
         async with self._scheduler_lock:
             self._state.dirty.update(self._state.resources.keys())
-            self._work.deploy_with_context(self._state.dirty, deploying=self._deploying)
+            self._work.deploy_with_context(self._state.dirty, priority, deploying=self._deploying_stale)
 
     async def dryrun(self, dry_run_id: uuid.UUID, version: int) -> None:
         resources = await self._build_resource_mappings_from_db(version)
@@ -195,7 +195,7 @@ async def dryrun(self, dry_run_id: uuid.UUID, version: int) -> None:
                         resource_details=resource,
                         dry_run_id=dry_run_id,
                     ),
-                    priority=10,
+                    priority=TaskPriority.DRYRUN,
                 )
             )
 
@@ -204,7 +204,7 @@ async def get_facts(self, resource: dict[str, object]) -> None:
         self._work.agent_queues.queue_put_nowait(
             PrioritizedTask(
                 task=RefreshFact(resource=rid),
-                priority=10,
+                priority=TaskPriority.FACT_REFRESH,
             )
         )
 
@@ -305,6 +305,7 @@ async def _new_version(
                 # ensure deploy for ALL dirty resources, not just the new ones
                 self._work.deploy_with_context(
                     self._state.dirty,
+                    TaskPriority.NEW_VERSION_DEPLOY,
                     deploying=self._deploying,
                     added_requires=added_requires,
                     dropped_requires=dropped_requires,
@@ -384,7 +385,11 @@ async def report_resource_state(
                 if details.attributes.get("send_event", False):
                     provides: Set[ResourceIdStr] = self._state.requires.provides_view().get(resource, set())
                     if provides:
-                        self._work.deploy_with_context(provides, deploying=self._deploying)
+                        # Not sure about this priority. I believe this method is called when a resource has a new state
+                        # and hence, new version.
+                        self._work.deploy_with_context(
+                            provides, priority=TaskPriority.NEW_VERSION_DEPLOY, deploying=self._deploying
+                        )
 
     def get_types_for_agent(self, agent: str) -> Collection[ResourceType]:
         return list(self._state.types_per_agent[agent])
diff --git a/src/inmanta/deploy/work.py b/src/inmanta/deploy/work.py
@@ -21,6 +21,7 @@
 import itertools
 from collections.abc import Iterator, Mapping, Set
 from dataclasses import dataclass
+from enum import IntEnum
 from typing import Callable, Generic, Optional, TypeVar
 
 from inmanta.data.model import ResourceIdStr
@@ -34,6 +35,17 @@
 T = TypeVar("T", bound=tasks.Task, covariant=True)
 
 
+class TaskPriority(IntEnum):
+    TERMINATED = -1
+    USER_DEPLOY = 0
+    NEW_VERSION_DEPLOY = 1
+    USER_REPAIR = 2
+    DRYRUN = 3
+    INTERVAL_DEPLOY = 4
+    FACT_REFRESH = 5
+    INTERVAL_REPAIR = 6
+
+
 @dataclass(frozen=True, kw_only=True)
 class PrioritizedTask(Generic[T]):
     """
@@ -45,7 +57,7 @@ class PrioritizedTask(Generic[T]):
     """
 
     task: T
-    priority: int
+    priority: TaskPriority
 
 
 @functools.total_ordering
@@ -198,7 +210,8 @@ def send_shutdown(self) -> None:
         """
         poison_pill = TaskQueueItem(
             task=PrioritizedTask(
-                task=tasks.PoisonPill(resource=ResourceIdStr("system::Terminate[all,stop=True]")), priority=-1
+                task=tasks.PoisonPill(resource=ResourceIdStr("system::Terminate[all,stop=True]")),
+                priority=TaskPriority.TERMINATED,
             ),
             insert_order=0,
         )
@@ -294,6 +307,7 @@ def reset(self) -> None:
     def deploy_with_context(
         self,
         resources: Set[ResourceIdStr],
+        priority: TaskPriority,
         *,
         # TODO: update docstring + consider in_progress_deploys for name?
         deploying: Optional[Set[ResourceIdStr]] = None,
@@ -307,6 +321,7 @@ def deploy_with_context(
 
         :param resources: Set of resources that should be deployed. Adds a deploy task to the scheduled work for each
             of these, unless it is already scheduled.
+        :param priority: The priority of this deploy.
         :param stale_deploys: Set of resources for which a stale deploy is in progress, i.e. a deploy for an outdated resource
             intent.
         :param added_requires: Requires edges that were added since the previous state update, if any.
@@ -377,11 +392,12 @@ def extend_blocked_on(resource: ResourceIdStr, new_blockers: set[ResourceIdStr])
                 # discard rather than remove because task may already be running, in which case we leave it run its course
                 # and simply add a new one
                 task: tasks.Deploy = tasks.Deploy(resource=resource)
-                priority: Optional[int] = self.agent_queues.discard(task)
+                task_priority: Optional[int] = self.agent_queues.discard(task)
-                task_priority: Optional[int] = self.agent_queues.discard(task)
+                task_priority: Optional[TaskPriority] = self.agent_queues.discard(task)
-                task_priority: Optional[int] = self.agent_queues.discard(task)
+                task_priority: Optional[TaskPriority] = self.agent_queues.discard(task)
                 queued.remove(resource)
                 self._waiting[resource] = BlockedDeploy(
-                    # FIXME[#8015]: default priority
-                    task=PrioritizedTask(task=task, priority=priority if priority is not None else 0),
+                    task=PrioritizedTask(
+                        task=task, priority=TaskPriority(task_priority) if task_priority is not None else priority
+                    ),
                     # task was previously ready to execute => assume no other blockers than this one
                     blocked_on=new_blockers,
                 )
@@ -391,17 +407,23 @@ def extend_blocked_on(resource: ResourceIdStr, new_blockers: set[ResourceIdStr])
 
         # ensure desired resource deploys are scheduled
         for resource in resources:
+            prioritized_task = PrioritizedTask(task=tasks.Deploy(resource=resource), priority=priority)
             if is_scheduled(resource):
-                # Deploy is already scheduled / running. No need to do anything here. If any of its dependencies are to be
+                # Deploy is already scheduled / running. Check to see if this task has a higher priority than the one already
+                # scheduled. If it has, update the priority. If any of its dependencies are to be
                 # scheduled as well, they will follow the provides relation to ensure this deploy waits its turn.
+                if prioritized_task.task in self.agent_queues:
+                    self.agent_queues.queue_put_nowait(prioritized_task)
-                    self.agent_queues.queue_put_nowait(prioritized_task)
+                    # simply add it again, the queue will make sure only the highest priority is kept
+                    self.agent_queues.queue_put_nowait(prioritized_task)
-                    self.agent_queues.queue_put_nowait(prioritized_task)
+                    # simply add it again, the queue will make sure only the highest priority is kept
+                    self.agent_queues.queue_put_nowait(prioritized_task)
+                if resource in self._waiting:
+                    if self._waiting[resource].task.priority > priority:
+                        self._waiting[resource].task = prioritized_task
                 continue
             # task is not yet scheduled, schedule it now
             blocked_on: set[ResourceIdStr] = {
                 dependency for dependency in self.requires.get(resource, ()) if is_scheduled(dependency)
             }
             self._waiting[resource] = BlockedDeploy(
-                # FIXME[#8015]: priority
-                task=PrioritizedTask(task=tasks.Deploy(resource=resource), priority=0),
+                task=prioritized_task,
                 blocked_on=blocked_on,
             )
             not_scheduled.discard(resource)

diff --git a/tests/agent_server/deploy/test_scheduler_agent.py b/tests/agent_server/deploy/test_scheduler_agent.py
@@ -36,7 +36,8 @@
 from inmanta.agent.executor import ResourceDetails, ResourceInstallSpec
 from inmanta.config import Config
 from inmanta.data import ResourceIdStr
-from inmanta.deploy import state
+from inmanta.deploy import state, tasks
+from inmanta.deploy.work import TaskPriority
 from inmanta.protocol.common import custom_json_encoder
 from inmanta.util import retry_limited
 
@@ -167,7 +168,7 @@ async def build_resource_mappings_from_db(version: int | None) -> Mapping[Resour
 
 @pytest.fixture
 def environment() -> uuid.UUID:
-    return "83d604a0-691a-11ef-ae04-c8f750463317"
+    return uuid.UUID("83d604a0-691a-11ef-ae04-c8f750463317")
 
 
 @pytest.fixture
@@ -396,7 +397,7 @@ async def test_get_facts(agent: TestAgent, make_resource_minimal):
         ResourceIdStr(rid2): make_resource_minimal(rid2, {"value": "a"}, [rid1]),
     }
 
-    await agent.scheduler._new_version(5, resources, make_requires(resources))
+    await agent.scheduler._new_version(1, resources, make_requires(resources))
 
     await agent.scheduler.get_facts({"id": rid1})
 
@@ -409,3 +410,111 @@ async def done():
     await retry_limited(done, 5)
 
     assert agent.executor_manager.executors["agent1"].facts_count == 1
+
+
+async def test_scheduler_priority(agent: TestAgent, environment, make_resource_minimal):
+    """
+    Ensure that the tasks are placed in the queue in the correct order
+    And that existing tasks in the queue are replaced if a task that
-    And that existing tasks in the queue are replaced if a task that
+    and that existing tasks in the queue are replaced if a task that
-    And that existing tasks in the queue are replaced if a task that
+    and that existing tasks in the queue are replaced if a task that
+    does the same thing with higher priority is added to the queue
+    """
+
+    rid1 = ResourceIdStr("test::Resource[agent1,name=1]")
+    resources = {
+        rid1: make_resource_minimal(rid1, values={"value": "a"}, requires=[]),
+    }
+
+    executor1: ManagedExecutor = ManagedExecutor()
+    agent.executor_manager.register_managed_executor("agent1", executor1)
+
+    # We add two different tasks and assert that they are consumed in the correct order
+    # Add a new version deploy to the queue
+    await agent.scheduler._new_version(1, resources, make_requires(resources))
+    agent.scheduler.mock_versions[1] = resources
+
+    # And then a dryrun
+    dryrun = uuid.uuid4()
+    await agent.scheduler.dryrun(dryrun, 1)
+
+    # The tasks are consumed in the priority order
+    first_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(first_task, tasks.Deploy)
+    second_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(second_task, tasks.DryRun)
+
+    # The same is true if a task with lesser priority is added first
+    # Add a fact refresh task to the queue
+    await agent.scheduler.get_facts({"id": rid1})
+
+    # Then add an interval deploy task to the queue
+    agent.scheduler._state.dirty.add(rid1)
+    await agent.scheduler.deploy(TaskPriority.INTERVAL_DEPLOY)
+
+    # The tasks are consumed in the priority order
+    first_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(first_task, tasks.Deploy)
+    second_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(second_task, tasks.RefreshFact)
+    # Assert that all tasks were consumed
+    queue = agent.scheduler._work.agent_queues._get_queue("agent1")._queue
+    assert len(queue) == 0
+
+    # Add an interval deploy task to the queue
+    agent.scheduler._state.dirty.add(rid1)
+    await agent.scheduler.deploy(TaskPriority.INTERVAL_DEPLOY)
+
+    # Add a dryrun to the queue (which has more priority)
+    dryrun = uuid.uuid4()
+    await agent.scheduler.dryrun(dryrun, 1)
+
+    # Assert that we have both tasks in the queue
+    queue = agent.scheduler._work.agent_queues._get_queue("agent1")._queue
+    assert len(queue) == 2
+
+    # Add a user deploy
+    # It has more priority than interval deploy, so it will replace it in the queue
+    # It also has more priority than dryrun, so it will be consumed first
+
+    await agent.trigger_update(environment, "$__scheduler", incremental_deploy=True)
+
+    first_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(first_task, tasks.Deploy)
+    second_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(second_task, tasks.DryRun)
+
+    # Interval deploy is still in the queue but marked as deleted
+    queue = agent.scheduler._work.agent_queues._get_queue("agent1")._queue
+    assert len(queue) == 1
+    assert queue[0].deleted
+
+    # Force clean queue
+    agent.scheduler._work.agent_queues._get_queue("agent1")._queue = []
+
+    # If a task to deploy a resource is added to the queue,
+    # but a task to deploy that same resource is already present with higher priority,
+    # it will be ignored and not added to the queue
+
+    # Add a dryrun to the queue
+    dryrun = uuid.uuid4()
+    await agent.scheduler.dryrun(dryrun, 1)
+
+    # Add a user deploy
+    await agent.trigger_update(environment, "$__scheduler", incremental_deploy=True)
+
+    # Try to add an interval deploy task to the queue
+    agent.scheduler._state.dirty.add(rid1)
+    await agent.scheduler.deploy(TaskPriority.INTERVAL_DEPLOY)
+
+    # Assert that we still have only 2 tasks in the queue
+    queue = agent.scheduler._work.agent_queues._get_queue("agent1")._queue
+    assert len(queue) == 2
+
+    # The order is unaffected, the interval deploy was essentially ignored
+    first_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(first_task, tasks.Deploy)
+    second_task = await agent.scheduler._work.agent_queues.queue_get("agent1")
+    assert isinstance(second_task, tasks.DryRun)
+
+    # All tasks were consumed
+    queue = agent.scheduler._work.agent_queues._get_queue("agent1")._queue
+    assert len(queue) == 0