Skip to content

Commit

Permalink
planner: introduce SPOT VMs policy
Browse files Browse the repository at this point in the history
  • Loading branch information
csegarragonz committed May 2, 2024
1 parent 35af765 commit 0c454a9
Show file tree
Hide file tree
Showing 26 changed files with 1,603 additions and 56 deletions.
4 changes: 2 additions & 2 deletions .env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FAABRIC_VERSION=0.18.0
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.18.0
FAABRIC_VERSION=0.19.0
FAABRIC_CLI_IMAGE=faasm.azurecr.io/faabric:0.19.0
COMPOSE_PROJECT_NAME=faabric-dev
CONAN_CACHE_MOUNT_SOURCE=./conan-cache/
12 changes: 6 additions & 6 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
env:
DEPLOYMENT_TYPE: gha-ci
steps:
Expand All @@ -34,7 +34,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
steps:
- name: "Check out code"
uses: actions/checkout@v4
Expand All @@ -45,7 +45,7 @@ jobs:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
steps:
- name: "Check out code"
uses: actions/checkout@v4
Expand All @@ -65,7 +65,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
options: --privileged
services:
redis:
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
options: --privileged
services:
redis:
Expand Down Expand Up @@ -156,7 +156,7 @@ jobs:
REDIS_QUEUE_HOST: redis
REDIS_STATE_HOST: redis
container:
image: faasm.azurecr.io/faabric:0.18.0
image: faasm.azurecr.io/faabric:0.19.0
services:
redis:
image: redis
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.18.0
0.19.0
5 changes: 5 additions & 0 deletions include/faabric/batch-scheduler/BatchScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
#define NOT_ENOUGH_SLOTS_DECISION \
faabric::batch_scheduler::SchedulingDecision(NOT_ENOUGH_SLOTS, \
NOT_ENOUGH_SLOTS)
#define MUST_FREEZE -97
#define MUST_FREEZE_DECISION \
faabric::batch_scheduler::SchedulingDecision(MUST_FREEZE, MUST_FREEZE)

#define MUST_EVICT_IP "E.VI.CT.ME"

namespace faabric::batch_scheduler {

Expand Down
33 changes: 33 additions & 0 deletions include/faabric/batch-scheduler/SpotScheduler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include <faabric/batch-scheduler/BatchScheduler.h>
#include <faabric/batch-scheduler/SchedulingDecision.h>
#include <faabric/util/batch.h>

namespace faabric::batch_scheduler {

// This batch scheduler behaves in the same way than BinPack for NEW and
// SCALE_CHANGE requests, but for DIST_CHANGE it considers if any of the
// hosts in the Host Map have been tainted with the eviction mark. In which
// case it first tries to migrate them to other running hosts and, if not
// enough hosts are available, freezes the messages.
class SpotScheduler final : public BatchScheduler
{
public:
std::shared_ptr<SchedulingDecision> makeSchedulingDecision(
HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req) override;

private:
bool isFirstDecisionBetter(
std::shared_ptr<SchedulingDecision> decisionA,
std::shared_ptr<SchedulingDecision> decisionB) override;

std::vector<Host> getSortedHosts(
HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req,
const DecisionType& decisionType) override;
};
}
13 changes: 13 additions & 0 deletions include/faabric/planner/Planner.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class Planner

void printConfig() const;

std::string getPolicy();

void setPolicy(const std::string& newPolicy);

// ----------
Expand Down Expand Up @@ -87,10 +89,21 @@ class Planner
// the planner was last reset
int getNumMigrations();

// Helper method to get the next host that will be evicted
std::set<std::string> getNextEvictedHostIps();

std::map<int32_t, std::shared_ptr<BatchExecuteRequest>> getEvictedReqs();

// Main entrypoint to request the execution of batches
std::shared_ptr<faabric::batch_scheduler::SchedulingDecision> callBatch(
std::shared_ptr<BatchExecuteRequest> req);

// ----------
// API exclusive to SPOT policy mode
// ----------

void setNextEvictedVm(const std::set<std::string>& vmIp);

private:
// There's a singleton instance of the planner running, but it must allow
// concurrent requests
Expand Down
18 changes: 18 additions & 0 deletions include/faabric/planner/PlannerState.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ namespace faabric::planner {
*/
struct PlannerState
{
// Policy to operate the planner in. Mostly determins the batch scheduler
// behaviour, but also the planner's in some cases
std::string policy;

// Accounting of the hosts that are registered in the system and responsive
// We deliberately use the host's IP as unique key, but assign a unique host
// id for redundancy
Expand All @@ -36,5 +40,19 @@ struct PlannerState

// Helper coutner of the total number of migrations
std::atomic<int> numMigrations = 0;

// -----
// Data structures used only under the SPOT policy
// -----

// Map containing the BER that have been evicted due to a SPOT VM eviction.
// All messages in the VM have been checkpointed, are in the snapshot
// registry in the planner, and are ready to be scheduled when capacity
// appears
std::map<int, std::shared_ptr<BatchExecuteRequest>> evictedRequests;

// This variable simulates the values we would get from a cloud provider's
// API indicating the (set of) VM to be evicted next
std::set<std::string> nextEvictedHostIps;
};
}
9 changes: 9 additions & 0 deletions include/faabric/util/func.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
#include <vector>

#define MIGRATED_FUNCTION_RETURN_VALUE -99
#define FROZEN_FUNCTION_RETURN_VALUE -98

namespace faabric::util {

class FunctionFrozenException : public faabric::util::FaabricException
{
public:
explicit FunctionFrozenException(std::string message)
: FaabricException(std::move(message))
{}
};

class FunctionMigratedException : public faabric::util::FaabricException
{
public:
Expand Down
3 changes: 3 additions & 0 deletions src/batch-scheduler/BatchScheduler.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <faabric/batch-scheduler/BatchScheduler.h>
#include <faabric/batch-scheduler/BinPackScheduler.h>
#include <faabric/batch-scheduler/CompactScheduler.h>
#include <faabric/batch-scheduler/SpotScheduler.h>
#include <faabric/util/config.h>
#include <faabric/util/logging.h>

Expand All @@ -23,6 +24,8 @@ std::shared_ptr<BatchScheduler> getBatchScheduler()
batchScheduler = std::make_shared<BinPackScheduler>();
} else if (mode == "compact") {
batchScheduler = std::make_shared<CompactScheduler>();
} else if (mode == "spot") {
batchScheduler = std::make_shared<SpotScheduler>();
} else {
SPDLOG_ERROR("Unrecognised batch scheduler mode: {}", mode);
throw std::runtime_error("Unrecognised batch scheduler mode");
Expand Down
1 change: 1 addition & 0 deletions src/batch-scheduler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ faabric_lib(batch_scheduler
BatchScheduler.cpp
BinPackScheduler.cpp
CompactScheduler.cpp
SpotScheduler.cpp
)

target_link_libraries(batch_scheduler PRIVATE
Expand Down
8 changes: 4 additions & 4 deletions src/batch-scheduler/CompactScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ bool CompactScheduler::isFirstDecisionBetter(
throw std::runtime_error("Method not supported for COMPACT scheduler");
}

HostMap deepCopyHostMap(const HostMap& hostMap)
static HostMap deepCopyHostMap(const HostMap& hostMap)
{
HostMap newHostMap;

Expand Down Expand Up @@ -173,9 +173,9 @@ bool CompactScheduler::isFirstDecisionBetter(

// Filter-out from the host map all nodes that are executing requests from a
// different user
void filterHosts(HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req)
static void filterHosts(HostMap& hostMap,
const InFlightReqs& inFlightReqs,
std::shared_ptr<faabric::BatchExecuteRequest> req)
{
// We temporarily use the request subtype field to attach a user id for our
// multi-tenant simulations
Expand Down
Loading

0 comments on commit 0c454a9

Please sign in to comment.