From 8e523538f38596d3ff798771559764cdf0d9edd0 Mon Sep 17 00:00:00 2001
From: Albin Severinson <albin@severinson.org>
Date: Wed, 28 Feb 2024 11:53:18 +0000
Subject: [PATCH] Updated failure estimator (#3427)

* Updated failure estimator

* Added simulator script

* Cleanup simulator script

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Update SortFunc

* Comments and cleanup
---
 config/scheduler/config.yaml                  |  12 +-
 go.mod                                        |  17 +-
 go.sum                                        |  34 +-
 internal/armada/configuration/types.go        |  12 +-
 internal/common/interfaces/interfaces.go      |  11 +-
 internal/common/linalg/linalg.go              |  19 +
 internal/common/linalg/lingalg_test.go        |  45 +++
 .../common/optimisation/descent/descent.go    |  44 +++
 .../optimisation/descent/descent_test.go      |  44 +++
 .../common/optimisation/nesterov/nesterov.go  |  60 +++
 .../optimisation/nesterov/nesterov_test.go    |  83 +++++
 internal/common/optimisation/optimisation.go  |  11 +
 internal/common/slices/slices.go              |  21 ++
 internal/common/slices/slices_test.go         |  15 +
 internal/scheduler/context/context.go         |  10 +-
 .../database/executor_repository_test.go      |  10 +-
 .../scheduler/database/job_repository_test.go |  20 +-
 .../database/queue_repository_test.go         |  10 +-
 .../redis_executor_repository_test.go         |  10 +-
 .../failureestimator/failureestimator.go      | 294 +++++++--------
 .../failureestimator/failureestimator_test.go | 229 +++---------
 .../scheduler/failureestimator/simulator.jl   | 352 ++++++++++++++++++
 internal/scheduler/jobdb/jobdb_test.go        |  20 +-
 internal/scheduler/jobiteration.go            |   4 +-
 .../scheduler/nodedb/nodeiteration_test.go    |  10 +-
 .../preempting_queue_scheduler_test.go        |  10 +-
 internal/scheduler/scheduler.go               |   6 +-
 internal/scheduler/schedulerapp.go            |  16 +-
 .../scheduler/schedulerobjects/nodetype.go    |  10 +-
 .../scheduler/schedulerobjects/podutils.go    |  36 +-
 internal/scheduler/simulator/simulator.go     |  14 +-
 31 files changed, 1069 insertions(+), 420 deletions(-)
 create mode 100644 internal/common/linalg/linalg.go
 create mode 100644 internal/common/linalg/lingalg_test.go
 create mode 100644 internal/common/optimisation/descent/descent.go
 create mode 100644 internal/common/optimisation/descent/descent_test.go
 create mode 100644 internal/common/optimisation/nesterov/nesterov.go
 create mode 100644 internal/common/optimisation/nesterov/nesterov_test.go
 create mode 100644 internal/common/optimisation/optimisation.go
 create mode 100644 internal/scheduler/failureestimator/simulator.jl

diff --git a/config/scheduler/config.yaml b/config/scheduler/config.yaml
index 7f29baa6db9..739127ce978 100644
--- a/config/scheduler/config.yaml
+++ b/config/scheduler/config.yaml
@@ -140,10 +140,8 @@ scheduling:
   gangIdAnnotation: armadaproject.io/gangId
   gangCardinalityAnnotation: armadaproject.io/gangCardinality
   failureEstimatorConfig:
-    nodeSuccessProbabilityCordonThreshold: 0.1
-    queueSuccessProbabilityCordonThreshold: 0.05
-    nodeCordonTimeout: "10m"
-    queueCordonTimeout: "1m"
-    nodeEquilibriumFailureRate: 0.0167 # 1 per minute.
-    queueEquilibriumFailureRate: 0.0167 # 1 per minute.
-
+    # Optimised default parameters.
+    numInnerIterations: 10
+    innerOptimiserStepSize: 0.05
+    outerOptimiserStepSize: 0.05
+    outerOptimiserNesterovAcceleration: 0.2
diff --git a/go.mod b/go.mod
index 02768131905..a896a0a5309 100644
--- a/go.mod
+++ b/go.mod
@@ -49,11 +49,11 @@ require (
 	github.com/spf13/viper v1.15.0
 	github.com/stretchr/testify v1.8.4
 	github.com/weaveworks/promrus v1.2.0
-	golang.org/x/exp v0.0.0-20221031165847-c99f073a8326
-	golang.org/x/net v0.20.0
+	golang.org/x/exp v0.0.0-20240222234643-814bf88cf225
+	golang.org/x/net v0.21.0
 	golang.org/x/oauth2 v0.16.0
-	golang.org/x/sync v0.5.0
-	golang.org/x/tools v0.6.0 // indirect
+	golang.org/x/sync v0.6.0
+	golang.org/x/tools v0.18.0 // indirect
 	google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 // indirect
 	google.golang.org/grpc v1.57.1
 	gopkg.in/yaml.v2 v2.4.0
@@ -89,6 +89,7 @@ require (
 	github.com/segmentio/fasthash v1.0.3
 	github.com/xitongsys/parquet-go v1.6.2
 	golang.org/x/time v0.3.0
+	gonum.org/v1/gonum v0.14.0
 	google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9
 	gopkg.in/yaml.v3 v3.0.1
 )
@@ -184,10 +185,10 @@ require (
 	github.com/xitongsys/parquet-go-source v0.0.0-20200817004010-026bad9b25d0 // indirect
 	go.mongodb.org/mongo-driver v1.13.1 // indirect
 	go.uber.org/atomic v1.9.0 // indirect
-	golang.org/x/crypto v0.18.0 // indirect
-	golang.org/x/mod v0.9.0 // indirect
-	golang.org/x/sys v0.16.0 // indirect
-	golang.org/x/term v0.16.0 // indirect
+	golang.org/x/crypto v0.19.0 // indirect
+	golang.org/x/mod v0.15.0 // indirect
+	golang.org/x/sys v0.17.0 // indirect
+	golang.org/x/term v0.17.0 // indirect
 	golang.org/x/text v0.14.0 // indirect
 	golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
diff --git a/go.sum b/go.sum
index 564da9d52ab..6b0692e99e5 100644
--- a/go.sum
+++ b/go.sum
@@ -857,8 +857,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
 golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc=
-golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg=
+golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -869,8 +869,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0
 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
 golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
 golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
-golang.org/x/exp v0.0.0-20221031165847-c99f073a8326 h1:QfTh0HpN6hlw6D3vu8DAwC8pBIwikq0AI1evdm+FksE=
-golang.org/x/exp v0.0.0-20221031165847-c99f073a8326/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
+golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ=
+golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -897,8 +897,8 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/mod v0.9.0 h1:KENHtAZL2y3NLMYZeHY9DW8HW8V+kQyJsY/V9JlKvCs=
-golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0 h1:SernR4v+D55NyBH2QiEQrlBAnj1ECL6AGrA5+dPaMY8=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -947,8 +947,8 @@ golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qx
 golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
-golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo=
-golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
+golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -974,8 +974,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
-golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180821044426-4ea2f632f6e9/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -1049,14 +1049,14 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU=
-golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE=
-golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY=
+golang.org/x/term v0.17.0 h1:mkTF7LCd6WGJNL3K1Ad7kwxNfYAW6a8a8QqtMblp/4U=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.0.0-20180810153555-6e3c4e7365dd/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -1138,8 +1138,8 @@ golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
 golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
-golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM=
-golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.18.0 h1:k8NLag8AGHnn+PHbl7g43CtqZAwG60vZkLqgyZgIHgQ=
+golang.org/x/tools v0.18.0/go.mod h1:GL7B4CwcLLeo59yx/9UWWuNOW1n3VZ4f5axWfML7Lcg=
 golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -1148,6 +1148,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
 golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
+gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
+gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
diff --git a/internal/armada/configuration/types.go b/internal/armada/configuration/types.go
index 3ac46696dd7..4cbcdb5f257 100644
--- a/internal/armada/configuration/types.go
+++ b/internal/armada/configuration/types.go
@@ -295,13 +295,11 @@ type PreemptionConfig struct {
 // FailureEstimatorConfig contains config controlling node and queue success probability estimation.
 // See the internal/scheduler/failureestimator package for details.
 type FailureEstimatorConfig struct {
-	Disabled                               bool
-	NodeSuccessProbabilityCordonThreshold  float64
-	QueueSuccessProbabilityCordonThreshold float64
-	NodeCordonTimeout                      time.Duration
-	QueueCordonTimeout                     time.Duration
-	NodeEquilibriumFailureRate             float64
-	QueueEquilibriumFailureRate            float64
+	Disabled                           bool
+	NumInnerIterations                 int     `validate:"gt=0"`
+	InnerOptimiserStepSize             float64 `validate:"gt=0"`
+	OuterOptimiserStepSize             float64 `validate:"gt=0"`
+	OuterOptimiserNesterovAcceleration float64 `validate:"gte=0"`
 }
 
 // TODO: we can probably just typedef this to map[string]string
diff --git a/internal/common/interfaces/interfaces.go b/internal/common/interfaces/interfaces.go
index 91abe495611..3275e3622bd 100644
--- a/internal/common/interfaces/interfaces.go
+++ b/internal/common/interfaces/interfaces.go
@@ -1,13 +1,20 @@
 package interfaces
 
-// DeepCopier expresses that the object can be deep-copied.
+import "golang.org/x/exp/constraints"
+
+// DeepCopier represents object that can be deep-copied.
 type DeepCopier[T any] interface {
 	// DeepCopy returns a deep copy of the object.
 	DeepCopy() T
 }
 
-// Equaler expresses that objects can be compared for equality via the Equals method.
+// Equaler represents objects can be compared for equality via the Equals method.
 type Equaler[T any] interface {
 	// Returns true if both objects are equal.
 	Equal(T) bool
 }
+
+// Number represents any integer or floating-point number.
+type Number interface {
+	constraints.Integer | constraints.Float
+}
diff --git a/internal/common/linalg/linalg.go b/internal/common/linalg/linalg.go
new file mode 100644
index 00000000000..c807ca651d0
--- /dev/null
+++ b/internal/common/linalg/linalg.go
@@ -0,0 +1,19 @@
+package linalg
+
+import "gonum.org/v1/gonum/mat"
+
+// ExtendVecDense extends the length of vec in-place to be at least n.
+func ExtendVecDense(vec *mat.VecDense, n int) *mat.VecDense {
+	if vec == nil {
+		return mat.NewVecDense(n, make([]float64, n))
+	}
+	rawVec := vec.RawVector()
+	d := n - rawVec.N
+	if d <= 0 {
+		return vec
+	}
+	rawVec.Data = append(rawVec.Data, make([]float64, d)...)
+	rawVec.N = n
+	vec.SetRawVector(rawVec)
+	return vec
+}
diff --git a/internal/common/linalg/lingalg_test.go b/internal/common/linalg/lingalg_test.go
new file mode 100644
index 00000000000..200d9c0aacf
--- /dev/null
+++ b/internal/common/linalg/lingalg_test.go
@@ -0,0 +1,45 @@
+package linalg
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"gonum.org/v1/gonum/mat"
+
+	armadaslices "github.com/armadaproject/armada/internal/common/slices"
+)
+
+func TestExtendVecDense(t *testing.T) {
+	tests := map[string]struct {
+		vec      *mat.VecDense
+		n        int
+		expected *mat.VecDense
+	}{
+		"nil vec": {
+			vec:      nil,
+			n:        3,
+			expected: mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+		},
+		"extend": {
+			vec:      mat.NewVecDense(1, armadaslices.Zeros[float64](1)),
+			n:        3,
+			expected: mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+		},
+		"extend unnecessary due to greater length": {
+			vec:      mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+			n:        1,
+			expected: mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+		},
+		"extend unnecessary due to equal length": {
+			vec:      mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+			n:        3,
+			expected: mat.NewVecDense(3, armadaslices.Zeros[float64](3)),
+		},
+	}
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			actual := ExtendVecDense(tc.vec, tc.n)
+			assert.Equal(t, tc.expected, actual)
+		})
+	}
+}
diff --git a/internal/common/optimisation/descent/descent.go b/internal/common/optimisation/descent/descent.go
new file mode 100644
index 00000000000..c57051084ac
--- /dev/null
+++ b/internal/common/optimisation/descent/descent.go
@@ -0,0 +1,44 @@
+package descent
+
+import (
+	"fmt"
+
+	"github.com/pkg/errors"
+	"gonum.org/v1/gonum/mat"
+
+	"github.com/armadaproject/armada/internal/common/armadaerrors"
+)
+
+// Gradient descent optimiser; see the following link for details:
+// https://fluxml.ai/Flux.jl/stable/training/optimisers/
+type Descent struct {
+	eta float64
+}
+
+func New(eta float64) (*Descent, error) {
+	if eta < 0 {
+		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
+			Name:    "eta",
+			Value:   eta,
+			Message: fmt.Sprintf("outside allowed range [0, Inf)"),
+		})
+	}
+	return &Descent{eta: eta}, nil
+}
+
+func MustNew(eta float64) *Descent {
+	opt, err := New(eta)
+	if err != nil {
+		panic(err)
+	}
+	return opt
+}
+
+func (o *Descent) Update(out, p *mat.VecDense, g mat.Vector) *mat.VecDense {
+	out.AddScaledVec(p, -o.eta, g)
+	return p
+}
+
+func (o *Descent) Extend(_ int) {
+	return
+}
diff --git a/internal/common/optimisation/descent/descent_test.go b/internal/common/optimisation/descent/descent_test.go
new file mode 100644
index 00000000000..83571b7ea46
--- /dev/null
+++ b/internal/common/optimisation/descent/descent_test.go
@@ -0,0 +1,44 @@
+package descent
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"gonum.org/v1/gonum/mat"
+
+	armadaslices "github.com/armadaproject/armada/internal/common/slices"
+)
+
+func TestDescent(t *testing.T) {
+	tests := map[string]struct {
+		eta      float64
+		p        *mat.VecDense
+		g        *mat.VecDense
+		expected *mat.VecDense
+	}{
+		"eta is zero": {
+			eta:      0.0,
+			p:        mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			g:        mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			expected: mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+		},
+		"eta is non-zero": {
+			eta: 2.0,
+			p:   mat.NewVecDense(2, armadaslices.Zeros[float64](2)),
+			g:   mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			expected: func() *mat.VecDense {
+				rv := mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv.ScaleVec(-2, rv)
+				return rv
+			}(),
+		},
+	}
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			opt := MustNew(tc.eta)
+			rv := opt.Update(tc.p, tc.p, tc.g)
+			assert.Equal(t, tc.p, rv)
+			assert.Equal(t, tc.expected, tc.p)
+		})
+	}
+}
diff --git a/internal/common/optimisation/nesterov/nesterov.go b/internal/common/optimisation/nesterov/nesterov.go
new file mode 100644
index 00000000000..050f91f5e7f
--- /dev/null
+++ b/internal/common/optimisation/nesterov/nesterov.go
@@ -0,0 +1,60 @@
+package nesterov
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/pkg/errors"
+	"gonum.org/v1/gonum/mat"
+
+	"github.com/armadaproject/armada/internal/common/armadaerrors"
+	"github.com/armadaproject/armada/internal/common/linalg"
+)
+
+// Nesterov accelerated gradient descent optimiser; see the following link for details:
+// https://fluxml.ai/Flux.jl/stable/training/optimisers/
+type Nesterov struct {
+	eta float64
+	rho float64
+	vel *mat.VecDense
+}
+
+func New(eta, rho float64) (*Nesterov, error) {
+	if eta < 0 {
+		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
+			Name:    "eta",
+			Value:   eta,
+			Message: fmt.Sprintf("outside allowed range [0, Inf)"),
+		})
+	}
+	if rho < 0 || rho >= 1 {
+		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
+			Name:    "rho",
+			Value:   rho,
+			Message: fmt.Sprintf("outside allowed range [0, 1)"),
+		})
+	}
+	return &Nesterov{eta: eta, rho: rho}, nil
+}
+
+func MustNew(eta, rho float64) *Nesterov {
+	opt, err := New(eta, rho)
+	if err != nil {
+		panic(err)
+	}
+	return opt
+}
+
+func (o *Nesterov) Update(out, p *mat.VecDense, g mat.Vector) *mat.VecDense {
+	out.CopyVec(p)
+	out.AddScaledVec(out, math.Pow(o.rho, 2), o.vel)
+	out.AddScaledVec(out, -(1+o.rho)*o.eta, g)
+
+	o.vel.ScaleVec(o.rho, o.vel)
+	o.vel.AddScaledVec(o.vel, -o.eta, g)
+	return p
+}
+
+func (o *Nesterov) Extend(n int) {
+	o.vel = linalg.ExtendVecDense(o.vel, n)
+}
diff --git a/internal/common/optimisation/nesterov/nesterov_test.go b/internal/common/optimisation/nesterov/nesterov_test.go
new file mode 100644
index 00000000000..24df99f2120
--- /dev/null
+++ b/internal/common/optimisation/nesterov/nesterov_test.go
@@ -0,0 +1,83 @@
+package nesterov
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"gonum.org/v1/gonum/mat"
+
+	armadaslices "github.com/armadaproject/armada/internal/common/slices"
+)
+
+func TestNesterov(t *testing.T) {
+	tests := map[string]struct {
+		eta       float64
+		rho       float64
+		p0        *mat.VecDense
+		gs        []*mat.VecDense
+		expecteds []*mat.VecDense
+	}{
+		"eta is zero": {
+			eta: 0.0,
+			rho: 0.9,
+			p0:  mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			gs: []*mat.VecDense{
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			},
+			expecteds: []*mat.VecDense{
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			},
+		},
+		"rho is zero": {
+			eta: 2.0,
+			rho: 0.0,
+			p0:  mat.NewVecDense(2, armadaslices.Zeros[float64](2)),
+			gs: []*mat.VecDense{
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			},
+			expecteds: func() []*mat.VecDense {
+				rv := make([]*mat.VecDense, 2)
+				rv[0] = mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv[0].ScaleVec(-2, rv[0])
+				rv[1] = mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv[1].ScaleVec(-4, rv[1])
+				return rv
+			}(),
+		},
+		"eta and rho non-zero": {
+			eta: 2.0,
+			rho: 0.5,
+			p0:  mat.NewVecDense(2, armadaslices.Zeros[float64](2)),
+			gs: []*mat.VecDense{
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+				mat.NewVecDense(2, armadaslices.Ones[float64](2)),
+			},
+			expecteds: func() []*mat.VecDense {
+				rv := make([]*mat.VecDense, 3)
+				rv[0] = mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv[0].ScaleVec(-3, rv[0])
+				rv[1] = mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv[1].ScaleVec(-6.5, rv[1])
+				rv[2] = mat.NewVecDense(2, armadaslices.Ones[float64](2))
+				rv[2].ScaleVec(-10.25, rv[2])
+				return rv
+			}(),
+		},
+	}
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			opt := MustNew(tc.eta, tc.rho)
+			p := tc.p0
+			for i, g := range tc.gs {
+				opt.Extend(g.Len())
+				rv := opt.Update(p, p, g)
+				assert.Equal(t, p, rv)
+				assert.Equal(t, tc.expecteds[i], p)
+			}
+		})
+	}
+}
diff --git a/internal/common/optimisation/optimisation.go b/internal/common/optimisation/optimisation.go
new file mode 100644
index 00000000000..4bad4826f72
--- /dev/null
+++ b/internal/common/optimisation/optimisation.go
@@ -0,0 +1,11 @@
+package optimisation
+
+import "gonum.org/v1/gonum/mat"
+
+// Optimiser represents a first-order optimisation algorithm.
+type Optimiser interface {
+	// Update the parameters using gradient and store the result in out.
+	Update(out, parameters *mat.VecDense, gradient mat.Vector) *mat.VecDense
+	// Extend the internal state of the optimiser to accommodate at least n parameters.
+	Extend(n int)
+}
diff --git a/internal/common/slices/slices.go b/internal/common/slices/slices.go
index 2cfaf7c5710..9063bf85053 100644
--- a/internal/common/slices/slices.go
+++ b/internal/common/slices/slices.go
@@ -6,6 +6,8 @@ import (
 	"math/rand"
 
 	goslices "golang.org/x/exp/slices"
+
+	"github.com/armadaproject/armada/internal/common/interfaces"
 )
 
 // PartitionToLen partitions the elements of s into non-overlapping slices,
@@ -178,3 +180,22 @@ func AnyFunc[S ~[]T, T any](s S, predicate func(val T) bool) bool {
 	}
 	return false
 }
+
+// Zeros returns a slice T[] of length n with all elements equal to zero.
+func Zeros[T any](n int) []T {
+	return make([]T, n)
+}
+
+// Fill returns a slice T[] of length n with all elements equal to v.
+func Fill[T any](v T, n int) []T {
+	rv := make([]T, n)
+	for i := range rv {
+		rv[i] = v
+	}
+	return rv
+}
+
+// Ones returns a slice T[] of length n with all elements equal to 1.
+func Ones[T interfaces.Number](n int) []T {
+	return Fill[T](1, n)
+}
diff --git a/internal/common/slices/slices_test.go b/internal/common/slices/slices_test.go
index 4d57dbf7fac..c7bf81cc00c 100644
--- a/internal/common/slices/slices_test.go
+++ b/internal/common/slices/slices_test.go
@@ -326,3 +326,18 @@ func TestAny(t *testing.T) {
 		AnyFunc([]int{1, 2, 3}, func(v int) bool { return v > 3 }),
 	)
 }
+
+func TestZeros(t *testing.T) {
+	assert.Equal(t, make([]int, 3), Zeros[int](3))
+	assert.Equal(t, make([]string, 3), Zeros[string](3))
+}
+
+func TestOnes(t *testing.T) {
+	assert.Equal(t, []int{1, 1, 1}, Ones[int](3))
+	assert.Equal(t, []float64{1, 1, 1}, Ones[float64](3))
+}
+
+func TestFill(t *testing.T) {
+	assert.Equal(t, []int{2, 2, 2}, Fill[int](2, 3))
+	assert.Equal(t, []float64{0.5, 0.5, 0.5}, Fill[float64](0.5, 3))
+}
diff --git a/internal/scheduler/context/context.go b/internal/scheduler/context/context.go
index 571338f8963..1db201cb4e6 100644
--- a/internal/scheduler/context/context.go
+++ b/internal/scheduler/context/context.go
@@ -440,7 +440,15 @@ func (qctx *QueueSchedulingContext) ReportString(verbosity int32) string {
 				},
 			)
 			reasons := maps.Keys(jobIdsByReason)
-			slices.SortFunc(reasons, func(a, b string) bool { return len(jobIdsByReason[a]) < len(jobIdsByReason[b]) })
+			slices.SortFunc(reasons, func(a, b string) int {
+				if len(jobIdsByReason[a]) < len(jobIdsByReason[b]) {
+					return -1
+				} else if len(jobIdsByReason[a]) > len(jobIdsByReason[b]) {
+					return 1
+				} else {
+					return 0
+				}
+			})
 			for i := len(reasons) - 1; i >= 0; i-- {
 				reason := reasons[i]
 				jobIds := jobIdsByReason[reason]
diff --git a/internal/scheduler/database/executor_repository_test.go b/internal/scheduler/database/executor_repository_test.go
index 76a0e14c9f9..b31bd67caef 100644
--- a/internal/scheduler/database/executor_repository_test.go
+++ b/internal/scheduler/database/executor_repository_test.go
@@ -61,8 +61,14 @@ func TestExecutorRepository_LoadAndSave(t *testing.T) {
 				}
 				retrievedExecutors, err := repo.GetExecutors(ctx)
 				require.NoError(t, err)
-				executorSort := func(a *schedulerobjects.Executor, b *schedulerobjects.Executor) bool {
-					return a.Id > b.Id
+				executorSort := func(a *schedulerobjects.Executor, b *schedulerobjects.Executor) int {
+					if a.Id > b.Id {
+						return -1
+					} else if a.Id < b.Id {
+						return 1
+					} else {
+						return 0
+					}
 				}
 				slices.SortFunc(retrievedExecutors, executorSort)
 				slices.SortFunc(tc.executors, executorSort)
diff --git a/internal/scheduler/database/job_repository_test.go b/internal/scheduler/database/job_repository_test.go
index 9be871caa67..63d8b7ce335 100644
--- a/internal/scheduler/database/job_repository_test.go
+++ b/internal/scheduler/database/job_repository_test.go
@@ -367,7 +367,15 @@ func TestFindInactiveRuns(t *testing.T) {
 
 				inactive, err := repo.FindInactiveRuns(ctx, tc.runsToCheck)
 				require.NoError(t, err)
-				uuidSort := func(a uuid.UUID, b uuid.UUID) bool { return a.String() > b.String() }
+				uuidSort := func(a uuid.UUID, b uuid.UUID) int {
+					if a.String() > b.String() {
+						return -1
+					} else if a.String() < b.String() {
+						return 1
+					} else {
+						return 0
+					}
+				}
 				slices.SortFunc(inactive, uuidSort)
 				slices.SortFunc(tc.expectedInactive, uuidSort)
 				assert.Equal(t, tc.expectedInactive, inactive)
@@ -518,7 +526,15 @@ func TestFetchJobRunLeases(t *testing.T) {
 
 				leases, err := repo.FetchJobRunLeases(ctx, tc.executor, tc.maxRowsToFetch, tc.excludedRuns)
 				require.NoError(t, err)
-				leaseSort := func(a *JobRunLease, b *JobRunLease) bool { return a.RunID.String() > b.RunID.String() }
+				leaseSort := func(a *JobRunLease, b *JobRunLease) int {
+					if a.RunID.String() > b.RunID.String() {
+						return -1
+					} else if a.RunID.String() < b.RunID.String() {
+						return 1
+					} else {
+						return 0
+					}
+				}
 				slices.SortFunc(leases, leaseSort)
 				slices.SortFunc(tc.expectedLeases, leaseSort)
 				assert.Equal(t, tc.expectedLeases, leases)
diff --git a/internal/scheduler/database/queue_repository_test.go b/internal/scheduler/database/queue_repository_test.go
index edd33be3d41..718d0904290 100644
--- a/internal/scheduler/database/queue_repository_test.go
+++ b/internal/scheduler/database/queue_repository_test.go
@@ -58,7 +58,15 @@ func TestLegacyQueueRepository_GetAllQueues(t *testing.T) {
 			}
 			retrievedQueues, err := repo.GetAllQueues()
 			require.NoError(t, err)
-			sortFunc := func(a, b *Queue) bool { return a.Name > b.Name }
+			sortFunc := func(a, b *Queue) int {
+				if a.Name > b.Name {
+					return -1
+				} else if a.Name > b.Name {
+					return 1
+				} else {
+					return 0
+				}
+			}
 			slices.SortFunc(tc.expectedQueues, sortFunc)
 			slices.SortFunc(retrievedQueues, sortFunc)
 			assert.Equal(t, tc.expectedQueues, retrievedQueues)
diff --git a/internal/scheduler/database/redis_executor_repository_test.go b/internal/scheduler/database/redis_executor_repository_test.go
index bf5b0ea9629..be1eb42d9b6 100644
--- a/internal/scheduler/database/redis_executor_repository_test.go
+++ b/internal/scheduler/database/redis_executor_repository_test.go
@@ -61,8 +61,14 @@ func TestRedisExecutorRepository_LoadAndSave(t *testing.T) {
 				}
 				retrievedExecutors, err := repo.GetExecutors(ctx)
 				require.NoError(t, err)
-				executorSort := func(a *schedulerobjects.Executor, b *schedulerobjects.Executor) bool {
-					return a.Id > b.Id
+				executorSort := func(a *schedulerobjects.Executor, b *schedulerobjects.Executor) int {
+					if a.Id > b.Id {
+						return -1
+					} else if a.Id < b.Id {
+						return 1
+					} else {
+						return 0
+					}
 				}
 				slices.SortFunc(retrievedExecutors, executorSort)
 				slices.SortFunc(tc.executors, executorSort)
diff --git a/internal/scheduler/failureestimator/failureestimator.go b/internal/scheduler/failureestimator/failureestimator.go
index 9e35f5158e0..c131ded8da8 100644
--- a/internal/scheduler/failureestimator/failureestimator.go
+++ b/internal/scheduler/failureestimator/failureestimator.go
@@ -4,12 +4,17 @@ import (
 	"fmt"
 	"math"
 	"sync"
-	"time"
 
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
+	"gonum.org/v1/gonum/mat"
 
 	"github.com/armadaproject/armada/internal/common/armadaerrors"
+	"github.com/armadaproject/armada/internal/common/linalg"
+	armadamath "github.com/armadaproject/armada/internal/common/math"
+	"github.com/armadaproject/armada/internal/common/optimisation"
+	"github.com/armadaproject/armada/internal/common/slices"
+	armadaslices "github.com/armadaproject/armada/internal/common/slices"
 )
 
 const (
@@ -18,8 +23,6 @@ const (
 
 	// Floating point tolerance. Also used when applying limits to avoid divide-by-zero issues.
 	eps = 1e-15
-	// Assumed success probability of "good" nodes (queues) used when calculating step size.
-	healthySuccessProbability = 0.95
 )
 
 // FailureEstimator is a system for answering the following question:
@@ -28,53 +31,41 @@ const (
 // Denote by
 // - P_q the probability of a job from queue q succeeding when running on a perfect node and
 // - P_n is the probability of a perfect job succeeding on node n.
-// The success probability of a job from queue q on node n is then Pr(p_q*p_n=1),
+// The success probability of a job from queue q on node n is then Pr(p_q*p_n = 1),
 // where p_q and p_n are drawn from Bernoulli distributions with parameter P_q and P_n, respectively.
 //
-// Now, the goal is to jointly estimate P_q and P_n for each queue and node using observed successes and failures.
-// The method used is statistical and only relies on knowing which queue a job belongs to and on which node it ran.
-// The intuition of the method is that:
-// - A job from a queue with many failures doesn't say much about the node; likely it's the job that's the problem.
-// - A job failing on a node with many failures doesn't say much about the job; likely it's the node that's the problem.
+// Now, the goal is to jointly estimate P_q and P_n for each queue and node from observed successes and failures.
+// We do so here with a statistical method. The intuition of the method is that:
+// - A job from a queue with many failures failing doesn't say much about the node; likely the problem is with the job.
+// - A job failing on a node with many failures doesn't say much about the job; likely the problem is with the node.
 // And vice versa.
 //
-// Specifically, we maximise the log-likelihood function of P_q and P_n using observed successes and failures.
+// Specifically, we maximise the log-likelihood function of P_q and P_n over observed successes and failures.
 // This maximisation is performed using online gradient descent, where for each success or failure,
-// we update the corresponding P_q and P_n by taking a gradient step.
-// See New(...) for more details regarding step size.
-//
-// Finally, we exponentially decay P_q and P_N towards 1 over time,
-// such that nodes and queues for which we observe no failures appear to become healthier over time.
-// See New(...) function for details regarding decay.
+// we update the corresponding P_q and P_n by taking a gradient step. See the Update() function for details.
 //
 // This module internally only maintains success probability estimates, as this makes the maths cleaner.
-// When exporting these via API calls we convert to failure probabilities as these are more intuitive to reason about.
+// We convert these to failure probabilities when exporting these via API calls.
 type FailureEstimator struct {
-	// Map from node (queue) name to the estimated success probability of that node (queue). For example:
-	// - successProbabilityByNode["myNode"] = 0.85]: estimated failure probability of a perfect job run on "myNode" is 15%.
-	// - successProbabilityByQueue["myQueue"] = 0.60]: estimated failure probability of a job from "myQueue" run on a perfect node is 40%.
-	successProbabilityByNode  map[string]float64
-	successProbabilityByQueue map[string]float64
-
-	// Success probability below which to consider nodes (jobs) unhealthy.
-	nodeSuccessProbabilityCordonThreshold  float64
-	queueSuccessProbabilityCordonThreshold float64
-
-	// Exponential decay factor controlling how quickly estimated node (queue) success probability decays towards 1.
-	// Computed from:
-	// - {node, queue}SuccessProbabilityCordonThreshold
-	// - {node, queue}CordonTimeout
-	nodeFailureProbabilityDecayRate  float64
-	queueFailureProbabilityDecayRate float64
-	timeOfLastDecay                  time.Time
-
-	// Gradient descent step size. Controls the extent to which new data affects successProbabilityBy{Node, Queue}.
-	// Computed from:
-	// - {node, queue}SuccessProbabilityCordonThreshold
-	// - {node, queue}FailureProbabilityDecayRate
-	// - {node, queue}EquilibriumFailureRate
-	nodeStepSize  float64
-	queueStepSize float64
+	// Success probability estimates for all nodes and queues.
+	parameters             *mat.VecDense
+	intermediateParameters *mat.VecDense
+
+	// Gradient buffer.
+	gradient *mat.VecDense
+
+	// Maps node (queue) names to the corresponding index of parameters.
+	// E.g., if parameterIndexByNode["myNode"] = 10, then parameters[10] is the estimated success probability of myNode.
+	parameterIndexByNode  map[string]int
+	parameterIndexByQueue map[string]int
+
+	// Samples that have not been processed yet.
+	samples []Sample
+
+	// Optimisation settings.
+	numInnerIterations int
+	innerOptimiser     optimisation.Optimiser
+	outerOptimiser     optimisation.Optimiser
 
 	// Prometheus metrics.
 	failureProbabilityByNodeDesc  *prometheus.Desc
@@ -84,88 +75,41 @@ type FailureEstimator struct {
 	disabled bool
 
 	// Mutex protecting the above fields.
-	// Prevents concurrent map modification issues when scraping metrics.
+	// Prevents concurrent map modification when scraping metrics.
 	mu sync.Mutex
 }
 
-// New returns a new FailureEstimator. Parameters have the following meaning:
-// - {node, queue}SuccessProbabilityCordonThreshold: Success probability below which nodes (queues) are considered unhealthy.
-// - {node, queue}CordonTimeout: Amount of time for which nodes (queues) remain unhealthy in the absence of any job successes or failures for that node (queue).
-// - {node, queue}EquilibriumFailureRate: Job failures per second necessary for a node (queue) to remain unhealthy in the absence of any successes for that node (queue).
+type Sample struct {
+	i int
+	j int
+	c bool
+}
+
+// New returns a new FailureEstimator.
 func New(
-	nodeSuccessProbabilityCordonThreshold float64,
-	queueSuccessProbabilityCordonThreshold float64,
-	nodeCordonTimeout time.Duration,
-	queueCordonTimeout time.Duration,
-	nodeEquilibriumFailureRate float64,
-	queueEquilibriumFailureRate float64,
+	numInnerIterations int,
+	innerOptimiser optimisation.Optimiser,
+	outerOptimiser optimisation.Optimiser,
 ) (*FailureEstimator, error) {
-	if nodeSuccessProbabilityCordonThreshold < 0 || nodeSuccessProbabilityCordonThreshold > 1 {
-		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "nodeSuccessProbabilityCordonThreshold",
-			Value:   nodeSuccessProbabilityCordonThreshold,
-			Message: fmt.Sprintf("outside allowed range [0, 1]"),
-		})
-	}
-	if queueSuccessProbabilityCordonThreshold < 0 || queueSuccessProbabilityCordonThreshold > 1 {
+	if numInnerIterations < 1 {
 		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "queueSuccessProbabilityCordonThreshold",
-			Value:   queueSuccessProbabilityCordonThreshold,
-			Message: fmt.Sprintf("outside allowed range [0, 1]"),
-		})
-	}
-	if nodeCordonTimeout < 0 {
-		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "nodeCordonTimeout",
-			Value:   nodeCordonTimeout,
-			Message: fmt.Sprintf("outside allowed range [0, Inf)"),
-		})
-	}
-	if queueCordonTimeout < 0 {
-		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "queueCordonTimeout",
-			Value:   queueCordonTimeout,
-			Message: fmt.Sprintf("outside allowed range [0, Inf)"),
-		})
-	}
-	if nodeEquilibriumFailureRate <= 0 {
-		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "nodeEquilibriumFailureRate",
-			Value:   nodeEquilibriumFailureRate,
-			Message: fmt.Sprintf("outside allowed range (0, Inf)"),
-		})
-	}
-	if queueEquilibriumFailureRate <= 0 {
-		return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{
-			Name:    "queueEquilibriumFailureRate",
-			Value:   queueEquilibriumFailureRate,
-			Message: fmt.Sprintf("outside allowed range (0, Inf)"),
+			Name:    "numInnerIterations",
+			Value:   numInnerIterations,
+			Message: fmt.Sprintf("outside allowed range [1, Inf)"),
 		})
 	}
+	return &FailureEstimator{
+		parameters:             mat.NewVecDense(32, armadaslices.Fill[float64](0.5, 32)),
+		intermediateParameters: mat.NewVecDense(32, armadaslices.Zeros[float64](32)),
+		gradient:               mat.NewVecDense(32, armadaslices.Zeros[float64](32)),
 
-	// Compute decay rates such that a node (queue) with success probability 0 will over {node, queue}CordonTimeout time
-	// decay to a success probability of {node, queue}SuccessProbabilityCordonThreshold.
-	nodeFailureProbabilityDecayRate := math.Exp(math.Log(1-nodeSuccessProbabilityCordonThreshold) / nodeCordonTimeout.Seconds())
-	queueFailureProbabilityDecayRate := math.Exp(math.Log(1-queueSuccessProbabilityCordonThreshold) / queueCordonTimeout.Seconds())
+		parameterIndexByNode:  make(map[string]int, 16),
+		parameterIndexByQueue: make(map[string]int, 16),
 
-	// Compute step size such that a node (queue) with success probability {node, queue}SuccessProbabilityCordonThreshold
-	// for which we observe 0 successes and {node, queue}EquilibriumFailureRate failures per second from "good" nodes (queues)
-	// will remain at exactly {node, queue}SuccessProbabilityCordonThreshold success probability.
-	dNodeSuccessProbability := healthySuccessProbability / (1 - nodeSuccessProbabilityCordonThreshold*healthySuccessProbability)
-	dQueueSuccessProbability := healthySuccessProbability / (1 - queueSuccessProbabilityCordonThreshold*healthySuccessProbability)
-	nodeStepSize := (1 - nodeSuccessProbabilityCordonThreshold - (1-nodeSuccessProbabilityCordonThreshold)*nodeFailureProbabilityDecayRate) / dNodeSuccessProbability / nodeEquilibriumFailureRate
-	queueStepSize := (1 - queueSuccessProbabilityCordonThreshold - (1-queueSuccessProbabilityCordonThreshold)*queueFailureProbabilityDecayRate) / dQueueSuccessProbability / queueEquilibriumFailureRate
+		numInnerIterations: numInnerIterations,
+		innerOptimiser:     innerOptimiser,
+		outerOptimiser:     outerOptimiser,
 
-	return &FailureEstimator{
-		successProbabilityByNode:               make(map[string]float64, 1024),
-		successProbabilityByQueue:              make(map[string]float64, 128),
-		nodeSuccessProbabilityCordonThreshold:  nodeSuccessProbabilityCordonThreshold,
-		queueSuccessProbabilityCordonThreshold: queueSuccessProbabilityCordonThreshold,
-		nodeFailureProbabilityDecayRate:        nodeFailureProbabilityDecayRate,
-		queueFailureProbabilityDecayRate:       queueFailureProbabilityDecayRate,
-		timeOfLastDecay:                        time.Now(),
-		nodeStepSize:                           nodeStepSize,
-		queueStepSize:                          queueStepSize,
 		failureProbabilityByNodeDesc: prometheus.NewDesc(
 			fmt.Sprintf("%s_%s_node_failure_probability", namespace, subsystem),
 			"Estimated per-node failure probability.",
@@ -195,61 +139,93 @@ func (fe *FailureEstimator) IsDisabled() bool {
 	return fe.disabled
 }
 
-// Decay moves the success probabilities of nodes (queues) closer to 1, depending on the configured cordon timeout.
-// Periodically calling Decay() ensures nodes (queues) considered unhealthy are eventually considered healthy again,
-// even if we observe no successes for those nodes (queues).
-func (fe *FailureEstimator) Decay() {
+// Push adds a sample to the internal buffer of the failure estimator.
+// Samples added via Push are processed on the next call to Update.
+func (fe *FailureEstimator) Push(node, queue string, success bool) {
 	fe.mu.Lock()
 	defer fe.mu.Unlock()
-	t := time.Now()
-	fe.decay(t.Sub(fe.timeOfLastDecay).Seconds())
-	fe.timeOfLastDecay = t
-	return
-}
 
-func (fe *FailureEstimator) decay(secondsSinceLastDecay float64) {
-	nodeFailureProbabilityDecay := math.Pow(fe.nodeFailureProbabilityDecayRate, secondsSinceLastDecay)
-	for k, v := range fe.successProbabilityByNode {
-		failureProbability := 1 - v
-		failureProbability *= nodeFailureProbabilityDecay
-		successProbability := 1 - failureProbability
-		fe.successProbabilityByNode[k] = applyBounds(successProbability)
+	i, ok := fe.parameterIndexByNode[node]
+	if !ok {
+		i = len(fe.parameterIndexByNode) + len(fe.parameterIndexByQueue)
+		fe.parameterIndexByNode[node] = i
+	}
+	j, ok := fe.parameterIndexByQueue[queue]
+	if !ok {
+		j = len(fe.parameterIndexByNode) + len(fe.parameterIndexByQueue)
+		fe.parameterIndexByQueue[queue] = j
 	}
+	fe.extendParameters(armadamath.Max(i, j) + 1)
+	fe.samples = append(fe.samples, Sample{
+		i: i,
+		j: j,
+		c: success,
+	})
+}
 
-	queueFailureProbabilityDecay := math.Pow(fe.queueFailureProbabilityDecayRate, secondsSinceLastDecay)
-	for k, v := range fe.successProbabilityByQueue {
-		failureProbability := 1 - v
-		failureProbability *= queueFailureProbabilityDecay
-		successProbability := 1 - failureProbability
-		fe.successProbabilityByQueue[k] = applyBounds(successProbability)
+func (fe *FailureEstimator) extendParameters(n int) {
+	oldN := fe.parameters.Len()
+	fe.parameters = linalg.ExtendVecDense(fe.parameters, n)
+	if oldN < n {
+		for i := oldN; i < n; i++ {
+			// Initialise new parameters with 50% success probability.
+			fe.parameters.SetVec(i, 0.5)
+		}
 	}
-	return
+	fe.intermediateParameters = linalg.ExtendVecDense(fe.intermediateParameters, n)
+	fe.gradient = linalg.ExtendVecDense(fe.gradient, n)
 }
 
-// Update with success=false decreases the estimated success probability of the provided node and queue.
-// Calling with success=true increases the estimated success probability of the provided node and queue.
-// This update is performed by taking one gradient descent step.
-func (fe *FailureEstimator) Update(node, queue string, success bool) {
+// Update success probability estimates based on pushed samples.
+func (fe *FailureEstimator) Update() {
 	fe.mu.Lock()
 	defer fe.mu.Unlock()
-
-	// Assume that nodes (queues) we haven't seen before have a 50% success probability.
-	// Avoiding extreme values for new nodes (queues) helps avoid drastic changes to existing estimates.
-	nodeSuccessProbability, ok := fe.successProbabilityByNode[node]
-	if !ok {
-		nodeSuccessProbability = 0.5
+	if len(fe.samples) == 0 {
+		// Nothing to do.
+		return
 	}
-	queueSuccessProbability, ok := fe.successProbabilityByQueue[queue]
-	if !ok {
-		queueSuccessProbability = 0.5
+
+	// Inner loop to compute intermediateParameters from samples.
+	// Passing over samples multiple times in random order helps improve convergence.
+	fe.intermediateParameters.CopyVec(fe.parameters)
+	for k := 0; k < fe.numInnerIterations; k++ {
+
+		// Compute gradient with respect to updates.
+		fe.gradient.Zero()
+		slices.Shuffle(fe.samples)
+		for _, sample := range fe.samples {
+			gi, gj := fe.negLogLikelihoodGradient(
+				fe.intermediateParameters.AtVec(sample.i),
+				fe.intermediateParameters.AtVec(sample.j),
+				sample.c,
+			)
+			fe.gradient.SetVec(sample.i, fe.gradient.AtVec(sample.i)+gi)
+			fe.gradient.SetVec(sample.j, fe.gradient.AtVec(sample.j)+gj)
+		}
+
+		// Update intermediateParameters using this gradient.
+		fe.innerOptimiser.Extend(fe.intermediateParameters.Len())
+		fe.intermediateParameters = fe.innerOptimiser.Update(fe.intermediateParameters, fe.intermediateParameters, fe.gradient)
+		applyBoundsVec(fe.intermediateParameters)
 	}
 
-	dNodeSuccessProbability, dQueueSuccessProbability := fe.negLogLikelihoodGradient(nodeSuccessProbability, queueSuccessProbability, success)
-	nodeSuccessProbability -= fe.nodeStepSize * dNodeSuccessProbability
-	queueSuccessProbability -= fe.queueStepSize * dQueueSuccessProbability
+	// Let the gradient be the difference between parameters and intermediateParameters,
+	// i.e., we use the inner loop as a method to estimate the gradient,
+	// and then update parameters using this gradient and the outer optimiser.
+	fe.gradient.CopyVec(fe.parameters)
+	fe.gradient.SubVec(fe.gradient, fe.intermediateParameters)
+	fe.outerOptimiser.Extend(fe.parameters.Len())
+	fe.parameters = fe.outerOptimiser.Update(fe.parameters, fe.parameters, fe.gradient)
+	applyBoundsVec(fe.parameters)
+
+	// Empty the buffer.
+	fe.samples = fe.samples[0:0]
+}
 
-	fe.successProbabilityByNode[node] = applyBounds(nodeSuccessProbability)
-	fe.successProbabilityByQueue[queue] = applyBounds(queueSuccessProbability)
+func applyBoundsVec(vec *mat.VecDense) {
+	for i := 0; i < vec.Len(); i++ {
+		vec.SetVec(i, applyBounds(vec.AtVec(i)))
+	}
 }
 
 // applyBounds ensures values stay in the range [eps, 1-eps].
@@ -264,7 +240,7 @@ func applyBounds(v float64) float64 {
 	}
 }
 
-// negLogLikelihoodGradient returns the gradient of the negated log-likelihood function with respect to P_q and P_n.
+// negLogLikelihoodGradient returns the gradient of the negated log-likelihood function.
 func (fe *FailureEstimator) negLogLikelihoodGradient(nodeSuccessProbability, queueSuccessProbability float64, success bool) (float64, float64) {
 	if success {
 		dNodeSuccessProbability := -1 / nodeSuccessProbability
@@ -294,13 +270,13 @@ func (fe *FailureEstimator) Collect(ch chan<- prometheus.Metric) {
 
 	// Report failure probability rounded to nearest multiple of 0.01.
 	// (As it's unlikely the estimate is accurate to within less than this.)
-	for k, v := range fe.successProbabilityByNode {
-		failureProbability := 1 - v
+	for k, i := range fe.parameterIndexByNode {
+		failureProbability := 1 - fe.parameters.AtVec(i)
 		failureProbability = math.Round(failureProbability*100) / 100
 		ch <- prometheus.MustNewConstMetric(fe.failureProbabilityByNodeDesc, prometheus.GaugeValue, failureProbability, k)
 	}
-	for k, v := range fe.successProbabilityByQueue {
-		failureProbability := 1 - v
+	for k, j := range fe.parameterIndexByQueue {
+		failureProbability := 1 - fe.parameters.AtVec(j)
 		failureProbability = math.Round(failureProbability*100) / 100
 		ch <- prometheus.MustNewConstMetric(fe.failureProbabilityByQueueDesc, prometheus.GaugeValue, failureProbability, k)
 	}
diff --git a/internal/scheduler/failureestimator/failureestimator_test.go b/internal/scheduler/failureestimator/failureestimator_test.go
index 3b02f2a096a..5a4bbb3c8bd 100644
--- a/internal/scheduler/failureestimator/failureestimator_test.go
+++ b/internal/scheduler/failureestimator/failureestimator_test.go
@@ -1,190 +1,81 @@
 package failureestimator
 
 import (
-	"math"
+	"fmt"
 	"testing"
-	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-)
-
-func TestNew(t *testing.T) {
-	successProbabilityCordonThreshold := 0.05
-	cordonTimeout := 10 * time.Minute
-	equilibriumFailureRate := 0.1
-
-	// Node decay rate and step size tests.
-	fe, err := New(
-		successProbabilityCordonThreshold,
-		0,
-		cordonTimeout,
-		0,
-		equilibriumFailureRate,
-		eps,
-	)
-	require.NoError(t, err)
-	assert.LessOrEqual(t, fe.nodeFailureProbabilityDecayRate, 0.9999145148300861+eps)
-	assert.GreaterOrEqual(t, fe.nodeFailureProbabilityDecayRate, 0.9999145148300861-eps)
-	assert.LessOrEqual(t, fe.nodeStepSize, 0.0008142462434300169+eps)
-	assert.GreaterOrEqual(t, fe.nodeStepSize, 0.0008142462434300169-eps)
-
-	// Queue decay rate and step size tests.
-	fe, err = New(
-		0,
-		successProbabilityCordonThreshold,
-		0,
-		cordonTimeout,
-		eps,
-		equilibriumFailureRate,
-	)
-	require.NoError(t, err)
-	assert.LessOrEqual(t, fe.queueFailureProbabilityDecayRate, 0.9999145148300861+eps)
-	assert.GreaterOrEqual(t, fe.queueFailureProbabilityDecayRate, 0.9999145148300861-eps)
-	assert.LessOrEqual(t, fe.queueStepSize, 0.0008142462434300169+eps)
-	assert.GreaterOrEqual(t, fe.queueStepSize, 0.0008142462434300169-eps)
-}
-
-func TestDecay(t *testing.T) {
-	successProbabilityCordonThreshold := 0.05
-	cordonTimeout := 10 * time.Minute
-	equilibriumFailureRate := 0.1
 
-	// Node decay tests.
-	fe, err := New(
-		successProbabilityCordonThreshold,
-		0,
-		cordonTimeout,
-		0,
-		equilibriumFailureRate,
-		eps,
-	)
-	require.NoError(t, err)
-
-	fe.successProbabilityByNode["foo"] = 0.0
-	fe.successProbabilityByNode["bar"] = 0.4
-	fe.successProbabilityByNode["baz"] = 1.0
-
-	fe.decay(0)
-	assert.Equal(
-		t,
-		fe.successProbabilityByNode,
-		map[string]float64{
-			"foo": eps,
-			"bar": 0.4,
-			"baz": 1.0 - eps,
-		},
-	)
-
-	fe.decay(1)
-	assert.Equal(
-		t,
-		fe.successProbabilityByNode,
-		map[string]float64{
-			"foo": 1 - (1-eps)*math.Pow(fe.nodeFailureProbabilityDecayRate, 1),
-			"bar": 1 - (1-0.4)*math.Pow(fe.nodeFailureProbabilityDecayRate, 1),
-			"baz": 1.0 - eps,
-		},
-	)
-
-	fe.decay(1e6)
-	assert.Equal(
-		t,
-		fe.successProbabilityByNode,
-		map[string]float64{
-			"foo": 1.0 - eps,
-			"bar": 1.0 - eps,
-			"baz": 1.0 - eps,
-		},
-	)
-
-	// Queue decay tests.
-	fe, err = New(
-		0,
-		successProbabilityCordonThreshold,
-		0,
-		cordonTimeout,
-		eps,
-		equilibriumFailureRate,
-	)
-	require.NoError(t, err)
-
-	fe.successProbabilityByQueue["foo"] = 0.0
-	fe.successProbabilityByQueue["bar"] = 0.4
-	fe.successProbabilityByQueue["baz"] = 1.0
-
-	fe.decay(0)
-	assert.Equal(
-		t,
-		fe.successProbabilityByQueue,
-		map[string]float64{
-			"foo": eps,
-			"bar": 0.4,
-			"baz": 1.0 - eps,
-		},
-	)
-
-	fe.decay(1)
-	assert.Equal(
-		t,
-		fe.successProbabilityByQueue,
-		map[string]float64{
-			"foo": 1 - (1-eps)*math.Pow(fe.queueFailureProbabilityDecayRate, 1),
-			"bar": 1 - (1-0.4)*math.Pow(fe.queueFailureProbabilityDecayRate, 1),
-			"baz": 1.0 - eps,
-		},
-	)
-
-	fe.decay(1e6)
-	assert.Equal(
-		t,
-		fe.successProbabilityByQueue,
-		map[string]float64{
-			"foo": 1.0 - eps,
-			"bar": 1.0 - eps,
-			"baz": 1.0 - eps,
-		},
-	)
-}
+	"github.com/armadaproject/armada/internal/common/optimisation/descent"
+	"github.com/armadaproject/armada/internal/common/optimisation/nesterov"
+)
 
 func TestUpdate(t *testing.T) {
-	successProbabilityCordonThreshold := 0.05
-	cordonTimeout := 10 * time.Minute
-	equilibriumFailureRate := 0.1
-
 	fe, err := New(
-		successProbabilityCordonThreshold,
-		successProbabilityCordonThreshold,
-		cordonTimeout,
-		cordonTimeout,
-		equilibriumFailureRate,
-		equilibriumFailureRate,
+		10,
+		descent.MustNew(0.05),
+		nesterov.MustNew(0.05, 0.2),
 	)
 	require.NoError(t, err)
 
-	fe.Update("node", "queue", false)
-	nodeSuccessProbability, ok := fe.successProbabilityByNode["node"]
+	// Test initialisation.
+	fe.Push("node", "queue", false)
+	nodeParameterIndex, ok := fe.parameterIndexByNode["node"]
+	require.True(t, ok)
+	queueParameterIndex, ok := fe.parameterIndexByQueue["queue"]
+	require.True(t, ok)
+	require.Equal(t, 0, nodeParameterIndex)
+	require.Equal(t, 1, queueParameterIndex)
+	require.Equal(t, 0.5, fe.parameters.AtVec(0))
+	require.Equal(t, 0.5, fe.parameters.AtVec(1))
+
+	for i := 0; i < 100; i++ {
+		fe.Push(fmt.Sprintf("node-%d", i), "queue-0", false)
+	}
+	nodeParameterIndex, ok = fe.parameterIndexByNode["node-99"]
 	require.True(t, ok)
-	queueSuccessProbability, ok := fe.successProbabilityByQueue["queue"]
+	queueParameterIndex, ok = fe.parameterIndexByQueue["queue-0"]
 	require.True(t, ok)
+	require.Equal(t, 2+100, nodeParameterIndex)
+	require.Equal(t, 3, queueParameterIndex)
+	require.Equal(t, 0.5, fe.parameters.AtVec(102))
+	require.Equal(t, 0.5, fe.parameters.AtVec(3))
+
+	// Test that the estimates move in the expected direction on failure.
+	fe.Update()
+	nodeSuccessProbability := fe.parameters.AtVec(0)
+	queueSuccessProbability := fe.parameters.AtVec(1)
 	assert.Greater(t, nodeSuccessProbability, eps)
 	assert.Greater(t, queueSuccessProbability, eps)
-	assert.Less(t, nodeSuccessProbability, healthySuccessProbability-eps)
-	assert.Less(t, queueSuccessProbability, healthySuccessProbability-eps)
-
-	fe.Update("node", "queue", true)
-	assert.Greater(t, fe.successProbabilityByNode["node"], nodeSuccessProbability)
-	assert.Greater(t, fe.successProbabilityByQueue["queue"], queueSuccessProbability)
-
-	for i := 0; i < 100000; i++ {
-		fe.Update("node", "queue", false)
+	assert.Less(t, nodeSuccessProbability, 0.5-eps)
+	assert.Less(t, queueSuccessProbability, 0.5-eps)
+
+	// Test that the estimates move in the expected direction on success.
+	fe.Push("node", "queue", true)
+	fe.Update()
+	assert.Greater(t, fe.parameters.AtVec(0), nodeSuccessProbability)
+	assert.Greater(t, fe.parameters.AtVec(1), queueSuccessProbability)
+
+	for i := 0; i < 1000; i++ {
+		for i := 0; i < 10; i++ {
+			fe.Push("node", "queue", false)
+		}
+		fe.Update()
 	}
-	assert.Equal(t, fe.successProbabilityByNode["node"], eps)
-	assert.Equal(t, fe.successProbabilityByQueue["queue"], eps)
-
-	for i := 0; i < 100000; i++ {
-		fe.Update("node", "queue", true)
+	assert.Greater(t, fe.parameters.AtVec(0), 0.0)
+	assert.Greater(t, fe.parameters.AtVec(1), 0.0)
+	assert.Less(t, fe.parameters.AtVec(0), 2*eps)
+	assert.Less(t, fe.parameters.AtVec(1), 2*eps)
+
+	for i := 0; i < 1000; i++ {
+		for i := 0; i < 10; i++ {
+			fe.Push("node", "queue", true)
+		}
+		fe.Update()
 	}
-	assert.Equal(t, fe.successProbabilityByNode["node"], 1-eps)
-	assert.Equal(t, fe.successProbabilityByQueue["queue"], 1-eps)
+	assert.Greater(t, fe.parameters.AtVec(0), 1-2*eps)
+	assert.Greater(t, fe.parameters.AtVec(1), 1-2*eps)
+	assert.Less(t, fe.parameters.AtVec(0), 1.0)
+	assert.Less(t, fe.parameters.AtVec(1), 1.0)
 }
diff --git a/internal/scheduler/failureestimator/simulator.jl b/internal/scheduler/failureestimator/simulator.jl
new file mode 100644
index 00000000000..726a0ad8991
--- /dev/null
+++ b/internal/scheduler/failureestimator/simulator.jl
@@ -0,0 +1,352 @@
+using Random
+using StatsBase
+using Optimization
+using OptimizationOptimJL
+using OptimizationBBO
+using ForwardDiff
+using Plots
+
+# Julia script to simulate failure estimation and optimise parameters.
+
+function neg_log_likelihood(As, Bs, Is, cs)
+    llsum = 0
+    for k = 1:length(cs)
+        i = Is[k, 1]
+        j = Is[k, 2]
+        if cs[k]
+            llsum += log(As[i]*Bs[j])
+        else
+            llsum += log(1 - As[i]*Bs[j])
+        end
+    end
+    return -llsum
+end
+
+function neg_log_likelihood_gradient!(dAs, dBs, As, Bs, Is, cs)
+    dAs .= 0
+    dBs .= 0
+    for k = 1:length(cs)
+        i = Is[k, 1]
+        j = Is[k, 2]
+        dA, dB = neg_log_likelihood_gradient_inner(As[i], Bs[j], cs[k])
+        dAs[i] += dA
+        dBs[j] += dB
+    end
+    return dAs, dBs
+end
+
+function neg_log_likelihood_gradient_inner(A, B, c)
+    if c
+        dA = -1 / A
+        dB = -1 / B
+        return dA, dB
+    else
+        dA = B / (1 - A*B)
+        dB = A / (1 - A*B)
+        return dA, dB
+    end
+end
+
+function f(u, p)
+    Is, cs, n = p
+    As = view(u, 1:n)
+    Bs = view(u, n+1:length(u))
+    return neg_log_likelihood(As, Bs, Is, cs)
+end
+
+function g(G, u, p)
+    Is, cs, n = p
+    dAs = view(G, 1:n)
+    dBs = view(G, n+1:length(u))
+    neg_log_likelihood_gradient!(dAs, dBs, As, Bs, Is, cs)
+    return G
+end
+
+function global_optimization_solution(n, k, Is, cs)
+    prob = OptimizationProblem(
+        OptimizationFunction(f, grad=g),
+        ones(n+k)./2,
+        (Is, cs, n),
+        lb=zeros(n+k),
+        ub=ones(n+k),
+    )
+    return solve(prob, NelderMead())
+end
+
+function gd(Is, Js, cs, ts, n, k; inner_opt, outer_opt, update_interval::Float64=60.0, num_sub_iterations::Integer=1)
+    nsamples = length(cs)
+    x = fill(0.5, n+k)
+    xk = zeros(n+k)
+    g = zeros(n+k)
+    As = zeros(n, nsamples)
+    Bs = zeros(k, nsamples)
+    Ak = view(x, 1:n)
+    Bk = view(x, (n+1):(n+k))
+    Aki = view(xk, 1:n)
+    Bki = view(xk, (n+1):(n+k))
+    Agki = view(g, 1:n)
+    Bgki = view(g, (n+1):(n+k))
+
+    index_of_last_update = 0
+    time_of_last_update = 0.0
+    for sample_index = 1:nsamples
+        As[:, sample_index] .= Ak
+        Bs[:, sample_index] .= Bk
+
+        if !(ts[sample_index] - time_of_last_update >= update_interval)
+            # Not yet time to update.
+            continue
+        end
+
+        # Perform several gradient descent steps over the collected data.
+        xk .= x
+        batch_indices = collect((index_of_last_update+1):sample_index)
+        for _ = 1:num_sub_iterations
+            g .= 0
+            shuffle!(batch_indices)
+            for batch_index = batch_indices
+                i = Is[batch_index]
+                j = Js[batch_index]
+                dA, dB = neg_log_likelihood_gradient_inner(Aki[i], Bki[j], cs[batch_index])
+                Agki[i] += dA
+                Bgki[j] += dB
+            end
+
+            Flux.Optimise.update!(inner_opt, xk, g)
+            xk .= min.(max.(xk, 0), 1)
+        end
+
+        g .= x .- xk
+        Flux.Optimise.update!(outer_opt, x, g)
+        x .= min.(max.(x, 0), 1)
+
+        index_of_last_update = sample_index
+        time_of_last_update = ts[sample_index]
+
+        As[:, sample_index] .= Ak
+        Bs[:, sample_index] .= Bk
+    end
+    return As, Bs
+end
+
+struct NodeTemplate
+    n::Int
+    p::Float64
+    df::Float64
+end
+
+struct QueueTemplate
+    k::Int
+    p::Float64
+    df::Float64
+    ds::Float64
+    ws::Vector{Int}
+end
+
+struct Parameters
+    # Minimum simulation time.
+    t::Float64
+    node_templates::Vector{NodeTemplate}
+    queue_templates::Vector{QueueTemplate}
+end
+
+function scenario1()
+    return Parameters(
+        36000,
+        [
+            NodeTemplate(1, 0.9, 60),
+            NodeTemplate(1, 0.01, 60),
+        ],
+        [
+            QueueTemplate(1, 0.9, 60, 60, [1]),
+            QueueTemplate(1, 0.01, 60, 60, [1]),
+        ],
+    )
+end
+
+function scenario2()
+    return Parameters(
+        36000,
+        [
+            NodeTemplate(18, 0.9, 60),
+            NodeTemplate(2, 0.1, 60),
+        ],
+        [
+            QueueTemplate(1, 0.9, 60, 60, [1]),
+            QueueTemplate(1, 0.1, 60, 60, [1]),
+        ],
+    )
+end
+
+function scenario3()
+    return Parameters(
+        36000,
+        [
+            NodeTemplate(90, 0.9, 60),
+            NodeTemplate(10, 0.1, 60),
+        ],
+        [
+            QueueTemplate(8, 0.9, 60, 600, ones(Int, 8)),
+            QueueTemplate(2, 0.1, 60, 600, ones(Int, 2)),
+        ],
+    )
+end
+
+function simulate(p::Parameters)
+    for tmpl in p.queue_templates
+        if length(tmpl.ws) != tmpl.k
+            throw(ArgumentError("ws must be of length k"))
+        end
+    end
+    n = sum(tmpl.n for tmpl in p.node_templates)
+    k = sum(tmpl.k for tmpl in p.queue_templates)
+    ws = [sum(tmpl.ws) for tmpl in p.queue_templates]
+    ncs = cumsum(tmpl.n for tmpl in p.node_templates)
+    kcs = cumsum(tmpl.k for tmpl in p.queue_templates)
+
+    Is = Vector{Int}()
+    Js = Vector{Int}()
+    as = Vector{Bool}()
+    bs = Vector{Bool}()
+    cs = Vector{Bool}()
+    ts = Vector{Float64}()
+    for (node_template_index, node_template) in enumerate(p.node_templates)
+        for node_template_i = 1:node_template.n
+            i = node_template_i
+            if node_template_index > 1
+                i += ncs[node_template_index-1]
+            end
+
+            t = 0.0
+            while t < p.t
+                queue_template_index = sample(1:length(p.queue_templates), Weights(ws))
+                queue_template = p.queue_templates[queue_template_index]
+                j = sample(1:queue_template.k, Weights(queue_template.ws))
+                if queue_template_index > 1
+                    j += kcs[queue_template_index-1]
+                end
+
+                # Sample node and queue failure.
+                a = rand() < node_template.p
+                b = rand() < queue_template.p
+                c = a*b
+
+                # Compute duration until job termination.
+                d = 0.0
+                if !a
+                    # Node failure.
+                    d = node_template.df
+                elseif !b
+                    # Queue failure.
+                    d = queue_template.df
+                else
+                    # Job success.
+                    d = queue_template.ds
+                end
+
+                t += d
+                push!(Is, i)
+                push!(Js, j)
+                push!(as, a)
+                push!(bs, b)
+                push!(cs, c)
+                push!(ts, t)
+            end
+        end
+    end
+
+    # Sort by time and return.
+    p = sortperm(ts)
+    return Is[p], Js[p], as[p], bs[p], cs[p], ts[p]
+end
+
+function rate_by_index(Is, as)
+    as_by_i = Dict{Int, Vector{Bool}}()
+    for k = 1:length(Is)
+        i = Is[k]
+        a = as[k]
+        vs = get(as_by_i, i, Vector{Bool}())
+        as_by_i[i] = push!(vs, a)
+    end
+    rate_by_index = Dict{Int, Float64}()
+    for (i, vs) in as_by_i
+        rate_by_index[i] = sum(vs)/length(vs)
+    end
+    return rate_by_index
+end
+
+function squared_error(p::Parameters, As, Bs)
+    Ase = copy(As)
+    Bse = copy(Bs)
+    i = 1
+    for node_template = p.node_templates
+        for _ = 1:node_template.n
+            Ase[i, :] .-= node_template.p
+            Ase[i, :] .^= 2
+            i += 1
+        end
+    end
+    j = 1
+    for queue_template = p.queue_templates
+        for _ = 1:queue_template.k
+            Bse[j, :] .-= queue_template.p
+            Bse[j, :] .^= 2
+            j += 1
+        end
+    end
+    return Ase, Bse
+end
+
+function grid_search(p::Parameters; num_simulations=10)
+    n = sum(tmpl.n for tmpl in p.node_templates)
+    k = sum(tmpl.k for tmpl in p.queue_templates)
+
+    it = Base.Iterators.product(
+        # Inner step-size.
+        range(1e-5, 1e-2, length=10),
+        # Number of sub-iterations.
+        range(1, 10),
+        # Outer step-size.
+        range(1e-2, 0.25, length=10),
+        # Outer Nesterov acceleration.
+        range(0.0, 0.99, length=10)
+    )
+    mses = zeros(length(it))
+    for _ = num_simulations
+        Is, Js, as, bs, cs, ts = simulate(p)
+        for (i, (inner_step_size, num_sub_iterations, outer_step_size, outer_nesterov_acceleration)) = enumerate(it)
+            As, Bs = gd(Is, Js, cs, ts, n, k, inner_opt=Descent(inner_step_size), outer_opt=Nesterov(outer_step_size, outer_nesterov_acceleration), update_interval=60.0, num_sub_iterations=num_sub_iterations)
+            Ase, Bse = squared_error(p, As, Bs)
+            mses[i] += (sum(Ase) + sum(Bse)) / ((n+k)*length(cs))
+        end
+    end
+    mses ./= num_simulations
+    return it, mses
+end
+
+function main()
+    p = scenario3()
+    n = sum(tmpl.n for tmpl in p.node_templates)
+    k = sum(tmpl.k for tmpl in p.queue_templates)
+
+    Is, Js, as, bs, cs, ts = simulate(p)
+
+    plots = Vector{Plots.Plot}()
+
+    As, Bs = gd(Is, Js, cs, ts, n, k, inner_opt=Descent(0.05), outer_opt=Nesterov(0.05, 0.2), update_interval=60.0, num_sub_iterations=10)
+    Ase, Bse = squared_error(p, As, Bs)
+    push!(plots, plot!(plot(ts, As', legend=false), ts, mean(Ase, dims=1)', color="black", legend=false))
+    push!(plots, plot!(plot(ts, Bs', legend=false), ts, mean(Bse, dims=1)', color="black", legend=false))
+
+    As, Bs = gd(Is, Js, cs, ts, n, k, inner_opt=Descent(0.05), outer_opt=Nesterov(0.01, 0.9), update_interval=60.0, num_sub_iterations=10)
+    Ase, Bse = squared_error(p, As, Bs)
+    push!(plots, plot!(plot(ts, As', legend=false), ts, mean(Ase, dims=1)', color="black", legend=false))
+    push!(plots, plot!(plot(ts, Bs', legend=false), ts, mean(Bse, dims=1)', color="black", legend=false))
+
+    As, Bs = gd(Is, Js, cs, ts, n, k, inner_opt=Descent(0.05), outer_opt=Nesterov(0.005, 0.99), update_interval=60.0, num_sub_iterations=10)
+    Ase, Bse = squared_error(p, As, Bs)
+    push!(plots, plot!(plot(ts, As', legend=false), ts, mean(Ase, dims=1)', color="black", legend=false))
+    push!(plots, plot!(plot(ts, Bs', legend=false), ts, mean(Bse, dims=1)', color="black", legend=false))
+
+    plot(plots..., layout=(3, 2))
+end
diff --git a/internal/scheduler/jobdb/jobdb_test.go b/internal/scheduler/jobdb/jobdb_test.go
index abd6497dc2a..eda76e4e0b2 100644
--- a/internal/scheduler/jobdb/jobdb_test.go
+++ b/internal/scheduler/jobdb/jobdb_test.go
@@ -170,11 +170,23 @@ func TestJobDb_TestGetAll(t *testing.T) {
 	require.NoError(t, err)
 	actual := txn.GetAll()
 	expected := []*Job{job1, job2}
-	slices.SortFunc(expected, func(a, b *Job) bool {
-		return a.id > b.id
+	slices.SortFunc(expected, func(a, b *Job) int {
+		if a.id > b.id {
+			return -1
+		} else if a.id < b.id {
+			return 1
+		} else {
+			return 0
+		}
 	})
-	slices.SortFunc(actual, func(a, b *Job) bool {
-		return a.id > b.id
+	slices.SortFunc(actual, func(a, b *Job) int {
+		if a.id > b.id {
+			return -1
+		} else if a.id < b.id {
+			return 1
+		} else {
+			return 0
+		}
 	})
 	assert.Equal(t, expected, actual)
 }
diff --git a/internal/scheduler/jobiteration.go b/internal/scheduler/jobiteration.go
index e914988e4a3..f6733c26e39 100644
--- a/internal/scheduler/jobiteration.go
+++ b/internal/scheduler/jobiteration.go
@@ -72,8 +72,8 @@ func (repo *InMemoryJobRepository) EnqueueMany(jctxs []*schedulercontext.JobSche
 
 // sortQueue sorts jobs in a specified queue by the order in which they should be scheduled.
 func (repo *InMemoryJobRepository) sortQueue(queue string) {
-	slices.SortFunc(repo.jctxsByQueue[queue], func(a, b *schedulercontext.JobSchedulingContext) bool {
-		return a.Job.SchedulingOrderCompare(b.Job) == -1
+	slices.SortFunc(repo.jctxsByQueue[queue], func(a, b *schedulercontext.JobSchedulingContext) int {
+		return a.Job.SchedulingOrderCompare(b.Job)
 	})
 }
 
diff --git a/internal/scheduler/nodedb/nodeiteration_test.go b/internal/scheduler/nodedb/nodeiteration_test.go
index 54447741808..5c23f158a88 100644
--- a/internal/scheduler/nodedb/nodeiteration_test.go
+++ b/internal/scheduler/nodedb/nodeiteration_test.go
@@ -45,7 +45,15 @@ func TestNodesIterator(t *testing.T) {
 			}
 
 			sortedNodes := slices.Clone(tc.Nodes)
-			slices.SortFunc(sortedNodes, func(a, b *schedulerobjects.Node) bool { return a.Id < b.Id })
+			slices.SortFunc(sortedNodes, func(a, b *schedulerobjects.Node) int {
+				if a.Id < b.Id {
+					return -1
+				} else if a.Id > b.Id {
+					return 1
+				} else {
+					return 0
+				}
+			})
 			expected := make([]int, len(sortedNodes))
 			for i, node := range sortedNodes {
 				expected[i] = indexById[node.Id]
diff --git a/internal/scheduler/preempting_queue_scheduler_test.go b/internal/scheduler/preempting_queue_scheduler_test.go
index ea9650c66f9..9b4d622d7e2 100644
--- a/internal/scheduler/preempting_queue_scheduler_test.go
+++ b/internal/scheduler/preempting_queue_scheduler_test.go
@@ -2022,8 +2022,14 @@ func TestPreemptingQueueScheduler(t *testing.T) {
 				// which jobs are preempted).
 				slices.SortFunc(
 					result.ScheduledJobs,
-					func(a, b *schedulercontext.JobSchedulingContext) bool {
-						return a.Job.GetSubmitTime().Before(b.Job.GetSubmitTime())
+					func(a, b *schedulercontext.JobSchedulingContext) int {
+						if a.Job.GetSubmitTime().Before(b.Job.GetSubmitTime()) {
+							return -1
+						} else if b.Job.GetSubmitTime().Before(a.Job.GetSubmitTime()) {
+							return 1
+						} else {
+							return 0
+						}
 					},
 				)
 				var scheduledJobs []*jobdb.Job
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 556ce651b61..8598c1a8d60 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -280,7 +280,6 @@ func (s *Scheduler) cycle(ctx *armadacontext.Context, updateAll bool, leaderToke
 
 	// Update success probability estimates.
 	if !s.failureEstimator.IsDisabled() {
-		s.failureEstimator.Decay()
 		for _, jst := range jsts {
 			if jst.Job == nil {
 				continue
@@ -290,12 +289,13 @@ func (s *Scheduler) cycle(ctx *armadacontext.Context, updateAll bool, leaderToke
 				continue
 			}
 			if jst.Failed {
-				s.failureEstimator.Update(run.NodeName(), jst.Job.GetQueue(), false)
+				s.failureEstimator.Push(run.NodeName(), jst.Job.GetQueue(), false)
 			}
 			if jst.Succeeded {
-				s.failureEstimator.Update(run.NodeName(), jst.Job.GetQueue(), true)
+				s.failureEstimator.Push(run.NodeName(), jst.Job.GetQueue(), true)
 			}
 		}
+		s.failureEstimator.Update()
 	}
 
 	// Generate any eventSequences that came out of synchronising the db state.
diff --git a/internal/scheduler/schedulerapp.go b/internal/scheduler/schedulerapp.go
index 057da09276c..bc408ef156e 100644
--- a/internal/scheduler/schedulerapp.go
+++ b/internal/scheduler/schedulerapp.go
@@ -25,6 +25,8 @@ import (
 	grpcCommon "github.com/armadaproject/armada/internal/common/grpc"
 	"github.com/armadaproject/armada/internal/common/health"
 	"github.com/armadaproject/armada/internal/common/logging"
+	"github.com/armadaproject/armada/internal/common/optimisation/descent"
+	"github.com/armadaproject/armada/internal/common/optimisation/nesterov"
 	"github.com/armadaproject/armada/internal/common/profiling"
 	"github.com/armadaproject/armada/internal/common/pulsarutils"
 	"github.com/armadaproject/armada/internal/common/serve"
@@ -220,12 +222,14 @@ func Run(config schedulerconfig.Configuration) error {
 	}
 
 	failureEstimator, err := failureestimator.New(
-		config.Scheduling.FailureEstimatorConfig.NodeSuccessProbabilityCordonThreshold,
-		config.Scheduling.FailureEstimatorConfig.QueueSuccessProbabilityCordonThreshold,
-		config.Scheduling.FailureEstimatorConfig.NodeCordonTimeout,
-		config.Scheduling.FailureEstimatorConfig.QueueCordonTimeout,
-		config.Scheduling.FailureEstimatorConfig.NodeEquilibriumFailureRate,
-		config.Scheduling.FailureEstimatorConfig.QueueEquilibriumFailureRate,
+		config.Scheduling.FailureEstimatorConfig.NumInnerIterations,
+		// Invalid config will have failed validation.
+		descent.MustNew(config.Scheduling.FailureEstimatorConfig.InnerOptimiserStepSize),
+		// Invalid config will have failed validation.
+		nesterov.MustNew(
+			config.Scheduling.FailureEstimatorConfig.OuterOptimiserStepSize,
+			config.Scheduling.FailureEstimatorConfig.OuterOptimiserNesterovAcceleration,
+		),
 	)
 	if err != nil {
 		return err
diff --git a/internal/scheduler/schedulerobjects/nodetype.go b/internal/scheduler/schedulerobjects/nodetype.go
index d52d35e36fb..020c57e675d 100644
--- a/internal/scheduler/schedulerobjects/nodetype.go
+++ b/internal/scheduler/schedulerobjects/nodetype.go
@@ -33,7 +33,15 @@ func NewNodeType(taints []v1.Taint, labels map[string]string, indexedTaints map[
 
 	// Sort taints to ensure node type id is consistent regardless of
 	// the order in which taints are set on the node.
-	slices.SortFunc(taints, func(a, b v1.Taint) bool { return a.Key < b.Key }) // TODO: Use less ambiguous sorting.
+	slices.SortFunc(taints, func(a, b v1.Taint) int {
+		if a.Key < b.Key {
+			return -1
+		} else if a.Key > b.Key {
+			return 1
+		} else {
+			return 0
+		}
+	}) // TODO: Use less ambiguous sorting.
 
 	// Filter out any labels that should not be indexed.
 	if indexedLabels != nil {
diff --git a/internal/scheduler/schedulerobjects/podutils.go b/internal/scheduler/schedulerobjects/podutils.go
index bbd087702db..1892c959b5b 100644
--- a/internal/scheduler/schedulerobjects/podutils.go
+++ b/internal/scheduler/schedulerobjects/podutils.go
@@ -227,45 +227,45 @@ func (skg *PodRequirementsSerialiser) AppendResourceList(out []byte, resourceLis
 	return out
 }
 
-func lessToleration(a, b v1.Toleration) bool {
+func lessToleration(a, b v1.Toleration) int {
 	if a.Key < b.Key {
-		return true
+		return -1
 	} else if a.Key > b.Key {
-		return false
+		return 1
 	}
 	if a.Value < b.Value {
-		return true
+		return -1
 	} else if a.Value > b.Value {
-		return false
+		return 1
 	}
 	if string(a.Operator) < string(b.Operator) {
-		return true
+		return -1
 	} else if string(a.Operator) > string(b.Operator) {
-		return false
+		return 1
 	}
 	if string(a.Effect) < string(b.Effect) {
-		return true
+		return -1
 	} else if string(a.Effect) > string(b.Effect) {
-		return false
+		return 1
 	}
-	return true
+	return 0
 }
 
-func lessNodeSelectorRequirement(a, b v1.NodeSelectorRequirement) bool {
+func lessNodeSelectorRequirement(a, b v1.NodeSelectorRequirement) int {
 	if a.Key < b.Key {
-		return true
+		return -1
 	} else if a.Key > b.Key {
-		return false
+		return 1
 	}
 	if string(a.Operator) < string(b.Operator) {
-		return true
+		return -1
 	} else if string(a.Operator) > string(b.Operator) {
-		return false
+		return 1
 	}
 	if len(a.Values) < len(b.Values) {
-		return true
+		return -1
 	} else if len(a.Values) > len(b.Values) {
-		return false
+		return 1
 	}
-	return true
+	return 0
 }
diff --git a/internal/scheduler/simulator/simulator.go b/internal/scheduler/simulator/simulator.go
index bf364bb75f1..b270b54bfd6 100644
--- a/internal/scheduler/simulator/simulator.go
+++ b/internal/scheduler/simulator/simulator.go
@@ -499,21 +499,21 @@ func (s *Simulator) handleScheduleEvent(ctx *armadacontext.Context) error {
 			preemptedJobs := scheduler.PreemptedJobsFromSchedulerResult[*jobdb.Job](result)
 			scheduledJobs := slices.Clone(result.ScheduledJobs)
 			failedJobs := scheduler.FailedJobsFromSchedulerResult[*jobdb.Job](result)
-			lessJob := func(a, b *jobdb.Job) bool {
+			lessJob := func(a, b *jobdb.Job) int {
 				if a.Queue() < b.Queue() {
-					return true
+					return -1
 				} else if a.Queue() > b.Queue() {
-					return false
+					return 1
 				}
 				if a.Id() < b.Id() {
-					return true
+					return -1
 				} else if a.Id() > b.Id() {
-					return false
+					return 1
 				}
-				return false
+				return 0
 			}
 			slices.SortFunc(preemptedJobs, lessJob)
-			slices.SortFunc(scheduledJobs, func(a, b *schedulercontext.JobSchedulingContext) bool {
+			slices.SortFunc(scheduledJobs, func(a, b *schedulercontext.JobSchedulingContext) int {
 				return lessJob(a.Job.(*jobdb.Job), b.Job.(*jobdb.Job))
 			})
 			slices.SortFunc(failedJobs, lessJob)