From f6759d96afd584651dded37519c27cc1ea04a7a7 Mon Sep 17 00:00:00 2001 From: Bakhadyr Date: Fri, 28 Jun 2024 14:50:51 +0300 Subject: [PATCH 1/2] fail live probe if fail-fast mechanism was triggered Signed-off-by: Bakhadyr --- .../java/com/ibm/watson/modelmesh/ModelMesh.java | 16 +++++++++++++--- .../ibm/watson/modelmesh/ModelMeshEnvVars.java | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/ibm/watson/modelmesh/ModelMesh.java b/src/main/java/com/ibm/watson/modelmesh/ModelMesh.java index f8779651..020d5888 100644 --- a/src/main/java/com/ibm/watson/modelmesh/ModelMesh.java +++ b/src/main/java/com/ibm/watson/modelmesh/ModelMesh.java @@ -1292,6 +1292,14 @@ boolean isLeader() { return le != null && le.isLeader(); } + @Override + protected boolean isLive() { + if (failFastUpgradeEnabled && failLiveOnFailFastEnabled) { + return btspSuccessCount != null || !abortStartup; + } + return true; + } + /* * We don't begin to return READY until no other members of the same logical * model-mesh deployment are in a terminating state. We can still receive @@ -1332,6 +1340,8 @@ protected boolean isReady() { /* -------------------------- "fail-fast" startup probation period feature -------------------- */ + protected volatile boolean failFastUpgradeEnabled; + protected volatile boolean failLiveOnFailFastEnabled; protected volatile boolean abortStartup; // flag used to abort startup in case of unexpected model loading failures protected AtomicInteger btspSuccessCount; // count of all succeeded load while bootstrap protected AtomicInteger btspFatalCount; // count of all fatal failures load while bootstrap @@ -1343,9 +1353,9 @@ protected boolean isReady() { BOOTSTRAP_CLEARANCE_PERIOD_MS = Long.parseLong(btspClearanceStr); } - boolean failfastUpgradeEnabled = !"false".equalsIgnoreCase( - System.getenv(FAILFAST_UPGRADE_ENV_VAR)); - if (failfastUpgradeEnabled) { + failFastUpgradeEnabled = !"false".equalsIgnoreCase(System.getenv(FAILFAST_UPGRADE_ENV_VAR)); + failLiveOnFailFastEnabled = "true".equalsIgnoreCase(System.getenv(FAIL_LIVE_ON_FAILFAST_ENV_VAR)); + if (failFastUpgradeEnabled) { btspSuccessCount = new AtomicInteger(); btspFatalCount = new AtomicInteger(); btspFailureCount = new AtomicInteger(); diff --git a/src/main/java/com/ibm/watson/modelmesh/ModelMeshEnvVars.java b/src/main/java/com/ibm/watson/modelmesh/ModelMeshEnvVars.java index 24c65e4c..30dca1bf 100644 --- a/src/main/java/com/ibm/watson/modelmesh/ModelMeshEnvVars.java +++ b/src/main/java/com/ibm/watson/modelmesh/ModelMeshEnvVars.java @@ -61,6 +61,7 @@ private ModelMeshEnvVars() {} public static final String BOOTSTRAP_CLEARANCE_PERIOD_ENV_VAR = "BOOTSTRAP_CLEARANCE_PERIOD_MS"; public static final String FAILFAST_UPGRADE_ENV_VAR = "MM_FAILFAST_UPGRADE_ENABLED"; + public static final String FAIL_LIVE_ON_FAILFAST_ENV_VAR = "MM_FAIL_LIVE_ON_FAILFAST_ENABLED"; public static final String GRPC_MAX_CONNECTION_AGE_SECS_ENV_VAR = "MM_SVC_GRPC_MAX_CONNECTION_AGE_SECS"; public static final String GRPC_MAX_CONNECTION_AGE_GRACE_SECS_ENV_VAR = "MM_SVC_GRPC_MAX_CONNECTION_AGE_GRACE_SECS"; From 390be673ea6675d6b6752d69007e58bb626e0d8f Mon Sep 17 00:00:00 2001 From: Bakhadyr Date: Fri, 25 Oct 2024 14:51:29 +0300 Subject: [PATCH 2/2] retrigger CI Signed-off-by: Bakhadyr