-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implementation of zombie cluster detection (#587)
* Implementation of zombie cluster detection * Minor tweaks * Don't zombify creating clusters, and fix error message * Also check the project's billing * Update libs dependency for bugfix * PR feedback * Add unit test to make sure Google errors don't cause zombification * Update wb-libs to non-SNAP
- Loading branch information
Showing
10 changed files
with
403 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 6 additions & 0 deletions
6
src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/ZombieClusterConfig.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package org.broadinstitute.dsde.workbench.leonardo.config | ||
|
||
import scala.concurrent.duration.FiniteDuration | ||
|
||
case class ZombieClusterConfig(enableZombieClusterDetection: Boolean, | ||
zombieCheckPeriod: FiniteDuration) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/ZombieClusterMonitor.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
package org.broadinstitute.dsde.workbench.leonardo.monitor | ||
|
||
import java.time.Instant | ||
|
||
import akka.actor.{Actor, Props, Timers} | ||
import cats.implicits._ | ||
import com.typesafe.scalalogging.LazyLogging | ||
import org.broadinstitute.dsde.workbench.google.GoogleProjectDAO | ||
import org.broadinstitute.dsde.workbench.leonardo.config.ZombieClusterConfig | ||
import org.broadinstitute.dsde.workbench.leonardo.dao.google.GoogleDataprocDAO | ||
import org.broadinstitute.dsde.workbench.leonardo.db.DbReference | ||
import org.broadinstitute.dsde.workbench.leonardo.model.google.ClusterStatus | ||
import org.broadinstitute.dsde.workbench.leonardo.model.{Cluster, ClusterError} | ||
import org.broadinstitute.dsde.workbench.leonardo.monitor.ZombieClusterMonitor._ | ||
import org.broadinstitute.dsde.workbench.model.google.GoogleProject | ||
|
||
import scala.concurrent.Future | ||
|
||
object ZombieClusterMonitor { | ||
|
||
def props(config: ZombieClusterConfig, gdDAO: GoogleDataprocDAO, googleProjectDAO: GoogleProjectDAO, dbRef: DbReference): Props = { | ||
Props(new ZombieClusterMonitor(config, gdDAO, googleProjectDAO, dbRef)) | ||
} | ||
|
||
sealed trait ZombieClusterMonitorMessage | ||
case object DetectZombieClusters extends ZombieClusterMonitorMessage | ||
case object TimerKey extends ZombieClusterMonitorMessage | ||
} | ||
|
||
/** | ||
* This monitor periodically sweeps the Leo database and checks for clusters which no longer exist in Google. | ||
*/ | ||
class ZombieClusterMonitor(config: ZombieClusterConfig, gdDAO: GoogleDataprocDAO, googleProjectDAO: GoogleProjectDAO, dbRef: DbReference) extends Actor with Timers with LazyLogging { | ||
import context._ | ||
|
||
override def preStart(): Unit = { | ||
super.preStart() | ||
timers.startPeriodicTimer(TimerKey, DetectZombieClusters, config.zombieCheckPeriod) | ||
} | ||
|
||
override def receive: Receive = { | ||
case DetectZombieClusters => | ||
// Get active clusters from the Leo DB, grouped by project | ||
val zombieClusters = getActiveClustersFromDatabase.flatMap { clusterMap => | ||
clusterMap.toList.flatTraverse { case (project, clusters) => | ||
// Check if the project is active | ||
isProjectActiveInGoogle(project).flatMap { | ||
case true => | ||
// If the project is active, check each individual cluster | ||
logger.debug(s"Project ${project.value} containing ${clusters.size} clusters is active in Google") | ||
clusters.toList.traverseFilter { cluster => | ||
isClusterActiveInGoogle(cluster).map { | ||
case true => | ||
logger.debug(s"Cluster ${cluster.projectNameString} is active in Google") | ||
None | ||
case false => | ||
logger.debug(s"Cluster ${cluster.projectNameString} is a zombie!") | ||
Some(cluster) | ||
} | ||
} | ||
case false => | ||
// If the project is inactive, all clusters in the project are zombies | ||
logger.debug(s"Project ${project.value} containing ${clusters.size} clusters is inactive in Google") | ||
Future.successful(clusters.toList) | ||
} | ||
} | ||
} | ||
|
||
// Error out each detected zombie cluster | ||
zombieClusters.flatMap { cs => | ||
logger.info(s"Detected ${cs.size} zombie clusters across ${cs.map(_.googleProject).toSet.size} projects.") | ||
cs.traverse { cluster => | ||
handleZombieCluster(cluster) | ||
} | ||
} | ||
|
||
} | ||
|
||
private def getActiveClustersFromDatabase: Future[Map[GoogleProject, Seq[Cluster]]] = { | ||
dbRef.inTransaction { | ||
_.clusterQuery.listActive | ||
} map { clusters => | ||
clusters.groupBy(_.googleProject) | ||
} | ||
} | ||
|
||
private def isProjectActiveInGoogle(googleProject: GoogleProject): Future[Boolean] = { | ||
// Check the project and its billing info | ||
(googleProjectDAO.isProjectActive(googleProject.value) |@| googleProjectDAO.isBillingActive(googleProject.value)) | ||
.map(_ && _) | ||
.recover { case e => | ||
logger.warn(s"Unable to check status of project ${googleProject.value} for zombie cluster detection", e) | ||
true | ||
} | ||
} | ||
|
||
private def isClusterActiveInGoogle(cluster: Cluster): Future[Boolean] = { | ||
// Clusters in Creating status may not yet exist in Google. Therefore treat all Creating clusters as active. | ||
if (cluster.status == ClusterStatus.Creating) { | ||
Future.successful(true) | ||
} else { | ||
// Check if status returned by GoogleDataprocDAO is an "active" status. | ||
gdDAO.getClusterStatus(cluster.googleProject, cluster.clusterName) map { clusterStatus => | ||
ClusterStatus.activeStatuses contains clusterStatus | ||
} recover { case e => | ||
logger.warn(s"Unable to check status of cluster ${cluster.projectNameString} for zombie cluster detection", e) | ||
true | ||
} | ||
} | ||
} | ||
|
||
private def handleZombieCluster(cluster: Cluster): Future[Unit] = { | ||
logger.info(s"Erroring zombie cluster: ${cluster.projectNameString}") | ||
dbRef.inTransaction { dataAccess => | ||
for { | ||
_ <- dataAccess.clusterQuery.updateClusterStatus(cluster.id, ClusterStatus.Error) | ||
error = ClusterError("An underlying resource was removed in Google. Please delete and recreate your cluster.", -1, Instant.now) | ||
_ <- dataAccess.clusterErrorQuery.save(cluster.id, error) | ||
} yield () | ||
}.void | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.