Skip to content

Commit 7da53ac

Browse files
authored
Issue 733: If google marks cluster as Error but doesn't return error code then don't continue to monitor the cluster (#764)
* fix clustermonitoractor
1 parent 311df2b commit 7da53ac

File tree

2 files changed

+32
-19
lines changed

2 files changed

+32
-19
lines changed

src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/ClusterMonitorActor.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ class ClusterMonitorActor(val cluster: Cluster,
7070
// the Retry trait needs a reference to the ActorSystem
7171
override val system = context.system
7272

73+
private val internalError = "Internal Error"
74+
7375
override def preStart(): Unit = {
7476
super.preStart()
7577
scheduleInitialMonitorPass
@@ -303,7 +305,7 @@ class ClusterMonitorActor(val cluster: Cluster,
303305
case Error if leoClusterStatus != Deleting && leoClusterStatus != Stopping =>
304306
gdDAO.getClusterErrorDetails(cluster.dataprocInfo.operationName).map {
305307
case Some(errorDetails) => FailedCluster(errorDetails, googleInstances)
306-
case None => NotReadyCluster(ClusterStatus.Error, googleInstances)
308+
case None => FailedCluster(ClusterErrorDetails(Code.INTERNAL.value, Some(internalError)), googleInstances)
307309
}
308310
// Take care we don't delete a Creating cluster if google hasn't updated their status yet
309311
case Deleted if leoClusterStatus == Creating =>

src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/ClusterMonitorSpec.scala

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -278,62 +278,73 @@ class ClusterMonitorSpec extends TestKit(ActorSystem("leonardotest")) with FlatS
278278

279279
// Pre:
280280
// - cluster exists in the DB with status Creating
281-
// - dataproc DAO returns status ERROR, but no error code
281+
// - dataproc DAO returns status ERROR and error code CANCELLED
282282
// - compute DAO returns RUNNING
283283
// Post:
284-
// - cluster is not changed in the DB
284+
// - cluster status is set to Error in the DB
285285
// - instances are populated in the DB
286-
// - monitor actor does not shut down
287-
it should "keep monitoring in ERROR state with no error code" in isolatedDbTest {
286+
// - monitor actor shuts down
287+
it should "monitor until ERROR state with no restart and getClusterStatus returns Some(ErrorDetail) " in isolatedDbTest {
288288
val savedCreatingCluster = creatingCluster.save()
289289
creatingCluster shouldEqual savedCreatingCluster
290-
291-
val dao = mock[GoogleDataprocDAO]
290+
291+
val gdDAO = mock[GoogleDataprocDAO]
292292
when {
293-
dao.getClusterStatus(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))
293+
gdDAO.getClusterStatus(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))
294294
} thenReturn Future.successful(ClusterStatus.Error)
295295

296296
when {
297-
dao.getClusterInstances(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))
298-
} thenReturn Future.successful(clusterInstances)
297+
gdDAO.getClusterErrorDetails(mockitoEq(creatingCluster.dataprocInfo.operationName))
298+
} thenReturn Future.successful(Some(ClusterErrorDetails(Code.CANCELLED.value, Some("test message"))))
299299

300300
when {
301-
dao.getClusterErrorDetails(mockitoEq(creatingCluster.dataprocInfo.operationName))
302-
} thenReturn Future.successful(None)
301+
gdDAO.deleteCluster(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))
302+
} thenReturn Future.successful(())
303+
304+
when {
305+
gdDAO.getClusterInstances(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))
306+
} thenReturn Future.successful(clusterInstances)
303307

304308
val iamDAO = mock[GoogleIamDAO]
305309
when {
306310
iamDAO.removeIamRolesForUser(any[GoogleProject], any[WorkbenchEmail], mockitoEq(Set("roles/dataproc.worker")))
307311
} thenReturn Future.successful(())
308312

313+
when {
314+
iamDAO.removeServiceAccountKey(any[GoogleProject], any[WorkbenchEmail], any[ServiceAccountKeyId])
315+
} thenReturn Future.successful(())
316+
309317
val computeDAO = stubComputeDAO(InstanceStatus.Running)
310318
val storageDAO = mock[GoogleStorageDAO]
311319
val authProvider = mock[LeoAuthProvider]
312320

313-
withClusterSupervisor(dao, computeDAO, iamDAO, storageDAO, authProvider, mockJupyterDAO, true) { actor =>
321+
withClusterSupervisor(gdDAO, computeDAO, iamDAO, storageDAO, authProvider, mockJupyterDAO, false) { actor =>
314322

315323
eventually {
316324
val updatedCluster = dbFutureValue {
317325
_.clusterQuery.getActiveClusterByName(creatingCluster.googleProject, creatingCluster.clusterName)
318326
}
319327
updatedCluster shouldBe 'defined
320-
updatedCluster shouldBe Some(savedCreatingCluster.copy(instances = Set(masterInstance, workerInstance1, workerInstance2)))
328+
updatedCluster.map(_.status) shouldBe Some(ClusterStatus.Error)
329+
updatedCluster.flatMap(_.dataprocInfo.hostIp) shouldBe None
330+
updatedCluster.map(_.instances) shouldBe Some(Set(masterInstance, workerInstance1, workerInstance2))
321331
}
322332
verify(storageDAO, never).deleteBucket(any[GcsBucketName], any[Boolean])
323-
verify(iamDAO, never()).removeIamRolesForUser(any[GoogleProject], any[WorkbenchEmail], mockitoEq(Set("roles/dataproc.worker")))
324-
verify(iamDAO, never()).removeServiceAccountKey(any[GoogleProject], any[WorkbenchEmail], any[ServiceAccountKeyId])
333+
verify(iamDAO, if (clusterServiceAccount(creatingCluster.googleProject).isDefined) times(1) else never()).removeIamRolesForUser(any[GoogleProject], any[WorkbenchEmail], mockitoEq(Set("roles/dataproc.worker")))
334+
verify(iamDAO, if (notebookServiceAccount(creatingCluster.googleProject).isDefined) times(1) else never()).removeServiceAccountKey(any[GoogleProject], any[WorkbenchEmail], any[ServiceAccountKeyId])
325335
}
326336
}
327337

338+
328339
// Pre:
329340
// - cluster exists in the DB with status Creating
330-
// - dataproc DAO returns status ERROR and error code CANCELLED
341+
// - dataproc DAO returns status ERROR and no ERROR CODE
331342
// - compute DAO returns RUNNING
332343
// Post:
333344
// - cluster status is set to Error in the DB
334345
// - instances are populated in the DB
335346
// - monitor actor shuts down
336-
it should "monitor until ERROR state with no restart" in isolatedDbTest {
347+
it should "monitor until ERROR state with no restart and getClusterErrorDetails returns None " in isolatedDbTest {
337348
val savedCreatingCluster = creatingCluster.save()
338349
creatingCluster shouldEqual savedCreatingCluster
339350

@@ -344,7 +355,7 @@ class ClusterMonitorSpec extends TestKit(ActorSystem("leonardotest")) with FlatS
344355

345356
when {
346357
gdDAO.getClusterErrorDetails(mockitoEq(creatingCluster.dataprocInfo.operationName))
347-
} thenReturn Future.successful(Some(ClusterErrorDetails(Code.CANCELLED.value, Some("test message"))))
358+
} thenReturn Future.successful(None)
348359

349360
when {
350361
gdDAO.deleteCluster(mockitoEq(creatingCluster.googleProject), mockitoEq(creatingCluster.clusterName))

0 commit comments

Comments
 (0)