Skip to content

Commit 1406bd3

Browse files
hellolittlejggao
andauthored
Add job id to log and add running worker failure metrics (#790)
Co-authored-by: ggao <ggao@netflix.com>
1 parent 0d13219 commit 1406bd3

File tree

1 file changed

+21
-2
lines changed

1 file changed

+21
-2
lines changed

mantis-runtime-executor/src/main/java/io/mantisrx/server/worker/RunningWorker.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818

1919
import static io.mantisrx.server.core.utils.StatusConstants.STATUS_MESSAGE_FORMAT;
2020

21+
import com.netflix.spectator.api.Tag;
22+
import io.mantisrx.common.metrics.Metrics;
23+
import io.mantisrx.common.metrics.MetricsRegistry;
2124
import io.mantisrx.runtime.Context;
2225
import io.mantisrx.runtime.Job;
2326
import io.mantisrx.runtime.MantisJobState;
@@ -28,12 +31,14 @@
2831
import io.mantisrx.server.core.JobSchedulingInfo;
2932
import io.mantisrx.server.core.Status;
3033
import io.mantisrx.server.core.Status.TYPE;
31-
import java.util.Iterator;
34+
35+
import java.util.*;
3236
import java.util.concurrent.CountDownLatch;
3337
import org.slf4j.Logger;
3438
import org.slf4j.LoggerFactory;
3539
import rx.Observable;
3640
import rx.Observer;
41+
import rx.exceptions.OnErrorThrowable;
3742
import rx.functions.Action0;
3843
import rx.functions.Action1;
3944
import rx.subjects.PublishSubject;
@@ -43,6 +48,8 @@
4348
public class RunningWorker {
4449

4550
private static final Logger logger = LoggerFactory.getLogger(RunningWorker.class);
51+
private static final String workerFailureMetricName = "workerFailure";
52+
private static final String workerMonitorMetricId = "runningWorkerMonitor";
4653
private final int totalStagesNet;
4754
private Action0 onTerminateCallback;
4855
private Action0 onCompleteCallback;
@@ -150,7 +157,19 @@ public void signalCompleted() {
150157
public void signalFailed(Throwable t) {
151158
logger.info("JobId: " + jobId + ", stage: " + stageNum + " workerIndex: " + workerIndex + " workerNumber: " + workerNum + ","
152159
+ " signaling failed");
153-
logger.error("Worker failure detected, shutting down job", t);
160+
logger.error("Worker failure detected, shutting down job: {}", jobId, t);
161+
// Send failure metrics when data emission failed
162+
if (t instanceof OnErrorThrowable) {
163+
Metrics jobFailureMetrics = new Metrics.Builder()
164+
.id(workerMonitorMetricId, Tag.of("jobId", this.jobId),
165+
Tag.of("workerIndex", String.valueOf(this.workerIndex)),
166+
Tag.of("stageNum", String.valueOf(this.stageNum)))
167+
.addCounter(workerFailureMetricName)
168+
.build();
169+
170+
MetricsRegistry.getInstance().registerAndGet(jobFailureMetrics).getCounter(workerFailureMetricName).increment();
171+
}
172+
154173
jobStatus.onNext(new Status(jobId, stageNum, workerIndex, workerNum,
155174
TYPE.INFO, String.format(STATUS_MESSAGE_FORMAT, stageNum, workerIndex, workerNum, "failed. error: " + t.getMessage()),
156175
MantisJobState.Failed));

0 commit comments

Comments
 (0)