Skip to content

Commit

Permalink
[opt](log) add warn log for saving and pushing image failure (apache#…
Browse files Browse the repository at this point in the history
…41216)

We already have some metric on FE to record the number of saving or
pushing image failure events.
Such as:
```
doris_fe_image_push{type="failed"} 0
doris_fe_image_push{type="success"} 0
```
But it is a counter and hard to monitor them if user want to send alert
when failure.

Actually, this kind of event-driven alert is better be done using log.
So I add some warning log for these failure.
So that use can monitor the log and send alert when failure.

- Saving image failure:
    `Save image failed: xxx`

- Pushing image failure:
    `Push image failed: xxx`

- Deleting old edit log failure:
    `Delete old edit log failed: xxx`

- Deleting old image failure:
    `Delete old image failed: xxx`
  • Loading branch information
morningman committed Sep 27, 2024
1 parent 705f9c2 commit f3e7fa3
Showing 1 changed file with 17 additions and 15 deletions.
32 changes: 17 additions & 15 deletions fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,17 @@ public synchronized void doCheckpoint() throws CheckpointException {
return;
}
} catch (Throwable e) {
LOG.error("Does not get storage info", e);
LOG.warn("Save image failed: " + e.getMessage(), e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
return;
}

if (!checkMemoryEnoughToDoCheckpoint()) {
try {
checkMemoryEnoughToDoCheckpoint();
} catch (Throwable t) {
LOG.warn("Save image failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
Expand Down Expand Up @@ -155,7 +158,7 @@ public synchronized void doCheckpoint() throws CheckpointException {
LOG.info("checkpoint finished save image.{}", replayedJournalId);
} catch (Throwable e) {
exceptionCaught = true;
LOG.error("Exception when generate new image file", e);
LOG.warn("Save image failed: " + e.getMessage(), e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
}
Expand All @@ -174,8 +177,8 @@ public synchronized void doCheckpoint() throws CheckpointException {
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
}
} catch (Throwable ex) {
LOG.error("Master delete latest invalid image file failed.", ex);
} catch (Throwable t) {
LOG.warn("Delete old image failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
}
Expand Down Expand Up @@ -210,7 +213,7 @@ public synchronized void doCheckpoint() throws CheckpointException {
LOG.warn("Failed when pushing image file. url = {},responseBody = {}", url, responseBody);
}
} catch (IOException e) {
LOG.error("Exception when pushing image file. url = {}", url, e);
LOG.warn("Exception when pushing image file. url = {}", url, e);
}
}

Expand All @@ -222,6 +225,7 @@ public synchronized void doCheckpoint() throws CheckpointException {
MetricRepo.COUNTER_IMAGE_PUSH_SUCCESS.increase(1L);
}
} else {
LOG.warn("Push image failed: totally {} nodes, push succeeded {} nodes", otherNodesCount, successPushed);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_PUSH_FAILED.increase(1L);
}
Expand Down Expand Up @@ -281,8 +285,8 @@ public synchronized void doCheckpoint() throws CheckpointException {
}
LOG.info("journals <= {} are deleted. image version {}, other nodes min version {}",
deleteVersion, checkPointVersion, minOtherNodesJournalId);
} catch (Throwable e) {
LOG.error("failed to delete old edit log", e);
} catch (Throwable t) {
LOG.warn("Delete old edit log failed: " + t.getMessage(), t);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_EDIT_LOG_CLEAN_FAILED.increase(1L);
}
Expand All @@ -297,7 +301,7 @@ public synchronized void doCheckpoint() throws CheckpointException {
MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
}
} catch (Throwable e) {
LOG.error("Master delete old image file fail.", e);
LOG.warn("Master delete old image file fail.", e);
if (MetricRepo.isInit) {
MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);
}
Expand All @@ -319,17 +323,15 @@ private void destroyStaticFieldForCkpt() {
/*
* Check whether can we do the checkpoint due to the memory used percent.
*/
private boolean checkMemoryEnoughToDoCheckpoint() {
private void checkMemoryEnoughToDoCheckpoint() throws CheckpointException {
long memUsedPercent = getMemoryUsedPercent();
LOG.info("get jvm memory used percent: {} %", memUsedPercent);

if (memUsedPercent > Config.metadata_checkpoint_memory_threshold && !Config.force_do_metadata_checkpoint) {
LOG.warn("the memory used percent {} exceed the checkpoint memory threshold: {}",
memUsedPercent, Config.metadata_checkpoint_memory_threshold);
return false;
throw new CheckpointException(String.format(
"the memory used percent %d exceed the checkpoint memory threshold: %d",
memUsedPercent, Config.metadata_checkpoint_memory_threshold));
}

return true;
}

/*
Expand Down

0 comments on commit f3e7fa3

Please sign in to comment.