From 8c5c2f0fe6e5c9eb6ba19b25e121798b6658b75b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 25 Sep 2025 18:35:04 +0000
Subject: [PATCH 1/2] Initial plan
From ebbcd55a9960f5646c57778282cbd2f93796dd02 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 25 Sep 2025 18:48:33 +0000
Subject: [PATCH 2/2] Update failure detector docs to remove
first_heartbeat_estimate option
Co-authored-by: waynexia <15380403+waynexia@users.noreply.github.com>
---
.../configuration.md | 20 +++++++++----------
.../configuration.md | 20 +++++++++----------
.../configuration.md | 20 +++++++++----------
.../configuration.md | 20 +++++++++----------
4 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/docs/user-guide/deployments-administration/configuration.md b/docs/user-guide/deployments-administration/configuration.md
index aa3e9d28d..953ec32e8 100644
--- a/docs/user-guide/deployments-administration/configuration.md
+++ b/docs/user-guide/deployments-administration/configuration.md
@@ -638,20 +638,21 @@ retry_delay = "500ms"
max_running_procedures = 128
# Failure detectors options.
+# GreptimeDB uses the Phi Accrual Failure Detector algorithm to detect datanode failures.
[failure_detector]
-## The threshold value used by the failure detector to determine failure conditions.
+## Maximum acceptable φ before the peer is treated as failed.
+## Lower values react faster but yield more false positives.
threshold = 8.0
-## The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations.
+## The minimum standard deviation of the heartbeat intervals.
+## So tiny variations don't make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary.
min_std_deviation = "100ms"
-## The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable.
+## The acceptable pause duration between heartbeats.
+## Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses.
acceptable_heartbeat_pause = "10000ms"
-## The initial estimate of the heartbeat interval used by the failure detector.
-first_heartbeat_estimate = "1000ms"
-
## Datanode options.
[datanode]
@@ -739,10 +740,9 @@ create_topic_timeout = "30s"
| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
| `procedure.max_running_procedures` | Integer | `128` | The maximum number of procedures that can be running at the same time. If the number of running procedures exceeds this limit, the procedure will be rejected. |
| `failure_detector` | -- | -- | -- |
-| `failure_detector.threshold` | Float | `8.0` | The threshold value used by the failure detector to determine failure conditions. |
-| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations. |
-| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable. |
-| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | The initial estimate of the heartbeat interval used by the failure detector. |
+| `failure_detector.threshold` | Float | `8.0` | Maximum acceptable φ before the peer is treated as failed.
Lower values react faster but yield more false positives. |
+| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals.
So tiny variations don't make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary. |
+| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats.
Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses. |
| `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.timeout` | String | `10s` | Operation timeout. |
diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/deployments-administration/configuration.md b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/deployments-administration/configuration.md
index d500fc0aa..145b5d0ec 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/deployments-administration/configuration.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/deployments-administration/configuration.md
@@ -630,20 +630,21 @@ max_running_procedures = 128
# Failure detector 选项
+# GreptimeDB 使用 Phi 累积故障检测器算法来检测数据节点故障。
[failure_detector]
-## Failure detector 检测阈值
+## 判定节点故障前可接受的最大 φ 值。
+## 较低的值反应更快但会产生更多误报。
threshold = 8.0
-## 心跳间隔的最小标准差,用于计算可接受的变化。
+## 心跳间隔的最小标准差。
+## 防止微小变化导致 φ 值激增。在心跳间隔变化很小时防止过度敏感。
min_std_deviation = "100ms"
-## 心跳之间可接受的暂停时间长度。
+## 心跳之间可接受的暂停时长。
+## 在 φ 值上升前为学习到的平均间隔提供额外的宽限期,吸收临时网络故障或GC暂停。
acceptable_heartbeat_pause = "10000ms"
-## 首次心跳间隔的估计值。
-first_heartbeat_estimate = "1000ms"
-
## Datanode 选项。
[datanode]
@@ -712,10 +713,9 @@ create_topic_timeout = "30s"
| `procedure.retry_delay` | 字符串 | `500ms` | Procedure 初始重试延迟,延迟会指数增长。 |
| `procedure.max_running_procedures` | Integer | `128` | 同一时间可以运行的程序最大数量。如果运行的程序数量超过此限制,程序将被拒绝。 |
| `failure_detector` | -- | -- | 故障检测选项。 |
-| `failure_detector.threshold` | 浮点数 | `8.0` | Failure detector 用来判断故障条件的阈值。 |
-| `failure_detector.min_std_deviation` | 字符串 | `100ms` | 心跳间隔的最小标准差,用于计算可接受的变动范围。 |
-| `failure_detector.acceptable_heartbeat_pause` | 字符串 | `10000ms` | 允许的最大心跳暂停时间,用于确定心跳间隔是否可接受。 |
-| `failure_detector.first_heartbeat_estimate` | 字符串 | `1000ms` | 初始心跳间隔估算值。 |
+| `failure_detector.threshold` | 浮点数 | `8.0` | 判定节点故障前可接受的最大 φ 值。
较低的值反应更快但会产生更多误报。 |
+| `failure_detector.min_std_deviation` | 字符串 | `100ms` | 心跳间隔的最小标准差。
防止微小变化导致 φ 值激增。在心跳间隔变化很小时防止过度敏感。 |
+| `failure_detector.acceptable_heartbeat_pause` | 字符串 | `10000ms` | 心跳之间可接受的暂停时长。
在 φ 值上升前为学习到的平均间隔提供额外的宽限期,吸收临时网络故障或GC暂停。 |
| `datanode` | -- | -- | |
| `datanode.client` | -- | -- | Datanode 客户端选项。 |
| `datanode.client.timeout` | 字符串 | `10s` | 操作超时。 |
diff --git a/i18n/zh/docusaurus-plugin-content-docs/version-0.17/user-guide/deployments-administration/configuration.md b/i18n/zh/docusaurus-plugin-content-docs/version-0.17/user-guide/deployments-administration/configuration.md
index d500fc0aa..145b5d0ec 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/version-0.17/user-guide/deployments-administration/configuration.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/version-0.17/user-guide/deployments-administration/configuration.md
@@ -630,20 +630,21 @@ max_running_procedures = 128
# Failure detector 选项
+# GreptimeDB 使用 Phi 累积故障检测器算法来检测数据节点故障。
[failure_detector]
-## Failure detector 检测阈值
+## 判定节点故障前可接受的最大 φ 值。
+## 较低的值反应更快但会产生更多误报。
threshold = 8.0
-## 心跳间隔的最小标准差,用于计算可接受的变化。
+## 心跳间隔的最小标准差。
+## 防止微小变化导致 φ 值激增。在心跳间隔变化很小时防止过度敏感。
min_std_deviation = "100ms"
-## 心跳之间可接受的暂停时间长度。
+## 心跳之间可接受的暂停时长。
+## 在 φ 值上升前为学习到的平均间隔提供额外的宽限期,吸收临时网络故障或GC暂停。
acceptable_heartbeat_pause = "10000ms"
-## 首次心跳间隔的估计值。
-first_heartbeat_estimate = "1000ms"
-
## Datanode 选项。
[datanode]
@@ -712,10 +713,9 @@ create_topic_timeout = "30s"
| `procedure.retry_delay` | 字符串 | `500ms` | Procedure 初始重试延迟,延迟会指数增长。 |
| `procedure.max_running_procedures` | Integer | `128` | 同一时间可以运行的程序最大数量。如果运行的程序数量超过此限制,程序将被拒绝。 |
| `failure_detector` | -- | -- | 故障检测选项。 |
-| `failure_detector.threshold` | 浮点数 | `8.0` | Failure detector 用来判断故障条件的阈值。 |
-| `failure_detector.min_std_deviation` | 字符串 | `100ms` | 心跳间隔的最小标准差,用于计算可接受的变动范围。 |
-| `failure_detector.acceptable_heartbeat_pause` | 字符串 | `10000ms` | 允许的最大心跳暂停时间,用于确定心跳间隔是否可接受。 |
-| `failure_detector.first_heartbeat_estimate` | 字符串 | `1000ms` | 初始心跳间隔估算值。 |
+| `failure_detector.threshold` | 浮点数 | `8.0` | 判定节点故障前可接受的最大 φ 值。
较低的值反应更快但会产生更多误报。 |
+| `failure_detector.min_std_deviation` | 字符串 | `100ms` | 心跳间隔的最小标准差。
防止微小变化导致 φ 值激增。在心跳间隔变化很小时防止过度敏感。 |
+| `failure_detector.acceptable_heartbeat_pause` | 字符串 | `10000ms` | 心跳之间可接受的暂停时长。
在 φ 值上升前为学习到的平均间隔提供额外的宽限期,吸收临时网络故障或GC暂停。 |
| `datanode` | -- | -- | |
| `datanode.client` | -- | -- | Datanode 客户端选项。 |
| `datanode.client.timeout` | 字符串 | `10s` | 操作超时。 |
diff --git a/versioned_docs/version-0.17/user-guide/deployments-administration/configuration.md b/versioned_docs/version-0.17/user-guide/deployments-administration/configuration.md
index aa3e9d28d..953ec32e8 100644
--- a/versioned_docs/version-0.17/user-guide/deployments-administration/configuration.md
+++ b/versioned_docs/version-0.17/user-guide/deployments-administration/configuration.md
@@ -638,20 +638,21 @@ retry_delay = "500ms"
max_running_procedures = 128
# Failure detectors options.
+# GreptimeDB uses the Phi Accrual Failure Detector algorithm to detect datanode failures.
[failure_detector]
-## The threshold value used by the failure detector to determine failure conditions.
+## Maximum acceptable φ before the peer is treated as failed.
+## Lower values react faster but yield more false positives.
threshold = 8.0
-## The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations.
+## The minimum standard deviation of the heartbeat intervals.
+## So tiny variations don't make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary.
min_std_deviation = "100ms"
-## The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable.
+## The acceptable pause duration between heartbeats.
+## Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses.
acceptable_heartbeat_pause = "10000ms"
-## The initial estimate of the heartbeat interval used by the failure detector.
-first_heartbeat_estimate = "1000ms"
-
## Datanode options.
[datanode]
@@ -739,10 +740,9 @@ create_topic_timeout = "30s"
| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
| `procedure.max_running_procedures` | Integer | `128` | The maximum number of procedures that can be running at the same time. If the number of running procedures exceeds this limit, the procedure will be rejected. |
| `failure_detector` | -- | -- | -- |
-| `failure_detector.threshold` | Float | `8.0` | The threshold value used by the failure detector to determine failure conditions. |
-| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations. |
-| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable. |
-| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | The initial estimate of the heartbeat interval used by the failure detector. |
+| `failure_detector.threshold` | Float | `8.0` | Maximum acceptable φ before the peer is treated as failed.
Lower values react faster but yield more false positives. |
+| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals.
So tiny variations don't make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary. |
+| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats.
Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses. |
| `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.timeout` | String | `10s` | Operation timeout. |