Skip to content

Commit

Permalink
[CELEBORN-983] Rename PrometheusMetric configuration
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Replace
```properties
celeborn.metrics.master.prometheus.host
celeborn.metrics.master.prometheus.port
celeborn.metrics.worker.prometheus.host
celeborn.metrics.worker.prometheus.port
```
With
```properties
celeborn.master.http.host
celeborn.master.http.port
celeborn.worker.http.host
celeborn.worker.http.port
```

### Why are the changes needed?
The `celeborn.master.metrics.prometheus.port` and `celeborn.metrics.worker.prometheus.port` bind port not only serve prometheus metrics, but also provide some useful API services.

https://celeborn.apache.org/docs/latest/monitoring/#rest-api

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

Closes apache#1919 from cxzl25/CELEBORN-983.

Lead-authored-by: sychen <[email protected]>
Co-authored-by: Keyong Zhou <[email protected]>
Signed-off-by: zky.zhoukeyong <[email protected]>
  • Loading branch information
cxzl25 and waitinfuture committed Oct 13, 2023
1 parent 2b79692 commit dd65e74
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 82 deletions.
2 changes: 1 addition & 1 deletion charts/celeborn/templates/master-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ spec:
{{- toYaml .Values.resources.master | nindent 12 }}
ports:
- containerPort: {{ .Values.service.port }}
- containerPort: {{ get .Values.celeborn "celeborn.master.metrics.prometheus.port" | default 9098 }}
- containerPort: {{ get .Values.celeborn "celeborn.master.http.port" | default 9098 }}
name: metrics
protocol: TCP
volumeMounts:
Expand Down
2 changes: 1 addition & 1 deletion charts/celeborn/templates/worker-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ spec:
resources:
{{- toYaml .Values.resources.worker | nindent 12 }}
ports:
- containerPort: {{ get .Values.celeborn "celeborn.worker.metrics.prometheus.port" | default 9096 }}
- containerPort: {{ get .Values.celeborn "celeborn.worker.http.port" | default 9096 }}
name: metrics
protocol: TCP
volumeMounts:
Expand Down
4 changes: 2 additions & 2 deletions charts/celeborn/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ volumes:
celeborn:
celeborn.master.ha.enabled: true
celeborn.metrics.enabled: true
celeborn.master.metrics.prometheus.port: 9098
celeborn.worker.metrics.prometheus.port: 9096
celeborn.master.http.port: 9098
celeborn.worker.http.port: 9096
celeborn.worker.monitor.disk.enabled: false
celeborn.shuffle.chunk.size: 8m
celeborn.rpc.io.serverThreads: 64
Expand Down
94 changes: 50 additions & 44 deletions common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -540,8 +540,13 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se

def masterHost: String = get(MASTER_HOST).replace("<localhost>", Utils.localHostName(this))

def masterHttpHost: String =
get(MASTER_HTTP_HOST).replace("<localhost>", Utils.localHostName(this))

def masterPort: Int = get(MASTER_PORT)

def masterHttpPort: Int = get(MASTER_HTTP_PORT)

def haEnabled: Boolean = get(HA_ENABLED)

def haMasterNodeId: Option[String] = get(HA_MASTER_NODE_ID)
Expand Down Expand Up @@ -624,6 +629,9 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
// //////////////////////////////////////////////////////
// Worker //
// //////////////////////////////////////////////////////
def workerHttpHost: String =
get(WORKER_HTTP_HOST).replace("<localhost>", Utils.localHostName(this))
def workerHttpPort: Int = get(WORKER_HTTP_PORT)
def workerRpcPort: Int = get(WORKER_RPC_PORT)
def workerPushPort: Int = get(WORKER_PUSH_PORT)
def workerFetchPort: Int = get(WORKER_FETCH_PORT)
Expand Down Expand Up @@ -669,12 +677,6 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
def metricsSlidingWindowSize: Int = get(METRICS_SLIDING_WINDOW_SIZE)
def metricsCollectCriticalEnabled: Boolean = get(METRICS_COLLECT_CRITICAL_ENABLED)
def metricsCapacity: Int = get(METRICS_CAPACITY)
def masterPrometheusMetricHost: String =
get(MASTER_PROMETHEUS_HOST).replace("<localhost>", Utils.localHostName(this))
def masterPrometheusMetricPort: Int = get(MASTER_PROMETHEUS_PORT)
def workerPrometheusMetricHost: String =
get(WORKER_PROMETHEUS_HOST).replace("<localhost>", Utils.localHostName(this))
def workerPrometheusMetricPort: Int = get(WORKER_PROMETHEUS_PORT)
def metricsExtraLabels: Map[String, String] =
get(METRICS_EXTRA_LABELS).map(Utils.parseMetricLabels).toMap
def metricsAppTopDiskUsageCount: Int = get(METRICS_APP_TOP_DISK_USAGE_COUNT)
Expand Down Expand Up @@ -1600,6 +1602,16 @@ object CelebornConf extends Logging {
.stringConf
.createWithDefaultString("<localhost>")

val MASTER_HTTP_HOST: ConfigEntry[String] =
buildConf("celeborn.master.http.host")
.withAlternative("celeborn.metrics.master.prometheus.host")
.withAlternative("celeborn.master.metrics.prometheus.host")
.categories("master")
.version("0.4.0")
.doc("Master's http host.")
.stringConf
.createWithDefaultString("<localhost>")

val MASTER_PORT: ConfigEntry[Int] =
buildConf("celeborn.master.port")
.categories("master")
Expand All @@ -1609,6 +1621,17 @@ object CelebornConf extends Logging {
.checkValue(p => p >= 1024 && p < 65535, "Invalid port")
.createWithDefault(9097)

val MASTER_HTTP_PORT: ConfigEntry[Int] =
buildConf("celeborn.master.http.port")
.withAlternative("celeborn.metrics.master.prometheus.port")
.withAlternative("celeborn.master.metrics.prometheus.port")
.categories("master")
.version("0.4.0")
.doc("Master's http port.")
.intConf
.checkValue(p => p >= 1024 && p < 65535, "Invalid port")
.createWithDefault(9098)

val HA_ENABLED: ConfigEntry[Boolean] =
buildConf("celeborn.master.ha.enabled")
.withAlternative("celeborn.ha.enabled")
Expand Down Expand Up @@ -2088,6 +2111,27 @@ object CelebornConf extends Logging {
.timeConf(TimeUnit.MILLISECONDS)
.createWithDefaultString("1000ms")

val WORKER_HTTP_HOST: ConfigEntry[String] =
buildConf("celeborn.worker.http.host")
.withAlternative("celeborn.metrics.worker.prometheus.host")
.withAlternative("celeborn.worker.metrics.prometheus.host")
.categories("worker")
.doc("Worker's http host.")
.version("0.4.0")
.stringConf
.createWithDefault("<localhost>")

val WORKER_HTTP_PORT: ConfigEntry[Int] =
buildConf("celeborn.worker.http.port")
.withAlternative("celeborn.metrics.worker.prometheus.port")
.withAlternative("celeborn.worker.metrics.prometheus.port")
.categories("worker")
.doc("Worker's http port.")
.version("0.4.0")
.intConf
.checkValue(p => p >= 1024 && p < 65535, "Invalid port")
.createWithDefault(9096)

val WORKER_RPC_PORT: ConfigEntry[Int] =
buildConf("celeborn.worker.rpc.port")
.categories("worker")
Expand Down Expand Up @@ -3578,44 +3622,6 @@ object CelebornConf extends Logging {
.intConf
.createWithDefault(4096)

val MASTER_PROMETHEUS_HOST: ConfigEntry[String] =
buildConf("celeborn.metrics.master.prometheus.host")
.withAlternative("celeborn.master.metrics.prometheus.host")
.categories("metrics")
.doc("Master's Prometheus host.")
.version("0.3.0")
.stringConf
.createWithDefault("<localhost>")

val MASTER_PROMETHEUS_PORT: ConfigEntry[Int] =
buildConf("celeborn.metrics.master.prometheus.port")
.withAlternative("celeborn.master.metrics.prometheus.port")
.categories("metrics")
.doc("Master's Prometheus port.")
.version("0.3.0")
.intConf
.checkValue(p => p >= 1024 && p < 65535, "Invalid port")
.createWithDefault(9098)

val WORKER_PROMETHEUS_HOST: ConfigEntry[String] =
buildConf("celeborn.metrics.worker.prometheus.host")
.withAlternative("celeborn.worker.metrics.prometheus.host")
.categories("metrics")
.doc("Worker's Prometheus host.")
.version("0.3.0")
.stringConf
.createWithDefault("<localhost>")

val WORKER_PROMETHEUS_PORT: ConfigEntry[Int] =
buildConf("celeborn.metrics.worker.prometheus.port")
.withAlternative("celeborn.worker.metrics.prometheus.port")
.categories("metrics")
.doc("Worker's Prometheus port.")
.version("0.3.0")
.intConf
.checkValue(p => p >= 1024 && p < 65535, "Invalid port")
.createWithDefault(9096)

val METRICS_EXTRA_LABELS: ConfigEntry[Seq[String]] =
buildConf("celeborn.metrics.extraLabels")
.categories("metrics")
Expand Down
6 changes: 3 additions & 3 deletions conf/celeborn-defaults.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ celeborn.push.io.numConnectionsPerPeer 2
celeborn.worker.push.io.threads 8

celeborn.metrics.enabled true
celeborn.metrics.master.prometheus.port 9098
celeborn.metrics.worker.prometheus.port 9096

celeborn.worker.storage.dirs /mnt/disk1,/mnt/disk2,/mnt/disk3,/mnt/disk4,/mnt/disk5,/mnt/disk6,/mnt/disk7,/mnt/disk8,/mnt/disk9,/mnt/disk10
celeborn.worker.http.port 9096

celeborn.master.endpoints clb-1:9097,clb-2:9098,clb-3:9099
celeborn.master.http.port 9098

celeborn.master.ha.enabled true
celeborn.master.ha.enabled true
celeborn.master.ha.node.1.host clb-1
celeborn.master.ha.node.1.port 9097
celeborn.master.ha.node.1.ratis.port 9872
Expand Down
2 changes: 2 additions & 0 deletions docs/configuration/master.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ license: |
| celeborn.master.heartbeat.application.timeout | 300s | Application heartbeat timeout. | 0.3.0 |
| celeborn.master.heartbeat.worker.timeout | 120s | Worker heartbeat timeout. | 0.3.0 |
| celeborn.master.host | &lt;localhost&gt; | Hostname for master to bind. | 0.2.0 |
| celeborn.master.http.host | &lt;localhost&gt; | Master's http host. | 0.4.0 |
| celeborn.master.http.port | 9098 | Master's http port. | 0.4.0 |
| celeborn.master.port | 9097 | Port for master to bind. | 0.2.0 |
| celeborn.master.slot.assign.extraSlots | 2 | Extra slots number when master assign slots. | 0.3.0 |
| celeborn.master.slot.assign.loadAware.diskGroupGradient | 0.1 | This value means how many more workload will be placed into a faster disk group than a slower group. | 0.3.0 |
Expand Down
4 changes: 0 additions & 4 deletions docs/configuration/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,8 @@ license: |
| celeborn.metrics.conf | &lt;undefined&gt; | Custom metrics configuration file path. Default use `metrics.properties` in classpath. | 0.3.0 |
| celeborn.metrics.enabled | true | When true, enable metrics system. | 0.2.0 |
| celeborn.metrics.extraLabels | | If default metric labels are not enough, extra metric labels can be customized. Labels' pattern is: `<label1_key>=<label1_value>[,<label2_key>=<label2_value>]*`; e.g. `env=prod,version=1` | 0.3.0 |
| celeborn.metrics.master.prometheus.host | &lt;localhost&gt; | Master's Prometheus host. | 0.3.0 |
| celeborn.metrics.master.prometheus.port | 9098 | Master's Prometheus port. | 0.3.0 |
| celeborn.metrics.prometheus.path | /metrics/prometheus | URI context path of prometheus metrics HTTP server. | 0.4.0 |
| celeborn.metrics.sample.rate | 1.0 | It controls if Celeborn collect timer metrics for some operations. Its value should be in [0.0, 1.0]. | 0.2.0 |
| celeborn.metrics.timer.slidingWindow.size | 4096 | The sliding window size of timer metric. | 0.2.0 |
| celeborn.metrics.worker.pauseSpentTime.forceAppend.threshold | 10 | Force append worker pause spent time even if worker still in pause serving state.Help user can find worker pause spent time increase, when worker always been pause state. | |
| celeborn.metrics.worker.prometheus.host | &lt;localhost&gt; | Worker's Prometheus host. | 0.3.0 |
| celeborn.metrics.worker.prometheus.port | 9096 | Worker's Prometheus port. | 0.3.0 |
<!--end-include-->
2 changes: 2 additions & 0 deletions docs/configuration/worker.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ license: |
| celeborn.worker.graceful.shutdown.saveCommittedFileInfo.interval | 5s | Interval for a Celeborn worker to flush committed file infos into Level DB. | 0.3.1 |
| celeborn.worker.graceful.shutdown.saveCommittedFileInfo.sync | false | Whether to call sync method to save committed file infos into Level DB to handle OS crash. | 0.3.1 |
| celeborn.worker.graceful.shutdown.timeout | 600s | The worker's graceful shutdown timeout time. | 0.2.0 |
| celeborn.worker.http.host | &lt;localhost&gt; | Worker's http host. | 0.4.0 |
| celeborn.worker.http.port | 9096 | Worker's http port. | 0.4.0 |
| celeborn.worker.monitor.disk.check.interval | 30s | Intervals between device monitor to check disk. | 0.3.0 |
| celeborn.worker.monitor.disk.check.timeout | 30s | Timeout time for worker check device status. | 0.3.0 |
| celeborn.worker.monitor.disk.checklist | readwrite,diskusage | Monitor type for disk, available items are: iohang, readwrite and diskusage. | 0.2.0 |
Expand Down
9 changes: 9 additions & 0 deletions docs/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ license: |

- Since 0.4.0, Celeborn changed the default value of `celeborn.<module>.io.numConnectionsPerPeer` from `2` to `1`.

- Since 0.4.0, Celeborn has changed the names of the prometheus master and worker configuration as shown in the table below:

| Key Before v0.4.0 | Key After v0.4.0 |
|-------------------------------------------|-----------------------------|
| `celeborn.metrics.master.prometheus.host` | `celeborn.master.http.host` |
| `celeborn.metrics.master.prometheus.port` | `celeborn.master.http.port` |
| `celeborn.metrics.worker.prometheus.host` | `celeborn.worker.http.host` |
| `celeborn.metrics.worker.prometheus.port` | `celeborn.worker.http.port` |

## Upgrading from 0.3.0 to 0.3.1

- Since 0.3.1, Celeborn changed the default value of `celeborn.worker.directMemoryRatioToResume` from `0.5` to `0.7`.
Expand Down
18 changes: 9 additions & 9 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,17 +269,17 @@ an easy way to create new visualizations and monitoring tools for Celeborn and
also easy for users to get the running status of the service. The REST API is available for
both master and worker. The endpoints are mounted at `host:port`. For example,
for the master, they would typically be accessible
at `http://<master-prometheus-host>:<master-prometheus-port><path>`, and
for the worker, at `http://<worker-prometheus-host>:<worker-prometheus-port><path>`.
at `http://<master-http-host>:<master-http-port><path>`, and
for the worker, at `http://<worker-http-host>:<worker-http-port><path>`.

The configuration of `<master-prometheus-host>`, `<master-prometheus-port>`, `<worker-prometheus-host>`, `<worker-prometheus-port>` as below:
The configuration of `<master-http-host>`, `<master-http-port>`, `<worker-http-host>`, `<worker-http--port>` as below:

| Key | Default | Description | Since |
|-----------------------------------------|---------|----------------------------|-------|
| celeborn.metrics.master.prometheus.host | 0.0.0.0 | Master's Prometheus host. | 0.2.0 |
| celeborn.metrics.master.prometheus.port | 9098 | Master's Prometheus port. | 0.2.0 |
| celeborn.metrics.worker.prometheus.host | 0.0.0.0 | Worker's Prometheus host. | 0.2.0 |
| celeborn.metrics.worker.prometheus.port | 9096 | Worker's Prometheus port. | 0.2.0 |
| Key | Default | Description | Since |
|---------------------------|---------|---------------------|-------|
| celeborn.master.http.host | 0.0.0.0 | Master's http host. | 0.4.0 |
| celeborn.master.http.port | 9098 | Master's http port. | 0.4.0 |
| celeborn.worker.http.host | 0.0.0.0 | Worker's http host. | 0.4.0 |
| celeborn.worker.http.port | 9096 | Worker's http port. | 0.4.0 |

### Available API providers

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ class MasterSuite extends AnyFunSuite
conf.set(CelebornConf.HA_MASTER_RATIS_STORAGE_DIR.key, getTmpDir())
conf.set(CelebornConf.WORKER_STORAGE_DIRS.key, getTmpDir())
conf.set(CelebornConf.METRICS_ENABLED.key, "true")
conf.set(CelebornConf.MASTER_PROMETHEUS_HOST.key, "127.0.0.1")
conf.set(CelebornConf.MASTER_PROMETHEUS_PORT.key, "11112")
conf.set(CelebornConf.MASTER_HTTP_HOST.key, "127.0.0.1")
conf.set(CelebornConf.MASTER_HTTP_PORT.key, "11112")

val args = Array("-h", "localhost", "-p", "9097")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,27 @@ abstract class HttpService extends Service with Logging {
}
httpServer = new HttpServer(
serviceName,
prometheusHost(),
prometheusPort(),
httpHost(),
httpPort(),
new HttpServerInitializer(handlers))
httpServer.start()
}

private def prometheusHost(): String = {
private def httpHost(): String = {
serviceName match {
case Service.MASTER =>
conf.masterPrometheusMetricHost
conf.masterHttpHost
case Service.WORKER =>
conf.workerPrometheusMetricHost
conf.workerHttpHost
}
}

private def prometheusPort(): Int = {
private def httpPort(): Int = {
serviceName match {
case Service.MASTER =>
conf.masterPrometheusMetricPort
conf.masterHttpPort
case Service.WORKER =>
conf.workerPrometheusMetricPort
conf.workerHttpPort
}
}

Expand Down
4 changes: 2 additions & 2 deletions tests/kubernetes-it/docker/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ volumes:
celeborn:
celeborn.master.ha.enabled: false
celeborn.metrics.enabled: false
celeborn.master.metrics.prometheus.port: 9098
celeborn.worker.metrics.prometheus.port: 9096
celeborn.master.http.port: 9098
celeborn.worker.http.port: 9096
celeborn.worker.monitor.disk.enabled: false
celeborn.shuffle.chunk.size: 8m
celeborn.rpc.io.serverThreads: 64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ import org.apache.celeborn.service.deploy.worker.{Worker, WorkerArguments}
import org.apache.celeborn.service.deploy.worker.memory.MemoryManager

trait MiniClusterFeature extends Logging {
val workerPrometheusPort = new AtomicInteger(12378)
val masterPrometheusPort = new AtomicInteger(22378)
val masterHttpPort = new AtomicInteger(22378)
val workerHttpPort = new AtomicInteger(12378)
var masterInfo: (Master, Thread) = _
val workerInfos = new mutable.HashMap[Worker, Thread]()

Expand All @@ -51,9 +51,9 @@ trait MiniClusterFeature extends Logging {
private def createMaster(map: Map[String, String] = null): Master = {
val conf = new CelebornConf()
conf.set(CelebornConf.METRICS_ENABLED.key, "false")
val prometheusPort = masterPrometheusPort.getAndIncrement()
conf.set(CelebornConf.MASTER_PROMETHEUS_PORT.key, s"$prometheusPort")
logInfo(s"set celeborn.master.metrics.prometheus.port to $prometheusPort")
val httpPort = masterHttpPort.getAndIncrement()
conf.set(CelebornConf.MASTER_HTTP_PORT.key, s"$httpPort")
logInfo(s"set ${CelebornConf.MASTER_HTTP_PORT.key} to $httpPort")
if (map != null) {
map.foreach(m => conf.set(m._1, m._2))
}
Expand All @@ -72,7 +72,7 @@ trait MiniClusterFeature extends Logging {
conf.set(CelebornConf.WORKER_STORAGE_DIRS.key, createTmpDir())
conf.set(CelebornConf.WORKER_DISK_MONITOR_ENABLED.key, "false")
conf.set(CelebornConf.CLIENT_PUSH_BUFFER_MAX_SIZE.key, "256K")
conf.set(CelebornConf.WORKER_PROMETHEUS_PORT.key, s"${workerPrometheusPort.incrementAndGet()}")
conf.set(CelebornConf.WORKER_HTTP_PORT.key, s"${workerHttpPort.incrementAndGet()}")
conf.set("celeborn.fetch.io.threads", "4")
conf.set("celeborn.push.io.threads", "4")
if (map != null) {
Expand Down

0 comments on commit dd65e74

Please sign in to comment.