diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 764b1121767..65e55ef51d9 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -2474,7 +2474,7 @@ object CelebornConf extends Logging { .doc("If direct memory usage is less than this limit, worker will resume.") .version("0.2.0") .doubleConf - .createWithDefault(0.5) + .createWithDefault(0.7) val WORKER_CONGESTION_CONTROL_ENABLED: ConfigEntry[Boolean] = buildConf("celeborn.worker.congestionControl.enabled") diff --git a/docs/assets/img/backpressure.svg b/docs/assets/img/backpressure.svg index 67596a081eb..9463f236945 100644 --- a/docs/assets/img/backpressure.svg +++ b/docs/assets/img/backpressure.svg @@ -1,4 +1,4 @@ -
Check
Memory
Check...
Y
Y
N
N
> 0.85?
> 0.85?
Y
Y
N
N
> 0.95?
> 0.95?
Pause Receive &&
Pause Replicate &&
Force Flush
Pause Receive &&...
Y
Y
N
N
Under
presure?
Under...
N
N
Y
Y
< 0.5?
< 0.5?
Pause Receive &&
Force Flush
Pause Receive &&...
Resume
Resume
Text is not SVG - cannot display
\ No newline at end of file +
> pause replicate ratio
default 0.95
> pause replicate ratio...
Check Worker Memory
Check Worker Memory
No
No
Yes
Yes
No
No
Yes
Yes
> pause push ratio
default 0.85
> pause push ratio...
Yes
Yes
No
No
< resume ratio
default 0.7
< resume ratio...
Yes
Yes
No
No
is paused before this check
is paused before this check
Marked is paused
Marked is paused
Pause both push && replicate 
Force flush
Pause both push && replicate...
Pause push
Force flush
Pause push...
Resume all
Resume all
Marked non pasued
Marked non pasued
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md index f846268cd85..994838b3fef 100644 --- a/docs/configuration/worker.md +++ b/docs/configuration/worker.md @@ -39,7 +39,7 @@ license: | | celeborn.worker.directMemoryRatioForReadBuffer | 0.1 | Max ratio of direct memory for read buffer | 0.2.0 | | celeborn.worker.directMemoryRatioToPauseReceive | 0.85 | If direct memory usage reaches this limit, the worker will stop to receive data from Celeborn shuffle clients. | 0.2.0 | | celeborn.worker.directMemoryRatioToPauseReplicate | 0.95 | If direct memory usage reaches this limit, the worker will stop to receive replication data from other workers. This value should be higher than celeborn.worker.directMemoryRatioToPauseReceive. | 0.2.0 | -| celeborn.worker.directMemoryRatioToResume | 0.5 | If direct memory usage is less than this limit, worker will resume. | 0.2.0 | +| celeborn.worker.directMemoryRatioToResume | 0.7 | If direct memory usage is less than this limit, worker will resume. | 0.2.0 | | celeborn.worker.fetch.heartbeat.enabled | false | enable the heartbeat from worker to client when fetching data | 0.3.0 | | celeborn.worker.fetch.io.threads | <undefined> | Netty IO thread number of worker to handle client fetch data. The default threads number is the number of flush thread. | 0.2.0 | | celeborn.worker.fetch.port | 0 | Server port for Worker to receive fetch data request from ShuffleClient. | 0.2.0 |