Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: add conflict detect for grant hot leader scheduler #4903

Merged
merged 6 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions server/api/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
package api

import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
Expand All @@ -24,7 +27,9 @@
"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/tikv/pd/pkg/errs"
"github.com/tikv/pd/pkg/mcs/utils/constant"
"github.com/tikv/pd/pkg/schedule/types"
"github.com/tikv/pd/pkg/slice"
"github.com/tikv/pd/pkg/utils/apiutil"
"github.com/tikv/pd/server"
"github.com/unrolled/render"
Expand Down Expand Up @@ -143,6 +148,15 @@
}
collector(strconv.FormatUint(limit, 10))
case types.GrantHotRegionScheduler:
schedulers, err := h.getSchedulersOnDifferentMode()
if err != nil {
h.r.JSON(w, http.StatusInternalServerError, err.Error())
return

Check warning on line 154 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L153-L154

Added lines #L153 - L154 were not covered by tests
}
if slice.Contains(schedulers, types.BalanceHotRegionScheduler.String()) {
h.r.JSON(w, http.StatusBadRequest, "balance-hot-region-scheduler is running, please remove it first")
return
}
leaderID, ok := input["store-leader-id"].(string)
if !ok {
h.r.JSON(w, http.StatusBadRequest, "missing leader id")
Expand All @@ -155,6 +169,16 @@
}
collector(leaderID)
collector(peerIDs)
case types.BalanceHotRegionScheduler:
schedulers, err := h.getSchedulersOnDifferentMode()
if err != nil {
h.r.JSON(w, http.StatusInternalServerError, err.Error())
return

Check warning on line 176 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L175-L176

Added lines #L175 - L176 were not covered by tests
}
if slice.Contains(schedulers, types.GrantHotRegionScheduler.String()) {
h.r.JSON(w, http.StatusBadRequest, "grant-hot-region-scheduler is running, please remove it first")
return
}
}

if err := h.AddScheduler(tp, args...); err != nil {
Expand Down Expand Up @@ -246,6 +270,48 @@
h.r.JSON(w, http.StatusOK, "Pause or resume the scheduler successfully.")
}

func (h *schedulerHandler) getSchedulersOnDifferentMode() ([]string, error) {
rc, err := h.GetRaftCluster()
if err != nil {
return nil, err

Check warning on line 276 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L276

Added line #L276 was not covered by tests
}
if !rc.IsServiceIndependent(constant.SchedulingServiceName) {
schedulers, err := h.GetSchedulerByStatus("", false)
if err != nil {
return nil, err

Check warning on line 281 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L281

Added line #L281 was not covered by tests
}
return schedulers.([]string), nil
}

addr, ok := h.svr.GetServicePrimaryAddr(h.svr.Context(), constant.SchedulingServiceName)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to query schedulers from the scheduling service?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we need to check whether hot scheduler is disabled.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PD has the same configuration as the scheduling service?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

if !ok {
return nil, errs.ErrNotFoundSchedulingAddr.FastGenByArgs()

Check warning on line 288 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L288

Added line #L288 was not covered by tests
}
url := fmt.Sprintf("%s/scheduling/api/v1/schedulers", addr)
req, err := http.NewRequest(http.MethodGet, url, http.NoBody)
if err != nil {
return nil, err

Check warning on line 293 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L293

Added line #L293 was not covered by tests
}
resp, err := h.svr.GetHTTPClient().Do(req)
if err != nil {
return nil, err

Check warning on line 297 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L297

Added line #L297 was not covered by tests
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errs.ErrSchedulingServer.FastGenByArgs(resp.StatusCode)

Check warning on line 301 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L301

Added line #L301 was not covered by tests
}
bs, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err

Check warning on line 305 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L305

Added line #L305 was not covered by tests
}
var schedulers []string
err = json.Unmarshal(bs, &schedulers)
if err != nil {
return nil, err

Check warning on line 310 in server/api/scheduler.go

View check run for this annotation

Codecov / codecov/patch

server/api/scheduler.go#L310

Added line #L310 was not covered by tests
}
return schedulers, nil
}

type schedulerConfigHandler struct {
svr *server.Server
rd *render.Render
Expand Down
7 changes: 4 additions & 3 deletions tools/pd-ctl/pdctl/command/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,10 @@ func NewSplitBucketSchedulerCommand() *cobra.Command {
// NewGrantHotRegionSchedulerCommand returns a command to add a grant-hot-region-scheduler.
func NewGrantHotRegionSchedulerCommand() *cobra.Command {
c := &cobra.Command{
Use: "grant-hot-region-scheduler <store_leader_id> <store_leader_id,store_peer_id_1,store_peer_id_2>",
Short: "add a scheduler to grant hot region to fixed stores",
Run: addSchedulerForGrantHotRegionCommandFunc,
Use: "grant-hot-region-scheduler <store_leader_id> <store_leader_id,store_peer_id_1,store_peer_id_2>",
Copy link
Member

@rleungx rleungx Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, is it necessary to set store_leader_id again?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I also think it is not necessary. I will modify it in another pr.

Short: `add a scheduler to grant hot region to fixed stores.
Note: If there is balance-hot-region-scheduler running, please remove it first, otherwise grant-hot-region-scheduler will not work.`,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ok if the hot scheduler is not work but exist, for example being pause status?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this pr, even though the hot-scheduler is paused, adding grant-hot-leader-scheduler is still not allowed.

If this is allowed, let's look at an example where I suspend the hot-scheduler for ten seconds and then add grant-hot-leader-scheduler, which is fine for ten seconds, but after ten seconds the hot-scheduler resumes scheduling and they will conflict again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, understand. BTW, the hot scheduler config still exist if I enable it again.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the configuration still exists. If you think this pr is ok, I will cancel /hold.

Run: addSchedulerForGrantHotRegionCommandFunc,
}
return c
}
Expand Down
34 changes: 32 additions & 2 deletions tools/pd-ctl/tests/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,12 +488,32 @@ func (suite *schedulerTestSuite) checkSchedulerConfig(cluster *pdTests.TestClust
})

// test grant hot region scheduler config
checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "add", "grant-hot-region-scheduler", "1", "1,2,3"}, map[string]bool{
// case 1: add grant-hot-region-scheduler when balance-hot-region-scheduler is running
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should not belong to the scheduler config test.

echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "grant-hot-region-scheduler", "1", "1,2,3"}, nil)
re.Contains(echo, "balance-hot-region-scheduler is running, please remove it first")

// case 2: add grant-hot-region-scheduler when balance-hot-region-scheduler is paused
checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "pause", "balance-hot-region-scheduler", "60"}, map[string]bool{
"balance-region-scheduler": true,
"balance-leader-scheduler": true,
"balance-hot-region-scheduler": true,
"grant-hot-region-scheduler": true,
})
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "grant-hot-region-scheduler", "1", "1,2,3"}, nil)
re.Contains(echo, "balance-hot-region-scheduler is running, please remove it first")

// case 3: add grant-hot-region-scheduler when balance-hot-region-scheduler is disabled
checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "remove", "balance-hot-region-scheduler"}, map[string]bool{
"balance-region-scheduler": true,
"balance-leader-scheduler": true,
})

checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "add", "grant-hot-region-scheduler", "1", "1,2,3"}, map[string]bool{
"balance-region-scheduler": true,
"balance-leader-scheduler": true,
"grant-hot-region-scheduler": true,
})

// case 4: test grant-hot-region-scheduler config
var conf3 map[string]any
expected3 := map[string]any{
"store-id": []any{float64(1), float64(2), float64(3)},
Expand All @@ -509,7 +529,17 @@ func (suite *schedulerTestSuite) checkSchedulerConfig(cluster *pdTests.TestClust
mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3)
return reflect.DeepEqual(expected3, conf3)
})

// case 5: remove grant-hot-region-scheduler
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-hot-region-scheduler"}, nil)
re.Contains(echo, "grant-hot-region-scheduler is running, please remove it first")

checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "remove", "grant-hot-region-scheduler"}, map[string]bool{
"balance-region-scheduler": true,
"balance-leader-scheduler": true,
})

checkSchedulerCommand(re, cmd, pdAddr, []string{"-u", pdAddr, "scheduler", "add", "balance-hot-region-scheduler"}, map[string]bool{
"balance-region-scheduler": true,
"balance-leader-scheduler": true,
"balance-hot-region-scheduler": true,
Expand Down