From e9e27797c85ab33651247edb6430aac90c82b578 Mon Sep 17 00:00:00 2001 From: zhangyangyu Date: Sat, 13 Apr 2024 07:04:11 +0000 Subject: [PATCH] deploy: 8c11565d5ce618fefe510c6ceb837a678716e3e2 --- .../code-style-and-quality-guide.html | 18 +- contribute-to-tidb/committer-guide.html | 2 +- contribute-to-tidb/contribute-code.html | 4 +- contribute-to-tidb/make-a-proposal.html | 4 +- .../release-notes-style-guide.html | 4 +- contribute-to-tidb/report-an-issue.html | 4 +- contribute-to-tidb/review-a-pr.html | 4 +- contribute-to-tidb/write-document.html | 28 +- extending-tidb/add-a-function.html | 14 +- get-started/build-tidb-from-source.html | 4 +- ...commit-code-and-submit-a-pull-request.html | 10 +- get-started/debug-and-profile.html | 10 +- get-started/install-golang.html | 4 +- get-started/setup-an-ide.html | 68 +- get-started/write-and-run-unit-tests.html | 16 +- index.html | 3 +- print.html | 639 +++++++++--------- project-management/release-train-model.html | 4 +- searcher.js | 2 +- searchindex.js | 2 +- searchindex.json | 2 +- understand-tidb/1pc.html | 6 +- understand-tidb/async-commit.html | 10 +- understand-tidb/cbo.html | 14 +- understand-tidb/ddl.html | 230 +++---- understand-tidb/dql.html | 2 +- understand-tidb/execution.html | 2 +- ...mplementation-of-vectorized-execution.html | 6 +- understand-tidb/introduction.html | 6 +- understand-tidb/lock-resolver.html | 6 +- .../memory-management-mechanism.html | 2 +- understand-tidb/mvcc-garbage-collection.html | 18 +- understand-tidb/optimistic-transaction.html | 38 +- understand-tidb/parser.html | 10 +- understand-tidb/plan-cache.html | 6 +- understand-tidb/planner.html | 4 +- understand-tidb/plugin.html | 22 +- understand-tidb/privilege.html | 10 +- understand-tidb/rbo.html | 22 +- understand-tidb/session.html | 8 +- understand-tidb/table-statistics.html | 4 +- .../the-lifecycle-of-a-statement.html | 2 +- understand-tidb/transaction-on-tikv.html | 6 +- understand-tidb/transaction.html | 4 +- 44 files changed, 643 insertions(+), 641 deletions(-) diff --git a/contribute-to-tidb/code-style-and-quality-guide.html b/contribute-to-tidb/code-style-and-quality-guide.html index cab4700a..957cf006 100644 --- a/contribute-to-tidb/code-style-and-quality-guide.html +++ b/contribute-to-tidb/code-style-and-quality-guide.html @@ -185,25 +185,25 @@

package main import ( - "fmt" - "strings" + "fmt" + "strings" ) type Email string func newEmail(email string) (Email, error) { - if !strings.Contains(email, "@") { - return Email(""), fmt.Errorf("Expected @ in the email") + if !strings.Contains(email, "@") { + return Email(""), fmt.Errorf("Expected @ in the email") } return Email(email), nil } func (email Email) Domain() string { - return strings.Split(string(email), "@")[1] + return strings.Split(string(email), "@")[1] } func main() { - ping, err := newEmail("go@pingcap.com") + ping, err := newEmail("go@pingcap.com") if err != nil { panic(err) } fmt.Println(ping.Domain()) } @@ -226,7 +226,7 @@

Parallel For-Loop

-

There are two types of for loop on range: "with index" and "without index". Let's see an example for range with index.

+

There are two types of for loop on range: "with index" and "without index". Let's see an example for range with index.

func TestRangeWithIndex(t *testing.T) {
 	rows := []struct{ index int }{{index: 0}, {index: 1}, {index: 2}}
 	for _, row := range rows {
@@ -248,7 +248,7 @@ 

Parallel

The same instance of the variable

Since the the loop iterator variable is the same instance of the variable, it may result in tricky error with parallel for-loop.

done := make(chan bool)
-values := []string{"a", "b", "c"}
+values := []string{"a", "b", "c"}
 for _, v := range values {
 	go func() {
 		fmt.Println(v)
@@ -259,7 +259,7 @@ 

Parallel section of Write and run unit tests

A clone of iteration target value

diff --git a/contribute-to-tidb/committer-guide.html b/contribute-to-tidb/committer-guide.html index a84307ed..d4d2fbd0 100644 --- a/contribute-to-tidb/committer-guide.html +++ b/contribute-to-tidb/committer-guide.html @@ -205,7 +205,7 @@

Time Management

There are many things that a committer can do, such as moderating discussions, pull request reviews and code contributions.

-

Working on an open source project can be rewarding, but also be a bit overwhelming sometimes. A little bit of time management might be helpful to alleviate the problem. For example, some committers have a "community day" in a week when they actively manage outstanding PRs, but watch the community less frequently in the rest of the time.

+

Working on an open source project can be rewarding, but also be a bit overwhelming sometimes. A little bit of time management might be helpful to alleviate the problem. For example, some committers have a "community day" in a week when they actively manage outstanding PRs, but watch the community less frequently in the rest of the time.

Remember that your merit will never go away, so please take your time and pace when contributing to the project:)

diff --git a/contribute-to-tidb/contribute-code.html b/contribute-to-tidb/contribute-code.html index bbe4c013..bc97b837 100644 --- a/contribute-to-tidb/contribute-code.html +++ b/contribute-to-tidb/contribute-code.html @@ -180,7 +180,7 @@

Contribute Co

TiDB is maintained, improved, and extended by code contributions. We welcome code contributions to TiDB. TiDB uses a workflow based on pull requests.

Before contributing

Contributing to TiDB does not start with opening a pull request. We expect contributors to reach out to us first to discuss the overall approach together. Without consensus with the TiDB committers, contributions might require substantial rework or will not be reviewed. So please create a GitHub issue, discuss under an existing issue, or create a topic on the internal.tidb.io and reach consensus.

-

For newcomers, you can check the starter issues, which are annotated with a "good first issue" label. These are issues suitable for new contributors to work with and won't take long to fix. But because the label is typically added at triage time it can turn out to be inaccurate, so do feel free to leave a comment if you think the classification no longer applies.

+

For newcomers, you can check the starter issues, which are annotated with a "good first issue" label. These are issues suitable for new contributors to work with and won't take long to fix. But because the label is typically added at triage time it can turn out to be inaccurate, so do feel free to leave a comment if you think the classification no longer applies.

To get your change merged you need to sign the CLA to grant PingCAP ownership of your code.

Contributing process

After a consensus is reached in issues, it's time to start the code contributing process:

@@ -215,7 +215,7 @@

Making good commits

diff --git a/contribute-to-tidb/make-a-proposal.html b/contribute-to-tidb/make-a-proposal.html index effdc925..76a5dc4c 100644 --- a/contribute-to-tidb/make-a-proposal.html +++ b/contribute-to-tidb/make-a-proposal.html @@ -180,7 +180,7 @@

Make a Propos

This page defines the best practices procedure for making a proposal in TiDB projects. This text is based on the content of TiDB Design Document.

Motivation

Many changes, including bug fixes and documentation improvements can be implemented and reviewed via the normal GitHub pull request workflow.

-

Some changes though are "substantial", and we ask that these be put through a bit of a design process and produce a consensus among the TiDB community.

+

Some changes though are "substantial", and we ask that these be put through a bit of a design process and produce a consensus among the TiDB community.

The process described in this page is intended to provide a consistent and controlled path for new features to enter the TiDB projects, so that all stakeholders can be confident about the direction the projects is evolving in.

Who should initiate the design document?

Everyone is encouraged to initiate a design document, but before doing it, please make sure you have an intention of getting the work done to implement it.

@@ -200,7 +200,7 @@

What
  • Start the implementation.
  • Please refer to the tracking issue from subtasks to track the progress.

    -

    An example that almost fits into this model is the proposal "Support global index for partition table", without following the latest template.

    +

    An example that almost fits into this model is the proposal "Support global index for partition table", without following the latest template.

    -

    To find this type of failure, enter grep -i "FAIL" to search the report output.

    +

    To find this type of failure, enter grep -i "FAIL" to search the report output.

    Data race

    Golang testing supports detecting data race by running tests with the -race flag. Its failure report looks like:

    [2021-06-21T15:36:38.766Z] ==================
    @@ -661,7 +662,7 @@ 

    dlv attach 1394942

    You might get error messages of the kernel security setting as follows:

    -
    Could not attach to pid 1394942: this could be caused by a kernel security setting, try writing "0" to /proc/sys/kernel/yama/ptrace_scope
    +
    Could not attach to pid 1394942: this could be caused by a kernel security setting, try writing "0" to /proc/sys/kernel/yama/ptrace_scope
     

    To resolve the error, follow the instructions provided in the error message and execute the following command as the root user to override the kernel security setting:

    echo 0 > /proc/sys/kernel/yama/ptrace_scope
    @@ -708,7 +709,7 @@ 

    Gather runtime profiling information through HTTP end points

    Usually, when TiDB server is running, it exposes a profiling end point through HTTP at http://127.0.0.1:10080/debug/pprof/profile. You can get the profile result by running the following commands:

    -
    curl -G "127.0.0.1:10080/debug/pprof/profile?seconds=45" > profile.profile
    +
    curl -G "127.0.0.1:10080/debug/pprof/profile?seconds=45" > profile.profile
     go tool pprof -http 127.0.0.1:4001 profile.profile
     

    The commands capture the profiling information for 45 seconds, and then provide a web view of the profiling result at 127.0.0.1:4001. This view contains a flame graph of the execution and more views that can help you diagnose the performance bottleneck.

    @@ -716,18 +717,18 @@

    curl -G "127.0.0.1:10080/debug/pprof/goroutine" > goroutine.profile +
    curl -G "127.0.0.1:10080/debug/pprof/goroutine" > goroutine.profile
     
    • Trace:
    -
    curl -G "127.0.0.1:10080/debug/pprof/trace?seconds=3" > trace.profile
    +
    curl -G "127.0.0.1:10080/debug/pprof/trace?seconds=3" > trace.profile
     go tool trace -http 127.0.0.1:4001 trace.profile
     
    • Heap:
    -
    curl -G "127.0.0.1:10080/debug/pprof/heap" > heap.profile
    +
    curl -G "127.0.0.1:10080/debug/pprof/heap" > heap.profile
     go tool pprof -http 127.0.0.1:4001 heap.profile
     

    To learn how the runtime information is analyzed, see Go's diagnostics document.

    @@ -745,7 +746,7 @@

    Git to manage its source code. To contribute to the project, you need to get familiar with Git features so that your changes can be incorporated into the codebase.

    This section addresses some of the most common questions and problems that new contributors might face. This section also covers some Git basics; however if you find that the content is a little difficult to understand, we recommend that you first read the following introductions to Git:

    @@ -787,7 +788,7 @@

    Standard Pr git add path/to/changed/file.go path/to/another/changed/file.go # commit changes staged, make sure the commit message is meaningful and readable -git commit -s -m "pkg, pkg2, pkg3: what's changed" +git commit -s -m "pkg, pkg2, pkg3: what's changed" # optionally use `git status` to check if the change set is correct # git status @@ -824,7 +825,7 @@

    Conflicts

    When this happens, you need to resolve the conflicts before your changes can be merged. First, get a local copy of the conflicting changes: checkout your local master branch with git checkout master, then git pull master to update it with the most recent changes.

    Rebasing

    You're now ready to start the rebasing process. Checkout the branch with your changes and execute git rebase master.

    -

    When you rebase a branch on master, all the changes on your branch are reapplied to the most recent version of master. In other words, Git tries to pretend that the changes you made to the old version of master were instead made to the new version of master. During this process, you should expect to encounter at least one "rebase conflict." This happens when Git's attempt to reapply the changes fails because your changes conflict with other changes that have been made. You can tell that this happened because you'll see lines in the output that look like:

    +

    When you rebase a branch on master, all the changes on your branch are reapplied to the most recent version of master. In other words, Git tries to pretend that the changes you made to the old version of master were instead made to the new version of master. During this process, you should expect to encounter at least one "rebase conflict." This happens when Git's attempt to reapply the changes fails because your changes conflict with other changes that have been made. You can tell that this happened because you'll see lines in the output that look like:

    CONFLICT (content): Merge conflict in file.go
     

    When you open these files, you'll see sections of the form

    @@ -839,7 +840,7 @@

    Rebasing

    Once you're all done fixing the conflicts, you need to stage the files that had conflicts in them via git add. Afterwards, run git rebase --continue to let Git know that you've resolved the conflicts and it should finish the rebase.

    Once the rebase has succeeded, you'll want to update the associated branch on your fork with git push --force-with-lease.

    Advanced rebasing

    -

    If your branch contains multiple consecutive rewrites of the same code, or if the rebase conflicts are extremely severe, you can use git rebase --interactive master to gain more control over the process. This allows you to choose to skip commits, edit the commits that you do not skip, change the order in which they are applied, or "squash" them into each other.

    +

    If your branch contains multiple consecutive rewrites of the same code, or if the rebase conflicts are extremely severe, you can use git rebase --interactive master to gain more control over the process. This allows you to choose to skip commits, edit the commits that you do not skip, change the order in which they are applied, or "squash" them into each other.

    Alternatively, you can sacrifice the commit history like this:

    # squash all the changes into one commit so you only have to worry about conflicts once
     git rebase -i $(git merge-base master HEAD)  # and squash all changes along the way
    @@ -848,7 +849,7 @@ 

    Advanced git rebase --continue

    Squashing commits into each other causes them to be merged into a single commit. Both the upside and downside of this is that it simplifies the history. On the one hand, you lose track of the steps in which changes were made, but the history becomes easier to work with.

    -

    You also may want to squash together just the last few commits, possibly because they only represent "fixups" and not real changes. For example, git rebase --interactive HEAD~2 allows you to edit the two commits only.

    +

    You also may want to squash together just the last few commits, possibly because they only represent "fixups" and not real changes. For example, git rebase --interactive HEAD~2 allows you to edit the two commits only.

    Setting pre-commit

    We use pre-commit to check the code style before committing. To install pre-commit, run:

    # Using pip:
    @@ -919,8 +920,8 @@ 

    Filing an iss
  • What did you see instead?
  • What is your TiDB version?
  • -

    Answering these questions give the details about your problem so other contributors or TiDB users could pick up your issue more easily.

    -

    As previous section shows, duplicated issues should be reduced. To help others who encountered the problem find your issue, except for problem details answered in the issue template, a descriptive title which contains information that might be unique to it also helps. This can be the components your issue belongs to or database features used in your issue, the conditions that trigger the bug, or part of the error message if there is any.

    +

    Answering these questions give the details about your problem so other contributors or TiDB users could pick up your issue more easily.

    +

    As previous section shows, duplicated issues should be reduced. To help others who encountered the problem find your issue, except for problem details answered in the issue template, a descriptive title which contains information that might be unique to it also helps. This can be the components your issue belongs to or database features used in your issue, the conditions that trigger the bug, or part of the error message if there is any.

    Making good issues

    Except for a good title and detailed issue message, you can also add suitable labels to your issue via /label, especially which component the issue belongs to and which versions the issue affects. Many committers and contributors only focus on certain subsystems of TiDB. Setting the appropriate component is important for getting their attention. Some issues might affect multiple releases. You can query Issue Triage chapter for more information about what need to do with such issues.

    If you are able to, you should take more considerations on your issue:

    @@ -1137,7 +1138,7 @@

    pull requests.

    Before contributing

    Contributing to TiDB does not start with opening a pull request. We expect contributors to reach out to us first to discuss the overall approach together. Without consensus with the TiDB committers, contributions might require substantial rework or will not be reviewed. So please create a GitHub issue, discuss under an existing issue, or create a topic on the internal.tidb.io and reach consensus.

    -

    For newcomers, you can check the starter issues, which are annotated with a "good first issue" label. These are issues suitable for new contributors to work with and won't take long to fix. But because the label is typically added at triage time it can turn out to be inaccurate, so do feel free to leave a comment if you think the classification no longer applies.

    +

    For newcomers, you can check the starter issues, which are annotated with a "good first issue" label. These are issues suitable for new contributors to work with and won't take long to fix. But because the label is typically added at triage time it can turn out to be inaccurate, so do feel free to leave a comment if you think the classification no longer applies.

    To get your change merged you need to sign the CLA to grant PingCAP ownership of your code.

    Contributing process

    After a consensus is reached in issues, it's time to start the code contributing process:

    @@ -1172,7 +1173,7 @@

    Making good commits

    @@ -1223,7 +1224,7 @@

    Triaging pull requests

    Some pull request authors may not be familiar with TiDB, TiDB development workflow or TiDB community. They don't know what labels should be added to the pull requests and which experts could be asked for review. If you are able to, it would be great for you to triage the pull requests, adding suitable labels to the pull requests, asking corresponding experts to review the pull requests. These actions could help more contributors notice the pull requests and make quick responses.

    Checking pull requests

    @@ -1242,7 +1243,7 @@

    Accepting pull requests

    Once you think the pull request is ready, you can approve it, commenting with /lgtm is also valid.

    @@ -1252,7 +1253,7 @@

    TiDB Design Document.

    Motivation

    Many changes, including bug fixes and documentation improvements can be implemented and reviewed via the normal GitHub pull request workflow.

    -

    Some changes though are "substantial", and we ask that these be put through a bit of a design process and produce a consensus among the TiDB community.

    +

    Some changes though are "substantial", and we ask that these be put through a bit of a design process and produce a consensus among the TiDB community.

    The process described in this page is intended to provide a consistent and controlled path for new features to enter the TiDB projects, so that all stakeholders can be confident about the direction the projects is evolving in.

    Who should initiate the design document?

    Everyone is encouraged to initiate a design document, but before doing it, please make sure you have an intention of getting the work done to implement it.

    @@ -1272,7 +1273,7 @@

    What
  • Start the implementation.
  • Please refer to the tracking issue from subtasks to track the progress.

    -

    An example that almost fits into this model is the proposal "Support global index for partition table", without following the latest template.

    +

    An example that almost fits into this model is the proposal "Support global index for partition table", without following the latest template.

    • tracking issue
    • pull request of design document
    • @@ -1286,25 +1287,25 @@

      package main import ( - "fmt" - "strings" + "fmt" + "strings" ) type Email string func newEmail(email string) (Email, error) { - if !strings.Contains(email, "@") { - return Email(""), fmt.Errorf("Expected @ in the email") + if !strings.Contains(email, "@") { + return Email(""), fmt.Errorf("Expected @ in the email") } return Email(email), nil } func (email Email) Domain() string { - return strings.Split(string(email), "@")[1] + return strings.Split(string(email), "@")[1] } func main() { - ping, err := newEmail("go@pingcap.com") + ping, err := newEmail("go@pingcap.com") if err != nil { panic(err) } fmt.Println(ping.Domain()) } @@ -1327,7 +1328,7 @@

      Parallel For-Loop

      -

      There are two types of for loop on range: "with index" and "without index". Let's see an example for range with index.

      +

      There are two types of for loop on range: "with index" and "without index". Let's see an example for range with index.

      func TestRangeWithIndex(t *testing.T) {
       	rows := []struct{ index int }{{index: 0}, {index: 1}, {index: 2}}
       	for _, row := range rows {
      @@ -1349,7 +1350,7 @@ 

      Parallel

      The same instance of the variable

      Since the the loop iterator variable is the same instance of the variable, it may result in tricky error with parallel for-loop.

      done := make(chan bool)
      -values := []string{"a", "b", "c"}
      +values := []string{"a", "b", "c"}
       for _, v := range values {
       	go func() {
       		fmt.Println(v)
      @@ -1360,7 +1361,7 @@ 

      Parallel section of Write and run unit tests

      A clone of iteration target value

      @@ -1477,8 +1478,8 @@

      Step 2: Clone the forked repository to local storage

      -
      cd $working_dir # Comes to the directory that you want put the fork in, for example, "cd ~/Documents/GitHub"
      -git clone git@github.com:$user/docs.git # Replace "$user" with your GitHub ID
      +
      cd $working_dir # Comes to the directory that you want put the fork in, for example, "cd ~/Documents/GitHub"
      +git clone git@github.com:$user/docs.git # Replace "$user" with your GitHub ID
       
       cd $working_dir/docs
       git remote add upstream git@github.com:pingcap/docs.git # Adds the upstream repo
      @@ -1505,7 +1506,7 @@ 

      Step 5: Commit your changes

      git status # Checks the local status
       git add <file> ... # Adds the file(s) you want to commit. If you want to commit all changes, you can directly use `git add.`
      -git commit -m "commit-message: update the xx"
      +git commit -m "commit-message: update the xx"
       

      See Commit Message Style.

      Step 6: Keep your branch in sync with upstream/master

      @@ -1514,7 +1515,7 @@

      Step 7: Push your changes to the remote

      -
      git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
      +
      git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
       

      Step 8: Create a pull request

        @@ -1546,31 +1547,31 @@
        Step 5: Commit your changes in stage 1.

        git status # Checks the local status
         git add <file> ... # Adds the file(s) you want to commit. If you want to commit all changes, you can directly use `git add.`
        -git commit -m "commit-message: update the xx"
        +git commit -m "commit-message: update the xx"
         
      1. Push your changes to the remote origin:

        -
        git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
        +
        git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
         
      2. -

        After all comments are addressed, reply on the PR page: "All comments are addressed. PTAL."

        -

        "PTAL" is short for "Please take a look".

        +

        After all comments are addressed, reply on the PR page: "All comments are addressed. PTAL."

        +

        "PTAL" is short for "Please take a look".

      Accept comments on the PR page
      -

      If a review comment is in the suggestion mode where the reviewer has already made the suggested change for you (with highlighted differences), to accept the suggestion, you only need to click the "Commit suggestion" button. Then the suggested change is automatically committed to your PR.

      +

      If a review comment is in the suggestion mode where the reviewer has already made the suggested change for you (with highlighted differences), to accept the suggestion, you only need to click the "Commit suggestion" button. Then the suggested change is automatically committed to your PR.

      If multiple review comments are in the suggestion mode, it is recommended to accept them in a batch. To do that, perform the following steps on the PR page:

      1. -

        Click the "Files changed" tab and see the file changes. You can see multiple review comments in suggestion mode.

        +

        Click the "Files changed" tab and see the file changes. You can see multiple review comments in suggestion mode.

      2. -

        Choose the suggestions you want to commit by clicking the "Add suggestion to batch" button on each suggestion.

        +

        Choose the suggestions you want to commit by clicking the "Add suggestion to batch" button on each suggestion.

      3. -

        After all suggestions to be committed are chosen, click the "Commit suggestions" button on the upper right corner of the PR page. Then, you have successfully committed all suggested changes.

        +

        After all suggestions to be committed are chosen, click the "Commit suggestions" button on the upper right corner of the PR page. Then, you have successfully committed all suggested changes.

      @@ -1582,7 +1583,7 @@

    • If most of your changes apply to multiple documentation versions but some differences exist among versions, make changes by commenting in the cherry-picked PR instructing how you would like to make version-specific changes. Then the repository committers will commit to the PR according to your comment before you approve it.

      @@ -1616,7 +1617,7 @@

    • After the docs PR is merged. The translation tracking process is finished. The updates in Chinese documentation are synchronized to the English documentation.

      @@ -1665,7 +1666,7 @@

      Com

      Bug fix

      -

      A bug fix note means that your PR fixes an existing bug or issue. This type of notes start with "Fix" followed by "the issue/bug".

      +

      A bug fix note means that your PR fixes an existing bug or issue. This type of notes start with "Fix" followed by "the issue/bug".

      Write your note clearly and adequately so that your target readers can get the main point of your bug fix. The bug or issue must be directly perceivable to the users, and you can refer to the associated GitHub issues.

      In addition, it is recommended to highlight the bug trigger condition or the workaround if there is any.

      Examples:

      @@ -1678,7 +1679,7 @@

      Bug fix

      Improvement

      An improvement note means that your PR improves stability or performance of the product, or enhances an existing feature. In addition to describing what your PR has changed, you should also mention how users can benefit from it.

      -

      This type of release note consists of two parts: what you have changed + the benefit of your change. This type of release notes often starts with "support", "increase", "improve", "optimize", etc.

      +

      This type of release note consists of two parts: what you have changed + the benefit of your change. This type of release notes often starts with "support", "increase", "improve", "optimize", etc.

      Examples:

      @@ -1715,7 +1716,7 @@

      Time Management

      There are many things that a committer can do, such as moderating discussions, pull request reviews and code contributions.

      -

      Working on an open source project can be rewarding, but also be a bit overwhelming sometimes. A little bit of time management might be helpful to alleviate the problem. For example, some committers have a "community day" in a week when they actively manage outstanding PRs, but watch the community less frequently in the rest of the time.

      +

      Working on an open source project can be rewarding, but also be a bit overwhelming sometimes. A little bit of time management might be helpful to alleviate the problem. For example, some committers have a "community day" in a week when they actively manage outstanding PRs, but watch the community less frequently in the rest of the time.

      Remember that your merit will never go away, so please take your time and pace when contributing to the project:)

      Miscellaneous Topics

      Communication channels

      @@ -1803,11 +1804,11 @@

      Protocol Layer<

      The entry method for a single session processing command is to call the dispatch method of the clientConn class, where the protocol is parsed and passed to a different handler.

      SQL Layer

      -

      Generally speaking, a SQL statement needs to go through a series of processes:

      +

      Generally speaking, a SQL statement needs to go through a series of processes:

      1. syntax parsing
      2. validity verification
      3. -
      4. building query plan
      5. +
      6. building query plan
      7. optimizing query plan
      8. generating executor according to plan
      9. executing and returning results
      10. @@ -1825,7 +1826,7 @@

        SQL Layer

        KV API Layer

        TiDB relies on the underlying storage engine to store and load data. It does not rely on a specific storage engine (such as TiKV), but has some requirements for the storage engine, and any engine that meets these requirements can be used (TiKV is the most suitable one).

        -

        The most basic requirement is "Key-Value engine with transactions and Golang driver". The more advanced requirement is "support for distributed computation interface", so that TiDB can push some computation requests down to the storage engine.

        +

        The most basic requirement is "Key-Value engine with transactions and Golang driver". The more advanced requirement is "support for distributed computation interface", so that TiDB can push some computation requests down to the storage engine.

        These requirements can be found in the interfaces of the kv package, and the storage engine needs to provide a Golang driver that implements these interfaces, which TiDB then uses to manipulate the underlying data.

        As for the most basic requirement, these interfaces are related:

          @@ -1975,7 +1976,7 @@

          @@ -2112,134 +2113,134 @@

          TableInfo struct according to the table definition in the statement and create a DDL job and call doDDLJob which goes through the limitDDLJobs goroutine and adds one or more jobs to the DDL job queue in addBatchDDLJobs

          DDL job encoded as JSON:

          {
          -  "id": 56,
          -  "type": 3,
          -  "schema_id": 1,
          -  "table_id": 55,
          -  "schema_name": "test",
          -  "state": 0,
          -  "err": null,
          -  "err_count": 0,
          -  "row_count": 0,
          -  "raw_args": [
          +  "id": 56,
          +  "type": 3,
          +  "schema_id": 1,
          +  "table_id": 55,
          +  "schema_name": "test",
          +  "state": 0,
          +  "err": null,
          +  "err_count": 0,
          +  "row_count": 0,
          +  "raw_args": [
               {
          -      "id": 55,
          -      "name": {
          -        "O": "t",
          -        "L": "t"
          +      "id": 55,
          +      "name": {
          +        "O": "t",
          +        "L": "t"
                 },
          -      "charset": "utf8mb4",
          -      "collate": "utf8mb4_bin",
          -      "cols": [
          +      "charset": "utf8mb4",
          +      "collate": "utf8mb4_bin",
          +      "cols": [
                   {
          -          "id": 1,
          -          "name": {
          -            "O": "id",
          -            "L": "id"
          +          "id": 1,
          +          "name": {
          +            "O": "id",
          +            "L": "id"
                     },
          -          "offset": 0,
          -          "origin_default": null,
          -          "origin_default_bit": null,
          -          "default": null,
          -          "default_bit": null,
          -          "default_is_expr": false,
          -          "generated_expr_string": "",
          -          "generated_stored": false,
          -          "dependences": null,
          -          "type": {
          -            "Tp": 3,
          -            "Flag": 4131,
          -            "Flen": 10,
          -            "Decimal": 0,
          -            "Charset": "binary",
          -            "Collate": "binary",
          -            "Elems": null
          +          "offset": 0,
          +          "origin_default": null,
          +          "origin_default_bit": null,
          +          "default": null,
          +          "default_bit": null,
          +          "default_is_expr": false,
          +          "generated_expr_string": "",
          +          "generated_stored": false,
          +          "dependences": null,
          +          "type": {
          +            "Tp": 3,
          +            "Flag": 4131,
          +            "Flen": 10,
          +            "Decimal": 0,
          +            "Charset": "binary",
          +            "Collate": "binary",
          +            "Elems": null
                     },
          -          "state": 5,
          -          "comment": "",
          -          "hidden": false,
          -          "change_state_info": null,
          -          "version": 2
          +          "state": 5,
          +          "comment": "",
          +          "hidden": false,
          +          "change_state_info": null,
          +          "version": 2
                   },
                   {
          -          "id": 2,
          -          "name": {
          -            "O": "notes",
          -            "L": "notes"
          +          "id": 2,
          +          "name": {
          +            "O": "notes",
          +            "L": "notes"
                     },
          -          "offset": 1,
          -          "origin_default": null,
          -          "origin_default_bit": null,
          -          "default": null,
          -          "default_bit": null,
          -          "default_is_expr": false,
          -          "generated_expr_string": "",
          -          "generated_stored": false,
          -          "dependences": null,
          -          "type": {
          -            "Tp": 15,
          -            "Flag": 0,
          -            "Flen": 255,
          -            "Decimal": 0,
          -            "Charset": "utf8mb4",
          -            "Collate": "utf8mb4_bin",
          -            "Elems": null
          +          "offset": 1,
          +          "origin_default": null,
          +          "origin_default_bit": null,
          +          "default": null,
          +          "default_bit": null,
          +          "default_is_expr": false,
          +          "generated_expr_string": "",
          +          "generated_stored": false,
          +          "dependences": null,
          +          "type": {
          +            "Tp": 15,
          +            "Flag": 0,
          +            "Flen": 255,
          +            "Decimal": 0,
          +            "Charset": "utf8mb4",
          +            "Collate": "utf8mb4_bin",
          +            "Elems": null
                     },
          -          "state": 5,
          -          "comment": "",
          -          "hidden": false,
          -          "change_state_info": null,
          -          "version": 2
          +          "state": 5,
          +          "comment": "",
          +          "hidden": false,
          +          "change_state_info": null,
          +          "version": 2
                   }
                 ],
          -      "index_info": null,
          -      "constraint_info": null,
          -      "fk_info": null,
          -      "state": 0,
          -      "pk_is_handle": true,
          -      "is_common_handle": false,
          -      "common_handle_version": 0,
          -      "comment": "",
          -      "auto_inc_id": 0,
          -      "auto_id_cache": 0,
          -      "auto_rand_id": 0,
          -      "max_col_id": 2,
          -      "max_idx_id": 0,
          -      "max_cst_id": 0,
          -      "update_timestamp": 0,
          -      "ShardRowIDBits": 0,
          -      "max_shard_row_id_bits": 0,
          -      "auto_random_bits": 0,
          -      "pre_split_regions": 0,
          -      "partition": null,
          -      "compression": "",
          -      "view": null,
          -      "sequence": null,
          -      "Lock": null,
          -      "version": 4,
          -      "tiflash_replica": null,
          -      "is_columnar": false,
          -      "temp_table_type": 0,
          -      "policy_ref_info": null,
          -      "placement_settings": null
          +      "index_info": null,
          +      "constraint_info": null,
          +      "fk_info": null,
          +      "state": 0,
          +      "pk_is_handle": true,
          +      "is_common_handle": false,
          +      "common_handle_version": 0,
          +      "comment": "",
          +      "auto_inc_id": 0,
          +      "auto_id_cache": 0,
          +      "auto_rand_id": 0,
          +      "max_col_id": 2,
          +      "max_idx_id": 0,
          +      "max_cst_id": 0,
          +      "update_timestamp": 0,
          +      "ShardRowIDBits": 0,
          +      "max_shard_row_id_bits": 0,
          +      "auto_random_bits": 0,
          +      "pre_split_regions": 0,
          +      "partition": null,
          +      "compression": "",
          +      "view": null,
          +      "sequence": null,
          +      "Lock": null,
          +      "version": 4,
          +      "tiflash_replica": null,
          +      "is_columnar": false,
          +      "temp_table_type": 0,
          +      "policy_ref_info": null,
          +      "placement_settings": null
               }
             ],
          -  "schema_state": 0,
          -  "snapshot_ver": 0,
          -  "real_start_ts": 0,
          -  "start_ts": 428310284267159550,
          -  "dependency_id": 0,
          -  "query": "CREATE TABLE t (id int unsigned NOT NULL PRIMARY KEY, notes varchar(255))",
          -  "binlog": {
          -    "SchemaVersion": 0,
          -    "DBInfo": null,
          -    "TableInfo": null,
          -    "FinishedTS": 0
          +  "schema_state": 0,
          +  "snapshot_ver": 0,
          +  "real_start_ts": 0,
          +  "start_ts": 428310284267159550,
          +  "dependency_id": 0,
          +  "query": "CREATE TABLE t (id int unsigned NOT NULL PRIMARY KEY, notes varchar(255))",
          +  "binlog": {
          +    "SchemaVersion": 0,
          +    "DBInfo": null,
          +    "TableInfo": null,
          +    "FinishedTS": 0
             },
          -  "version": 1,
          -  "reorg_meta": null,
          -  "multi_schema_info": null,
          -  "priority": 0
          +  "version": 1,
          +  "reorg_meta": null,
          +  "multi_schema_info": null,
          +  "priority": 0
           }
           

          Execution in the TiDB DDL Owner

          @@ -2251,7 +2252,7 @@

          SQL Layer

          Session

          The most important function in Session is ExecuteStmt. It wraps calls to other modules. The SQL execution will respect environment variables in Session like AutoCommit and timezone.

          Parser

          -

          Parser consists of Lexer and Yacc. It turns the SQL text to AST:

          +

          Parser consists of Lexer and Yacc. It turns the SQL text to AST:

          p := parserPool.Get().(*parser.Parser)
           defer parserPool.Put(p)
           p.SetSQLMode(s.sessionVars.SQLMode)
          @@ -2569,14 +2570,14 @@ 

          Understan

          Parser is generated by a parser generator named yacc. It takes the grammar file parser.y as the input and outputs the source code file parser.go, which is the real parser imported by TiDB. Thus, the core file is parser.y because when the SQL syntax changes, most of the changes take place in parser.y.

          In case you are unfamiliar with yacc, some concepts are listed here:

            -
          • Terminal Symbol is also known as "token". When a SQL string reaches parser, the first step is to tokenize them into an array of tokens. For example, "SELECT * FROM t" is tokenized to [selectKeyword, '*', fromKeyword, identifier(t)] by lexer.Lex().
          • +
          • Terminal Symbol is also known as "token". When a SQL string reaches parser, the first step is to tokenize them into an array of tokens. For example, "SELECT * FROM t" is tokenized to [selectKeyword, '*', fromKeyword, identifier(t)] by lexer.Lex().
          • Non-terminal Symbol is a syntactic variable, which can represent a group of terminal/non-terminal symbols.
          • Grammar Rule specifies which symbols can replace which other non-terminal symbol.
          • Semantic Action defines how an AST node is constructed.

          An example of a grammar rule is as follows:

          AlterDatabaseStmt:
          -	"ALTER" DatabaseSym DBName DatabaseOptionList
          +	"ALTER" DatabaseSym DBName DatabaseOptionList
           	{
           		$$ = &ast.AlterDatabaseStmt{
           			Name:                 $3,
          @@ -2587,7 +2588,7 @@ 

          Understan

          • AlterDatabaseStmt is a non-terminal symbol because there is no such token.
          • -
          • "ALTER" is a terminal symbol.
          • +
          • "ALTER" is a terminal symbol.
          • DatabaseSym, DBName and DatabaseOptionList are non-terminal symbols that are defined in other grammar rules.
          • The pseudo-code in brackets is the semantic action. It means an AST node ast.AlterDatabaseStmt will be constructed when the rule is reduced by the parser. Note that a dollar character $ followed by a number represents the binding Golang value previously (in other rules), where the number is the index of symbol in rule (1-based). $$ represents current binding value. After goyacc substitution, this code snippet will be valid Golang code.
          @@ -2624,8 +2625,8 @@

          FAQ

          1. How to resolve shift-reduce or reduce-reduce conflicts?
          -

          Shift means "move the next token in" to match the current rule. Reduce means "replace current tokens/symbols to a non-terminal symbol". Shift-reduce conflicts occur when the parser cannot decide the next step is to shift or to reduce.

          -

          When yacc reports such conflicts, it also keeps the file y.output. You can search "conflict on" in the file to locate which rule conflicts with other rules. Then you can try to annotate the %precedence to tokens, rewrite the grammar rule, or ask for help on GitHub.

          +

          Shift means "move the next token in" to match the current rule. Reduce means "replace current tokens/symbols to a non-terminal symbol". Shift-reduce conflicts occur when the parser cannot decide the next step is to shift or to reduce.

          +

          When yacc reports such conflicts, it also keeps the file y.output. You can search "conflict on" in the file to locate which rule conflicts with other rules. Then you can try to annotate the %precedence to tokens, rewrite the grammar rule, or ask for help on GitHub.

          Planner

          The planner package contains most of the codes related to SQL optimization. The input of the planner is an AST of the query returned from the parser, and the output of the planner is a plan tree that would be used for further execution.

          Package Structure

          @@ -2638,7 +2639,7 @@

          Package S

      Not recommendedClear in typeAdequate and clear in meaningUser perspectiveRecommended
      Not use the stale read request's start_ts to update max_ts to avoid commit request keep retryingβœ…βœ…βŒImprove commit performance in some edge cases
      tidb/pkg/planner/utilCommon utility functions / structures shared by the two planners
      -

      We can see that, TiDB has two planners, one is of System R model, which is defaultly used, and the other is of Cascades model, which is still under development. The unified entry function of planner module is Optimize(), before diving into either of the two planners, it would firstly check if there is any intervention for the planner from the "SQL Plan Management" module, if yes, the AST of the query would be modified before going through the optimization procedures. "SQL Plan Management" module is beyond the scope of this article, and it would be introduced in the SQL Plan Management section.

      +

      We can see that, TiDB has two planners, one is of System R model, which is defaultly used, and the other is of Cascades model, which is still under development. The unified entry function of planner module is Optimize(), before diving into either of the two planners, it would firstly check if there is any intervention for the planner from the "SQL Plan Management" module, if yes, the AST of the query would be modified before going through the optimization procedures. "SQL Plan Management" module is beyond the scope of this article, and it would be introduced in the SQL Plan Management section.

      This article would only focus on introducing the System R planner, i.e, the core package, readers who are interested in the Cascacdes planner can refer to this design doc.

      Optimization Procedures

      Ignore the trivial steps, the query optimization procedures can be briefly divided into 4 phases:

      @@ -2655,7 +2656,7 @@

      Plan Building

      During the plan building process, optimization flags would be collected for each operator built. For example, if a Selection operator is built, then an optimization flag like flagPredicatePushDown would be set in the plan builder. These saved flags would be used later in the logical optimization phase.

      Logical Optimization

      The entry function of this phase (also known as rule-based optimization) is logicalOptimize(). This function would do logically equivalent transformations for the initial plan tree according to relational algebra, and the result plan tree should be better than the initial one from the execution efficiency perspective in principle. Specifically, logicalOptimize() would traverse all the logical optimization rules predefined as optRuleList in order, and check if a rule is applicable by referring to the optimization flags saved during the plan building phase. If the flag is set for a rule, planner would traverse the plan tree from top down, and apply the transformations implied by the rule to the subtree satisfying the rule prerequisites.

      -

      An example logical optimization rule is "column pruning", for each operator in the plan tree, it would collect the columns needed by the upper operators, and prune the unneeded columns from the output. Another example rule is "decorrelation", it would try to pull up operators referring correlated columns, and resolve the column dependency, hence convert the LogicalApply operator to a regular LogicalJoin.

      +

      An example logical optimization rule is "column pruning", for each operator in the plan tree, it would collect the columns needed by the upper operators, and prune the unneeded columns from the output. Another example rule is "decorrelation", it would try to pull up operators referring correlated columns, and resolve the column dependency, hence convert the LogicalApply operator to a regular LogicalJoin.

      Physical Optimization

      The entry function of this phase (also known as cost-based optimization) is physicalOptimize(), it would do cost based enumeration for the implementations of each logical operator, and find a combination of all operators with the lowest cost as the final physical plan. Specifically, each logical operator would implement an interface function exhaustPhysicalPlans() to list all the possible physical algorithms, e.g, LogicalAggregation would have two possible implementations including PhysicalStreamAggregation and PhysicalHashAggregation. Each implementation may require specific properties for its child's output, e.g, PhysicalStreamAggregation would require that the child's output rows should be in order of the GROUP BY columns. These properties are recorded in PhysicalProperty structure, and passed down to the enumeration procedure of the child operators.

      Once the planner knows the specific implementation of the plan tree, or of a subtree, it can compute a cost for this implementation. The cost of one implementation is calculated as a sum of its resource consumptions including CPU, Memory, Network, IO, etc. For each kind of resource specifically, the consumption is measured based on a unit factor (e.g, scanFactor is the unit factor for IO consumption, which means the cost of scanning 1 byte data on TiKV or TiFlash), and the estimated number of rows / bytes to be processed by this operator. Note that, these unit factors can be customized by setting system variables like tidb_opt_xxx_factor to fit clusters of different hardware configurations. Each implementation of the whole logical plan tree would have a cost then, planner would choose the one with the lowest cost for execution.

      @@ -2677,7 +2678,7 @@

      Count-Min S

      The Count-Min Sketch (CM sketch) is a data structure used for query cardinality estimation for the equal predicate, or join, etc., and provides strong accuracy guarantees. Since its introduction in 2003 in the paper An improved data stream summary: The count-min sketch and its applications, it has gained widespread use given its simplicity of construction and use.

      CM sketch maintains an array of d*w counts, and for each value, maps it to a column in each row using d separate hash functions and modifies the count value at those d positions. This is shown in the following figure.

      count-min sketch

      -

      This way, when querying how many times a value appears, the d hash functions are still used to find the position mapped to in each row, and the minimum of these d values is used as the estimate.

      +

      This way, when querying how many times a value appears, the d hash functions are still used to find the position mapped to in each row, and the minimum of these d values is used as the estimate.

      Please note that CM sketch is not used as default statistics since version 5.1 given the increasing concerns on estimation bias under the scenarios with large distinct values of a column.

      Top-N Value (Most Frequent Value)

      The CM sketch would encounter severe hash collisions when the dataset grows while the histogram has its limit to estimate the selectivity of equal predicates. Thus we extract the Top-N value (a.k.a., the most frequent value) of the dataset out of the histogram to improve the accuracy of the estimation of an equal predicate. Here, the top-n statistics are stored as a pair of (value, cnt). For example, for a dataset 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5, 6, 7. if n of top-n is 1, the top-n value pair will be [(1, 7)], and the rest of the histogram is constructed using the remaining data 2, 2, 3, 4, 4, 5, 6, 7. You may refer to the paper Synopses for Massive Data: Samples, Histograms, Wavelets, Sketches for additional information.

      @@ -2706,7 +2707,7 @@

      merged into one.

    Statistics maintenance

    -

    From version 2.0, TiDB has introduced a dynamic update mechanism, which can dynamically adjust statistics based on the results of the query. In addition, from version 4.0, we have introduced the auto analyze function, which can automatically trigger the collection of (incremental) statistics based on the percentage of table data change.

    +

    From version 2.0, TiDB has introduced a dynamic update mechanism, which can dynamically adjust statistics based on the results of the query. In addition, from version 4.0, we have introduced the auto analyze function, which can automatically trigger the collection of (incremental) statistics based on the percentage of table data change.

    As data changes significantly, the settings of statistics collection may need to be modified accordingly, too. For example, the histogram needs to adjust the bucket height and the boundaries of the bucket; and the CM Sketch needs to adjust the count array so that the estimated value is equal to the result of the query.

    What needs to be pointed out here is that dynamically adjusting statistics based on query results is turned off by default in version 4.0, and will be revisited for a possible redesign in future versions.

    The application of the statistics

    @@ -2823,7 +2824,7 @@

    Appendix

    As stated in the planner overview, rule-based optimization (usually used interchangeably with logical optimization in TiDB code) consists of logical optimization rules. These rules have predefined order to be iterated. Each rule has a responding flag, and a rule will be applied only if it is flagged and not disabled. The flag is set according to the SQL in the plan building stage.

    The rule-based optimization will produce a logical plan tree that is logically equal to the original one. Besides the original plan tree, it will also make use of table schema information to make optimizations, but it doesn't rely on the statistics to do optimization (join reorder is the only exception, we'll talk about it later).

    Implementation Patterns

    -

    Code for each rule is placed in a separated file named "rule_xxx_xxx.go".

    +

    Code for each rule is placed in a separated file named "rule_xxx_xxx.go".

    All logical rule implements the logicalOptRule interface.

    It is defined as follows:

    type logicalOptRule interface {
    @@ -2839,10 +2840,10 @@ 

    Rules

    Column Pruning

    -

    This is a very fundamental optimization. It will prune unneeded columns for each operator.

    +

    This is a very fundamental optimization. It will prune unneeded columns for each operator.

    This main logic is in PruneColumns(parentUsedCols []*expression.Column) error method of the LogicalPlan interface. It traverses the plan tree from top to bottom. Each operator receives which columns are used by the parent operator, then uses this information to prune unneeded columns from itself (different kinds of operator would have different behaviors), then collect and pass columns needed by itself to its children.

    Decorrelation

    -

    As stated in the planner overview, the correlated subquery in the SQL becomes the Apply operator, which is a special kind of Join operator, in the plan tree. If we can transform it to a normal Join and keep it logically equal to the Apply, we can do make more optimizations that are only available to normal join operators.

    +

    As stated in the planner overview, the correlated subquery in the SQL becomes the Apply operator, which is a special kind of Join operator, in the plan tree. If we can transform it to a normal Join and keep it logically equal to the Apply, we can do make more optimizations that are only available to normal join operators.

    An Apply is equivalent to a Join if there are no correlated columns in its inner side. Here we try to pull up operators with correlated columns in the inner side across the Apply, then Apply can be changed to Join directly. So this kind of transformation is called decorrelation.

    The main logic is in (*decorrelateSolver).optimize(). It finds Apply and tries to decorrelate it.

    Currently, there're several cases we can decorrelate.

    @@ -2922,11 +2923,11 @@

    Decorrelation

    This rule will keep trying to decorrelate an Apply until it can't be decorrelated anymore. If there are no correlated columns in its inner side now, it is converted to a Join.

    Decorrelation can't guarantee a better plan

    It might be intuitive to think that a decorrelated Join is more efficient than the nested-loop style Apply. That's probably true in most cases. However, as we said above, decorrelation just enables us to make more optimizations that are only available for normal Join. This doesn't mean Apply is always a worse plan.

    -

    The decorrelation involves some "pull-up" operation. This usually makes the execution of the inner sub-tree of the Apply/Join becomes less efficient. +

    The decorrelation involves some "pull-up" operation. This usually makes the execution of the inner sub-tree of the Apply/Join becomes less efficient. And in some cases, for example, when the outer side of the Apply only has one row of data, the nested-loop style Apply execution won't incur inefficiency compared with a normal Join. In such cases, the decorrelated plan is worse than the original one.

    Aggregation Elimination

    This rule finds Aggregations and tries to remove useless Aggregation operator or useless DISTINCT of aggregate functions.

    -

    A DISTINCT of an aggregate function is useless when the argument of the aggregate function is a unique column. In this case, we can set the AggFuncDesc.HasDistinct to false directly.

    +

    A DISTINCT of an aggregate function is useless when the argument of the aggregate function is a unique column. In this case, we can set the AggFuncDesc.HasDistinct to false directly.

    Example:

    CREATE TABLE t(a INT, b INT UNIQUE);
     EXPLAIN SELECT count(distinct b) FROM t;
    @@ -2961,7 +2962,7 @@ 

    Generally, there are two cases we can optimize. First, if there are two Projections in a row, we can merge them into one Projection. Second, if all expressions of a Projection are Column, which means there are no extra calculations, we can remove this Projection.

    Note that for the second case, not all Projection can be eliminated. For example, the Projection at the top of the plan tree or below UnionAll can't be removed. This is indicated by the canEliminate parameter.

    Max/Min Elimination

    -

    Max/Min elimination finds Aggregation with max() or min() aggregate function. It doesn't actually "eliminate" the Aggregation. It adds Limit and Sort to get the same effect of max() and min(), but the Aggregagation is remained to assure correctness.

    +

    Max/Min elimination finds Aggregation with max() or min() aggregate function. It doesn't actually "eliminate" the Aggregation. It adds Limit and Sort to get the same effect of max() and min(), but the Aggregagation is remained to assure correctness.

    Example:

    CREATE TABLE t(a int, b int UNIQUE NOT NULL);
     EXPLAIN SELECT MAX(a) FROM t;
    @@ -3015,10 +3016,10 @@ 

    Max/Min

    Predicate Pushdown

    This is a very fundamental and important optimization. It traverses the plan tree from top to bottom, collects predicates (filter conditions), and tries to push them down as deep as possible.

    The main logic is in the PredicatePushDown([]expression.Expression) ([]expression.Expression, LogicalPlan) method of LogicalPlan interface. The parament is the pushed-down predicates. The return values are predicates that can't be pushed down anymore and the child operator after pushing down predicates.

    -

    The predicates mainly come from Selection. The predicates can be pushed across some operators, like Projection and UnionAll. For some operators, we can only push down predicates when some requirements are met. For example, we can only push predicates across Window if all Columns in the predicates are Window's PartitionBy columns. For some operators, we can't push predicates across them, like Limit.

    +

    The predicates mainly come from Selection. The predicates can be pushed across some operators, like Projection and UnionAll. For some operators, we can only push down predicates when some requirements are met. For example, we can only push predicates across Window if all Columns in the predicates are Window's PartitionBy columns. For some operators, we can't push predicates across them, like Limit.

    In the optimal case, the predicates reach DataSource and can be pushed down to the storage layer in the physical optimization stage.

    Join is a special case in this rule. We not only push down predicates for Join, but we also make some other optimizations here. They are implemented in (*LogicalJoin).PredicatePushDown. Two of them are important and explained as follows.

    -

    First, we will try to "simplify" outer joins, which means convert outer joins to inner joins. As we know, outer join is different from inner join because we will pad NULLs for unmatched rows from the outer side. If the predicates are guaranteed to filter such rows, this join makes no difference from an inner join. In this case, we can directly change it to an inner join.

    +

    First, we will try to "simplify" outer joins, which means convert outer joins to inner joins. As we know, outer join is different from inner join because we will pad NULLs for unmatched rows from the outer side. If the predicates are guaranteed to filter such rows, this join makes no difference from an inner join. In this case, we can directly change it to an inner join.

    Example:

    CREATE TABLE t(a int, b int);
     CREATE TABLE t1(a int, b int);
    @@ -3087,7 +3088,7 @@ 

    Partition

    The rationale of this rule is rather simple, but there are different kinds of partition types and the pushed-down conditions can be very complicated. So the implementation details of this rule are also complicated. Some descriptions of these details can be found in the official docs.

    Note that there is a feature called dynamic pruning. As of this section is written, it is an experimental feature and is not enabled by default. In this mode, we no longer build a DataSource for every partition. Accessing partitions is done in one operator, and the partition pruning work is done at the execution stage. So this rule is not needed in this mode.

    Aggregation Pushdown

    -

    This rule finds LogicalAggregation and tries to push it down. Currently, we can push it across Join, Projection, UnionAll, and PartitionUnionAll. Note that pushdown here doesn't mean "move this operator below other operators". There should be one Aggregation remained in the original position and another Aggregation pushed down to assure correctness.

    +

    This rule finds LogicalAggregation and tries to push it down. Currently, we can push it across Join, Projection, UnionAll, and PartitionUnionAll. Note that pushdown here doesn't mean "move this operator below other operators". There should be one Aggregation remained in the original position and another Aggregation pushed down to assure correctness.

    Pushing Aggregation across Join is the most complicated case of them. The aggregate functions are separated into left and right sides and we try to push them to the left and right side of Join respectively. There are many requirements to make this transformation. For example, the join type should be among InnerJoin, LeftOuterJoin and RightOuterJoin. Only specific types of aggregate functions can be pushed down. And when we try to push aggregate functions down to one side of the Join, there can't be count() and sum() functions in the other side. If all requirements are met, we can generate and push down Aggregation. The new Aggregation is also transformed and different from the original Aggregation. For example, the columns in the join conditions should be added into GroupByItems.

    Example:

    CREATE TABLE t1(a int, b int);
    @@ -3210,11 +3211,11 @@ 

    Join ReorderJoin reorder tries to find the most efficient order to join several tables together. In fact, it's not a rule-based optimization. It makes use of statistics to estimate row counts of join results. We put join reorder in this stage out of convenience.

    Currently, we have implemented two join reorder algorithms: greedy and dynamic programming. The dynamic programming one is not mature enough now and is disabled by default. We focus on the greedy algorithm here.

    There are three files relevant to join reorder. rule_join_reorder.go contains the entry and common logic of join reorder. rule_join_reorder_dp.go contains the dynamic programming algorithm. rule_join_reorder_greedy.go contains the greedy algorithm.

    -

    At the beginning of join reorder, we extract "join groups" from the plan tree. A join group is some sub-trees connected together by inner Joins directly, which means there can't exist any other kind of operator between inner Joins. The join reorder algorithm optimizes one join group at a time. And join groups are optimized from bottom to top.

    +

    At the beginning of join reorder, we extract "join groups" from the plan tree. A join group is some sub-trees connected together by inner Joins directly, which means there can't exist any other kind of operator between inner Joins. The join reorder algorithm optimizes one join group at a time. And join groups are optimized from bottom to top.

    For every node in a join group, the row count is estimated. The join result row count is estimated using the simple and classic leftRowCount * rightRowCount / max(leftNDV, rightNDV) formula. Then two of them, which can get the minimum cost (calculated in (*baseSingleGroupJoinOrderSolver).baseNodeCumCost()), are chosen, connected by an inner join, then added into the join group. This process is repeated until all nodes in the join group are joined together.

    Build Key Information

    This one is actually not an optimization rule. It collects information from bottom to top that is needed by other optimizations. Two kinds of information are collected and set up for each operator.

    -

    The first information is the unique key. This is collected in DataSource from table schema information and stored as KeyInfo in the Schema for each operator. There is one thing tricky about the unique key: when you declare UNIQUE for one column when creating a table, there can be duplicated NULLs in this column actually. You should declare UNIQUE NOT NULL to get "true" uniqueness.

    +

    The first information is the unique key. This is collected in DataSource from table schema information and stored as KeyInfo in the Schema for each operator. There is one thing tricky about the unique key: when you declare UNIQUE for one column when creating a table, there can be duplicated NULLs in this column actually. You should declare UNIQUE NOT NULL to get "true" uniqueness.

    The second is the MaxOneRow attribute, which means if this operator is guaranteed to output no more than one row.

    Ending

    Currently, our rule-based optimization is a batch of rules executed in a fixed order. This is not enough to make some optimizations when the query is complicated. So we usually do more things than what the name of a rule implies. As stated above, we specially optimize Joins in predicate pushdown. Except for that, we also try to eliminate aggregations in aggregation pushdown and build key information for the newly generated Aggregations. There are more examples like that.

    @@ -3246,12 +3247,12 @@

    Derive Statistics

    -

    The entry function of deriving statistics is the baseLogicalPlan.recursiveDeriveStats function. It is used to pass the statistics of logical plan from the bottom up. And within each operator, the statistics will be kept for subsequent cost calculations.

    -

    For each operator, the logic for deriving statistics is in DeriveStats(childStats []*property.StatsInfo, ...) (*property.StatsInfo, error) method of the LogicalPlan interface. Their specific implementation is in the planner/core/stats.go file.

    +

    The entry function of deriving statistics is the baseLogicalPlan.recursiveDeriveStats function. It is used to pass the statistics of logical plan from the bottom up. And within each operator, the statistics will be kept for subsequent cost calculations.

    +

    For each operator, the logic for deriving statistics is in DeriveStats(childStats []*property.StatsInfo, ...) (*property.StatsInfo, error) method of the LogicalPlan interface. Their specific implementation is in the planner/core/stats.go file.

    The function calculates its own statistics based on the statistics of the child nodes. And each operator needs to save statistics to the property.StatsInfo structure, which the main variables include RowCount(the number of rows), ColNDVs(the NDV of each columns), and HistColl(the histogram, only the DataSource can keep this). You can read the contents of the Table Statistics chapter to get more information about statistics.

    In addition, we need to pay more attention to the implementation of the DataSource.DeriveStats function. DataSource.DeriveStats shows where the plan's statistics originally came from. Other operators are some special implementations and you can read their implementation when you need it.

    Prepare Properties

    -

    The entry function of preparing properties is the preparePossibleProperties(logic) function. it is used to generate possible physical properties of each operators from the bottom up. It reduces some properties related paths that do not need to be considered. In this way, the impossible paths can be cut as early as possible in the process of searching the physical plan, thereby speeding up the search.

    +

    The entry function of preparing properties is the preparePossibleProperties(logic) function. it is used to generate possible physical properties of each operators from the bottom up. It reduces some properties related paths that do not need to be considered. In this way, the impossible paths can be cut as early as possible in the process of searching the physical plan, thereby speeding up the search.

    For each operator, the logic for preparing properties is in PreparePossibleProperties(schema *expression.Schema, childrenProperties ...[][]*expression.Column) [][]*expression.Column method of the LogicalPlan interface. Their specific implementation is in the planner/core/property_cols_prune.go file.

    The preparePossibleProperties function is only used for Join and Aggregation operators, because only these two operators have corresponding sorting physical implementations. The properties originally came from the DataSource operator, such as the corresponding primary key and index information in the DataSource operator. It will be passed from bottom to top, so that each operator in the plan gets its own properties. Besides, some operators with sorting effect can still generate properties for transmission, such as Sort operator, Window operator with order by, etc.

    We will illustrate with examples:

    @@ -3287,7 +3288,7 @@

    Find Best Task<

    The logical plan corresponding to this SQL is as follows:

    logical-plan

    -

    Then we will combine the following picture and code for a detailed introduction to explain how the baseLogicalPlan.findBestTask function work.

    +

    Then we will combine the following picture and code for a detailed introduction to explain how the baseLogicalPlan.findBestTask function work.

    In the figure, the black font operator is a logical operator(e.g, Agg, Join and DS), the blue font is a physical operator(e.g, Stream Agg, Hash Join and Index Scan(a)...), and the yellow arrow is an operator for which the cost has been calculated. And the red dashed arrow is not in compliance with prop operator. The font on the arrow describes the property requirements for the lower operators(e.g, s.a means the the output of the lower operator needs to be ordered according to s.a).

    1. Step1: The getTask function corresponds to the yellow arrow in the figure, which means that the calculated part can be used directly without repeated calculations.
    2. @@ -3296,7 +3297,7 @@

      Find Best Task<

    find-best-plan

    Exhaust Physical Plans

    -

    For each operator, the logic for exhausting physical plans are in exhaustPhysicalPlans(*property.PhysicalProperty) (physicalPlans []PhysicalPlan) method of the LogicalPlan interface. Their specific implementation is in the planner/core/exhaust_physical_plans.go file.

    +

    For each operator, the logic for exhausting physical plans are in exhaustPhysicalPlans(*property.PhysicalProperty) (physicalPlans []PhysicalPlan) method of the LogicalPlan interface. Their specific implementation is in the planner/core/exhaust_physical_plans.go file.

    exhaustPhysicalPlans generates all possible plans that can match the required property. Different operators have different implementations. You can learn more about how different logical operators generate corresponding physical operators based on the required property by reading the code.

    Enumerate Physical Plans For Task

    Enumerate Physical Plans For Task will use the dynamic programming algorithm to select the lowest cost plan among all generated physical plans and it will return the task which encapsulates the selected physical plan. The task is a new version of physical plan. It stores cost information for a physical plan. You can see the interface definition of task in the planner/core/task.go file. A task may be copTask, rootTask, or a mppTask. copTask is a task that runs in a distributed kv store. rootTask is the task that runs in the TiDB compute layer. mppTask is a task that related to the mpp plan.

    @@ -3322,13 +3323,13 @@

    planner/core/task.go file.

    +

    After we find the best children tasks, we should use the pp.attach2Task to combine them with the current physical operator pp. The attach2Task(...task) task is the method of PhysicalPlan interface. It makes the current physical plan as the father of task's physical plan and updates the cost of current task. For each operator, you can see their special implementations in the planner/core/task.go file.

    In addition, the cost of one implementation is calculated as a sum of its resource consumptions including CPU, Memory, Network, IO, etc. For each kind of resource specifically, the consumption is measured based on a unit factor (e.g, scanFactor is the unit factor for IO consumption, which means the cost of scanning 1 byte data on TiKV or TiFlash), and the estimated number of rows / bytes to be processed by this operator. Note that, these unit factors can be customized by setting system variables like tidb_opt_xxx_factor to fit clusters of different hardware configurations in the sessionctx/variable/session.go file.

  • @@ -3377,7 +3378,7 @@

    The main function of handling execute is common_plans.go:Execute.getPhysicalPlan().

    Plan Rebuilding

    -

    A cached plan cannot be reused directly unless it is rebuilt. The main goal of rebuilding is to re-calculate the access range.

    +

    A cached plan cannot be reused directly unless it is rebuilt. The main goal of rebuilding is to re-calculate the access range.

    For example, if the query is select * from t where a<?, when you first execute it with 1, then a TableScan with range (-INF, 1) could be generated and cached, and then you later execute it with 2, the range has to be re-calculated to (-INF, 2) so that it can read correct data this time, and that is what plan rebuilding does.

    plan-cache-rebuilding

    The entry function of plan rebuilding is common_plans.go:Execute.RebuildPlan().

    @@ -3396,7 +3397,7 @@

    The P

    Parameter Makers are used to propagate current parameter values to executors(operators).

    For example, if the query is select * from t where a+?>1, then the filter a+?>1 will be converted to a Selection operator with an expression-tree a+?>1:

    plan-cache-parameter

    -

    The parameter placeholder(?) is converted to a ParamMaker in Constant.

    +

    The parameter placeholder(?) is converted to a ParamMaker in Constant.

    You can regard it as a special kind of pointer, and when the plan is processed and the parameter's value is needed, we can use it to get the corresponding value of this parameter.

    type Constant struct {
         ...
    @@ -3422,7 +3423,7 @@ 

    Execu

    It's easy to understand how the Volcano model works for single-process execution. For parallelism issues, the Volcano introduces an operator called Exchange at any desired point in a plan tree. Further explanation about the parallelism-related issues would be introduced in the Parallel Execution Framework section.

    Vectorized Execution

    Vectorization uses the Volcano iteration model where each operator has a Next method that produces result tuples. However, each Next call fetches a block of tuples instead of just one tuple.

    -

    The main principle of vectorized execution is batched execution on a columnar data representation: every "work" primitive function that manipulates data does not work on a single data item, but on a vector (an array) of such data items that represents multiple tuples. The idea behind vectorized execution is to amortize the iterator call overhead by performing as much as possible inside the data manipulation methods. For example, this work can be to hash 1000s of values, compare 1000s of string pairs, update a 1000 aggregates, or fetch a 1000 values from 1000s of addresses.

    +

    The main principle of vectorized execution is batched execution on a columnar data representation: every "work" primitive function that manipulates data does not work on a single data item, but on a vector (an array) of such data items that represents multiple tuples. The idea behind vectorized execution is to amortize the iterator call overhead by performing as much as possible inside the data manipulation methods. For example, this work can be to hash 1000s of values, compare 1000s of string pairs, update a 1000 aggregates, or fetch a 1000 values from 1000s of addresses.

    Columnar Different from the row-oriented data representation, columnar format organize data by column rather by row. By storing data in columns rather than rows, the database can more precisely access the data it needs to answer a query rather than scanning and discarding unwanted data in rows. The memory columnar data representation in TiDB is defined as Chunk, which is inspired by Apache Arrow.

    The detailed definition and usage of Chunk will be introduced in the Implementation of Vectorized Execution section.

    Memory Management Mechanism

    @@ -3573,7 +3574,7 @@

    Vectorized Execution

    In vectorized execution, the interface to evaluate an expression in a batch manner in TiDB looks like this:

    type VecNode interface {
    @@ -3607,10 +3608,10 @@ 

    this link.

    How operators are evaluated

    -

    In this section, we'll dive deeper into the evaluation of operators, focusing on HashJoin as an example.

    +

    In this section, we'll dive deeper into the evaluation of operators, focusing on HashJoin as an example.

    HashJoin in vectorized execution consists of the following steps:

    Hashing Phase

    Let's consider the table used for constructing the hash table as 't'. The data from table 't' is read into Chunk in batches. First, the data in the Chunk is filtered by columns based on the predicates on table 't'. The filtered results for these columns are then combined into a selected array. In the selected array, true values indicate valid rows. The relevant code is available in the VectorizedFilter section. @@ -3666,7 +3667,7 @@

    AggSpil b. If the processing key doesn't exist in the Map, spill the data to disk.

  • After all data have been processed, output the aggregate result in the map, clear the map. Then read the spilling data from disk, repeat the Step1-Step3 until all data gets aggregated.
  • -

    As we can see, unlike other spilling implementations, AggSpillDiskAction does not make the memory drop immediately, but keeps the memory no longer growing.

    +

    As we can see, unlike other spilling implementations, AggSpillDiskAction does not make the memory drop immediately, but keeps the memory no longer growing.

    Log/Cancel

    When the above methods cannot control the memory within the threshold, we will try to use PanicOnExceed to cancel the SQL or use LogOnExceed to log the SQL info.

    Implementation of Typical Operators

    @@ -3793,7 +3794,7 @@

    Probe stage

  • In the absence of potential matching rows, the onMissMatch function is invoked. Otherwise, the tryToMatch function is executed.
  • Transaction

    -

    The transaction engine in TiDB is responsible to provide ACID guarantees for all the read and write requests. It consists of the client/coordinator part in the TiDB repository and the server/participant part in the TiKV repository.

    +

    The transaction engine in TiDB is responsible to provide ACID guarantees for all the read and write requests. It consists of the client/coordinator part in the TiDB repository and the server/participant part in the TiKV repository.

    This document is mainly about the TiDB part.

    The Architecture

    In TiDB the transaction write flow is like this:

    @@ -3827,7 +3828,7 @@

    The Interface }

    These are common interfaces the transaction will provide.

    -

    For example, Commit will be used to commit the current ongoing transaction. The transaction is considered ongoing before the Commit operation is triggered. The two-phase commit processing will be used to commit a transaction and it will finally become committed or aborted.

    +

    For example, Commit will be used to commit the current ongoing transaction. The transaction is considered ongoing before the Commit operation is triggered. The two-phase commit processing will be used to commit a transaction and it will finally become committed or aborted.

    LazyTxn is a wrapper of the transaction implementations, when the SQL statements are executed using a standalone session context, LazyTxn will be used to do things like:

    • Return the memory buffer for write.
    • @@ -3941,7 +3942,7 @@

      T

      The receiving input transaction requests will be translated into transaction commands. Then the transaction scheduler will handle these transaction commands, it will first try to fetch the needed key latches (latch is used to sequence all the transaction commands on the same key),then try to fetch a storage snapshot for the current transaction processing.

      The task will be processed as a future. The future processing is done in the transaction scheduler thread-pool. Usually, there will be some tasks like conflict and constraint checks, write mutation generations. For example, the prewrite request processing will need to check if there is already a conflict lock or a conflict committed write record.

      Transaction Log Replication

      -

      In TiDB, the key space is split into different ranges or regions. Each region is a raft group and its leader will be responsible for handling its key range related read/write requests.

      +

      In TiDB, the key space is split into different ranges or regions. Each region is a raft group and its leader will be responsible for handling its key range related read/write requests.

      If the transaction command processing in the transaction scheduler is successful, the generated transaction writes will be written into the raft log engine by the region leaders in raftStore (raft store will be introduced in other documents in details). The work flow is like this:

      RaftStore

      The writes generated by transaction commands will be sent to the raft peer message task queue first, then the raft batch system will poll each raft peer and handle these requests in the raft thread-pool. After all the raft logs are persisted on majority raft group members, they are regarded as commit. Then the correspond apply task be delivered to the apply worker pool to apply the actual write contents to the storage engine, after that the transaction command processing is considered successful and the callback will be invoked to response OK results to the RPC client.

      @@ -3953,9 +3954,9 @@

      {user_key}{commit_ts}write_info -

      After prewrite, the lock correspond records for the transaction will be written into the storage. Read and write conflicts on the "locked" key will need to consider if it's safe to bypass the lock or it must try to resolve the encountered locks. As commit_ts is part of the stored key, there could be different historical versions for it, and GC is responsible to clean up all the old versions which will not be needed any more. GC mechanism will be introduced in another document.

      +

      After prewrite, the lock correspond records for the transaction will be written into the storage. Read and write conflicts on the "locked" key will need to consider if it's safe to bypass the lock or it must try to resolve the encountered locks. As commit_ts is part of the stored key, there could be different historical versions for it, and GC is responsible to clean up all the old versions which will not be needed any more. GC mechanism will be introduced in another document.

      Transaction Recovery

      -

      In TiDB, the transaction coordinator (in tidb-server) is stateless and it will not persist any information. If the transaction coordinator fails for example the tidb-server crashes, the transaction context in memory will get lost, and as the coordinator is gone the normal commit processing will stop. How to recover the transaction state and make a decision if it should commit or abort?

      +

      In TiDB, the transaction coordinator (in tidb-server) is stateless and it will not persist any information. If the transaction coordinator fails for example the tidb-server crashes, the transaction context in memory will get lost, and as the coordinator is gone the normal commit processing will stop. How to recover the transaction state and make a decision if it should commit or abort?

      Actually, there is no special mechanism to recover the undetermined-status transactions, the recovery is done by other concurrent conflict transactions, or the conflict transactions will help decide the actual states of the undetermined-status transactions. The lock resolve process will be triggered if the current ongoing transaction encounters locks of other transactions doing reads or writes. The whole resolve process will be introduced in other documents in details.

      Transaction Optimizations

      Normally the transaction commit will need two phases, the prewrite phase and commit phase. Under certain circumstances transaction commit could be done in a single phase for example the generated transaction mutations could be processed by a single region leader. This optimization is called one-phase commit in TiDB.

      @@ -3976,27 +3977,27 @@

      here .

      +

      The function (e *SimpleExec) executeBegin do the main work for a "BEGIN" statement,The important comment and simplified code is as followers. The completed code is here .

      /*
       
      -Check the syntax "start transaction read only" and "as of timestamp" used correctly.
      -If stale read timestamp was set,  creates a new stale read transaction and sets "in transaction" state, and return.
      +Check the syntax "start transaction read only" and "as of timestamp" used correctly.
      +If stale read timestamp was set,  creates a new stale read transaction and sets "in transaction" state, and return.
       create a new transaction and set some properties like snapshot, startTS etc
       */
       
       func (e *SimpleExec) executeBegin(ctx context.Context, s *ast.BeginStmt) error {
       ​    if s.ReadOnly {
      -​       // the statement "start transaction read only" must be used with tidb_enable_noop_functions is true
      -​       //  the statement "start transaction read only  as of timestamp" can be used Whether  tidb_enable_noop_functions  is true or false,but that tx_read_ts mustn't be set.
      -​       //  the statement "start transaction read only  as of timestamp" must ensure the timestamp is in the legal safe point range        
      +​       // the statement "start transaction read only" must be used with tidb_enable_noop_functions is true
      +​       //  the statement "start transaction read only  as of timestamp" can be used Whether  tidb_enable_noop_functions  is true or false,but that tx_read_ts mustn't be set.
      +​       //  the statement "start transaction read only  as of timestamp" must ensure the timestamp is in the legal safe point range        
       ​       enableNoopFuncs := e.ctx.GetSessionVars().EnableNoopFuncs
       ​       if !enableNoopFuncs && s.AsOf == nil {
      -​           return expression.ErrFunctionsNoopImpl.GenWithStackByArgs("READ ONLY")
      +​           return expression.ErrFunctionsNoopImpl.GenWithStackByArgs("READ ONLY")
       ​       }
       
       ​       if s.AsOf != nil {
       ​           if e.ctx.GetSessionVars().TxnReadTS.PeakTxnReadTS() > 0 {
      -​              return errors.New("start transaction read only as of is forbidden after set transaction read only as of")
      +​              return errors.New("start transaction read only as of is forbidden after set transaction read only as of")
       ​           }
       ​       }
       ​    } 
      @@ -4010,9 +4011,9 @@ 

      DML Executed In Optimistic Transaction

      There are three kinds of DML operations, such as update, delete and insert. For simplicity, This article only describes the update statement execution process. DML mutations are not sended to tikv directly, but buffered in TiDB temporarily, commit operation make the mutations effective at last.

      -

      The main function stack to execute an update statement such as "update t1 set id2 = 2 where pk = 1" is as followers.

      +

      The main function stack to execute an update statement such as "update t1 set id2 = 2 where pk = 1" is as followers.

      (a *ExecStmt) Exec
       ​    (a *ExecStmt) handleNoDelay
       ​        (a *ExecStmt) handleNoDelayExecutor
      @@ -4064,7 +4065,7 @@ 

      Commit Optimistic Transaction

      -

      Committing transaction includes "prewrite" and "commit" two phases that are explained separately below. The function (c *twoPhaseCommitter) execute does the main work for committing transaction. The important comment and simplified code are as followers. The completed code is here .

      +

      Committing transaction includes "prewrite" and "commit" two phases that are explained separately below. The function (c *twoPhaseCommitter) execute does the main work for committing transaction. The important comment and simplified code are as followers. The completed code is here .

      /*
      -do the "prewrite" operation first
      +do the "prewrite" operation first
       if OnePC transaction, return immediately
       if AsyncCommit transaction, commit the transaction Asynchronously and return
       if not OnePC and AsyncCommit transaction, commit the transaction synchronously.
       */
       
       func (c *twoPhaseCommitter) execute(ctx context.Context) (err error) {
      -​    // do the "prewrite" operation
      +​    // do the "prewrite" operation
       ​    c.prewriteStarted = true
       
       ​    start := time.Now()
      @@ -4123,7 +4124,7 @@ 

      other sections, TiDB's transaction system is based on Google's Percolator algorithm. Which makes it necessary to resolve locks when a reading transaction meets locked keys or during GC.

      So we introduced Lock Resolver in TiDB to resolve locks.

      The Data Structure

      -

      Lock Resolver is a quiet simple struct:

      +

      Lock Resolver is a quiet simple struct:

      type LockResolver struct {
       	store storage
       	mu    struct {
      @@ -4414,7 +4415,7 @@ 

      TiKV part

      collapse continuous rollbacks (tikv#3290)

      Instead of keeping all rollbacks in write column family when a key rollbacked for several times, this PR collapse continuous rollbacks, and only keep the latest rollback.

      protect primary locks (tikv#5575)
      -

      After we have pessimistic lock in TiKV, if the rollback record of the primary lock is collapsed and TiKV receives stale acquire_pessimistic_lock and prewrite, the transaction will commit successfully even if secondary locks are rolled back. To solve this problem, we can prevent the rollback records of the primary key of pessimistic transactions from being collapsed. By setting the short value of these rollback records to "protected".

      +

      After we have pessimistic lock in TiKV, if the rollback record of the primary lock is collapsed and TiKV receives stale acquire_pessimistic_lock and prewrite, the transaction will commit successfully even if secondary locks are rolled back. To solve this problem, we can prevent the rollback records of the primary key of pessimistic transactions from being collapsed. By setting the short value of these rollback records to "protected".

      gc_fence (tikv#9207)

      See the super detailed comment here.

      After understand these, you can finally understand the code:

      @@ -4471,7 +4472,7 @@
      gc_fence // In this case, if the current transaction is to be rolled back, the `overlapped_write` must not be overwritten. TxnCommitRecord::None { overlapped_write } => overlapped_write, TxnCommitRecord::SingleRecord { write, .. } if write.write_type != WriteType::Rollback => { - panic!("txn record found but not expected: {:?}", txn) + panic!("txn record found but not expected: {:?}", txn) } _ => return txn.unlock_key(key, is_pessimistic_txn), }; @@ -4611,7 +4612,7 @@

      Prewrite

      c.mu.Unlock() }
      -

      However, if any response of prewrite is finally lost due to RPC reasons, it is impossible for us to know whether the prewriting succeeds. And it also means we cannot know whether the transaction succeeds. In this case, we can only return an "undetermined error" and the client connection will be closed:

      +

      However, if any response of prewrite is finally lost due to RPC reasons, it is impossible for us to know whether the prewriting succeeds. And it also means we cannot know whether the transaction succeeds. In this case, we can only return an "undetermined error" and the client connection will be closed:

      defer func() {
           if err != nil {
               // If we fail to receive response for async commit prewrite, it will be undetermined whether this
      @@ -4626,7 +4627,7 @@ 

      Prewrite

      But don't worry, this does not happen very often. It is safe to retry a prewrite which temporarily fails due to network reasons. The above problem only happens if a prewrite request has been sent, but later retries all fail due to RPC errors.

      Commit

      -

      The whole commit process is done asynchronously in background. This is why the optimization is called "async commit":

      +

      The whole commit process is done asynchronously in background. This is why the optimization is called "async commit":

      if c.isAsyncCommit() {
           // For async commit protocol, the commit is considered success here.
           c.txn.commitTS = c.commitTS
      @@ -4681,7 +4682,7 @@ 

      DDL compa

      TiKV part

      Concurrency manager

      As discussed in the theory blog, TiKV needs to record the max TS and set some memory locks for ongoing prewrite requests.

      -

      For simplicity, we use a global component to implement it. We call it the "concurrency manager".

      +

      For simplicity, we use a global component to implement it. We call it the "concurrency manager".

      The methods provided by the concurrency manager can be found in this file.

      It is very easy to update the max TS. It's just an atomic operation:

      #![allow(unused)]
      @@ -4750,7 +4751,7 @@ 

      Prewrite

      The operation is done while locked to guarantee the atomicity of getting the max TS and setting the min_commit_ts in the lock.

      The key guard is saved until the lock is successfully written into RocksDB. Before that, readers are able to check the locks in order not to break any constraint. We can release the guard to remove the lock in the memory table after the readers can read the lock from the RocksDB.

      Fallback to non-async commit

      -

      The client may provide a max_commit_ts constraint. If the calculated min_commit_ts is larger than the max_commit_ts, we need to fallback to non-async commit.

      +

      The client may provide a max_commit_ts constraint. If the calculated min_commit_ts is larger than the max_commit_ts, we need to fallback to non-async commit.

      When the CommitTsTooLarge error happens, the lock will still be written, but in the lock there will be no use_async_commit flag and no secondary keys will be recorded:

      #![allow(unused)]
       fn main() {
      @@ -4812,7 +4813,7 @@ 

      becomes leader or a region is merged.

      Before the max TS is updated, the corresponding region is not allowed to proceed an async-commit prewrite. The property is checked here.

      Summary

      -

      This is how the "async commit" optimization is implemented in TiDB.

      +

      This is how the "async commit" optimization is implemented in TiDB.

      Due to limited space, some subtle problems such as non-unique timestamps and the compatibility with follower read are not involved.

      During the implementation of async commit, many problems blocking one-phase commit (1PC) are solved. So it becomes relatively easy to implement 1PC in TiDB. The next document will introduce the implementation details of 1PC.

      1PC

      @@ -4855,7 +4856,7 @@

      TiDB Part

    • doActionOnGroupMutations
      • Divide mutations into batches
      • -
      • ⭐ checkOnePCFallBack
      • +
      • ⭐ checkOnePCFallBack
      • doActionOnBatches

        There is an inner table named mysql.tidb in TiDB which stores many runtime information. Actually, these variables are stored in TiKV as common KV variables. We'll talk about the usage of these variables in the GC workflow later.

        -
        MySQL [test]> select * from mysql.tidb where variable_name like "tikv_gc_%";
        +
        MySQL [test]> select * from mysql.tidb where variable_name like "tikv_gc_%";
         +--------------------------+----------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+
         | VARIABLE_NAME            | VARIABLE_VALUE                                                                         | COMMENT                                                                                     |
         +--------------------------+----------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+
        @@ -5020,8 +5021,8 @@ 

        TiDB part

        | tikv_gc_last_run_time | 20210929-17:03:35 +0800 | The time when last GC starts. (DO NOT EDIT) | | tikv_gc_safe_point | 20210929-16:53:35 +0800 | All versions after safe point can be accessed. (DO NOT EDIT) | | tikv_gc_auto_concurrency | true | Let TiDB pick the concurrency automatically. If set false, tikv_gc_concurrency will be used | -| tikv_gc_scan_lock_mode | legacy | Mode of scanning locks, "physical" or "legacy" | -| tikv_gc_mode | distributed | Mode of GC, "central" or "distributed" | +| tikv_gc_scan_lock_mode | legacy | Mode of scanning locks, "physical" or "legacy" | +| tikv_gc_mode | distributed | Mode of GC, "central" or "distributed" | +--------------------------+----------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+ 11 rows in set (0.003 sec)
        @@ -5101,7 +5102,7 @@

        GC SafepointGC Workflow

        if err := s.storage.SaveGCSafePoint(newSafePoint); err != nil { return nil, err } - log.Info("updated gc safe point", - zap.Uint64("safe-point", newSafePoint)) + log.Info("updated gc safe point", + zap.Uint64("safe-point", newSafePoint)) } else if newSafePoint < oldSafePoint { - log.Warn("trying to update gc safe point", - zap.Uint64("old-safe-point", oldSafePoint), - zap.Uint64("new-safe-point", newSafePoint)) + log.Warn("trying to update gc safe point", + zap.Uint64("old-safe-point", oldSafePoint), + zap.Uint64("new-safe-point", newSafePoint)) newSafePoint = oldSafePoint } @@ -5242,7 +5243,7 @@

        S

      The state of the variables is stored in sessionVars. The raw string values are stored in a map named systems. This string value is used for persistence in the mysql.global_variables table.

      For many variables, as well as a string value there is a typed field in sessionVars. For example:

      -

      SessionVars.systems["tidb_skip_utf8_check"] (string) maps to SessionVars.SkipUTF8Check (bool).

      +

      SessionVars.systems["tidb_skip_utf8_check"] (string) maps to SessionVars.SkipUTF8Check (bool).

      The typed value is set when the SetSession attached to the system variable definition is called. For tidb_skip_utf8_check this is as follows:

      {Scope: ScopeGlobal | ScopeSession, Name: TiDBSkipUTF8Check, Value: BoolToOnOff(DefSkipUTF8Check), Type: TypeBool, SetSession: func(s *SessionVars, val string) error {
       	s.SkipUTF8Check = TiDBOptOn(val)
      @@ -5274,7 +5275,7 @@ 

      -

      Because GLOBAL scoped variables are propagated to other servers, TiDB also has a special concept of "instance-scoped variables". An instance scoped variable is actually a SESSION scoped variable that has a GetSession method which returns data that is specific to an instance. For example, tidb_general_log:

      +

      Because GLOBAL scoped variables are propagated to other servers, TiDB also has a special concept of "instance-scoped variables". An instance scoped variable is actually a SESSION scoped variable that has a GetSession method which returns data that is specific to an instance. For example, tidb_general_log:

      {Scope: ScopeSession, Name: TiDBGeneralLog, Value: BoolToOnOff(DefTiDBGeneralLog), Type: TypeBool, skipInit: true, SetSession: func(s *SessionVars, val string) error {
       	ProcessGeneralLog.Store(TiDBOptOn(val))
       	return nil
      @@ -5283,7 +5284,7 @@ 

      The decision to make an option such as tidb_general_log instance scoped is because it references a file on the local filesystem. This may create issues when global, as the path may not be writable on each tidb-server in the cluster.

      -

      As you can see by the Scope: Session, instance-scoped variables are not natively handled by the sysvar framework, but are instead denoted by the GetSession() function reading from a global location. The documentation for tidb_general_log also notes it as "instance" scoped by convention.

      +

      As you can see by the Scope: Session, instance-scoped variables are not natively handled by the sysvar framework, but are instead denoted by the GetSession() function reading from a global location. The documentation for tidb_general_log also notes it as "instance" scoped by convention.

      Transaction state

      The session struct (s.txn) is responsible for keeping modified key-value pairs in a LazyTxn until the transaction commits. A commit statement only sets the session variable state that it is no longer in an active transaction:

      func (e *SimpleExec) executeCommit(s *ast.CommitStmt) {
      @@ -5294,7 +5295,7 @@ 

      Transacti
      if !sessVars.InTxn() {
       	if err := se.CommitTxn(ctx); err != nil {
       		if _, ok := sql.(*executor.ExecStmt).StmtNode.(*ast.CommitStmt); ok {
      -			err = errors.Annotatef(err, "previous statement: %s", se.GetSessionVars().PrevStmt)
      +			err = errors.Annotatef(err, "previous statement: %s", se.GetSessionVars().PrevStmt)
       		}
       		return err
       	}
      @@ -5335,11 +5336,11 @@ 

      Manually checking with the privilege manager

      For (2) above, manual checks should follow the following pattern:

      checker := privilege.GetPrivilegeManager(e.ctx)
      -if checker != nil && !checker.RequestVerification(ctx.GetSessionVars().ActiveRoles, schema.Name.L, table.Name.L, "", mysql.AllPrivMask) {
      +if checker != nil && !checker.RequestVerification(ctx.GetSessionVars().ActiveRoles, schema.Name.L, table.Name.L, "", mysql.AllPrivMask) {
           /* do something */
       }
       ..
      -if checker == nil || !checker.RequestDynamicVerification(ctx.GetSessionVars().ActiveRoles, "RESTRICTED_TABLES_ADMIN", false) {
      +if checker == nil || !checker.RequestDynamicVerification(ctx.GetSessionVars().ActiveRoles, "RESTRICTED_TABLES_ADMIN", false) {
           /* do something */
       }
       
      @@ -5347,13 +5348,13 @@

      Static and dynamic privileges

      Privileges fall into two categories:

        -
      • Static privileges: These are the "traditional" privileges such as INSERT, UPDATE, SELECT, DELETE, SUPER, PROCESS which have existed in MySQL for a long time. They can usually be assigned to a user on either a global or database/table level.
      • -
      • Dynamic privileges: These are new privileges such as BACKUP_ADMIN, RESTORE_ADMIN, CONNECTION_ADMIN. They can only be assigned on a global level, and each have their own "grantable" attribute.
      • +
      • Static privileges: These are the "traditional" privileges such as INSERT, UPDATE, SELECT, DELETE, SUPER, PROCESS which have existed in MySQL for a long time. They can usually be assigned to a user on either a global or database/table level.
      • +
      • Dynamic privileges: These are new privileges such as BACKUP_ADMIN, RESTORE_ADMIN, CONNECTION_ADMIN. They can only be assigned on a global level, and each have their own "grantable" attribute.

      Dynamic privileges were introduced in MySQL 8.0 (and TiDB 5.1) to solve a specific issue, which is that the SUPER privilege is too coarse. There are many scenarios where a user needs to be assigned the SUPER privilege to perform a specific action, but too many other privileges are granted at the same time.

      Any statements added to TiDB should no longer require the SUPER privilege directly. Instead, a dynamic privilege should be added which will be satified by the SUPER privilege.

      Security Enhanced Mode

      -

      TiDB features an extension to MySQL called Security Enhanced Mode (SEM), which is disabled by default. One of the main aims of SEM is to reduce the privileges of SUPER and instead require specific "restricted" dynamic privileges instead. The design is inspired by features such as "Security Enhanced Linux" (SeLinux) and AppArmor.

      +

      TiDB features an extension to MySQL called Security Enhanced Mode (SEM), which is disabled by default. One of the main aims of SEM is to reduce the privileges of SUPER and instead require specific "restricted" dynamic privileges instead. The design is inspired by features such as "Security Enhanced Linux" (SeLinux) and AppArmor.

      SEM plugs directly into the privilege manager, but the hard coded list of restricted objects lives in ./util/sem/*. It is expected that over time SEM will protect against additional operations which are considered to be high risk or too broad.

        @@ -5364,7 +5365,7 @@