From 5984f1ec6f49d8a2e5ae3a362f9c71efe1c8c36e Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Mon, 21 Nov 2022 21:00:42 -0800
Subject: [PATCH 01/21] Rename `"model_state_dict"` to `"model"`

---
 .../train/examples/pytorch/tune_cifar_torch_pbt_example.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
index 71472227a249..46ea6ab3947a 100644
--- a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
+++ b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
@@ -83,7 +83,7 @@ def train_func(config):
         checkpoint_dict = session.get_checkpoint().to_dict()
 
         # Load in model
-        model_state = checkpoint_dict["model_state_dict"]
+        model_state = checkpoint_dict["model"]
         model.load_state_dict(model_state)
 
         # Load in optimizer
@@ -146,7 +146,7 @@ def train_func(config):
         checkpoint = Checkpoint.from_dict(
             {
                 "epoch": epoch,
-                "model_state_dict": model.state_dict(),
+                "model": model.state_dict(),
                 "optimizer_state_dict": optimizer.state_dict(),
             }
         )

From 8f58490187070bc8122629f97f9c9ff07b716f85 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Mon, 21 Nov 2022 21:00:49 -0800
Subject: [PATCH 02/21] Revert "Rename `"model_state_dict"` to `"model"`"

This reverts commit 5984f1ec6f49d8a2e5ae3a362f9c71efe1c8c36e.
---
 .../train/examples/pytorch/tune_cifar_torch_pbt_example.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
index 46ea6ab3947a..71472227a249 100644
--- a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
+++ b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
@@ -83,7 +83,7 @@ def train_func(config):
         checkpoint_dict = session.get_checkpoint().to_dict()
 
         # Load in model
-        model_state = checkpoint_dict["model"]
+        model_state = checkpoint_dict["model_state_dict"]
         model.load_state_dict(model_state)
 
         # Load in optimizer
@@ -146,7 +146,7 @@ def train_func(config):
         checkpoint = Checkpoint.from_dict(
             {
                 "epoch": epoch,
-                "model": model.state_dict(),
+                "model_state_dict": model.state_dict(),
                 "optimizer_state_dict": optimizer.state_dict(),
             }
         )

From de05655b003c96b3cb9194e6cf21155e04ee22f5 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Thu, 26 Jan 2023 11:56:49 -0800
Subject: [PATCH 03/21] Update annotations.py

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 python/ray/util/annotations.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/ray/util/annotations.py b/python/ray/util/annotations.py
index 9996b092fcab..f7b93746f910 100644
--- a/python/ray/util/annotations.py
+++ b/python/ray/util/annotations.py
@@ -49,7 +49,7 @@ def PublicAPI(*args, **kwargs):
     def wrap(obj):
         if stability in ["alpha", "beta"]:
             message = (
-                f"PublicAPI ({stability}): This API is in {stability} "
+                f"**PublicAPI ({stability}):** This API is in {stability} "
                 "and may change before becoming stable."
             )
         else:
@@ -80,7 +80,8 @@ def DeveloperAPI(*args, **kwargs):
 
     def wrap(obj):
         _append_doc(
-            obj, message="DeveloperAPI: This API may change across minor Ray releases."
+            obj,
+            message="**DeveloperAPI:** This API may change across minor Ray releases.",
         )
         _mark_annotated(obj)
         return obj

From fd2ff917e1cc3258554c56b283db8e8e155cff9a Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Thu, 26 Jan 2023 12:02:30 -0800
Subject: [PATCH 04/21] Revert "Update annotations.py"

This reverts commit de05655b003c96b3cb9194e6cf21155e04ee22f5.

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 python/ray/util/annotations.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/ray/util/annotations.py b/python/ray/util/annotations.py
index f7b93746f910..9996b092fcab 100644
--- a/python/ray/util/annotations.py
+++ b/python/ray/util/annotations.py
@@ -49,7 +49,7 @@ def PublicAPI(*args, **kwargs):
     def wrap(obj):
         if stability in ["alpha", "beta"]:
             message = (
-                f"**PublicAPI ({stability}):** This API is in {stability} "
+                f"PublicAPI ({stability}): This API is in {stability} "
                 "and may change before becoming stable."
             )
         else:
@@ -80,8 +80,7 @@ def DeveloperAPI(*args, **kwargs):
 
     def wrap(obj):
         _append_doc(
-            obj,
-            message="**DeveloperAPI:** This API may change across minor Ray releases.",
+            obj, message="DeveloperAPI: This API may change across minor Ray releases."
         )
         _mark_annotated(obj)
         return obj

From f4ddcc986eadb86f4587903357aabc6d3ff05a48 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 23 May 2023 11:20:37 -0700
Subject: [PATCH 05/21] Initial commit

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/AMPM.yml               |   9 +
 .github/styles/Google/Acronyms.yml           |  64 +++++++
 .github/styles/Google/Colons.yml             |   8 +
 .github/styles/Google/Contractions.yml       |  30 +++
 .github/styles/Google/DateFormat.yml         |   9 +
 .github/styles/Google/Ellipses.yml           |   9 +
 .github/styles/Google/EmDash.yml             |  12 ++
 .github/styles/Google/EnDash.yml             |  13 ++
 .github/styles/Google/Exclamation.yml        |   9 +
 .github/styles/Google/FirstPerson.yml        |  13 ++
 .github/styles/Google/Gender.yml             |   9 +
 .github/styles/Google/GenderBias.yml         |  47 +++++
 .github/styles/Google/HeadingPunctuation.yml |  13 ++
 .github/styles/Google/Headings.yml           |  29 +++
 .github/styles/Google/Latin.yml              |  11 ++
 .github/styles/Google/LyHyphens.yml          |  14 ++
 .github/styles/Google/OptionalPlurals.yml    |  12 ++
 .github/styles/Google/Ordinal.yml            |   7 +
 .github/styles/Google/OxfordComma.yml        |   7 +
 .github/styles/Google/Parens.yml             |   7 +
 .github/styles/Google/Passive.yml            | 184 +++++++++++++++++++
 .github/styles/Google/Periods.yml            |   7 +
 .github/styles/Google/Quotes.yml             |   7 +
 .github/styles/Google/Ranges.yml             |   7 +
 .github/styles/Google/Semicolons.yml         |   8 +
 .github/styles/Google/Slang.yml              |  11 ++
 .github/styles/Google/Spacing.yml            |  10 +
 .github/styles/Google/Spelling.yml           |  10 +
 .github/styles/Google/Units.yml              |   8 +
 .github/styles/Google/We.yml                 |  11 ++
 .github/styles/Google/Will.yml               |   7 +
 .github/styles/Google/WordList.yml           |  81 ++++++++
 .github/styles/Google/meta.json              |   4 +
 .github/styles/Google/vocab.txt              |   0
 .github/styles/Vocab/ray/accept.txt          |  21 +++
 .github/workflows/vale.yml                   |   0
 .vale.ini                                    |  10 +
 37 files changed, 718 insertions(+)
 create mode 100644 .github/styles/Google/AMPM.yml
 create mode 100644 .github/styles/Google/Acronyms.yml
 create mode 100644 .github/styles/Google/Colons.yml
 create mode 100644 .github/styles/Google/Contractions.yml
 create mode 100644 .github/styles/Google/DateFormat.yml
 create mode 100644 .github/styles/Google/Ellipses.yml
 create mode 100644 .github/styles/Google/EmDash.yml
 create mode 100644 .github/styles/Google/EnDash.yml
 create mode 100644 .github/styles/Google/Exclamation.yml
 create mode 100644 .github/styles/Google/FirstPerson.yml
 create mode 100644 .github/styles/Google/Gender.yml
 create mode 100644 .github/styles/Google/GenderBias.yml
 create mode 100644 .github/styles/Google/HeadingPunctuation.yml
 create mode 100644 .github/styles/Google/Headings.yml
 create mode 100644 .github/styles/Google/Latin.yml
 create mode 100644 .github/styles/Google/LyHyphens.yml
 create mode 100644 .github/styles/Google/OptionalPlurals.yml
 create mode 100644 .github/styles/Google/Ordinal.yml
 create mode 100644 .github/styles/Google/OxfordComma.yml
 create mode 100644 .github/styles/Google/Parens.yml
 create mode 100644 .github/styles/Google/Passive.yml
 create mode 100644 .github/styles/Google/Periods.yml
 create mode 100644 .github/styles/Google/Quotes.yml
 create mode 100644 .github/styles/Google/Ranges.yml
 create mode 100644 .github/styles/Google/Semicolons.yml
 create mode 100644 .github/styles/Google/Slang.yml
 create mode 100644 .github/styles/Google/Spacing.yml
 create mode 100644 .github/styles/Google/Spelling.yml
 create mode 100644 .github/styles/Google/Units.yml
 create mode 100644 .github/styles/Google/We.yml
 create mode 100644 .github/styles/Google/Will.yml
 create mode 100644 .github/styles/Google/WordList.yml
 create mode 100644 .github/styles/Google/meta.json
 create mode 100644 .github/styles/Google/vocab.txt
 create mode 100644 .github/styles/Vocab/ray/accept.txt
 create mode 100644 .github/workflows/vale.yml
 create mode 100644 .vale.ini

diff --git a/.github/styles/Google/AMPM.yml b/.github/styles/Google/AMPM.yml
new file mode 100644
index 000000000000..fbdc6e4f84b9
--- /dev/null
+++ b/.github/styles/Google/AMPM.yml
@@ -0,0 +1,9 @@
+extends: existence
+message: "Use 'AM' or 'PM' (preceded by a space)."
+link: 'https://developers.google.com/style/word-list'
+level: error
+nonword: true
+tokens:
+  - '\d{1,2}[AP]M'
+  - '\d{1,2} ?[ap]m'
+  - '\d{1,2} ?[aApP]\.[mM]\.'
diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml
new file mode 100644
index 000000000000..f41af0189b07
--- /dev/null
+++ b/.github/styles/Google/Acronyms.yml
@@ -0,0 +1,64 @@
+extends: conditional
+message: "Spell out '%s', if it's unfamiliar to the audience."
+link: 'https://developers.google.com/style/abbreviations'
+level: suggestion
+ignorecase: false
+# Ensures that the existence of 'first' implies the existence of 'second'.
+first: '\b([A-Z]{3,5})\b'
+second: '(?:\b[A-Z][a-z]+ )+\(([A-Z]{3,5})\)'
+# ... with the exception of these:
+exceptions:
+  - API
+  - ASP
+  - CLI
+  - CPU
+  - CSS
+  - CSV
+  - DEBUG
+  - DOM
+  - DPI
+  - FAQ
+  - GCC
+  - GDB
+  - GET
+  - GPU
+  - GTK
+  - GUI
+  - HTML
+  - HTTP
+  - HTTPS
+  - IDE
+  - JAR
+  - JSON
+  - JSX
+  - LESS
+  - LLDB
+  - NET
+  - NOTE
+  - NVDA
+  - OSS
+  - PATH
+  - PDF
+  - PHP
+  - POST
+  - RAM
+  - REPL
+  - RSA
+  - SCM
+  - SCSS
+  - SDK
+  - SQL
+  - SSH
+  - SSL
+  - SVG
+  - TBD
+  - TCP
+  - TODO
+  - URI
+  - URL
+  - USB
+  - UTF
+  - XML
+  - XSS
+  - YAML
+  - ZIP
diff --git a/.github/styles/Google/Colons.yml b/.github/styles/Google/Colons.yml
new file mode 100644
index 000000000000..99363fbd46d7
--- /dev/null
+++ b/.github/styles/Google/Colons.yml
@@ -0,0 +1,8 @@
+extends: existence
+message: "'%s' should be in lowercase."
+link: 'https://developers.google.com/style/colons'
+nonword: true
+level: warning
+scope: sentence
+tokens:
+  - ':\s[A-Z]'
diff --git a/.github/styles/Google/Contractions.yml b/.github/styles/Google/Contractions.yml
new file mode 100644
index 000000000000..4f6fd5d489dc
--- /dev/null
+++ b/.github/styles/Google/Contractions.yml
@@ -0,0 +1,30 @@
+extends: substitution
+message: "Use '%s' instead of '%s'."
+link: 'https://developers.google.com/style/contractions'
+level: suggestion
+ignorecase: true
+action:
+  name: replace
+swap:
+  are not: aren't
+  cannot: can't
+  could not: couldn't
+  did not: didn't
+  do not: don't
+  does not: doesn't
+  has not: hasn't
+  have not: haven't
+  how is: how's
+  is not: isn't
+  it is: it's
+  should not: shouldn't
+  that is: that's
+  they are: they're
+  was not: wasn't
+  we are: we're
+  we have: we've
+  were not: weren't
+  what is: what's
+  when is: when's
+  where is: where's
+  will not: won't
diff --git a/.github/styles/Google/DateFormat.yml b/.github/styles/Google/DateFormat.yml
new file mode 100644
index 000000000000..e9d227fa13d5
--- /dev/null
+++ b/.github/styles/Google/DateFormat.yml
@@ -0,0 +1,9 @@
+extends: existence
+message: "Use 'July 31, 2016' format, not '%s'."
+link: 'https://developers.google.com/style/dates-times'
+ignorecase: true
+level: error
+nonword: true
+tokens:
+  - '\d{1,2}(?:\.|/)\d{1,2}(?:\.|/)\d{4}'
+  - '\d{1,2} (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)|May|Jun(?:e)|Jul(?:y)|Aug(?:ust)|Sep(?:tember)?|Oct(?:ober)|Nov(?:ember)?|Dec(?:ember)?) \d{4}'
diff --git a/.github/styles/Google/Ellipses.yml b/.github/styles/Google/Ellipses.yml
new file mode 100644
index 000000000000..1e070517bfe4
--- /dev/null
+++ b/.github/styles/Google/Ellipses.yml
@@ -0,0 +1,9 @@
+extends: existence
+message: "In general, don't use an ellipsis."
+link: 'https://developers.google.com/style/ellipses'
+nonword: true
+level: warning
+action:
+  name: remove
+tokens:
+  - '\.\.\.'
diff --git a/.github/styles/Google/EmDash.yml b/.github/styles/Google/EmDash.yml
new file mode 100644
index 000000000000..1befe72aa881
--- /dev/null
+++ b/.github/styles/Google/EmDash.yml
@@ -0,0 +1,12 @@
+extends: existence
+message: "Don't put a space before or after a dash."
+link: 'https://developers.google.com/style/dashes'
+nonword: true
+level: error
+action:
+  name: edit
+  params:
+    - remove
+    - ' '
+tokens:
+  - '\s[—–]\s'
diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml
new file mode 100644
index 000000000000..b314dc4e98ab
--- /dev/null
+++ b/.github/styles/Google/EnDash.yml
@@ -0,0 +1,13 @@
+extends: existence
+message: "Use an em dash ('—') instead of '–'."
+link: 'https://developers.google.com/style/dashes'
+nonword: true
+level: error
+action:
+  name: edit
+  params:
+    - replace
+    - '-'
+    - '—'
+tokens:
+  - '–'
diff --git a/.github/styles/Google/Exclamation.yml b/.github/styles/Google/Exclamation.yml
new file mode 100644
index 000000000000..eea5fd24bd5f
--- /dev/null
+++ b/.github/styles/Google/Exclamation.yml
@@ -0,0 +1,9 @@
+extends: existence
+message: "Don't use exclamation points in text."
+link: 'https://developers.google.com/style/exclamation-points'
+nonword: true
+level: error
+action:
+  name: remove
+tokens:
+  - '\w+!(?:\s|$)'
diff --git a/.github/styles/Google/FirstPerson.yml b/.github/styles/Google/FirstPerson.yml
new file mode 100644
index 000000000000..0b7b8828ca5f
--- /dev/null
+++ b/.github/styles/Google/FirstPerson.yml
@@ -0,0 +1,13 @@
+extends: existence
+message: "Avoid first-person pronouns such as '%s'."
+link: 'https://developers.google.com/style/pronouns#personal-pronouns'
+ignorecase: true
+level: warning
+nonword: true
+tokens:
+  - (?:^|\s)I\s
+  - (?:^|\s)I,\s
+  - \bI'm\b
+  - \bme\b
+  - \bmy\b
+  - \bmine\b
diff --git a/.github/styles/Google/Gender.yml b/.github/styles/Google/Gender.yml
new file mode 100644
index 000000000000..c8486181d697
--- /dev/null
+++ b/.github/styles/Google/Gender.yml
@@ -0,0 +1,9 @@
+extends: existence
+message: "Don't use '%s' as a gender-neutral pronoun."
+link: 'https://developers.google.com/style/pronouns#gender-neutral-pronouns'
+level: error
+ignorecase: true
+tokens:
+  - he/she
+  - s/he
+  - \(s\)he
diff --git a/.github/styles/Google/GenderBias.yml b/.github/styles/Google/GenderBias.yml
new file mode 100644
index 000000000000..9e7019086302
--- /dev/null
+++ b/.github/styles/Google/GenderBias.yml
@@ -0,0 +1,47 @@
+extends: substitution
+message: "Consider using '%s' instead of '%s'."
+link: 'https://developers.google.com/style/inclusive-documentation'
+ignorecase: true
+level: error
+action:
+  name: replace
+swap:
+  (?:alumna|alumnus):          graduate
+  (?:alumnae|alumni):          graduates
+  air(?:m[ae]n|wom[ae]n):      pilot(s)
+  anchor(?:m[ae]n|wom[ae]n):   anchor(s)
+  authoress:                   author
+  camera(?:m[ae]n|wom[ae]n):   camera operator(s)
+  chair(?:m[ae]n|wom[ae]n):    chair(s)
+  congress(?:m[ae]n|wom[ae]n): member(s) of congress
+  door(?:m[ae]|wom[ae]n):      concierge(s)
+  draft(?:m[ae]n|wom[ae]n):    drafter(s)
+  fire(?:m[ae]n|wom[ae]n):     firefighter(s)
+  fisher(?:m[ae]n|wom[ae]n):   fisher(s)
+  fresh(?:m[ae]n|wom[ae]n):    first-year student(s)
+  garbage(?:m[ae]n|wom[ae]n):  waste collector(s)
+  lady lawyer:                 lawyer
+  ladylike:                    courteous
+  landlord:                    building manager
+  mail(?:m[ae]n|wom[ae]n):     mail carriers
+  man and wife:                husband and wife
+  man enough:                  strong enough
+  mankind:                     human kind
+  manmade:                     manufactured
+  manpower:                    personnel
+  men and girls:               men and women
+  middle(?:m[ae]n|wom[ae]n):   intermediary
+  news(?:m[ae]n|wom[ae]n):     journalist(s)
+  ombuds(?:man|woman):         ombuds
+  oneupmanship:                upstaging
+  poetess:                     poet
+  police(?:m[ae]n|wom[ae]n):   police officer(s)
+  repair(?:m[ae]n|wom[ae]n):   technician(s)
+  sales(?:m[ae]n|wom[ae]n):    salesperson or sales people
+  service(?:m[ae]n|wom[ae]n):  soldier(s)
+  steward(?:ess)?:             flight attendant
+  tribes(?:m[ae]n|wom[ae]n):   tribe member(s)
+  waitress:                    waiter
+  woman doctor:                doctor
+  woman scientist[s]?:         scientist(s)
+  work(?:m[ae]n|wom[ae]n):     worker(s)
diff --git a/.github/styles/Google/HeadingPunctuation.yml b/.github/styles/Google/HeadingPunctuation.yml
new file mode 100644
index 000000000000..b538be5b42a2
--- /dev/null
+++ b/.github/styles/Google/HeadingPunctuation.yml
@@ -0,0 +1,13 @@
+extends: existence
+message: "Don't put a period at the end of a heading."
+link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings'
+nonword: true
+level: warning
+scope: heading
+action:
+  name: edit
+  params:
+    - remove
+    - '.'
+tokens:
+  - '[a-z0-9][.]\s*$'
diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml
new file mode 100644
index 000000000000..a53301338a47
--- /dev/null
+++ b/.github/styles/Google/Headings.yml
@@ -0,0 +1,29 @@
+extends: capitalization
+message: "'%s' should use sentence-style capitalization."
+link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings'
+level: warning
+scope: heading
+match: $sentence
+indicators:
+  - ':'
+exceptions:
+  - Azure
+  - CLI
+  - Code
+  - Cosmos
+  - Docker
+  - Emmet
+  - gRPC
+  - I
+  - Kubernetes
+  - Linux
+  - macOS
+  - Marketplace
+  - MongoDB
+  - REPL
+  - Studio
+  - TypeScript
+  - URLs
+  - Visual
+  - VS
+  - Windows
diff --git a/.github/styles/Google/Latin.yml b/.github/styles/Google/Latin.yml
new file mode 100644
index 000000000000..ca03b9154b16
--- /dev/null
+++ b/.github/styles/Google/Latin.yml
@@ -0,0 +1,11 @@
+extends: substitution
+message: "Use '%s' instead of '%s'."
+link: 'https://developers.google.com/style/abbreviations'
+ignorecase: true
+level: error
+nonword: true
+action:
+  name: replace
+swap:
+  '\b(?:eg|e\.g\.)(?=[\s,;])': for example
+  '\b(?:ie|i\.e\.)(?=[\s,;])': that is
diff --git a/.github/styles/Google/LyHyphens.yml b/.github/styles/Google/LyHyphens.yml
new file mode 100644
index 000000000000..ac8f557a4af7
--- /dev/null
+++ b/.github/styles/Google/LyHyphens.yml
@@ -0,0 +1,14 @@
+extends: existence
+message: "'%s' doesn't need a hyphen."
+link: 'https://developers.google.com/style/hyphens'
+level: error
+ignorecase: false
+nonword: true
+action:
+  name: edit
+  params:
+    - replace
+    - '-'
+    - ' '
+tokens:
+  - '\s[^\s-]+ly-'
diff --git a/.github/styles/Google/OptionalPlurals.yml b/.github/styles/Google/OptionalPlurals.yml
new file mode 100644
index 000000000000..f858ea6fee16
--- /dev/null
+++ b/.github/styles/Google/OptionalPlurals.yml
@@ -0,0 +1,12 @@
+extends: existence
+message: "Don't use plurals in parentheses such as in '%s'."
+link: 'https://developers.google.com/style/plurals-parentheses'
+level: error
+nonword: true
+action:
+  name: edit
+  params:
+    - remove
+    - '(s)'
+tokens:
+  - '\b\w+\(s\)'
diff --git a/.github/styles/Google/Ordinal.yml b/.github/styles/Google/Ordinal.yml
new file mode 100644
index 000000000000..d1ac7d27e80d
--- /dev/null
+++ b/.github/styles/Google/Ordinal.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Spell out all ordinal numbers ('%s') in text."
+link: 'https://developers.google.com/style/numbers'
+level: error
+nonword: true
+tokens:
+  - \d+(?:st|nd|rd|th)
diff --git a/.github/styles/Google/OxfordComma.yml b/.github/styles/Google/OxfordComma.yml
new file mode 100644
index 000000000000..b9ba21ebb25a
--- /dev/null
+++ b/.github/styles/Google/OxfordComma.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Use the Oxford comma in '%s'."
+link: 'https://developers.google.com/style/commas'
+scope: sentence
+level: warning
+tokens:
+  - '(?:[^,]+,){1,}\s\w+\s(?:and|or)'
diff --git a/.github/styles/Google/Parens.yml b/.github/styles/Google/Parens.yml
new file mode 100644
index 000000000000..3b8711d0c88f
--- /dev/null
+++ b/.github/styles/Google/Parens.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Use parentheses judiciously."
+link: 'https://developers.google.com/style/parentheses'
+nonword: true
+level: suggestion
+tokens:
+  - '\(.+\)'
diff --git a/.github/styles/Google/Passive.yml b/.github/styles/Google/Passive.yml
new file mode 100644
index 000000000000..3265890e5202
--- /dev/null
+++ b/.github/styles/Google/Passive.yml
@@ -0,0 +1,184 @@
+extends: existence
+link: 'https://developers.google.com/style/voice'
+message: "In general, use active voice instead of passive voice ('%s')."
+ignorecase: true
+level: suggestion
+raw:
+  - \b(am|are|were|being|is|been|was|be)\b\s*
+tokens:
+  - '[\w]+ed'
+  - awoken
+  - beat
+  - become
+  - been
+  - begun
+  - bent
+  - beset
+  - bet
+  - bid
+  - bidden
+  - bitten
+  - bled
+  - blown
+  - born
+  - bought
+  - bound
+  - bred
+  - broadcast
+  - broken
+  - brought
+  - built
+  - burnt
+  - burst
+  - cast
+  - caught
+  - chosen
+  - clung
+  - come
+  - cost
+  - crept
+  - cut
+  - dealt
+  - dived
+  - done
+  - drawn
+  - dreamt
+  - driven
+  - drunk
+  - dug
+  - eaten
+  - fallen
+  - fed
+  - felt
+  - fit
+  - fled
+  - flown
+  - flung
+  - forbidden
+  - foregone
+  - forgiven
+  - forgotten
+  - forsaken
+  - fought
+  - found
+  - frozen
+  - given
+  - gone
+  - gotten
+  - ground
+  - grown
+  - heard
+  - held
+  - hidden
+  - hit
+  - hung
+  - hurt
+  - kept
+  - knelt
+  - knit
+  - known
+  - laid
+  - lain
+  - leapt
+  - learnt
+  - led
+  - left
+  - lent
+  - let
+  - lighted
+  - lost
+  - made
+  - meant
+  - met
+  - misspelt
+  - mistaken
+  - mown
+  - overcome
+  - overdone
+  - overtaken
+  - overthrown
+  - paid
+  - pled
+  - proven
+  - put
+  - quit
+  - read
+  - rid
+  - ridden
+  - risen
+  - run
+  - rung
+  - said
+  - sat
+  - sawn
+  - seen
+  - sent
+  - set
+  - sewn
+  - shaken
+  - shaven
+  - shed
+  - shod
+  - shone
+  - shorn
+  - shot
+  - shown
+  - shrunk
+  - shut
+  - slain
+  - slept
+  - slid
+  - slit
+  - slung
+  - smitten
+  - sold
+  - sought
+  - sown
+  - sped
+  - spent
+  - spilt
+  - spit
+  - split
+  - spoken
+  - spread
+  - sprung
+  - spun
+  - stolen
+  - stood
+  - stridden
+  - striven
+  - struck
+  - strung
+  - stuck
+  - stung
+  - stunk
+  - sung
+  - sunk
+  - swept
+  - swollen
+  - sworn
+  - swum
+  - swung
+  - taken
+  - taught
+  - thought
+  - thrived
+  - thrown
+  - thrust
+  - told
+  - torn
+  - trodden
+  - understood
+  - upheld
+  - upset
+  - wed
+  - wept
+  - withheld
+  - withstood
+  - woken
+  - won
+  - worn
+  - wound
+  - woven
+  - written
+  - wrung
diff --git a/.github/styles/Google/Periods.yml b/.github/styles/Google/Periods.yml
new file mode 100644
index 000000000000..d24a6a6c0335
--- /dev/null
+++ b/.github/styles/Google/Periods.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Don't use periods with acronyms or initialisms such as '%s'."
+link: 'https://developers.google.com/style/abbreviations'
+level: error
+nonword: true
+tokens:
+  - '\b(?:[A-Z]\.){3,}'
diff --git a/.github/styles/Google/Quotes.yml b/.github/styles/Google/Quotes.yml
new file mode 100644
index 000000000000..3cb6f1abd182
--- /dev/null
+++ b/.github/styles/Google/Quotes.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Commas and periods go inside quotation marks."
+link: 'https://developers.google.com/style/quotation-marks'
+level: error
+nonword: true
+tokens:
+  - '"[^"]+"[.,?]'
diff --git a/.github/styles/Google/Ranges.yml b/.github/styles/Google/Ranges.yml
new file mode 100644
index 000000000000..3ec045e777d9
--- /dev/null
+++ b/.github/styles/Google/Ranges.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Don't add words such as 'from' or 'between' to describe a range of numbers."
+link: 'https://developers.google.com/style/hyphens'
+nonword: true
+level: warning
+tokens:
+  - '(?:from|between)\s\d+\s?-\s?\d+'
diff --git a/.github/styles/Google/Semicolons.yml b/.github/styles/Google/Semicolons.yml
new file mode 100644
index 000000000000..bb8b85b420ee
--- /dev/null
+++ b/.github/styles/Google/Semicolons.yml
@@ -0,0 +1,8 @@
+extends: existence
+message: "Use semicolons judiciously."
+link: 'https://developers.google.com/style/semicolons'
+nonword: true
+scope: sentence
+level: suggestion
+tokens:
+  - ';'
diff --git a/.github/styles/Google/Slang.yml b/.github/styles/Google/Slang.yml
new file mode 100644
index 000000000000..63f4c248a841
--- /dev/null
+++ b/.github/styles/Google/Slang.yml
@@ -0,0 +1,11 @@
+extends: existence
+message: "Don't use internet slang abbreviations such as '%s'."
+link: 'https://developers.google.com/style/abbreviations'
+ignorecase: true
+level: error
+tokens:
+  - 'tl;dr'
+  - ymmv
+  - rtfm
+  - imo
+  - fwiw
diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml
new file mode 100644
index 000000000000..66e45a6b72a9
--- /dev/null
+++ b/.github/styles/Google/Spacing.yml
@@ -0,0 +1,10 @@
+extends: existence
+message: "'%s' should have one space."
+link: 'https://developers.google.com/style/sentence-spacing'
+level: error
+nonword: true
+action:
+  name: remove
+tokens:
+  - '[a-z][.?!] {2,}[A-Z]'
+  - '[a-z][.?!][A-Z]'
diff --git a/.github/styles/Google/Spelling.yml b/.github/styles/Google/Spelling.yml
new file mode 100644
index 000000000000..527ac07d318c
--- /dev/null
+++ b/.github/styles/Google/Spelling.yml
@@ -0,0 +1,10 @@
+extends: existence
+message: "In general, use American spelling instead of '%s'."
+link: 'https://developers.google.com/style/spelling'
+ignorecase: true
+level: warning
+tokens:
+  - '(?:\w+)nised?'
+  - 'colour'
+  - 'labour'
+  - 'centre'
diff --git a/.github/styles/Google/Units.yml b/.github/styles/Google/Units.yml
new file mode 100644
index 000000000000..53522ab2dea3
--- /dev/null
+++ b/.github/styles/Google/Units.yml
@@ -0,0 +1,8 @@
+extends: existence
+message: "Put a nonbreaking space between the number and the unit in '%s'."
+link: "https://developers.google.com/style/units-of-measure"
+nonword: true
+level: error
+tokens:
+  - \b\d+(?:B|kB|MB|GB|TB)
+  - \b\d+(?:ns|ms|s|min|h|d)
diff --git a/.github/styles/Google/We.yml b/.github/styles/Google/We.yml
new file mode 100644
index 000000000000..c7ac7d36221d
--- /dev/null
+++ b/.github/styles/Google/We.yml
@@ -0,0 +1,11 @@
+extends: existence
+message: "Try to avoid using first-person plural like '%s'."
+link: 'https://developers.google.com/style/pronouns#personal-pronouns'
+level: warning
+ignorecase: true
+tokens:
+  - we
+  - we'(?:ve|re)
+  - ours?
+  - us
+  - let's
diff --git a/.github/styles/Google/Will.yml b/.github/styles/Google/Will.yml
new file mode 100644
index 000000000000..128a918362b8
--- /dev/null
+++ b/.github/styles/Google/Will.yml
@@ -0,0 +1,7 @@
+extends: existence
+message: "Avoid using '%s'."
+link: 'https://developers.google.com/style/tense'
+ignorecase: true
+level: warning
+tokens:
+  - will
diff --git a/.github/styles/Google/WordList.yml b/.github/styles/Google/WordList.yml
new file mode 100644
index 000000000000..0d675f2372a2
--- /dev/null
+++ b/.github/styles/Google/WordList.yml
@@ -0,0 +1,81 @@
+extends: substitution
+message: "Use '%s' instead of '%s'."
+link: "https://developers.google.com/style/word-list"
+level: warning
+ignorecase: false
+action:
+  name: replace
+swap:
+  "(?:API Console|dev|developer) key": API key
+  "(?:cell ?phone|smart ?phone)": phone|mobile phone
+  "(?:dev|developer|APIs) console": API console
+  "(?:e-mail|Email|E-mail)": email
+  "(?:file ?path|path ?name)": path
+  "(?:kill|terminate|abort)": stop|exit|cancel|end
+  "(?:OAuth ?2|Oauth)": OAuth 2.0
+  "(?:ok|Okay)": OK|okay
+  "(?:WiFi|wifi)": Wi-Fi
+  '[\.]+apk': APK
+  '3\-D': 3D
+  'Google (?:I\-O|IO)': Google I/O
+  "tap (?:&|and) hold": touch & hold
+  "un(?:check|select)": clear
+  above: preceding
+  account name: username
+  action bar: app bar
+  admin: administrator
+  Ajax: AJAX
+  a\.k\.a|aka: or|also known as
+  Android device: Android-powered device
+  android: Android
+  API explorer: APIs Explorer
+  application: app
+  approx\.: approximately
+  authN: authentication
+  authZ: authorization
+  autoupdate: automatically update
+  cellular data: mobile data
+  cellular network: mobile network
+  chapter: documents|pages|sections
+  check box: checkbox
+  check: select
+  CLI: command-line tool
+  click on: click|click in
+  Cloud: Google Cloud Platform|GCP
+  Container Engine: Kubernetes Engine
+  content type: media type
+  curated roles: predefined roles
+  data are: data is
+  Developers Console: Google API Console|API Console
+  disabled?: turn off|off
+  ephemeral IP address: ephemeral external IP address
+  fewer data: less data
+  file name: filename
+  firewalls: firewall rules
+  functionality: capability|feature
+  Google account: Google Account
+  Google accounts: Google Accounts
+  Googling: search with Google
+  grayed-out: unavailable
+  HTTPs: HTTPS
+  in order to: to
+  ingest: import|load
+  k8s: Kubernetes
+  long press: touch & hold
+  network IP address: internal IP address
+  omnibox: address bar
+  open-source: open source
+  overview screen: recents screen
+  regex: regular expression
+  SHA1: SHA-1|HAS-SHA1
+  sign into: sign in to
+  sign-?on: single sign-on
+  static IP address: static external IP address
+  stylesheet: style sheet
+  synch: sync
+  tablename: table name
+  tablet: device
+  touch: tap
+  url: URL
+  vs\.: versus
+  World Wide Web: web
diff --git a/.github/styles/Google/meta.json b/.github/styles/Google/meta.json
new file mode 100644
index 000000000000..a5da2a8480ef
--- /dev/null
+++ b/.github/styles/Google/meta.json
@@ -0,0 +1,4 @@
+{
+  "feed": "https://github.com/errata-ai/Google/releases.atom",
+  "vale_version": ">=1.0.0"
+}
diff --git a/.github/styles/Google/vocab.txt b/.github/styles/Google/vocab.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt
new file mode 100644
index 000000000000..3c9afb35b90a
--- /dev/null
+++ b/.github/styles/Vocab/ray/accept.txt
@@ -0,0 +1,21 @@
+[aA]utoscaling
+[cC]onfig
+Anyscale
+APIs
+Autoscaler
+Conda
+Databricks
+Datadog
+Dockerfile
+Github
+Grafana
+hostname
+Metaflow
+MLflow
+plaintext
+RLlib
+VSCode
+cron
+MLOps
+Readonly
+Webterminal
diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/.vale.ini b/.vale.ini
new file mode 100644
index 000000000000..f06807039099
--- /dev/null
+++ b/.vale.ini
@@ -0,0 +1,10 @@
+StylesPath = .github/styles
+
+Vocab = ray
+
+MinAlertLevel = error
+
+Packages = Google
+
+[*.{md,rst,py}]
+BasedOnStyles = Vale, Google

From 05862e28d67c85cea6ffa63df5296e924bf1b409 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 30 May 2023 18:00:35 -0700
Subject: [PATCH 06/21] Update stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/Spacing.yml   |  7 +++++--
 .github/styles/Vocab/ray/accept.txt | 23 +++--------------------
 .github/workflows/vale.yml          | 16 ++++++++++++++++
 .vale.ini                           |  4 ++--
 doc/source/data/getting-started.rst | 10 +++++-----
 5 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml
index 66e45a6b72a9..e0d26537eb99 100644
--- a/.github/styles/Google/Spacing.yml
+++ b/.github/styles/Google/Spacing.yml
@@ -5,6 +5,9 @@ level: error
 nonword: true
 action:
   name: remove
+# FIXME: This rule complains about Sphinx directives like
+# ":class:`Dataset <ray.data.Dataset>`".
 tokens:
-  - '[a-z][.?!] {2,}[A-Z]'
-  - '[a-z][.?!][A-Z]'
+  - '^(?!a)b$'  # This regex is impossible to match.
+  # - '[a-z][.?!] {2,}[A-Z]'
+  # - '[a-z][.?!][A-Z]'
diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt
index 3c9afb35b90a..ff1e24b785c6 100644
--- a/.github/styles/Vocab/ray/accept.txt
+++ b/.github/styles/Vocab/ray/accept.txt
@@ -1,21 +1,4 @@
-[aA]utoscaling
-[cC]onfig
-Anyscale
+Data's
 APIs
-Autoscaler
-Conda
-Databricks
-Datadog
-Dockerfile
-Github
-Grafana
-hostname
-Metaflow
-MLflow
-plaintext
-RLlib
-VSCode
-cron
-MLOps
-Readonly
-Webterminal
+UDFs
+Ray Data
diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index e69de29bb2d1..2dcbd3b57915 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -0,0 +1,16 @@
+name: reviewdog
+on: [pull_request]
+
+jobs:
+  vale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: errata-ai/vale-action@reviewdog
+        with:
+          files: docs/source/data/
+          fail_on_error: true
+          # Report level for reviewdog [info,warning,error].
+          level: info
+          # Error on introduced changes only.
+          filter_mode: added
diff --git a/.vale.ini b/.vale.ini
index f06807039099..993e968b4972 100644
--- a/.vale.ini
+++ b/.vale.ini
@@ -1,8 +1,8 @@
 StylesPath = .github/styles
 
-Vocab = ray
+Vocab = Ray
 
-MinAlertLevel = error
+MinAlertLevel = suggestion
 
 Packages = Google
 
diff --git a/doc/source/data/getting-started.rst b/doc/source/data/getting-started.rst
index f1db53b5e81d..0486b22ae153 100644
--- a/doc/source/data/getting-started.rst
+++ b/doc/source/data/getting-started.rst
@@ -1,6 +1,6 @@
 .. _data_getting_started:
 
-Getting Started
+Getting started
 ===============
 
 Ray Data's main abstraction is a :class:`Dataset <ray.data.Dataset>`, which
@@ -21,7 +21,7 @@ To learn more about installing Ray and its libraries, read
 :ref:`Installing Ray <installation>`.
 
 Create a dataset
--------------------
+----------------
 
 Create datasets from on-disk files, Python objects, and cloud storage services like S3.
 Ray Data can read from any `filesystem supported by Arrow
@@ -43,7 +43,7 @@ To learn more about creating datasets, read
 :ref:`Loading data <loading_data>`.
 
 Transform the dataset
-------------------------
+---------------------
 
 Apply :ref:`user-defined functions <transform_datasets_writing_udfs>` (UDFs) to
 transform datasets. Ray executes transformations in parallel for performance.
@@ -82,7 +82,7 @@ To learn more about transforming datasets, read
 :ref:`Transforming data <transforming_data>`.
 
 Consume the dataset
-----------------------
+-------------------
 
 Pass datasets to Ray tasks or actors, and access records with methods like
 :meth:`~ray.data.Dataset.take_batch` and :meth:`~ray.data.Dataset.iter_batches`.
@@ -138,7 +138,7 @@ To learn more about consuming datasets, read
 :ref:`Consuming data <consuming_data>`.
 
 Save the dataset
--------------------
+----------------
 
 Call methods like :meth:`~ray.data.Dataset.write_parquet` to save dataset contents to local
 or remote filesystems.

From 508ff9fce51702f3a963cafe7bd97aeef4c9540a Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 30 May 2023 18:05:20 -0700
Subject: [PATCH 07/21] Update vale.yml

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/workflows/vale.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index 2dcbd3b57915..37999b702303 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -10,7 +10,5 @@ jobs:
         with:
           files: docs/source/data/
           fail_on_error: true
-          # Report level for reviewdog [info,warning,error].
-          level: info
           # Error on introduced changes only.
           filter_mode: added

From 4ceee8aa63f08ca04503bf66ffd72c45f88dffa1 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 30 May 2023 18:09:57 -0700
Subject: [PATCH 08/21] Update stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Vocab/Ray/reject.txt | 0
 .github/workflows/vale.yml          | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 .github/styles/Vocab/Ray/reject.txt

diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index 37999b702303..ed0ca20750c8 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -8,7 +8,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: errata-ai/vale-action@reviewdog
         with:
-          files: docs/source/data/
+          files: doc/source/data/
           fail_on_error: true
           # Error on introduced changes only.
           filter_mode: added

From f602b8ea5cf6f6cea2df845873bd826e8c5e2cf6 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 30 May 2023 18:14:08 -0700
Subject: [PATCH 09/21] Test stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/workflows/vale.yml            | 1 +
 doc/source/data/transforming-data.rst | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index ed0ca20750c8..794b6663e897 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -12,3 +12,4 @@ jobs:
           fail_on_error: true
           # Error on introduced changes only.
           filter_mode: added
+          level: warning
diff --git a/doc/source/data/transforming-data.rst b/doc/source/data/transforming-data.rst
index 0b9305dabbef..72d7a255ea9e 100644
--- a/doc/source/data/transforming-data.rst
+++ b/doc/source/data/transforming-data.rst
@@ -18,7 +18,7 @@ There are two main types of supported transforms:
 
 * One-to-one: each input block will contribute to only one output
   block, such as :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`.
-* All-to-all: input blocks can contribute to multiple output blocks,
+* All-to-all: input blocks will contribute to multiple output blocks,
   such as :meth:`ds.random_shuffle() <ray.data.Dataset.random_shuffle>`.
 
 .. list-table:: Common Ray Data transforms.
@@ -225,7 +225,7 @@ globally shuffle the order of data records.
     >>> dataset.random_shuffle().take_batch()  # doctest: +SKIP
     {'id': array([7, 0, 9, 3, 5, 1, 4, 2, 8, 6])}
 
-For reduced overhead during training ingest, use local shuffles. Read 
+For reduced overhead during training ingest, use local shuffles. Read
 :ref:`Shuffling Data <air-shuffle>` in the AIR user guide to learn more.
 
 .. _data-groupbys:

From 2a13086de8aaf8a7745e40cbb57d24455c78b641 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 30 May 2023 18:17:25 -0700
Subject: [PATCH 10/21] Update stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/workflows/vale.yml            | 2 +-
 .vale.ini                             | 2 +-
 doc/source/data/transforming-data.rst | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index 794b6663e897..2f96a92fd797 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -8,7 +8,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: errata-ai/vale-action@reviewdog
         with:
-          files: doc/source/data/
+          files: doc/source/data/getting-started.rst
           fail_on_error: true
           # Error on introduced changes only.
           filter_mode: added
diff --git a/.vale.ini b/.vale.ini
index 993e968b4972..1189591d35d4 100644
--- a/.vale.ini
+++ b/.vale.ini
@@ -6,5 +6,5 @@ MinAlertLevel = suggestion
 
 Packages = Google
 
-[*.{md,rst,py}]
+[*.{md,rst}]
 BasedOnStyles = Vale, Google
diff --git a/doc/source/data/transforming-data.rst b/doc/source/data/transforming-data.rst
index 72d7a255ea9e..dcf7742dd1fd 100644
--- a/doc/source/data/transforming-data.rst
+++ b/doc/source/data/transforming-data.rst
@@ -18,7 +18,7 @@ There are two main types of supported transforms:
 
 * One-to-one: each input block will contribute to only one output
   block, such as :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`.
-* All-to-all: input blocks will contribute to multiple output blocks,
+* All-to-all: input blocks can contribute to multiple output blocks,
   such as :meth:`ds.random_shuffle() <ray.data.Dataset.random_shuffle>`.
 
 .. list-table:: Common Ray Data transforms.

From 9a4740d89b3f13f82558ad4a5d3ba1e9bc3f07ee Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 13 Jun 2023 13:10:31 -0700
Subject: [PATCH 11/21] Update stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Vocab/ray/accept.txt | 6 ++++++
 .github/workflows/vale.yml          | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt
index ff1e24b785c6..088291ccf700 100644
--- a/.github/styles/Vocab/ray/accept.txt
+++ b/.github/styles/Vocab/ray/accept.txt
@@ -2,3 +2,9 @@ Data's
 APIs
 UDFs
 Ray Data
+:(?:class|meth):([`~\w\s]+?)<([\w\.]+?)>
+API[s]
+UDF[s]
+Data's
+CPUs
+app
diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
index 2f96a92fd797..5f3f0454d464 100644
--- a/.github/workflows/vale.yml
+++ b/.github/workflows/vale.yml
@@ -10,6 +10,4 @@ jobs:
         with:
           files: doc/source/data/getting-started.rst
           fail_on_error: true
-          # Error on introduced changes only.
-          filter_mode: added
           level: warning

From a91cd78ac165b94ef70133334354842b3e212c82 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Tue, 11 Jul 2023 12:27:32 -0700
Subject: [PATCH 12/21] Update files

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/Colons.yml              |  2 +-
 .github/styles/Google/Headings.yml            |  2 +-
 .github/styles/Vocab/Ray/reject.txt           |  1 +
 .github/styles/Vocab/ray/accept.txt           | 27 +++++++--
 doc/source/data/batch_inference.rst           | 16 +++---
 doc/source/data/data-internals.rst            | 47 ++++++++--------
 doc/source/data/inspecting-data.rst           |  5 +-
 doc/source/data/key-concepts.rst              |  2 +-
 doc/source/data/loading-data.rst              |  8 +--
 doc/source/data/performance-tips.rst          | 25 ++++-----
 doc/source/data/user-guide.rst                |  4 +-
 doc/source/data/working-with-pytorch.rst      | 56 +++++++++----------
 doc/source/data/working-with-tensors.rst      |  7 +--
 doc/source/data/working-with-text.rst         |  2 +-
 .../data/tests/test_dynamic_block_split.py    |  5 +-
 15 files changed, 112 insertions(+), 97 deletions(-)

diff --git a/.github/styles/Google/Colons.yml b/.github/styles/Google/Colons.yml
index 99363fbd46d7..dc6ba867b3c3 100644
--- a/.github/styles/Google/Colons.yml
+++ b/.github/styles/Google/Colons.yml
@@ -2,7 +2,7 @@ extends: existence
 message: "'%s' should be in lowercase."
 link: 'https://developers.google.com/style/colons'
 nonword: true
-level: warning
+level: suggestion
 scope: sentence
 tokens:
   - ':\s[A-Z]'
diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml
index a53301338a47..168eb6c050bd 100644
--- a/.github/styles/Google/Headings.yml
+++ b/.github/styles/Google/Headings.yml
@@ -1,7 +1,7 @@
 extends: capitalization
 message: "'%s' should use sentence-style capitalization."
 link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings'
-level: warning
+level: suggestion
 scope: heading
 match: $sentence
 indicators:
diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt
index e69de29bb2d1..12c6d5d5eac2 100644
--- a/.github/styles/Vocab/Ray/reject.txt
+++ b/.github/styles/Vocab/Ray/reject.txt
@@ -0,0 +1 @@
+torch
diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt
index 088291ccf700..bfc654ae0956 100644
--- a/.github/styles/Vocab/ray/accept.txt
+++ b/.github/styles/Vocab/ray/accept.txt
@@ -1,10 +1,27 @@
 Data's
 APIs
-UDFs
 Ray Data
-:(?:class|meth):([`~\w\s]+?)<([\w\.]+?)>
 API[s]
 UDF[s]
-Data's
-CPUs
-app
+CPU[s]
+GPU[s]
+performant
+config
+ingest
+application
+touch
+ndarray[s]
+dataset's
+URI[s]
+codec
+interoperates
+Spotify's
+preprocess
+Predibase
+pushdown
+dicts
+[gG]roupby
+parallelization
+prefetching
+indexable
+dtype
diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst
index c26eba992203..a5330685c83f 100644
--- a/doc/source/data/batch_inference.rst
+++ b/doc/source/data/batch_inference.rst
@@ -31,7 +31,7 @@ Using Ray Data for offline inference involves four basic steps:
 - **Step 3:** Transform your dataset using the pre-trained model by calling :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`. For more details, see :ref:`Transforming Data <transforming_data>`.
 - **Step 4:** Get the final predictions by either iterating through the output or saving the results. For more details, see the :ref:`Iterating over data <iterating-over-data>` and :ref:`Saving data <saving-data>` user guides.
 
-For more in-depth examples for your use case, see :ref:`our batch inference examples<batch_inference_examples>`.
+For more in-depth examples for your use case, see :ref:`the batch inference examples<batch_inference_examples>`.
 For how to configure batch inference, see :ref:`the configuration guide<batch_inference_configuration>`.
 
 .. tabs::
@@ -184,7 +184,7 @@ More examples
 -------------
 - :doc:`Image Classification Batch Inference with PyTorch ResNet18 </data/examples/pytorch_resnet_batch_prediction>`
 - :doc:`Object Detection Batch Inference with PyTorch FasterRCNN_ResNet50 </data/examples/batch_inference_object_detection>`
-- :doc:`Image Classification Batch Inference with Huggingface Vision Transformer </data/examples/huggingface_vit_batch_prediction>`
+- :doc:`Image Classification Batch Inference with Hugging Face Vision Transformer </data/examples/huggingface_vit_batch_prediction>`
 
 .. _batch_inference_configuration:
 
@@ -199,8 +199,8 @@ Using GPUs for inference
 To use GPUs for inference, make the following changes to your code:
 
 1. Update the class implementation to move the model and data to and from GPU.
-2. Specify `num_gpus=1` in the :meth:`ds.map_batches() <ray.data.Dataset.map_batches>` call to indicate that each actor should use 1 GPU.
-3. Specify a `batch_size` for inference. For more details on how to configure the batch size, see `batch_inference_batch_size`_.
+2. Specify ``num_gpus=1`` in the :meth:`ds.map_batches() <ray.data.Dataset.map_batches>` call to indicate that each actor should use 1 GPU.
+3. Specify a ``batch_size`` for inference. For more details on how to configure the batch size, see :ref:`Configuring Batch Size <batch_inference_batch_size>`.
 
 The remaining is the same as the :ref:`Quickstart <batch_inference_quickstart>`.
 
@@ -342,7 +342,7 @@ Configuring Batch Size
 
 Configure the size of the input batch that is passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`
 
-Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter OOMs, decreasing ``batch_size`` may help.
+Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter out-of-memory errors, decreasing ``batch_size`` may help.
 
 .. testcode::
 
@@ -361,7 +361,7 @@ Increasing batch size results in faster execution because inference is a vectori
 
 .. caution::
   The default ``batch_size`` of ``4096`` may be too large for datasets with large rows
-  (e.g., tables with many columns or a collection of large images).
+  (for example, tables with many columns or a collection of large images).
 
 Handling GPU out-of-memory failures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -383,9 +383,9 @@ Handling CPU out-of-memory failures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 If you run out of CPU RAM, you likely that you have too many model replicas that are running concurrently on the same node. For example, if a model
-uses 5GB of RAM when created / run, and a machine has 16GB of RAM total, then no more
+uses 5 GB of RAM when created / run, and a machine has 16 GB of RAM total, then no more
 than three of these models can be run at the same time. The default resource assignments
-of one CPU per task/actor will likely lead to `OutOfMemoryError` from Ray in this situation.
+of one CPU per task/actor might lead to `OutOfMemoryError` from Ray in this situation.
 
 Suppose your cluster has 4 nodes, each with 16 CPUs. To limit to at most
 3 of these actors per node, you can override the CPU or memory:
diff --git a/doc/source/data/data-internals.rst b/doc/source/data/data-internals.rst
index 8c96c74d1908..833bb19624df 100644
--- a/doc/source/data/data-internals.rst
+++ b/doc/source/data/data-internals.rst
@@ -40,7 +40,7 @@ task reads one or more files and produces an output block:
 ..
   https://docs.google.com/drawings/d/15B4TB8b5xN15Q9S8-s0MjW6iIvo_PrH7JtV1fL123pU/edit
 
-To handle transient errors from remote datasources, Ray Data retries application-level
+To handle transient errors from remote data sources, Ray Data retries application-level
 exceptions.
 
 For more information on loading data, see :ref:`Loading data <loading_data>`.
@@ -87,9 +87,9 @@ Ray Data uses Ray Core for execution, and is subject to the same scheduling cons
 Ray Data and placement groups
 -----------------------------
 
-By default, Ray Data configures its tasks and actors to use the cluster-default scheduling strategy ("DEFAULT"). You can inspect this configuration variable here:
+By default, Ray Data configures its tasks and actors to use the cluster-default scheduling strategy (``"DEFAULT"``). You can inspect this configuration variable here:
 :class:`ray.data.DataContext.get_current().scheduling_strategy <ray.data.DataContext>`. This scheduling strategy schedules these Tasks and Actors outside any present
-placement group. To force Ray Data to schedule tasks within the current placement group (i.e., to use current placement group resources specifically for Ray Data), set ``ray.data.DataContext.get_current().scheduling_strategy = None``.
+placement group. To use current placement group resources specifically for Ray Data, set ``ray.data.DataContext.get_current().scheduling_strategy = None``.
 
 Consider this override only for advanced use cases to improve performance predictability. The general recommendation is to let Ray Data run outside placement groups.
 
@@ -98,9 +98,9 @@ Consider this override only for advanced use cases to improve performance predic
 Ray Data and Tune
 -----------------
 
-When using Ray Data in conjunction with :ref:`Ray Tune <tune-main>`, it is important to ensure there are enough free CPUs for Ray Data to run on. By default, Tune will try to fully utilize cluster CPUs. This can prevent Ray Data from scheduling tasks, reducing performance or causing workloads to hang.
+When using Ray Data in conjunction with :ref:`Ray Tune <tune-main>`, it's important to ensure there are enough free CPUs for Ray Data to run on. By default, Tune tries to fully utilize cluster CPUs. This can prevent Ray Data from scheduling tasks, reducing performance or causing workloads to hang.
 
-To ensure CPU resources are always available for Ray Data execution, limit the number of concurrent Tune trials. This can be done using the ``max_concurrent_trials`` Tune option.
+To ensure CPU resources are always available for Ray Data execution, limit the number of concurrent Tune trials with the ``max_concurrent_trials`` Tune option.
 
 .. literalinclude:: ./doc_code/key_concepts.py
   :language: python
@@ -114,11 +114,11 @@ Execution
 
 Ray Data execution by default is:
 
-- **Lazy**: This means that transformations on Dataset are not executed until a
-  consumption operation (e.g. :meth:`ds.iter_batches() <ray.data.Dataset.iter_batches>`)
-  or :meth:`Dataset.materialize() <ray.data.Dataset.materialize>` is called. This creates
-  opportunities for optimizing the execution plan (e.g. :ref:`stage fusion <datasets_stage_fusion>`).
-- **Streaming**: This means that Dataset transformations will be executed in a
+- **Lazy**: This means that transformations on Dataset aren't executed until you call a
+  consumption operation like :meth:`ds.iter_batches() <ray.data.Dataset.iter_batches>`
+  or :meth:`Dataset.materialize() <ray.data.Dataset.materialize>`. This creates
+  opportunities for optimizing the execution plan like :ref:`stage fusion <datasets_stage_fusion>`.
+- **Streaming**: This means that Dataset transformations are executed in a
   streaming way, incrementally on the base data, instead of on all of the data
   at once, and overlapping the execution of operations. This can be used for streaming
   data loading into ML training to overlap the data preprocessing and model training,
@@ -139,11 +139,10 @@ writing (:meth:`ds.write_parquet() <ray.data.Dataset.write_parquet>`), or manual
 :meth:`ds.materialize() <ray.data.Dataset.materialize>`. There are a few
 exceptions to this rule, where transformations such as :meth:`ds.union()
 <ray.data.Dataset.union>` and
-:meth:`ds.limit() <ray.data.Dataset.limit>` trigger execution; we plan to make these
-operations lazy in the future.
+:meth:`ds.limit() <ray.data.Dataset.limit>` trigger execution.
 
 Check the API docs for Ray Data methods to see if they
-trigger execution. Those that do trigger execution will have a ``Note`` indicating as
+trigger execution. Those that do trigger execution have a ``Note`` indicating as
 much.
 
 .. _streaming_execution:
@@ -152,7 +151,7 @@ Streaming Execution
 -------------------
 
 The following code is a hello world example which invokes the execution with
-:meth:`ds.iter_batches() <ray.data.Dataset.iter_batches>` consumption. We will also enable verbose progress reporting, which shows per-operator progress in addition to overall progress.
+:meth:`ds.iter_batches() <ray.data.Dataset.iter_batches>` consumption. The example also enables verbose progress reporting, which shows per-operator progress in addition to overall progress.
 
 .. code-block::
 
@@ -177,19 +176,19 @@ The following code is a hello world example which invokes the execution with
    ):
        pass
 
-This launches a simple 4-stage pipeline. We use different compute args for each stage, which forces them to be run as separate operators instead of getting fused together. You should see a log message indicating streaming execution is being used:
+This launches a simple 4-stage pipeline. The example uses different compute arguments for each stage, which forces them to be run as separate operators instead of getting fused together. You should see a log message indicating streaming execution is being used:
 
 .. code-block::
 
    2023-03-30 16:40:10,076	INFO streaming_executor.py:83 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadRange] -> TaskPoolMapOperator[MapBatches(sleep)] -> ActorPoolMapOperator[MapBatches(sleep)] -> TaskPoolMapOperator[MapBatches(sleep)]
 
-The next few lines will show execution progress. Here is how to interpret the output:
+The next few lines shows execution progress. Here is how to interpret the output:
 
 .. code-block::
 
    Running: 7.0/16.0 CPU, 0.0/0.0 GPU, 76.91 MiB/2.25 GiB object_store_memory 65%|██▊ | 130/200 [00:08<00:02, 22.52it/s]
 
-This line tells you how many resources are currently being used by the streaming executor out of the limits, as well as the number of completed output blocks. The streaming executor will attempt to keep resource usage under the printed limits by throttling task executions.
+This line tells you how many resources are currently being used by the streaming executor out of the limits, as well as the number of completed output blocks. The streaming executor attempts to keep resource usage under the printed limits by throttling task executions.
 
 .. code-block::
 
@@ -202,7 +201,7 @@ These lines are only shown when verbose progress reporting is enabled. The `acti
 
 .. tip::
 
-    Avoid returning large outputs from the final operation of a pipeline you are iterating over, since the consumer process will be a serial bottleneck.
+    Avoid returning large outputs from the final operation of a pipeline you are iterating over, since the consumer process is a serial bottleneck.
 
 Fault tolerance
 ---------------
@@ -221,19 +220,19 @@ system failure occurs, Ray Data recreates blocks by re-executing tasks.
 Stage Fusion Optimization
 -------------------------
 
-In order to reduce memory usage and task overheads, Ray Data will automatically fuse together
+In order to reduce memory usage and task overheads, Ray Data automatically fuses together
 lazy operations that are compatible:
 
 * Same compute pattern: embarrassingly parallel map vs. all-to-all shuffle
 * Same compute strategy: Ray tasks vs Ray actors
-* Same resource specification, e.g. ``num_cpus`` or ``num_gpus`` requests
+* Same resource specification, for example, ``num_cpus`` or ``num_gpus`` requests
 
-Read stages and subsequent map-like transformations will usually be fused together.
+Read stages and subsequent map-like transformations are usually fused together.
 All-to-all transformations such as
 :meth:`ds.random_shuffle() <ray.data.Dataset.random_shuffle>` can be fused with earlier
 map-like stages, but not later stages.
 
-You can tell if stage fusion is enabled by checking the :ref:`Dataset stats <data_performance_tips>` and looking for fused stages (e.g., ``read->map_batches``).
+You can tell if stage fusion is enabled by checking the :ref:`Dataset stats <data_performance_tips>` and looking for fused stages (for example, ``read->map_batches``).
 
 .. code-block::
 
@@ -252,7 +251,7 @@ Execution Memory
 
 During execution, a task can read multiple input blocks, and write multiple output blocks. Input and output blocks consume both worker heap memory and shared memory via Ray's object store.
 
-Ray Data attempts to bound its heap memory usage to `num_execution_slots * max_block_size`. The number of execution slots is by default equal to the number of CPUs, unless custom resources are specified. The maximum block size is set by the configuration parameter `ray.data.DataContext.target_max_block_size` and is set to 512MiB by default. When a task's output is larger than this value, the worker will automatically split the output into multiple smaller blocks to avoid running out of heap memory.
+Ray Data attempts to bound its heap memory usage to ``num_execution_slots * max_block_size``. The number of execution slots is by default equal to the number of CPUs, unless custom resources are specified. The maximum block size is set by the configuration parameter `ray.data.DataContext.target_max_block_size` and is set to 512MiB by default. When a task's output is larger than this value, the worker automatically splits the output into multiple smaller blocks to avoid running out of heap memory.
 
 Large block size can lead to potential out-of-memory situations. To avoid these issues, make sure no single item in your Ray Data is too large, and always call :meth:`ds.map_batches() <ray.data.Dataset.map_batches>` with batch size small enough such that the output batch can comfortably fit into memory.
 
@@ -262,5 +261,5 @@ Object Store Memory
 Ray Data uses the Ray object store to store data blocks, which means it inherits the memory management features of the Ray object store. This section discusses the relevant features:
 
 * Object Spilling: Since Ray Data uses the Ray object store to store data blocks, any blocks that can't fit into object store memory are automatically spilled to disk. The objects are automatically reloaded when needed by downstream compute tasks:
-* Locality Scheduling: Ray will preferentially schedule compute tasks on nodes that already have a local copy of the object, reducing the need to transfer objects between nodes in the cluster.
+* Locality Scheduling: Ray preferentially schedules compute tasks on nodes that already have a local copy of the object, reducing the need to transfer objects between nodes in the cluster.
 * Reference Counting: Dataset blocks are kept alive by object store reference counting as long as there is any Dataset that references them. To free memory, delete any Python references to the Dataset object.
diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst
index 271ad1f28201..105f75e538b3 100644
--- a/doc/source/data/inspecting-data.rst
+++ b/doc/source/data/inspecting-data.rst
@@ -18,7 +18,7 @@ This guide shows you how to:
 Describing datasets
 ===================
 
-:class:`Datasets <ray.data.Dataset>` are tabular. To view a Dataset's column names and
+:class:`Datasets <ray.data.Dataset>` are tabular. To view a dataset's column names and
 types, call :meth:`Dataset.schema() <ray.data.Dataset.schema>`.
 
 .. testcode::
@@ -149,7 +149,7 @@ For more information on working with batches, see
 Inspecting execution statistics
 ===============================
 
-Ray Data calculates statistics during execution like the wall clock time and memory usage for the different stages. 
+Ray Data calculates statistics during execution like the wall clock time and memory usage for the different stages.
 
 To view stats about your :class:`Datasets <ray.data.Dataset>`, call :meth:`Dataset.stats() <ray.data.Dataset.stats>` on an executed dataset. The stats are also persisted under `/tmp/ray/session_*/logs/ray-data.log`.
 
@@ -195,4 +195,3 @@ To view stats about your :class:`Datasets <ray.data.Dataset>`, call :meth:`Datas
         * In ray.get(): 2.16ms min, 2.16ms max, 2.16ms avg, 2.16ms total
         * In batch creation: 897.67us min, 897.67us max, 897.67us avg, 897.67us total
         * In batch formatting: 836.87us min, 836.87us max, 836.87us avg, 836.87us total
-
diff --git a/doc/source/data/key-concepts.rst b/doc/source/data/key-concepts.rst
index bc9cc118e9a6..7aa66784e91d 100644
--- a/doc/source/data/key-concepts.rst
+++ b/doc/source/data/key-concepts.rst
@@ -3,7 +3,7 @@
 Key Concepts
 ============
 
-Learn about :class:`Dataset <ray.data.Dataset>` and the functionality it provides.
+Learn about :class:`Dataset <ray.data.Dataset>` and the capabilities it provides.
 
 This guide provides a lightweight introduction to:
 
diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst
index fe854c99b5cd..e41c89e98b29 100644
--- a/doc/source/data/loading-data.rst
+++ b/doc/source/data/loading-data.rst
@@ -309,7 +309,7 @@ Handling compressed files
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 To read a compressed file, specify ``compression`` in ``arrow_open_stream_args``.
-You can use any `Codec supported by Arrow <https://arrow.apache.org/docs/python/generated/pyarrow.CompressedInputStream.html>`__.
+You can use any `codec supported by Arrow <https://arrow.apache.org/docs/python/generated/pyarrow.CompressedInputStream.html>`__.
 
 .. testcode::
 
@@ -640,7 +640,7 @@ Ray Data interoperates with HuggingFace and TensorFlow datasets.
 Reading databases
 =================
 
-Ray Data reads from databases like MySQL, Postgres, and MongoDB.
+Ray Data reads from databases like MySQL, PostgreSQL, and MongoDB.
 
 .. _reading_sql:
 
@@ -944,8 +944,8 @@ For an example, see :ref:`Implementing a Custom Datasource <custom_datasources>`
 Performance considerations
 ==========================
 
-The dataset ``parallelism`` determines the number of blocks the base data will be split
-into for parallel reads. Ray Data will decide internally how many read tasks to run
+The dataset ``parallelism`` determines the number of blocks the base data is split
+into for parallel reads. Ray Data decides internally how many read tasks to run
 concurrently to best utilize the cluster, ranging from ``1...parallelism`` tasks. In
 other words, the higher the parallelism, the smaller the data blocks in the Dataset and
 hence the more opportunity for parallel execution.
diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst
index 2c0ce87c3a58..ae7b513ca6a1 100644
--- a/doc/source/data/performance-tips.rst
+++ b/doc/source/data/performance-tips.rst
@@ -36,13 +36,13 @@ Tuning read resources
 ~~~~~~~~~~~~~~~~~~~~~
 
 By default, Ray requests 1 CPU per read task, which means one read tasks per CPU can execute concurrently.
-For datasources that can benefit from higher degress of IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter.
+For data sources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter.
 For example, use ``ray.data.read_parquet(path, ray_remote_args={"num_cpus": 0.25})`` to allow up to four read tasks per CPU.
 
 Parquet column pruning
 ~~~~~~~~~~~~~~~~~~~~~~
 
-Current Dataset will read all Parquet columns into memory.
+Current Dataset reads all Parquet columns into memory.
 If you only need a subset of the columns, make sure to specify the list of columns
 explicitly when calling :meth:`ray.data.read_parquet() <ray.data.read_parquet>` to
 avoid loading unnecessary data (projection pushdown).
@@ -55,8 +55,8 @@ Parquet row pruning
 ~~~~~~~~~~~~~~~~~~~
 
 Similarly, you can pass in a filter to :meth:`ray.data.read_parquet() <ray.data.Dataset.read_parquet>` (filter pushdown)
-which will be applied at the file scan so only rows that match the filter predicate
-will be returned.
+which is applied at the file scan so only rows that match the filter predicate
+are returned.
 For example, use ``ray.data.read_parquet("example://iris.parquet", filter=pyarrow.dataset.field("sepal.length") > 5.0)``
 (where ``pyarrow`` has to be imported)
 to read rows with sepal.length greater than 5.0.
@@ -65,8 +65,8 @@ This can be used in conjunction with column pruning when appropriate to get the
 Optimizing shuffles
 -------------------
 
-When should I use global per-epoch shuffling?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When should you use global per-epoch shuffling?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Use global per-epoch shuffling only if your model is sensitive to the
 randomness of the training data. Based on a
@@ -87,8 +87,7 @@ particular model under different shuffling policies:
 * windowed (pseudo-global) shuffling, and
 * fully global shuffling.
 
-From the perspective of keeping preprocessing time in check, as long as your data
-loading and shuffling throughput is higher than your training throughput, your GPU should
+As long as your data loading and shuffling throughput is higher than your training throughput, your GPU should
 be saturated. If you have shuffle-sensitive models, push the
 shuffle quality higher until this threshold is hit.
 
@@ -103,10 +102,10 @@ These operations include :meth:`Dataset.random_shuffle <ray.data.Dataset.random_
 Shuffle can be challenging to scale to large data sizes and clusters, especially when the total dataset size cannot fit into memory.
 
 Datasets provides an alternative shuffle implementation known as push-based shuffle for improving large-scale performance.
-We recommend trying this out if your dataset has more than 1000 blocks or is larger than 1 TB in size.
+Try this out if your dataset has more than 1000 blocks or is larger than 1 TB in size.
 
 To try this out locally or on a cluster, you can start with the `nightly release test <https://github.com/ray-project/ray/blob/master/release/nightly_tests/dataset/sort.py>`_ that Ray runs for :meth:`Dataset.random_shuffle <ray.data.Dataset.random_shuffle>` and :meth:`Dataset.sort <ray.data.Dataset.sort>`.
-To get an idea of the performance you can expect, here are some run time results for :meth:`Dataset.random_shuffle <ray.data.Dataset.random_shuffle>` on 1-10TB of data on 20 machines (m5.4xlarge instances on AWS EC2, each with 16 vCPUs, 64GB RAM).
+To get an idea of the performance you can expect, here are some run time results for :meth:`Dataset.random_shuffle <ray.data.Dataset.random_shuffle>` on 1-10 TB of data on 20 machines (m5.4xlarge instances on AWS EC2, each with 16 vCPUs, 64 GB RAM).
 
 .. image:: https://docs.google.com/spreadsheets/d/e/2PACX-1vQvBWpdxHsW0-loasJsBpdarAixb7rjoo-lTgikghfCeKPQtjQDDo2fY51Yc1B6k_S4bnYEoChmFrH2/pubchart?oid=598567373&format=image
    :align: center
@@ -149,7 +148,7 @@ By default, the CPU and GPU limits are set to the cluster size, and the object s
 You may want to customize these limits in the following scenarios:
 - If running multiple concurrent jobs on the cluster, setting lower limits can avoid resource contention between the jobs.
 - If you want to fine-tune the memory limit to maximize performance.
-- For data loading into training jobs, you may want to set the object store memory to a low value (e.g., 2GB) to limit resource usage.
+- For data loading into training jobs, you may want to set the object store memory to a low value (for example, 2 GB) to limit resource usage.
 
 You can configure execution options with the global DataContext. The options are applied for future jobs launched in the process:
 
@@ -168,7 +167,7 @@ Locality with output (ML ingest use case)
 
    ctx.execution_options.locality_with_output = True
 
-Setting this parameter to True tells Ray Data to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This setting can be useful if you know you are consuming the output data directly on the consumer node (i.e., for ML training ingest). However, other use cases may incur a performance penalty with this setting.
+Setting this parameter to True tells Ray Data to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This setting can be useful if you know you are consuming the output data directly on the consumer node (such as, for ML training ingest). However, other use cases may incur a performance penalty with this setting.
 
 Reproducibility
 ---------------
@@ -181,7 +180,7 @@ Deterministic execution
    # By default, this is set to False.
    ctx.execution_options.preserve_order = True
 
-To enable deterministic execution, set the above to True. This setting may decrease performance, but ensures block ordering is preserved through execution. This flag defaults to False.
+To enable deterministic execution, set the preceding to True. This setting may decrease performance, but ensures block ordering is preserved through execution. This flag defaults to False.
 
 Monitoring your application
 ---------------------------
diff --git a/doc/source/data/user-guide.rst b/doc/source/data/user-guide.rst
index 83c5cd459dac..86f8cb49b8fb 100644
--- a/doc/source/data/user-guide.rst
+++ b/doc/source/data/user-guide.rst
@@ -4,9 +4,9 @@
 User Guides
 ===========
 
-If you’re new to Ray Data, we recommend starting with the
+If you’re new to Ray Data, start with the
 :ref:`Ray Data Key Concepts <data_key_concepts>`.
-This user guide will help you navigate the Ray Data project and
+This user guide helps you navigate the Ray Data project and
 show you how achieve several tasks.
 
 .. toctree::
diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst
index 1f41e3afa4b7..d3012eba7230 100644
--- a/doc/source/data/working-with-pytorch.rst
+++ b/doc/source/data/working-with-pytorch.rst
@@ -7,19 +7,19 @@ Ray Data integrates with the PyTorch ecosystem.
 
 This guide describes how to:
 
-* :ref:`Iterate over your dataset as torch tensors for model training <iterating_pytorch>`
-* :ref:`Write transformations that deal with torch tensors <transform_pytorch>`
-* :ref:`Perform batch inference with torch models <batch_inference_pytorch>`
-* :ref:`Save Datasets containing torch tensors <saving_pytorch>`
+* :ref:`Iterate over your dataset as Torch tensors for model training <iterating_pytorch>`
+* :ref:`Write transformations that deal with Torch tensors <transform_pytorch>`
+* :ref:`Perform batch inference with Torch models <batch_inference_pytorch>`
+* :ref:`Save Datasets containing Torch tensors <saving_pytorch>`
 * :ref:`Migrate from PyTorch Datasets to Ray Data <migrate_pytorch>`
 
 .. _iterating_pytorch:
 
-Iterating over torch tensors for training
+Iterating over Torch tensors for training
 -----------------------------------------
-To iterate over batches of data in torch format, call :meth:`Dataset.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset.
+To iterate over batches of data in Torch format, call :meth:`Dataset.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset.
 
-This is useful for training torch models with batches from your dataset. For configuration details such as providing a `collate_fn` for customizing the conversion, see `the API reference <ray.data.Dataset.iter_torch_batches>`.
+This is useful for training Torch models with batches from your dataset. For configuration details such as providing a ``collate_fn`` for customizing the conversion, see `the API reference <ray.data.Dataset.iter_torch_batches>`.
 
 .. testcode::
 
@@ -40,12 +40,12 @@ This is useful for training torch models with batches from your dataset. For con
 
 Integration with Ray Train
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for data parallel training, with support for PyTorch, PyTorch Lightning, or Huggingface training.
+Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for data parallel training, with support for PyTorch, PyTorch Lightning, or Hugging Face training.
 
 .. testcode::
 
     import torch
-    from torch import nn
+    from Torch import nn
     import ray
     from ray.air import session, ScalingConfig
     from ray.train.torch import TorchTrainer
@@ -85,13 +85,13 @@ For more details, see the :ref:`Ray Train user guide <train-datasets>`.
 
 .. _transform_pytorch:
 
-Transformations with torch tensors
+Transformations with Torch tensors
 ----------------------------------
-Transformations applied with `map` or `map_batches` can return torch tensors.
+Transformations applied with `map` or ``map_batches`` can return Torch tensors.
 
 .. caution::
 
-    Under the hood, Ray Data automatically converts torch tensors to numpy arrays. Subsequent transformations accept numpy arrays as input, not torch tensors.
+    Under the hood, Ray Data automatically converts Torch tensors to NumPy arrays. Subsequent transformations accept NumPy arrays as input, not Torch tensors.
 
 .. tab-set::
 
@@ -162,7 +162,7 @@ For more information on transforming data, see :ref:`Transforming data <transfor
 Built-in PyTorch transforms
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You can use built-in torch transforms from `torchvision`, `torchtext`, and `torchaudio` Ray Data transformations.
+You can use built-in Torch transforms from ``torchvision``, ``torchtext``, and ``torchaudio`` Ray Data transformations.
 
 .. tab-set::
 
@@ -240,7 +240,7 @@ You can use built-in torch transforms from `torchvision`, `torchtext`, and `torc
 Batch inference with PyTorch
 ----------------------------
 
-With Ray Datasets, you can do scalable offline batch inference with torch models by mapping a pre-trained model over your data.
+With Ray Datasets, you can do scalable offline batch inference with Torch models by mapping a pre-trained model over your data.
 
 .. testcode::
 
@@ -293,10 +293,10 @@ For more details, see the :ref:`Batch inference user guide <batch_inference_home
 
 .. _saving_pytorch:
 
-Saving Datasets containing torch tensors
+Saving Datasets containing Torch tensors
 ----------------------------------------
 
-Datasets containing torch tensors can be saved to files, like parquet or numpy.
+Datasets containing Torch tensors can be saved to files, like parquet or NumPy.
 
 For more information on saving data, read
 :ref:`Saving data <saving-data>`.
@@ -338,12 +338,12 @@ Migrating from PyTorch Datasets and DataLoaders
 
 If you're currently using PyTorch Datasets and DataLoaders, you can migrate to Ray Data for working with distributed datasets.
 
-PyTorch Datasets are replaced by the :class:`Dataset <ray.data.Dataset>` abtraction, and the PyTorch DataLoader is replaced by :meth:`Dataset.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`.
+PyTorch Datasets are replaced by the :class:`Dataset <ray.data.Dataset>` abstraction, and the PyTorch DataLoader is replaced by :meth:`Dataset.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`.
 
 Built-in PyTorch Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If you are using built-in PyTorch datasets, for example from `torchvision`, these can be converted to a Ray Dataset using the :meth:`from_torch() <ray.data.from_torch>` API.
+If you are using built-in PyTorch datasets, for example from ``torchvision``, these can be converted to a Ray Dataset using the :meth:`from_torch() <ray.data.from_torch>` API.
 
 .. caution::
 
@@ -357,7 +357,7 @@ If you are using built-in PyTorch datasets, for example from `torchvision`, thes
     mnist = torchvision.datasets.MNIST(root="/tmp/", download=True)
     ds = ray.data.from_torch(mnist)
 
-    # The data for each item of the torch dataset is under the "item" key.
+    # The data for each item of the Torch dataset is under the "item" key.
     print(ds.schema())
 
 ..
@@ -378,11 +378,11 @@ If you have a custom PyTorch Dataset, you can migrate to Ray Data by converting
 
 Any logic for reading data from cloud storage and disk can be replaced by one of the Ray Data ``read_*`` APIs, and any transformation logic can be applied as a :meth:`map <ray.data.Dataset.map>` call on the Dataset.
 
-The following example shows a custom PyTorch Dataset, and what the analagous would look like with Ray Data.
+The following example shows a custom PyTorch Dataset, and what the analogous would look like with Ray Data.
 
 .. note::
 
-    Unlike PyTorch Map-style datasets, Ray Datasets are not indexable.
+    Unlike PyTorch Map-style datasets, Ray Datasets aren't indexable.
 
 .. tab-set::
 
@@ -477,7 +477,7 @@ PyTorch DataLoader
 
 The PyTorch DataLoader can be replaced by calling :meth:`Dataset.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to iterate over batches of the dataset.
 
-The following table describes how the arguments for PyTorch DataLoader map to Ray Data. Note the the behavior may not necessarily be identical. For exact semantics and usage, :meth:`see the API reference <ray.data.Dataset.iter_torch_batches>`.
+The following table describes how the arguments for PyTorch DataLoader map to Ray Data. Note the behavior may not necessarily be identical. For exact semantics and usage, :meth:`see the API reference <ray.data.Dataset.iter_torch_batches>`.
 
 .. list-table::
    :header-rows: 1
@@ -485,20 +485,20 @@ The following table describes how the arguments for PyTorch DataLoader map to Ra
    * - PyTorch DataLoader arguments
      - Ray Data API
    * - ``batch_size``
-     - ``batch_size`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
+     - ``batch_size`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
    * - ``shuffle``
-     - ``local_shuffle_buffer_size`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
+     - ``local_shuffle_buffer_size`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
    * - ``collate_fn``
-     - ``collate_fn`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
+     - ``collate_fn`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
    * - ``sampler``
      - Not supported. Can be manually implemented after iterating through the dataset with :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`.
    * - ``batch_sampler``
      - Not supported. Can be manually implemented after iterating through the dataset with :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`.
    * - ``drop_last``
-     - ``drop_last`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
+     - ``drop_last`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>`
    * - ``num_workers``
-     - Use ``prefetch_batches`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to indicate how many batches to prefetch. The number of prefetching threads will automatically be configured according to ``prefetch_batches``.
+     - Use ``prefetch_batches`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to indicate how many batches to prefetch. The number of prefetching threads are automatically configured according to ``prefetch_batches``.
    * - ``prefetch_factor``
-     - Use ``prefetch_batches`` arg to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to indicate how many batches to prefetch. The number of prefetching threads will automatically be configured according to ``prefetch_batches``.
+     - Use ``prefetch_batches`` argument to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to indicate how many batches to prefetch. The number of prefetching threads are automatically configured according to ``prefetch_batches``.
    * - ``pin_memory``
      - Pass in ``device`` to :meth:`ds.iter_torch_batches() <ray.data.Dataset.iter_torch_batches>` to get tensors that have already been moved to the correct device.
diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst
index 810eaab894ee..28d29d29c74d 100644
--- a/doc/source/data/working-with-tensors.rst
+++ b/doc/source/data/working-with-tensors.rst
@@ -3,7 +3,7 @@
 Working with Tensors
 ====================
 
-N-dimensional arrays (i.e., tensors) are ubiquitous in ML workloads. This guide
+N-dimensional arrays (that is, tensors) are ubiquitous in ML workloads. This guide
 describes the limitations and best practices of working with such data.
 
 Tensor data representation
@@ -98,9 +98,8 @@ Call :meth:`~ray.data.Dataset.map` or :meth:`~ray.data.Dataset.map_batches` to t
     # Increase the brightness, batch at a time.
     ds.map_batches(batch_increase_brightness)
 
-In this example, we return ``np.ndarray`` directly as the output. Ray Data will also treat
-returned lists of ``np.ndarray`` and objects implementing ``__array__`` (e.g., ``torch.Tensor``)
-as tensor data.
+In addition to NumPy ndarrays, Ray Data also treats returned lists of NumPy ndarrays and
+objects implementing ``__array__`` (for example, ``torch.Tensor``) as tensor data.
 
 For more information on transforming data, read
 :ref:`Transforming data <transforming_data>`.
diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst
index 33276c792248..da639c45a173 100644
--- a/doc/source/data/working-with-text.rst
+++ b/doc/source/data/working-with-text.rst
@@ -15,7 +15,7 @@ This guide shows you how to:
 Reading text files
 ------------------
 
-Ray Data can read lines of text and JSONL. Alternatiely, you can read raw binary
+Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary
 files and manually decode data.
 
 .. tab-set::
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index 2b25c74c37ae..2deb573fef2a 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -1,10 +1,11 @@
 import time
 
-import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
 
+import numpy as np
+
 import ray
 from ray.data._internal.lazy_block_list import LazyBlockList
 from ray.data.block import BlockMetadata
@@ -14,7 +15,7 @@
 from ray.tests.conftest import *  # noqa
 
 
-# Data source generates random bytes data
+# Datasource generates random bytes data
 class RandomBytesDatasource(Datasource):
     def create_reader(self, **read_args):
         return RandomBytesReader(

From a895099f7483946a4b9f5bbef8e6560a7a30526f Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 11:53:36 -0700
Subject: [PATCH 13/21] Update styles

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/Acronyms.yml            |  8 +++++-
 .github/styles/Google/Contractions.yml        |  2 +-
 .github/styles/Google/Headings.yml            |  2 +-
 .github/styles/Google/Spacing.yml             |  9 +++----
 .github/styles/Google/WordList.yml            |  2 +-
 .github/styles/Vocab/Data/accept.txt          | 21 +++++++++++++++
 .github/styles/Vocab/General/accept.txt       | 14 ++++++++++
 .github/styles/Vocab/Ray/reject.txt           |  1 -
 .github/styles/Vocab/ray/accept.txt           | 27 -------------------
 .vale.ini                                     |  7 ++++-
 doc/requirements-doc.txt                      |  4 +--
 doc/source/data/api/from_other_data_libs.rst  | 22 +++++++--------
 doc/source/data/api/grouped_data.rst          |  3 ++-
 doc/source/data/api/random_access_dataset.rst |  3 ++-
 doc/source/data/batch_inference.rst           |  4 +--
 .../data/examples/custom-datasource.rst       |  5 +++-
 doc/source/data/examples/index.rst            |  4 ---
 doc/source/data/examples/random-access.rst    | 12 ++++-----
 doc/source/data/performance-tips.rst          |  6 ++---
 doc/source/data/preprocessors.rst             |  8 +++---
 doc/source/data/saving-data.rst               |  2 +-
 doc/source/data/working-with-images.rst       |  4 +--
 doc/source/data/working-with-tensors.rst      |  2 +-
 doc/source/data/working-with-text.rst         |  2 ++
 24 files changed, 96 insertions(+), 78 deletions(-)
 create mode 100644 .github/styles/Vocab/Data/accept.txt
 create mode 100644 .github/styles/Vocab/General/accept.txt
 delete mode 100644 .github/styles/Vocab/Ray/reject.txt
 delete mode 100644 .github/styles/Vocab/ray/accept.txt

diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml
index f41af0189b07..f15489e94c99 100644
--- a/.github/styles/Google/Acronyms.yml
+++ b/.github/styles/Google/Acronyms.yml
@@ -1,7 +1,7 @@
 extends: conditional
 message: "Spell out '%s', if it's unfamiliar to the audience."
 link: 'https://developers.google.com/style/abbreviations'
-level: suggestion
+level: warning
 ignorecase: false
 # Ensures that the existence of 'first' implies the existence of 'second'.
 first: '\b([A-Z]{3,5})\b'
@@ -10,13 +10,16 @@ second: '(?:\b[A-Z][a-z]+ )+\(([A-Z]{3,5})\)'
 exceptions:
   - API
   - ASP
+  - AWS
   - CLI
   - CPU
   - CSS
   - CSV
+  - CUDA
   - DEBUG
   - DOM
   - DPI
+  - ETL
   - FAQ
   - GCC
   - GDB
@@ -30,16 +33,19 @@ exceptions:
   - IDE
   - JAR
   - JSON
+  - JSONL
   - JSX
   - LESS
   - LLDB
   - NET
+  - NFS
   - NOTE
   - NVDA
   - OSS
   - PATH
   - PDF
   - PHP
+  - PNG
   - POST
   - RAM
   - REPL
diff --git a/.github/styles/Google/Contractions.yml b/.github/styles/Google/Contractions.yml
index 4f6fd5d489dc..07a604d4e837 100644
--- a/.github/styles/Google/Contractions.yml
+++ b/.github/styles/Google/Contractions.yml
@@ -1,7 +1,7 @@
 extends: substitution
 message: "Use '%s' instead of '%s'."
 link: 'https://developers.google.com/style/contractions'
-level: suggestion
+level: warning
 ignorecase: true
 action:
   name: replace
diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml
index 168eb6c050bd..a53301338a47 100644
--- a/.github/styles/Google/Headings.yml
+++ b/.github/styles/Google/Headings.yml
@@ -1,7 +1,7 @@
 extends: capitalization
 message: "'%s' should use sentence-style capitalization."
 link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings'
-level: suggestion
+level: warning
 scope: heading
 match: $sentence
 indicators:
diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml
index e0d26537eb99..b430011c2ca5 100644
--- a/.github/styles/Google/Spacing.yml
+++ b/.github/styles/Google/Spacing.yml
@@ -1,13 +1,10 @@
 extends: existence
 message: "'%s' should have one space."
 link: 'https://developers.google.com/style/sentence-spacing'
-level: error
+level: warning
 nonword: true
 action:
   name: remove
-# FIXME: This rule complains about Sphinx directives like
-# ":class:`Dataset <ray.data.Dataset>`".
 tokens:
-  - '^(?!a)b$'  # This regex is impossible to match.
-  # - '[a-z][.?!] {2,}[A-Z]'
-  # - '[a-z][.?!][A-Z]'
+  - '[a-z][.?!] {2,}[A-Z]'
+  - '[a-z][.?!][A-Z]'
diff --git a/.github/styles/Google/WordList.yml b/.github/styles/Google/WordList.yml
index 0d675f2372a2..1b502744ec33 100644
--- a/.github/styles/Google/WordList.yml
+++ b/.github/styles/Google/WordList.yml
@@ -75,7 +75,7 @@ swap:
   synch: sync
   tablename: table name
   tablet: device
-  touch: tap
+  # touch: tap  # We rarely use touch in the sense of "tap" in our docs.
   url: URL
   vs\.: versus
   World Wide Web: web
diff --git a/.github/styles/Vocab/Data/accept.txt b/.github/styles/Vocab/Data/accept.txt
new file mode 100644
index 000000000000..8270e8b56dc4
--- /dev/null
+++ b/.github/styles/Vocab/Data/accept.txt
@@ -0,0 +1,21 @@
+[Pp]ushdown
+[Ii]ngest
+[Gg]roupby
+TFRecord(s)?
+Dask
+Modin
+[Dd]atasource
+[Pp]refetch
+[Pp]refetching
+[Ii]ndexable
+[Pp]reprocess
+[Pp]reprocessor(s)?
+Spotify('s)?
+Predibase('s)?
+UDF(s)?
+ndarray(s)?
+dtype
+[Ll]ookup(s)?
+[Mm]ultiget(s)?
+[Ss]calers
+Data('s)?
\ No newline at end of file
diff --git a/.github/styles/Vocab/General/accept.txt b/.github/styles/Vocab/General/accept.txt
new file mode 100644
index 000000000000..9bc01c2bdc16
--- /dev/null
+++ b/.github/styles/Vocab/General/accept.txt
@@ -0,0 +1,14 @@
+[Ii]nteroperates
+CPU[s]
+GPU[s]
+# Use 'API' judiciously: https://developers.google.com/style/word-list#api.
+API[s]
+[Aa]pplication
+NumPy
+[Pp]erformant
+[Cc]odec
+URI[s]
+[Ii]nterpretability
+[Pp]arallelization
+[Ss]ubclassing
+[Dd]ict(s)?
\ No newline at end of file
diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt
deleted file mode 100644
index 12c6d5d5eac2..000000000000
--- a/.github/styles/Vocab/Ray/reject.txt
+++ /dev/null
@@ -1 +0,0 @@
-torch
diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt
deleted file mode 100644
index bfc654ae0956..000000000000
--- a/.github/styles/Vocab/ray/accept.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Data's
-APIs
-Ray Data
-API[s]
-UDF[s]
-CPU[s]
-GPU[s]
-performant
-config
-ingest
-application
-touch
-ndarray[s]
-dataset's
-URI[s]
-codec
-interoperates
-Spotify's
-preprocess
-Predibase
-pushdown
-dicts
-[gG]roupby
-parallelization
-prefetching
-indexable
-dtype
diff --git a/.vale.ini b/.vale.ini
index 1189591d35d4..119c309ca4c8 100644
--- a/.vale.ini
+++ b/.vale.ini
@@ -1,10 +1,15 @@
 StylesPath = .github/styles
 
-Vocab = Ray
+Vocab = General, Data
 
 MinAlertLevel = suggestion
 
 Packages = Google
 
+[*.rst]
+TokenIgnores = (:class:`.*`)|(:.*:`.*`)|(`.*`)
+
 [*.{md,rst}]
 BasedOnStyles = Vale, Google
+Google.Colons = No
+Google.Headings = No
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
index 648984d6ed0f..2003a3a8d7e6 100644
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@@ -18,14 +18,14 @@ mock
 numpy
 scikit-image
 pandas
-pickle5
+# pickle5
 pillow
 pyarrow
 pydantic < 1.10.0
 # Note: more recent typing-extensions does not work well with pinned pydantic <1.10.0
 typing-extensions < 4.6.0
 pyyaml
-pytorch-lightning==1.6.5
+pytorch-lightning
 scikit-optimize
 redis
 starlette
diff --git a/doc/source/data/api/from_other_data_libs.rst b/doc/source/data/api/from_other_data_libs.rst
index 1a62249bb750..30a611f3b1d9 100644
--- a/doc/source/data/api/from_other_data_libs.rst
+++ b/doc/source/data/api/from_other_data_libs.rst
@@ -5,15 +5,15 @@ API Guide for Users from Other Data Libraries
 
 Ray Data is a data loading and preprocessing library for ML. It shares certain
 similarities with other ETL data processing libraries, but also has its own focus.
-In this API guide, we will provide API mappings for users who come from those data
+This guide provides API mappings for users who come from those data
 libraries, so you can quickly map what you may already know to Ray Data APIs.
 
 .. note::
 
   - This is meant to map APIs that perform comparable but not necessarily identical operations.
-    Please check the API reference for exact semantics and usage.
-  - This list may not be exhaustive: Ray Data is not a traditional ETL data processing library, so not all data processing APIs can map to Datasets.
-    In addition, we try to focus on common APIs or APIs that are less obvious to see a connection.
+    Please select the API reference for exact semantics and usage.
+  - This list may not be exhaustive: Ray Data isn't a traditional ETL data processing library, so not all data processing APIs can map to Datasets.
+    In addition, this list focuses on common APIs or APIs that are less obvious to see a connection.
 
 .. _api-guide-for-pandas-users:
 
@@ -72,19 +72,19 @@ For PyArrow Users
 
    * - PyArrow Table API
      - Ray Data API
-   * - pa.Table.schema
+   * - ``pa.Table.schema``
      - :meth:`ds.schema() <ray.data.Dataset.schema>`
-   * - pa.Table.num_rows
+   * - ``pa.Table.num_rows``
      - :meth:`ds.count() <ray.data.Dataset.count>`
-   * - pa.Table.filter()
+   * - ``pa.Table.filter()``
      - :meth:`ds.filter() <ray.data.Dataset.filter>`
-   * - pa.Table.drop()
+   * - ``pa.Table.drop()``
      - :meth:`ds.drop_columns() <ray.data.Dataset.drop_columns>`
-   * - pa.Table.add_column()
+   * - ``pa.Table.add_column()``
      - :meth:`ds.add_column() <ray.data.Dataset.add_column>`
-   * - pa.Table.groupby()
+   * - ``pa.Table.groupby()``
      - :meth:`ds.groupby() <ray.data.Dataset.groupby>`
-   * - pa.Table.sort_by()
+   * - ``pa.Table.sort_by()``
      - :meth:`ds.sort() <ray.data.Dataset.sort>`
 
 
diff --git a/doc/source/data/api/grouped_data.rst b/doc/source/data/api/grouped_data.rst
index fce6a8d9705e..e7abb1f9187e 100644
--- a/doc/source/data/api/grouped_data.rst
+++ b/doc/source/data/api/grouped_data.rst
@@ -5,7 +5,8 @@ GroupedData API
 
 .. currentmodule:: ray.data
 
-GroupedData objects are returned by groupby call: Dataset.groupby().
+GroupedData objects are returned by groupby call: 
+:meth:`Dataset.groupby() <ray.data.Dataset.groupby>`.
 
 Constructor
 -----------
diff --git a/doc/source/data/api/random_access_dataset.rst b/doc/source/data/api/random_access_dataset.rst
index 6bfbdba1585c..82c3bf1d14da 100644
--- a/doc/source/data/api/random_access_dataset.rst
+++ b/doc/source/data/api/random_access_dataset.rst
@@ -5,7 +5,8 @@ RandomAccessDataset (experimental)
 
 .. currentmodule:: ray.data
 
-RandomAccessDataset objects are returned by call: Dataset.to_random_access_dataset().
+RandomAccessDataset objects are returned by call: 
+:meth:`<Dataset.to_random_access_dataset() <ray.data.Dataset.to_random_access_dataset>`.
 
 Constructor
 -----------
diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst
index 8b8e5a0236e1..1f54d5c14214 100644
--- a/doc/source/data/batch_inference.rst
+++ b/doc/source/data/batch_inference.rst
@@ -340,7 +340,7 @@ The remaining is the same as the :ref:`Quickstart <batch_inference_quickstart>`.
 Configuring Batch Size
 ~~~~~~~~~~~~~~~~~~~~~~
 
-Configure the size of the input batch that is passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`
+Configure the size of the input batch that's passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`
 
 Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter out-of-memory errors, decreasing ``batch_size`` may help.
 
@@ -464,7 +464,7 @@ Models that have been trained with :ref:`Ray Train <train-docs>` can then be use
 
 **Step 3:** Use Ray Data for batch inference. To load in the model from the :class:`Checkpoint <ray.train.Checkpoint>` inside the Python class, use one of the framework-specific Checkpoint classes.
 
-In this case, we use the :class:`XGBoostCheckpoint <ray.train.xgboost.XGBoostCheckpoint>` to load the model.
+In this case, use :class:`XGBoostCheckpoint <ray.train.xgboost.XGBoostCheckpoint>` to load the model.
 
 The rest of the logic looks the same as in the `Quickstart <#quickstart>`_.
 
diff --git a/doc/source/data/examples/custom-datasource.rst b/doc/source/data/examples/custom-datasource.rst
index e60b7b423fa8..2ed79fa572c7 100644
--- a/doc/source/data/examples/custom-datasource.rst
+++ b/doc/source/data/examples/custom-datasource.rst
@@ -1,5 +1,8 @@
 .. _custom_datasources:
 
+.. TODO: Re-write this guide with correct editorial style.
+.. vale off
+    
 ================================
 Implementing a Custom Datasource
 ================================
@@ -38,7 +41,7 @@ By the end of the guide, you will have a ``MongoDatasource`` that you can use to
     There are a few MongoDB concepts involved here. The `URI <https://www.mongodb.com/docs/manual/reference/connection-string/>`__ points to
     a MongoDB instance, which hosts `Databases and Collections <https://www.mongodb.com/docs/manual/core/databases-and-collections/>`__. A collection
     is analogous to a table in SQL databases. MongoDB also has a `pipeline <https://www.mongodb.com/docs/manual/core/aggregation-pipeline/>`__ concept,
-    which expresses document processing in a series of stages (e.g. match documents with a predicate, sort results, and then select a few fields).
+    which expresses document processing in a series of stages (for example, match documents with a predicate, sort results, and then select a few fields).
     The execution results of the pipelines are used to create dataset.
 
 A custom datasource is an implementation of :class:`~ray.data.Datasource`. In this
diff --git a/doc/source/data/examples/index.rst b/doc/source/data/examples/index.rst
index bcaeb5b8ccbb..01bccaff9774 100644
--- a/doc/source/data/examples/index.rst
+++ b/doc/source/data/examples/index.rst
@@ -26,10 +26,6 @@ Ray Data Examples
 
 .. _data-recipes:
 
-Ray Data is a data processing engine that supports multiple data
-modalities and types. Here you will find a few end-to-end examples of some basic data
-processing with Ray Data on tabular data, text (coming soon), and images.
-
 Computer Vision
 ---------------
 .. grid:: 1 2 2 3
diff --git a/doc/source/data/examples/random-access.rst b/doc/source/data/examples/random-access.rst
index 2b985bb85a86..dd33fe114ea8 100644
--- a/doc/source/data/examples/random-access.rst
+++ b/doc/source/data/examples/random-access.rst
@@ -36,7 +36,7 @@ Similar to Dataset, a RandomAccessDataset can be passed to and used from any Ray
 Architecture
 ------------
 
-RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source data found on its node. In addition, it is ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups.
+RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source data found on its node. In addition, it's ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups.
 
 Lookups occur as follows:
 
@@ -44,16 +44,16 @@ Lookups occur as follows:
 * Second, an actor that has the block pinned is selected (this is done randomly).
 * A method call is sent to the actor, which then performs binary search to locate the record for the key.
 
-This means that each random lookup costs ~1 network RTT as well as a small amount of computation on both the client and server side.
+This means that each random lookup costs ~1 network round-trip time, as well as a small amount of computation on both the client and server side.
 
 Performance
 -----------
 
-Since actor communication goes directly from worker to worker in Ray, the throughput of a RandomAccessDataset scales linearly with the number of workers available. As a rough measure, a single worker can provide ~2k individual gets/s and serve ~10k records/s for multigets, and this scales linearly as you increase the number of clients and workers for a single RandomAccessDataset. Large workloads may require hundreds of workers for sufficient throughput. You will also generally want more workers than clients, since the client does less computation than worker actors do.
+Since actor communication goes directly from worker to worker in Ray, the throughput of a RandomAccessDataset scales linearly with the number of workers available. As a rough measure, a single worker can provide ~2k individual gets/s and serve ~10k records/s for multigets, and this scales linearly as you increase the number of clients and workers for a single RandomAccessDataset. Large workloads may require hundreds of workers for sufficient throughput. You also generally want more workers than clients, since the client does less computation than worker actors do.
 
-To debug performance problems, use ``random_access_ds.stats()``. This will return a string showing the actor-side measured latencies as well as the distribution of data blocks and queries across the actors. Load imbalances can cause bottlenecks as certain actors receive more requests than others. Ensure that load is evenly distributed across the key space to avoid this.
+To debug performance problems, use ``random_access_ds.stats()``. This returns a string showing the actor-side measured latencies as well as the distribution of data blocks and queries across the actors. Load imbalances can cause bottlenecks as certain actors receive more requests than others. Ensure that load is evenly distributed across the key space to avoid this.
 
-It is important to note that the client (Ray worker process) can also be a bottleneck. To scale past the throughput of a single client, use multiple tasks to gather the data, for example:
+It's important to note that the client (Ray worker process) can also be a bottleneck. To scale past the throughput of a single client, use multiple tasks to gather the data, for example:
 
 .. testcode::
 
@@ -81,4 +81,4 @@ It is important to note that the client (Ray worker process) can also be a bottl
 Fault Tolerance
 ---------------
 
-Currently, RandomAccessDataset is not fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source data.
+Currently, RandomAccessDataset isn't fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source data.
diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst
index bf97e7fdae3d..18a25fb115f1 100644
--- a/doc/source/data/performance-tips.rst
+++ b/doc/source/data/performance-tips.rst
@@ -26,10 +26,10 @@ Tuning read parallelism
 By default, Ray Data automatically selects the read ``parallelism`` according to the following procedure:
 
 1. The number of available CPUs is estimated. If in a placement group, the number of CPUs in the cluster is scaled by the size of the placement group compared to the cluster size. If not in a placement group, this is the number of CPUs in the cluster.
-2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it is set to 8.
+2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it's set to 8.
 3. The in-memory data size is estimated. If the parallelism would create in-memory blocks that are larger on average than the target block size (512MiB), the parallelism is increased until the blocks are < 512MiB in size.
 
-Occasionally, it is advantageous to manually tune the parallelism to optimize the application. This can be done when loading data via the ``parallelism`` parameter.
+Occasionally, it's advantageous to manually tune the parallelism to optimize the application. This can be done when loading data via the ``parallelism`` parameter.
 For example, use ``ray.data.read_parquet(path, parallelism=1000)`` to force up to 1000 read tasks to be created.
 
 Tuning read resources
@@ -101,7 +101,7 @@ Enabling push-based shuffle
 Some Dataset operations require a *shuffle* operation, meaning that data is shuffled from all of the input partitions to all of the output partitions.
 These operations include :meth:`Dataset.random_shuffle <ray.data.Dataset.random_shuffle>`,
 :meth:`Dataset.sort <ray.data.Dataset.sort>` and :meth:`Dataset.groupby <ray.data.Dataset.groupby>`.
-Shuffle can be challenging to scale to large data sizes and clusters, especially when the total dataset size cannot fit into memory.
+Shuffle can be challenging to scale to large data sizes and clusters, especially when the total dataset size can't fit into memory.
 
 Datasets provides an alternative shuffle implementation known as push-based shuffle for improving large-scale performance.
 Try this out if your dataset has more than 1000 blocks or is larger than 1 TB in size.
diff --git a/doc/source/data/preprocessors.rst b/doc/source/data/preprocessors.rst
index 8b775916543a..e53fcc937061 100644
--- a/doc/source/data/preprocessors.rst
+++ b/doc/source/data/preprocessors.rst
@@ -6,7 +6,7 @@ Using Preprocessors
 Data preprocessing is a common technique for transforming raw data into features for a machine learning model.
 In general, you may want to apply the same preprocessing logic to your offline training data and online inference data.
 
-This page covers *preprocessors*, which are a higher level API on top of existing Ray Data operations like `map_batches`,
+This page covers *preprocessors*, which are a higher level API on top of existing Ray Data operations like ``map_batches``,
 targeted towards tabular and structured data use cases.
 
 If you are working with tabular data, you should use Ray Data preprocessors. However, the recommended way to perform preprocessing 
@@ -23,7 +23,7 @@ Overview
 
 The :class:`Preprocessor <ray.data.preprocessor.Preprocessor>` class has four public methods:
 
-#. :meth:`fit() <ray.data.preprocessor.Preprocessor.fit>`: Compute state information about a :class:`Dataset <ray.data.Dataset>` (e.g., the mean or standard deviation of a column)
+#. :meth:`fit() <ray.data.preprocessor.Preprocessor.fit>`: Compute state information about a :class:`Dataset <ray.data.Dataset>` (for example, the mean or standard deviation of a column)
    and save it to the :class:`Preprocessor <ray.data.preprocessor.Preprocessor>`. This information is used to perform :meth:`transform() <ray.data.preprocessor.Preprocessor.transform>`, and the method is typically called on a
    training dataset.
 #. :meth:`transform() <ray.data.preprocessor.Preprocessor.transform>`: Apply a transformation to a :class:`Dataset <ray.data.Dataset>`.
@@ -32,7 +32,7 @@ The :class:`Preprocessor <ray.data.preprocessor.Preprocessor>` class has four pu
 #. :meth:`transform_batch() <ray.data.preprocessor.Preprocessor.transform_batch>`: Apply a transformation to a single :class:`batch <ray.train.predictor.DataBatchType>` of data. This method is typically called on online or offline inference data.
 #. :meth:`fit_transform() <ray.data.preprocessor.Preprocessor.fit_tranform>`: Syntactic sugar for calling both :meth:`fit() <ray.data.preprocessor.Preprocessor.fit>` and :meth:`transform() <ray.data.preprocessor.Preprocessor.transform>` on a :class:`Dataset <ray.data.Dataset>`.
 
-To show these methods in action, let's walk through a basic example. First, we'll set up two simple Ray ``Dataset``\s.
+To show these methods in action, walk through a basic example. First, you'll set up two simple Ray ``Dataset``\s.
 
 .. literalinclude:: doc_code/preprocessors.py
     :language: python
@@ -266,7 +266,7 @@ If you want to implement a custom preprocessor that needs to be fit, extend the
 If your preprocessor doesn't need to be fit, construct a
 :class:`~ray.data.preprocessors.BatchMapper` to apply a UDF in parallel over your data.
 :class:`~ray.data.preprocessors.BatchMapper` can drop, add, or modify columns, and you
-can specify a `batch_size` to control the size of the data batches provided to your UDF.
+can specify a ``batch_size`` to control the size of the data batches provided to your UDF.
 
 .. literalinclude:: doc_code/preprocessors.py
     :language: python
diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst
index a2ce55261606..5fd2529bd7e8 100644
--- a/doc/source/data/saving-data.rst
+++ b/doc/source/data/saving-data.rst
@@ -7,7 +7,7 @@ Saving Data
 Ray Data lets you save data in files or other Python objects.
 
 This guide shows you how to:
-
+ 
 * `Write data to files <#writing-data-to-files>`_
 * `Convert Datasets to other Python libraries <#converting-datasets-to-other-python-libraries>`_
 
diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst
index 1d458c47bc16..57b1521c0ab9 100644
--- a/doc/source/data/working-with-images.rst
+++ b/doc/source/data/working-with-images.rst
@@ -242,7 +242,7 @@ Finally, call :meth:`Dataset.map_batches() <ray.data.Dataset.map_batches>`.
     {'class': 296}
 
 For more information on performing inference, see
-:ref:`End-to-end: Offline Batch Inference <batch_inference_home>`
+:ref:`End-to-end: offline batch inference <batch_inference_home>`
 and :ref:`Transforming batches with actors <transforming_data_actors>`.
 
 .. _saving_images:
@@ -250,7 +250,7 @@ and :ref:`Transforming batches with actors <transforming_data_actors>`.
 Saving images
 -------------
 
-Save images with formats like PNG, Parquet, and Numpy. To view all supported formats,
+Save images with formats like PNG, Parquet, and NumPy. To view all supported formats,
 see the :ref:`Input/Output reference <input-output>`.
 
 .. tab-set::
diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst
index 1bf606869c14..392765f4e56a 100644
--- a/doc/source/data/working-with-tensors.rst
+++ b/doc/source/data/working-with-tensors.rst
@@ -3,7 +3,7 @@
 Working with Tensors
 ====================
 
-N-dimensional arrays (that is, tensors) are ubiquitous in ML workloads. This guide
+N-dimensional arrays (in other words, tensors) are ubiquitous in ML workloads. This guide
 describes the limitations and best practices of working with such data.
 
 Tensor data representation
diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst
index da639c45a173..6540f6748289 100644
--- a/doc/source/data/working-with-text.rst
+++ b/doc/source/data/working-with-text.rst
@@ -15,6 +15,8 @@ This guide shows you how to:
 Reading text files
 ------------------
 
+Food: Spam ham eggs.
+
 Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary
 files and manually decode data.
 

From bbf60f83616c7c632074d29b9cfba187802b80a0 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 11:56:17 -0700
Subject: [PATCH 14/21] Update stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 doc/source/data/batch_inference.rst               | 2 +-
 doc/source/data/data-internals.rst                | 2 +-
 doc/source/data/loading-data.rst                  | 2 +-
 doc/source/data/performance-tips.rst              | 2 +-
 python/ray/data/tests/test_dynamic_block_split.py | 3 +--
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst
index 1f54d5c14214..4effd4f338a5 100644
--- a/doc/source/data/batch_inference.rst
+++ b/doc/source/data/batch_inference.rst
@@ -26,7 +26,7 @@ To start, install Ray Data:
 
 Using Ray Data for offline inference involves four basic steps:
 
-- **Step 1:** Load your data into a Ray Dataset. Ray Data supports many different data sources and formats. For more details, see :ref:`Loading Data <loading_data>`.
+- **Step 1:** Load your data into a Ray Dataset. Ray Data supports many different datasources and formats. For more details, see :ref:`Loading Data <loading_data>`.
 - **Step 2:** Define a Python class to load the pre-trained model.
 - **Step 3:** Transform your dataset using the pre-trained model by calling :meth:`ds.map_batches() <ray.data.Dataset.map_batches>`. For more details, see :ref:`Transforming Data <transforming_data>`.
 - **Step 4:** Get the final predictions by either iterating through the output or saving the results. For more details, see the :ref:`Iterating over data <iterating-over-data>` and :ref:`Saving data <saving-data>` user guides.
diff --git a/doc/source/data/data-internals.rst b/doc/source/data/data-internals.rst
index 833bb19624df..033612279e9e 100644
--- a/doc/source/data/data-internals.rst
+++ b/doc/source/data/data-internals.rst
@@ -40,7 +40,7 @@ task reads one or more files and produces an output block:
 ..
   https://docs.google.com/drawings/d/15B4TB8b5xN15Q9S8-s0MjW6iIvo_PrH7JtV1fL123pU/edit
 
-To handle transient errors from remote data sources, Ray Data retries application-level
+To handle transient errors from remote datasources, Ray Data retries application-level
 exceptions.
 
 For more information on loading data, see :ref:`Loading data <loading_data>`.
diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst
index 1e57a8904d6b..4ce1e75119d1 100644
--- a/doc/source/data/loading-data.rst
+++ b/doc/source/data/loading-data.rst
@@ -925,7 +925,7 @@ Synthetic datasets can be useful for testing and benchmarking.
             ------  ----
             data    numpy.ndarray(shape=(64, 64), dtype=int64)
 
-Loading other data sources
+Loading other datasources
 ==========================
 
 If Ray Data can't load your data, subclass
diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst
index 18a25fb115f1..20abfe47668c 100644
--- a/doc/source/data/performance-tips.rst
+++ b/doc/source/data/performance-tips.rst
@@ -36,7 +36,7 @@ Tuning read resources
 ~~~~~~~~~~~~~~~~~~~~~
 
 By default, Ray requests 1 CPU per read task, which means one read tasks per CPU can execute concurrently.
-For data sources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter.
+For datasources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter.
 For example, use ``ray.data.read_parquet(path, ray_remote_args={"num_cpus": 0.25})`` to allow up to four read tasks per CPU.
 
 Parquet column pruning
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index 8643db172f84..0e76ab42e2c3 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -1,12 +1,11 @@
 import os
 import time
 
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
 
-import numpy as np
-
 import ray
 from ray.data import Dataset
 from ray.data._internal.lazy_block_list import LazyBlockList

From 13ed8dc48aaa07ff3153f87f0e2d094568232fe4 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 12:00:27 -0700
Subject: [PATCH 15/21] Fix stuff

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Vocab/General/reject.txt           |  1 +
 .github/workflows/vale.yml                        | 13 -------------
 doc/requirements-doc.txt                          |  4 ++--
 doc/source/data/api/from_other_data_libs.rst      |  2 +-
 doc/source/data/working-with-images.rst           |  2 +-
 doc/source/data/working-with-pytorch.rst          |  2 +-
 doc/source/data/working-with-text.rst             |  2 --
 python/ray/data/tests/test_dynamic_block_split.py |  2 +-
 8 files changed, 7 insertions(+), 21 deletions(-)
 create mode 100644 .github/styles/Vocab/General/reject.txt
 delete mode 100644 .github/workflows/vale.yml

diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt
new file mode 100644
index 000000000000..02cc4c0883fe
--- /dev/null
+++ b/.github/styles/Vocab/General/reject.txt
@@ -0,0 +1 @@
+[Pp]lease
\ No newline at end of file
diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml
deleted file mode 100644
index 5f3f0454d464..000000000000
--- a/.github/workflows/vale.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: reviewdog
-on: [pull_request]
-
-jobs:
-  vale:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: errata-ai/vale-action@reviewdog
-        with:
-          files: doc/source/data/getting-started.rst
-          fail_on_error: true
-          level: warning
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
index 2003a3a8d7e6..648984d6ed0f 100644
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@@ -18,14 +18,14 @@ mock
 numpy
 scikit-image
 pandas
-# pickle5
+pickle5
 pillow
 pyarrow
 pydantic < 1.10.0
 # Note: more recent typing-extensions does not work well with pinned pydantic <1.10.0
 typing-extensions < 4.6.0
 pyyaml
-pytorch-lightning
+pytorch-lightning==1.6.5
 scikit-optimize
 redis
 starlette
diff --git a/doc/source/data/api/from_other_data_libs.rst b/doc/source/data/api/from_other_data_libs.rst
index 30a611f3b1d9..500cb123ebbd 100644
--- a/doc/source/data/api/from_other_data_libs.rst
+++ b/doc/source/data/api/from_other_data_libs.rst
@@ -11,7 +11,7 @@ libraries, so you can quickly map what you may already know to Ray Data APIs.
 .. note::
 
   - This is meant to map APIs that perform comparable but not necessarily identical operations.
-    Please select the API reference for exact semantics and usage.
+    Select the API reference for exact semantics and usage.
   - This list may not be exhaustive: Ray Data isn't a traditional ETL data processing library, so not all data processing APIs can map to Datasets.
     In addition, this list focuses on common APIs or APIs that are less obvious to see a connection.
 
diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst
index 57b1521c0ab9..a66fd695815a 100644
--- a/doc/source/data/working-with-images.rst
+++ b/doc/source/data/working-with-images.rst
@@ -242,7 +242,7 @@ Finally, call :meth:`Dataset.map_batches() <ray.data.Dataset.map_batches>`.
     {'class': 296}
 
 For more information on performing inference, see
-:ref:`End-to-end: offline batch inference <batch_inference_home>`
+:ref:`End-to-end: Offline Batch Inference <batch_inference_home>`
 and :ref:`Transforming batches with actors <transforming_data_actors>`.
 
 .. _saving_images:
diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst
index 1ae1515981c8..4f92e01a875e 100644
--- a/doc/source/data/working-with-pytorch.rst
+++ b/doc/source/data/working-with-pytorch.rst
@@ -45,7 +45,7 @@ Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for
 .. testcode::
 
     import torch
-    from Torch import nn
+    from torch import nn
     import ray
     from ray import train
     from ray.train import ScalingConfig
diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst
index 6540f6748289..da639c45a173 100644
--- a/doc/source/data/working-with-text.rst
+++ b/doc/source/data/working-with-text.rst
@@ -15,8 +15,6 @@ This guide shows you how to:
 Reading text files
 ------------------
 
-Food: Spam ham eggs.
-
 Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary
 files and manually decode data.
 
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index 0e76ab42e2c3..0d5de4acbf6e 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -16,7 +16,7 @@
 from ray.tests.conftest import *  # noqa
 
 
-# Datasource generates random bytes data
+# Data source generates random bytes data
 class RandomBytesDatasource(Datasource):
     def create_reader(self, **read_args):
         return RandomBytesReader(

From df1b0e0a30d1463d99727cfd9ca46dc357a8c8b4 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 12:03:18 -0700
Subject: [PATCH 16/21] Sort files

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Vocab/Data/accept.txt    | 26 ++++++++++++-------------
 .github/styles/Vocab/General/accept.txt | 14 ++++++-------
 .github/styles/Vocab/General/reject.txt |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/styles/Vocab/Data/accept.txt b/.github/styles/Vocab/Data/accept.txt
index 8270e8b56dc4..e612b3460d46 100644
--- a/.github/styles/Vocab/Data/accept.txt
+++ b/.github/styles/Vocab/Data/accept.txt
@@ -1,21 +1,21 @@
-[Pp]ushdown
-[Ii]ngest
-[Gg]roupby
-TFRecord(s)?
 Dask
+Data('s)?
 Modin
+Predibase('s)?
+Spotify('s)?
+TFRecord(s)?
+UDF(s)?
 [Dd]atasource
+[Gg]roupby
+[Ii]ndexable
+[Ii]ngest
+[Ll]ookup(s)?
+[Mm]ultiget(s)?
 [Pp]refetch
 [Pp]refetching
-[Ii]ndexable
 [Pp]reprocess
 [Pp]reprocessor(s)?
-Spotify('s)?
-Predibase('s)?
-UDF(s)?
-ndarray(s)?
-dtype
-[Ll]ookup(s)?
-[Mm]ultiget(s)?
+[Pp]ushdown
 [Ss]calers
-Data('s)?
\ No newline at end of file
+dtype
+ndarray(s)?
diff --git a/.github/styles/Vocab/General/accept.txt b/.github/styles/Vocab/General/accept.txt
index 9bc01c2bdc16..208355bd18fb 100644
--- a/.github/styles/Vocab/General/accept.txt
+++ b/.github/styles/Vocab/General/accept.txt
@@ -1,14 +1,14 @@
-[Ii]nteroperates
-CPU[s]
-GPU[s]
 # Use 'API' judiciously: https://developers.google.com/style/word-list#api.
 API[s]
-[Aa]pplication
+CPU[s]
+GPU[s]
 NumPy
-[Pp]erformant
-[Cc]odec
 URI[s]
+[Aa]pplication
+[Cc]odec
+[Dd]ict(s)?
+[Ii]nteroperates
 [Ii]nterpretability
 [Pp]arallelization
+[Pp]erformant
 [Ss]ubclassing
-[Dd]ict(s)?
\ No newline at end of file
diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt
index 02cc4c0883fe..c2b33abf6daf 100644
--- a/.github/styles/Vocab/General/reject.txt
+++ b/.github/styles/Vocab/General/reject.txt
@@ -1 +1 @@
-[Pp]lease
\ No newline at end of file
+[Pp]lease

From 4dcf8f2a241960da2429744165e86542bd18b0c2 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 12:08:19 -0700
Subject: [PATCH 17/21] Add notes

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .vale.ini | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.vale.ini b/.vale.ini
index 119c309ca4c8..14c01c4dbf5e 100644
--- a/.vale.ini
+++ b/.vale.ini
@@ -7,9 +7,15 @@ MinAlertLevel = suggestion
 Packages = Google
 
 [*.rst]
+# HACK(@bveeramani): I have no clue why we need to include `(:class:`.*`)` in addition
+# to `(:.*:`.*`)`, but we get false positives if we don't. `TokenIgnores` is weird.
 TokenIgnores = (:class:`.*`)|(:.*:`.*`)|(`.*`)
 
 [*.{md,rst}]
 BasedOnStyles = Vale, Google
+# We're disabling "Colons" because we disagree with Google's suggestion to lowercase the
+# first word after a colon.
 Google.Colons = No
+# TODO(@bveeramani): We're temporarily disabling "Heading". In the future, we'll update
+# all headings and enable this rule.
 Google.Headings = No

From 7808303ecebbd5dd1813a11bd0d3c5d856fb5300 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 12:11:08 -0700
Subject: [PATCH 18/21] Remove whitespace

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 doc/source/data/saving-data.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst
index 5fd2529bd7e8..a2ce55261606 100644
--- a/doc/source/data/saving-data.rst
+++ b/doc/source/data/saving-data.rst
@@ -7,7 +7,7 @@ Saving Data
 Ray Data lets you save data in files or other Python objects.
 
 This guide shows you how to:
- 
+
 * `Write data to files <#writing-data-to-files>`_
 * `Convert Datasets to other Python libraries <#converting-datasets-to-other-python-libraries>`_
 

From 6ac1000a72a74fb425fc1e9913c6a16c9e9f7c9c Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 22:50:07 -0700
Subject: [PATCH 19/21] Update files

Address review comments

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/Acronyms.yml      | 2 ++
 .github/styles/Google/Will.yml          | 1 +
 .github/styles/Vocab/General/reject.txt | 1 +
 doc/source/data/preprocessors.rst       | 2 +-
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml
index f15489e94c99..695bb5864f2c 100644
--- a/.github/styles/Google/Acronyms.yml
+++ b/.github/styles/Google/Acronyms.yml
@@ -22,6 +22,8 @@ exceptions:
   - ETL
   - FAQ
   - GCC
+  - GCE
+  - GCP
   - GDB
   - GET
   - GPU
diff --git a/.github/styles/Google/Will.yml b/.github/styles/Google/Will.yml
index 128a918362b8..20f6bc55294d 100644
--- a/.github/styles/Google/Will.yml
+++ b/.github/styles/Google/Will.yml
@@ -5,3 +5,4 @@ ignorecase: true
 level: warning
 tokens:
   - will
+  - "'ll"
diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt
index c2b33abf6daf..b94f0057f42e 100644
--- a/.github/styles/Vocab/General/reject.txt
+++ b/.github/styles/Vocab/General/reject.txt
@@ -1 +1,2 @@
 [Pp]lease
+[Cc]ongratulations
diff --git a/doc/source/data/preprocessors.rst b/doc/source/data/preprocessors.rst
index e53fcc937061..aca5aa0377ab 100644
--- a/doc/source/data/preprocessors.rst
+++ b/doc/source/data/preprocessors.rst
@@ -32,7 +32,7 @@ The :class:`Preprocessor <ray.data.preprocessor.Preprocessor>` class has four pu
 #. :meth:`transform_batch() <ray.data.preprocessor.Preprocessor.transform_batch>`: Apply a transformation to a single :class:`batch <ray.train.predictor.DataBatchType>` of data. This method is typically called on online or offline inference data.
 #. :meth:`fit_transform() <ray.data.preprocessor.Preprocessor.fit_tranform>`: Syntactic sugar for calling both :meth:`fit() <ray.data.preprocessor.Preprocessor.fit>` and :meth:`transform() <ray.data.preprocessor.Preprocessor.transform>` on a :class:`Dataset <ray.data.Dataset>`.
 
-To show these methods in action, walk through a basic example. First, you'll set up two simple Ray ``Dataset``\s.
+To show these methods in action, walk through a basic example. First, set up two simple Ray ``Dataset``\s.
 
 .. literalinclude:: doc_code/preprocessors.py
     :language: python

From 6e2dea9b6d78d7c843f6bb601a94916290ce8a36 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 23:14:59 -0700
Subject: [PATCH 20/21] Update files

Update dashes

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/EmDash.yml |  2 +-
 .github/styles/Google/EnDash.yml | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/styles/Google/EmDash.yml b/.github/styles/Google/EmDash.yml
index 1befe72aa881..e7231cff5f7a 100644
--- a/.github/styles/Google/EmDash.yml
+++ b/.github/styles/Google/EmDash.yml
@@ -9,4 +9,4 @@ action:
     - remove
     - ' '
 tokens:
-  - '\s[—–]\s'
+  - '\s-{2,3}\s'
diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml
index b314dc4e98ab..0480d79eb9b8 100644
--- a/.github/styles/Google/EnDash.yml
+++ b/.github/styles/Google/EnDash.yml
@@ -1,5 +1,5 @@
 extends: existence
-message: "Use an em dash ('—') instead of '–'."
+message: "Use an em dash ('---') instead of '--'."
 link: 'https://developers.google.com/style/dashes'
 nonword: true
 level: error
@@ -7,7 +7,8 @@ action:
   name: edit
   params:
     - replace
-    - '-'
-    - '—'
-tokens:
-  - '–'
+    - '--'
+    - '---'
+raw:
+  - '(?<!-)-{2}(?!-)'
+

From c60d7d3d4d72982a516753b9c3f82ec024486a77 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <balaji@anyscale.com>
Date: Mon, 21 Aug 2023 23:23:46 -0700
Subject: [PATCH 21/21] Update files

Remove trailing newine

Signed-off-by: Balaji Veeramani <balaji@anyscale.com>
---
 .github/styles/Google/EnDash.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml
index 0480d79eb9b8..e01331ffcfee 100644
--- a/.github/styles/Google/EnDash.yml
+++ b/.github/styles/Google/EnDash.yml
@@ -11,4 +11,3 @@ action:
     - '---'
 raw:
   - '(?<!-)-{2}(?!-)'
-