From 5984f1ec6f49d8a2e5ae3a362f9c71efe1c8c36e Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Nov 2022 21:00:42 -0800 Subject: [PATCH 01/21] Rename `"model_state_dict"` to `"model"` --- .../train/examples/pytorch/tune_cifar_torch_pbt_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py index 71472227a249..46ea6ab3947a 100644 --- a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py +++ b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py @@ -83,7 +83,7 @@ def train_func(config): checkpoint_dict = session.get_checkpoint().to_dict() # Load in model - model_state = checkpoint_dict["model_state_dict"] + model_state = checkpoint_dict["model"] model.load_state_dict(model_state) # Load in optimizer @@ -146,7 +146,7 @@ def train_func(config): checkpoint = Checkpoint.from_dict( { "epoch": epoch, - "model_state_dict": model.state_dict(), + "model": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } ) From 8f58490187070bc8122629f97f9c9ff07b716f85 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Nov 2022 21:00:49 -0800 Subject: [PATCH 02/21] Revert "Rename `"model_state_dict"` to `"model"`" This reverts commit 5984f1ec6f49d8a2e5ae3a362f9c71efe1c8c36e. --- .../train/examples/pytorch/tune_cifar_torch_pbt_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py index 46ea6ab3947a..71472227a249 100644 --- a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py +++ b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py @@ -83,7 +83,7 @@ def train_func(config): checkpoint_dict = session.get_checkpoint().to_dict() # Load in model - model_state = checkpoint_dict["model"] + model_state = checkpoint_dict["model_state_dict"] model.load_state_dict(model_state) # Load in optimizer @@ -146,7 +146,7 @@ def train_func(config): checkpoint = Checkpoint.from_dict( { "epoch": epoch, - "model": model.state_dict(), + "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } ) From de05655b003c96b3cb9194e6cf21155e04ee22f5 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 26 Jan 2023 11:56:49 -0800 Subject: [PATCH 03/21] Update annotations.py Signed-off-by: Balaji Veeramani --- python/ray/util/annotations.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/ray/util/annotations.py b/python/ray/util/annotations.py index 9996b092fcab..f7b93746f910 100644 --- a/python/ray/util/annotations.py +++ b/python/ray/util/annotations.py @@ -49,7 +49,7 @@ def PublicAPI(*args, **kwargs): def wrap(obj): if stability in ["alpha", "beta"]: message = ( - f"PublicAPI ({stability}): This API is in {stability} " + f"**PublicAPI ({stability}):** This API is in {stability} " "and may change before becoming stable." ) else: @@ -80,7 +80,8 @@ def DeveloperAPI(*args, **kwargs): def wrap(obj): _append_doc( - obj, message="DeveloperAPI: This API may change across minor Ray releases." + obj, + message="**DeveloperAPI:** This API may change across minor Ray releases.", ) _mark_annotated(obj) return obj From fd2ff917e1cc3258554c56b283db8e8e155cff9a Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 26 Jan 2023 12:02:30 -0800 Subject: [PATCH 04/21] Revert "Update annotations.py" This reverts commit de05655b003c96b3cb9194e6cf21155e04ee22f5. Signed-off-by: Balaji Veeramani --- python/ray/util/annotations.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/ray/util/annotations.py b/python/ray/util/annotations.py index f7b93746f910..9996b092fcab 100644 --- a/python/ray/util/annotations.py +++ b/python/ray/util/annotations.py @@ -49,7 +49,7 @@ def PublicAPI(*args, **kwargs): def wrap(obj): if stability in ["alpha", "beta"]: message = ( - f"**PublicAPI ({stability}):** This API is in {stability} " + f"PublicAPI ({stability}): This API is in {stability} " "and may change before becoming stable." ) else: @@ -80,8 +80,7 @@ def DeveloperAPI(*args, **kwargs): def wrap(obj): _append_doc( - obj, - message="**DeveloperAPI:** This API may change across minor Ray releases.", + obj, message="DeveloperAPI: This API may change across minor Ray releases." ) _mark_annotated(obj) return obj From f4ddcc986eadb86f4587903357aabc6d3ff05a48 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 23 May 2023 11:20:37 -0700 Subject: [PATCH 05/21] Initial commit Signed-off-by: Balaji Veeramani --- .github/styles/Google/AMPM.yml | 9 + .github/styles/Google/Acronyms.yml | 64 +++++++ .github/styles/Google/Colons.yml | 8 + .github/styles/Google/Contractions.yml | 30 +++ .github/styles/Google/DateFormat.yml | 9 + .github/styles/Google/Ellipses.yml | 9 + .github/styles/Google/EmDash.yml | 12 ++ .github/styles/Google/EnDash.yml | 13 ++ .github/styles/Google/Exclamation.yml | 9 + .github/styles/Google/FirstPerson.yml | 13 ++ .github/styles/Google/Gender.yml | 9 + .github/styles/Google/GenderBias.yml | 47 +++++ .github/styles/Google/HeadingPunctuation.yml | 13 ++ .github/styles/Google/Headings.yml | 29 +++ .github/styles/Google/Latin.yml | 11 ++ .github/styles/Google/LyHyphens.yml | 14 ++ .github/styles/Google/OptionalPlurals.yml | 12 ++ .github/styles/Google/Ordinal.yml | 7 + .github/styles/Google/OxfordComma.yml | 7 + .github/styles/Google/Parens.yml | 7 + .github/styles/Google/Passive.yml | 184 +++++++++++++++++++ .github/styles/Google/Periods.yml | 7 + .github/styles/Google/Quotes.yml | 7 + .github/styles/Google/Ranges.yml | 7 + .github/styles/Google/Semicolons.yml | 8 + .github/styles/Google/Slang.yml | 11 ++ .github/styles/Google/Spacing.yml | 10 + .github/styles/Google/Spelling.yml | 10 + .github/styles/Google/Units.yml | 8 + .github/styles/Google/We.yml | 11 ++ .github/styles/Google/Will.yml | 7 + .github/styles/Google/WordList.yml | 81 ++++++++ .github/styles/Google/meta.json | 4 + .github/styles/Google/vocab.txt | 0 .github/styles/Vocab/ray/accept.txt | 21 +++ .github/workflows/vale.yml | 0 .vale.ini | 10 + 37 files changed, 718 insertions(+) create mode 100644 .github/styles/Google/AMPM.yml create mode 100644 .github/styles/Google/Acronyms.yml create mode 100644 .github/styles/Google/Colons.yml create mode 100644 .github/styles/Google/Contractions.yml create mode 100644 .github/styles/Google/DateFormat.yml create mode 100644 .github/styles/Google/Ellipses.yml create mode 100644 .github/styles/Google/EmDash.yml create mode 100644 .github/styles/Google/EnDash.yml create mode 100644 .github/styles/Google/Exclamation.yml create mode 100644 .github/styles/Google/FirstPerson.yml create mode 100644 .github/styles/Google/Gender.yml create mode 100644 .github/styles/Google/GenderBias.yml create mode 100644 .github/styles/Google/HeadingPunctuation.yml create mode 100644 .github/styles/Google/Headings.yml create mode 100644 .github/styles/Google/Latin.yml create mode 100644 .github/styles/Google/LyHyphens.yml create mode 100644 .github/styles/Google/OptionalPlurals.yml create mode 100644 .github/styles/Google/Ordinal.yml create mode 100644 .github/styles/Google/OxfordComma.yml create mode 100644 .github/styles/Google/Parens.yml create mode 100644 .github/styles/Google/Passive.yml create mode 100644 .github/styles/Google/Periods.yml create mode 100644 .github/styles/Google/Quotes.yml create mode 100644 .github/styles/Google/Ranges.yml create mode 100644 .github/styles/Google/Semicolons.yml create mode 100644 .github/styles/Google/Slang.yml create mode 100644 .github/styles/Google/Spacing.yml create mode 100644 .github/styles/Google/Spelling.yml create mode 100644 .github/styles/Google/Units.yml create mode 100644 .github/styles/Google/We.yml create mode 100644 .github/styles/Google/Will.yml create mode 100644 .github/styles/Google/WordList.yml create mode 100644 .github/styles/Google/meta.json create mode 100644 .github/styles/Google/vocab.txt create mode 100644 .github/styles/Vocab/ray/accept.txt create mode 100644 .github/workflows/vale.yml create mode 100644 .vale.ini diff --git a/.github/styles/Google/AMPM.yml b/.github/styles/Google/AMPM.yml new file mode 100644 index 000000000000..fbdc6e4f84b9 --- /dev/null +++ b/.github/styles/Google/AMPM.yml @@ -0,0 +1,9 @@ +extends: existence +message: "Use 'AM' or 'PM' (preceded by a space)." +link: 'https://developers.google.com/style/word-list' +level: error +nonword: true +tokens: + - '\d{1,2}[AP]M' + - '\d{1,2} ?[ap]m' + - '\d{1,2} ?[aApP]\.[mM]\.' diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml new file mode 100644 index 000000000000..f41af0189b07 --- /dev/null +++ b/.github/styles/Google/Acronyms.yml @@ -0,0 +1,64 @@ +extends: conditional +message: "Spell out '%s', if it's unfamiliar to the audience." +link: 'https://developers.google.com/style/abbreviations' +level: suggestion +ignorecase: false +# Ensures that the existence of 'first' implies the existence of 'second'. +first: '\b([A-Z]{3,5})\b' +second: '(?:\b[A-Z][a-z]+ )+\(([A-Z]{3,5})\)' +# ... with the exception of these: +exceptions: + - API + - ASP + - CLI + - CPU + - CSS + - CSV + - DEBUG + - DOM + - DPI + - FAQ + - GCC + - GDB + - GET + - GPU + - GTK + - GUI + - HTML + - HTTP + - HTTPS + - IDE + - JAR + - JSON + - JSX + - LESS + - LLDB + - NET + - NOTE + - NVDA + - OSS + - PATH + - PDF + - PHP + - POST + - RAM + - REPL + - RSA + - SCM + - SCSS + - SDK + - SQL + - SSH + - SSL + - SVG + - TBD + - TCP + - TODO + - URI + - URL + - USB + - UTF + - XML + - XSS + - YAML + - ZIP diff --git a/.github/styles/Google/Colons.yml b/.github/styles/Google/Colons.yml new file mode 100644 index 000000000000..99363fbd46d7 --- /dev/null +++ b/.github/styles/Google/Colons.yml @@ -0,0 +1,8 @@ +extends: existence +message: "'%s' should be in lowercase." +link: 'https://developers.google.com/style/colons' +nonword: true +level: warning +scope: sentence +tokens: + - ':\s[A-Z]' diff --git a/.github/styles/Google/Contractions.yml b/.github/styles/Google/Contractions.yml new file mode 100644 index 000000000000..4f6fd5d489dc --- /dev/null +++ b/.github/styles/Google/Contractions.yml @@ -0,0 +1,30 @@ +extends: substitution +message: "Use '%s' instead of '%s'." +link: 'https://developers.google.com/style/contractions' +level: suggestion +ignorecase: true +action: + name: replace +swap: + are not: aren't + cannot: can't + could not: couldn't + did not: didn't + do not: don't + does not: doesn't + has not: hasn't + have not: haven't + how is: how's + is not: isn't + it is: it's + should not: shouldn't + that is: that's + they are: they're + was not: wasn't + we are: we're + we have: we've + were not: weren't + what is: what's + when is: when's + where is: where's + will not: won't diff --git a/.github/styles/Google/DateFormat.yml b/.github/styles/Google/DateFormat.yml new file mode 100644 index 000000000000..e9d227fa13d5 --- /dev/null +++ b/.github/styles/Google/DateFormat.yml @@ -0,0 +1,9 @@ +extends: existence +message: "Use 'July 31, 2016' format, not '%s'." +link: 'https://developers.google.com/style/dates-times' +ignorecase: true +level: error +nonword: true +tokens: + - '\d{1,2}(?:\.|/)\d{1,2}(?:\.|/)\d{4}' + - '\d{1,2} (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)|May|Jun(?:e)|Jul(?:y)|Aug(?:ust)|Sep(?:tember)?|Oct(?:ober)|Nov(?:ember)?|Dec(?:ember)?) \d{4}' diff --git a/.github/styles/Google/Ellipses.yml b/.github/styles/Google/Ellipses.yml new file mode 100644 index 000000000000..1e070517bfe4 --- /dev/null +++ b/.github/styles/Google/Ellipses.yml @@ -0,0 +1,9 @@ +extends: existence +message: "In general, don't use an ellipsis." +link: 'https://developers.google.com/style/ellipses' +nonword: true +level: warning +action: + name: remove +tokens: + - '\.\.\.' diff --git a/.github/styles/Google/EmDash.yml b/.github/styles/Google/EmDash.yml new file mode 100644 index 000000000000..1befe72aa881 --- /dev/null +++ b/.github/styles/Google/EmDash.yml @@ -0,0 +1,12 @@ +extends: existence +message: "Don't put a space before or after a dash." +link: 'https://developers.google.com/style/dashes' +nonword: true +level: error +action: + name: edit + params: + - remove + - ' ' +tokens: + - '\s[—–]\s' diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml new file mode 100644 index 000000000000..b314dc4e98ab --- /dev/null +++ b/.github/styles/Google/EnDash.yml @@ -0,0 +1,13 @@ +extends: existence +message: "Use an em dash ('—') instead of '–'." +link: 'https://developers.google.com/style/dashes' +nonword: true +level: error +action: + name: edit + params: + - replace + - '-' + - '—' +tokens: + - '–' diff --git a/.github/styles/Google/Exclamation.yml b/.github/styles/Google/Exclamation.yml new file mode 100644 index 000000000000..eea5fd24bd5f --- /dev/null +++ b/.github/styles/Google/Exclamation.yml @@ -0,0 +1,9 @@ +extends: existence +message: "Don't use exclamation points in text." +link: 'https://developers.google.com/style/exclamation-points' +nonword: true +level: error +action: + name: remove +tokens: + - '\w+!(?:\s|$)' diff --git a/.github/styles/Google/FirstPerson.yml b/.github/styles/Google/FirstPerson.yml new file mode 100644 index 000000000000..0b7b8828ca5f --- /dev/null +++ b/.github/styles/Google/FirstPerson.yml @@ -0,0 +1,13 @@ +extends: existence +message: "Avoid first-person pronouns such as '%s'." +link: 'https://developers.google.com/style/pronouns#personal-pronouns' +ignorecase: true +level: warning +nonword: true +tokens: + - (?:^|\s)I\s + - (?:^|\s)I,\s + - \bI'm\b + - \bme\b + - \bmy\b + - \bmine\b diff --git a/.github/styles/Google/Gender.yml b/.github/styles/Google/Gender.yml new file mode 100644 index 000000000000..c8486181d697 --- /dev/null +++ b/.github/styles/Google/Gender.yml @@ -0,0 +1,9 @@ +extends: existence +message: "Don't use '%s' as a gender-neutral pronoun." +link: 'https://developers.google.com/style/pronouns#gender-neutral-pronouns' +level: error +ignorecase: true +tokens: + - he/she + - s/he + - \(s\)he diff --git a/.github/styles/Google/GenderBias.yml b/.github/styles/Google/GenderBias.yml new file mode 100644 index 000000000000..9e7019086302 --- /dev/null +++ b/.github/styles/Google/GenderBias.yml @@ -0,0 +1,47 @@ +extends: substitution +message: "Consider using '%s' instead of '%s'." +link: 'https://developers.google.com/style/inclusive-documentation' +ignorecase: true +level: error +action: + name: replace +swap: + (?:alumna|alumnus): graduate + (?:alumnae|alumni): graduates + air(?:m[ae]n|wom[ae]n): pilot(s) + anchor(?:m[ae]n|wom[ae]n): anchor(s) + authoress: author + camera(?:m[ae]n|wom[ae]n): camera operator(s) + chair(?:m[ae]n|wom[ae]n): chair(s) + congress(?:m[ae]n|wom[ae]n): member(s) of congress + door(?:m[ae]|wom[ae]n): concierge(s) + draft(?:m[ae]n|wom[ae]n): drafter(s) + fire(?:m[ae]n|wom[ae]n): firefighter(s) + fisher(?:m[ae]n|wom[ae]n): fisher(s) + fresh(?:m[ae]n|wom[ae]n): first-year student(s) + garbage(?:m[ae]n|wom[ae]n): waste collector(s) + lady lawyer: lawyer + ladylike: courteous + landlord: building manager + mail(?:m[ae]n|wom[ae]n): mail carriers + man and wife: husband and wife + man enough: strong enough + mankind: human kind + manmade: manufactured + manpower: personnel + men and girls: men and women + middle(?:m[ae]n|wom[ae]n): intermediary + news(?:m[ae]n|wom[ae]n): journalist(s) + ombuds(?:man|woman): ombuds + oneupmanship: upstaging + poetess: poet + police(?:m[ae]n|wom[ae]n): police officer(s) + repair(?:m[ae]n|wom[ae]n): technician(s) + sales(?:m[ae]n|wom[ae]n): salesperson or sales people + service(?:m[ae]n|wom[ae]n): soldier(s) + steward(?:ess)?: flight attendant + tribes(?:m[ae]n|wom[ae]n): tribe member(s) + waitress: waiter + woman doctor: doctor + woman scientist[s]?: scientist(s) + work(?:m[ae]n|wom[ae]n): worker(s) diff --git a/.github/styles/Google/HeadingPunctuation.yml b/.github/styles/Google/HeadingPunctuation.yml new file mode 100644 index 000000000000..b538be5b42a2 --- /dev/null +++ b/.github/styles/Google/HeadingPunctuation.yml @@ -0,0 +1,13 @@ +extends: existence +message: "Don't put a period at the end of a heading." +link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings' +nonword: true +level: warning +scope: heading +action: + name: edit + params: + - remove + - '.' +tokens: + - '[a-z0-9][.]\s*$' diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml new file mode 100644 index 000000000000..a53301338a47 --- /dev/null +++ b/.github/styles/Google/Headings.yml @@ -0,0 +1,29 @@ +extends: capitalization +message: "'%s' should use sentence-style capitalization." +link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings' +level: warning +scope: heading +match: $sentence +indicators: + - ':' +exceptions: + - Azure + - CLI + - Code + - Cosmos + - Docker + - Emmet + - gRPC + - I + - Kubernetes + - Linux + - macOS + - Marketplace + - MongoDB + - REPL + - Studio + - TypeScript + - URLs + - Visual + - VS + - Windows diff --git a/.github/styles/Google/Latin.yml b/.github/styles/Google/Latin.yml new file mode 100644 index 000000000000..ca03b9154b16 --- /dev/null +++ b/.github/styles/Google/Latin.yml @@ -0,0 +1,11 @@ +extends: substitution +message: "Use '%s' instead of '%s'." +link: 'https://developers.google.com/style/abbreviations' +ignorecase: true +level: error +nonword: true +action: + name: replace +swap: + '\b(?:eg|e\.g\.)(?=[\s,;])': for example + '\b(?:ie|i\.e\.)(?=[\s,;])': that is diff --git a/.github/styles/Google/LyHyphens.yml b/.github/styles/Google/LyHyphens.yml new file mode 100644 index 000000000000..ac8f557a4af7 --- /dev/null +++ b/.github/styles/Google/LyHyphens.yml @@ -0,0 +1,14 @@ +extends: existence +message: "'%s' doesn't need a hyphen." +link: 'https://developers.google.com/style/hyphens' +level: error +ignorecase: false +nonword: true +action: + name: edit + params: + - replace + - '-' + - ' ' +tokens: + - '\s[^\s-]+ly-' diff --git a/.github/styles/Google/OptionalPlurals.yml b/.github/styles/Google/OptionalPlurals.yml new file mode 100644 index 000000000000..f858ea6fee16 --- /dev/null +++ b/.github/styles/Google/OptionalPlurals.yml @@ -0,0 +1,12 @@ +extends: existence +message: "Don't use plurals in parentheses such as in '%s'." +link: 'https://developers.google.com/style/plurals-parentheses' +level: error +nonword: true +action: + name: edit + params: + - remove + - '(s)' +tokens: + - '\b\w+\(s\)' diff --git a/.github/styles/Google/Ordinal.yml b/.github/styles/Google/Ordinal.yml new file mode 100644 index 000000000000..d1ac7d27e80d --- /dev/null +++ b/.github/styles/Google/Ordinal.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Spell out all ordinal numbers ('%s') in text." +link: 'https://developers.google.com/style/numbers' +level: error +nonword: true +tokens: + - \d+(?:st|nd|rd|th) diff --git a/.github/styles/Google/OxfordComma.yml b/.github/styles/Google/OxfordComma.yml new file mode 100644 index 000000000000..b9ba21ebb25a --- /dev/null +++ b/.github/styles/Google/OxfordComma.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Use the Oxford comma in '%s'." +link: 'https://developers.google.com/style/commas' +scope: sentence +level: warning +tokens: + - '(?:[^,]+,){1,}\s\w+\s(?:and|or)' diff --git a/.github/styles/Google/Parens.yml b/.github/styles/Google/Parens.yml new file mode 100644 index 000000000000..3b8711d0c88f --- /dev/null +++ b/.github/styles/Google/Parens.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Use parentheses judiciously." +link: 'https://developers.google.com/style/parentheses' +nonword: true +level: suggestion +tokens: + - '\(.+\)' diff --git a/.github/styles/Google/Passive.yml b/.github/styles/Google/Passive.yml new file mode 100644 index 000000000000..3265890e5202 --- /dev/null +++ b/.github/styles/Google/Passive.yml @@ -0,0 +1,184 @@ +extends: existence +link: 'https://developers.google.com/style/voice' +message: "In general, use active voice instead of passive voice ('%s')." +ignorecase: true +level: suggestion +raw: + - \b(am|are|were|being|is|been|was|be)\b\s* +tokens: + - '[\w]+ed' + - awoken + - beat + - become + - been + - begun + - bent + - beset + - bet + - bid + - bidden + - bitten + - bled + - blown + - born + - bought + - bound + - bred + - broadcast + - broken + - brought + - built + - burnt + - burst + - cast + - caught + - chosen + - clung + - come + - cost + - crept + - cut + - dealt + - dived + - done + - drawn + - dreamt + - driven + - drunk + - dug + - eaten + - fallen + - fed + - felt + - fit + - fled + - flown + - flung + - forbidden + - foregone + - forgiven + - forgotten + - forsaken + - fought + - found + - frozen + - given + - gone + - gotten + - ground + - grown + - heard + - held + - hidden + - hit + - hung + - hurt + - kept + - knelt + - knit + - known + - laid + - lain + - leapt + - learnt + - led + - left + - lent + - let + - lighted + - lost + - made + - meant + - met + - misspelt + - mistaken + - mown + - overcome + - overdone + - overtaken + - overthrown + - paid + - pled + - proven + - put + - quit + - read + - rid + - ridden + - risen + - run + - rung + - said + - sat + - sawn + - seen + - sent + - set + - sewn + - shaken + - shaven + - shed + - shod + - shone + - shorn + - shot + - shown + - shrunk + - shut + - slain + - slept + - slid + - slit + - slung + - smitten + - sold + - sought + - sown + - sped + - spent + - spilt + - spit + - split + - spoken + - spread + - sprung + - spun + - stolen + - stood + - stridden + - striven + - struck + - strung + - stuck + - stung + - stunk + - sung + - sunk + - swept + - swollen + - sworn + - swum + - swung + - taken + - taught + - thought + - thrived + - thrown + - thrust + - told + - torn + - trodden + - understood + - upheld + - upset + - wed + - wept + - withheld + - withstood + - woken + - won + - worn + - wound + - woven + - written + - wrung diff --git a/.github/styles/Google/Periods.yml b/.github/styles/Google/Periods.yml new file mode 100644 index 000000000000..d24a6a6c0335 --- /dev/null +++ b/.github/styles/Google/Periods.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Don't use periods with acronyms or initialisms such as '%s'." +link: 'https://developers.google.com/style/abbreviations' +level: error +nonword: true +tokens: + - '\b(?:[A-Z]\.){3,}' diff --git a/.github/styles/Google/Quotes.yml b/.github/styles/Google/Quotes.yml new file mode 100644 index 000000000000..3cb6f1abd182 --- /dev/null +++ b/.github/styles/Google/Quotes.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Commas and periods go inside quotation marks." +link: 'https://developers.google.com/style/quotation-marks' +level: error +nonword: true +tokens: + - '"[^"]+"[.,?]' diff --git a/.github/styles/Google/Ranges.yml b/.github/styles/Google/Ranges.yml new file mode 100644 index 000000000000..3ec045e777d9 --- /dev/null +++ b/.github/styles/Google/Ranges.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Don't add words such as 'from' or 'between' to describe a range of numbers." +link: 'https://developers.google.com/style/hyphens' +nonword: true +level: warning +tokens: + - '(?:from|between)\s\d+\s?-\s?\d+' diff --git a/.github/styles/Google/Semicolons.yml b/.github/styles/Google/Semicolons.yml new file mode 100644 index 000000000000..bb8b85b420ee --- /dev/null +++ b/.github/styles/Google/Semicolons.yml @@ -0,0 +1,8 @@ +extends: existence +message: "Use semicolons judiciously." +link: 'https://developers.google.com/style/semicolons' +nonword: true +scope: sentence +level: suggestion +tokens: + - ';' diff --git a/.github/styles/Google/Slang.yml b/.github/styles/Google/Slang.yml new file mode 100644 index 000000000000..63f4c248a841 --- /dev/null +++ b/.github/styles/Google/Slang.yml @@ -0,0 +1,11 @@ +extends: existence +message: "Don't use internet slang abbreviations such as '%s'." +link: 'https://developers.google.com/style/abbreviations' +ignorecase: true +level: error +tokens: + - 'tl;dr' + - ymmv + - rtfm + - imo + - fwiw diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml new file mode 100644 index 000000000000..66e45a6b72a9 --- /dev/null +++ b/.github/styles/Google/Spacing.yml @@ -0,0 +1,10 @@ +extends: existence +message: "'%s' should have one space." +link: 'https://developers.google.com/style/sentence-spacing' +level: error +nonword: true +action: + name: remove +tokens: + - '[a-z][.?!] {2,}[A-Z]' + - '[a-z][.?!][A-Z]' diff --git a/.github/styles/Google/Spelling.yml b/.github/styles/Google/Spelling.yml new file mode 100644 index 000000000000..527ac07d318c --- /dev/null +++ b/.github/styles/Google/Spelling.yml @@ -0,0 +1,10 @@ +extends: existence +message: "In general, use American spelling instead of '%s'." +link: 'https://developers.google.com/style/spelling' +ignorecase: true +level: warning +tokens: + - '(?:\w+)nised?' + - 'colour' + - 'labour' + - 'centre' diff --git a/.github/styles/Google/Units.yml b/.github/styles/Google/Units.yml new file mode 100644 index 000000000000..53522ab2dea3 --- /dev/null +++ b/.github/styles/Google/Units.yml @@ -0,0 +1,8 @@ +extends: existence +message: "Put a nonbreaking space between the number and the unit in '%s'." +link: "https://developers.google.com/style/units-of-measure" +nonword: true +level: error +tokens: + - \b\d+(?:B|kB|MB|GB|TB) + - \b\d+(?:ns|ms|s|min|h|d) diff --git a/.github/styles/Google/We.yml b/.github/styles/Google/We.yml new file mode 100644 index 000000000000..c7ac7d36221d --- /dev/null +++ b/.github/styles/Google/We.yml @@ -0,0 +1,11 @@ +extends: existence +message: "Try to avoid using first-person plural like '%s'." +link: 'https://developers.google.com/style/pronouns#personal-pronouns' +level: warning +ignorecase: true +tokens: + - we + - we'(?:ve|re) + - ours? + - us + - let's diff --git a/.github/styles/Google/Will.yml b/.github/styles/Google/Will.yml new file mode 100644 index 000000000000..128a918362b8 --- /dev/null +++ b/.github/styles/Google/Will.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Avoid using '%s'." +link: 'https://developers.google.com/style/tense' +ignorecase: true +level: warning +tokens: + - will diff --git a/.github/styles/Google/WordList.yml b/.github/styles/Google/WordList.yml new file mode 100644 index 000000000000..0d675f2372a2 --- /dev/null +++ b/.github/styles/Google/WordList.yml @@ -0,0 +1,81 @@ +extends: substitution +message: "Use '%s' instead of '%s'." +link: "https://developers.google.com/style/word-list" +level: warning +ignorecase: false +action: + name: replace +swap: + "(?:API Console|dev|developer) key": API key + "(?:cell ?phone|smart ?phone)": phone|mobile phone + "(?:dev|developer|APIs) console": API console + "(?:e-mail|Email|E-mail)": email + "(?:file ?path|path ?name)": path + "(?:kill|terminate|abort)": stop|exit|cancel|end + "(?:OAuth ?2|Oauth)": OAuth 2.0 + "(?:ok|Okay)": OK|okay + "(?:WiFi|wifi)": Wi-Fi + '[\.]+apk': APK + '3\-D': 3D + 'Google (?:I\-O|IO)': Google I/O + "tap (?:&|and) hold": touch & hold + "un(?:check|select)": clear + above: preceding + account name: username + action bar: app bar + admin: administrator + Ajax: AJAX + a\.k\.a|aka: or|also known as + Android device: Android-powered device + android: Android + API explorer: APIs Explorer + application: app + approx\.: approximately + authN: authentication + authZ: authorization + autoupdate: automatically update + cellular data: mobile data + cellular network: mobile network + chapter: documents|pages|sections + check box: checkbox + check: select + CLI: command-line tool + click on: click|click in + Cloud: Google Cloud Platform|GCP + Container Engine: Kubernetes Engine + content type: media type + curated roles: predefined roles + data are: data is + Developers Console: Google API Console|API Console + disabled?: turn off|off + ephemeral IP address: ephemeral external IP address + fewer data: less data + file name: filename + firewalls: firewall rules + functionality: capability|feature + Google account: Google Account + Google accounts: Google Accounts + Googling: search with Google + grayed-out: unavailable + HTTPs: HTTPS + in order to: to + ingest: import|load + k8s: Kubernetes + long press: touch & hold + network IP address: internal IP address + omnibox: address bar + open-source: open source + overview screen: recents screen + regex: regular expression + SHA1: SHA-1|HAS-SHA1 + sign into: sign in to + sign-?on: single sign-on + static IP address: static external IP address + stylesheet: style sheet + synch: sync + tablename: table name + tablet: device + touch: tap + url: URL + vs\.: versus + World Wide Web: web diff --git a/.github/styles/Google/meta.json b/.github/styles/Google/meta.json new file mode 100644 index 000000000000..a5da2a8480ef --- /dev/null +++ b/.github/styles/Google/meta.json @@ -0,0 +1,4 @@ +{ + "feed": "https://github.com/errata-ai/Google/releases.atom", + "vale_version": ">=1.0.0" +} diff --git a/.github/styles/Google/vocab.txt b/.github/styles/Google/vocab.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt new file mode 100644 index 000000000000..3c9afb35b90a --- /dev/null +++ b/.github/styles/Vocab/ray/accept.txt @@ -0,0 +1,21 @@ +[aA]utoscaling +[cC]onfig +Anyscale +APIs +Autoscaler +Conda +Databricks +Datadog +Dockerfile +Github +Grafana +hostname +Metaflow +MLflow +plaintext +RLlib +VSCode +cron +MLOps +Readonly +Webterminal diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 000000000000..f06807039099 --- /dev/null +++ b/.vale.ini @@ -0,0 +1,10 @@ +StylesPath = .github/styles + +Vocab = ray + +MinAlertLevel = error + +Packages = Google + +[*.{md,rst,py}] +BasedOnStyles = Vale, Google From 05862e28d67c85cea6ffa63df5296e924bf1b409 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 30 May 2023 18:00:35 -0700 Subject: [PATCH 06/21] Update stuff Signed-off-by: Balaji Veeramani --- .github/styles/Google/Spacing.yml | 7 +++++-- .github/styles/Vocab/ray/accept.txt | 23 +++-------------------- .github/workflows/vale.yml | 16 ++++++++++++++++ .vale.ini | 4 ++-- doc/source/data/getting-started.rst | 10 +++++----- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml index 66e45a6b72a9..e0d26537eb99 100644 --- a/.github/styles/Google/Spacing.yml +++ b/.github/styles/Google/Spacing.yml @@ -5,6 +5,9 @@ level: error nonword: true action: name: remove +# FIXME: This rule complains about Sphinx directives like +# ":class:`Dataset `". tokens: - - '[a-z][.?!] {2,}[A-Z]' - - '[a-z][.?!][A-Z]' + - '^(?!a)b$' # This regex is impossible to match. + # - '[a-z][.?!] {2,}[A-Z]' + # - '[a-z][.?!][A-Z]' diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt index 3c9afb35b90a..ff1e24b785c6 100644 --- a/.github/styles/Vocab/ray/accept.txt +++ b/.github/styles/Vocab/ray/accept.txt @@ -1,21 +1,4 @@ -[aA]utoscaling -[cC]onfig -Anyscale +Data's APIs -Autoscaler -Conda -Databricks -Datadog -Dockerfile -Github -Grafana -hostname -Metaflow -MLflow -plaintext -RLlib -VSCode -cron -MLOps -Readonly -Webterminal +UDFs +Ray Data diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index e69de29bb2d1..2dcbd3b57915 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -0,0 +1,16 @@ +name: reviewdog +on: [pull_request] + +jobs: + vale: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: errata-ai/vale-action@reviewdog + with: + files: docs/source/data/ + fail_on_error: true + # Report level for reviewdog [info,warning,error]. + level: info + # Error on introduced changes only. + filter_mode: added diff --git a/.vale.ini b/.vale.ini index f06807039099..993e968b4972 100644 --- a/.vale.ini +++ b/.vale.ini @@ -1,8 +1,8 @@ StylesPath = .github/styles -Vocab = ray +Vocab = Ray -MinAlertLevel = error +MinAlertLevel = suggestion Packages = Google diff --git a/doc/source/data/getting-started.rst b/doc/source/data/getting-started.rst index f1db53b5e81d..0486b22ae153 100644 --- a/doc/source/data/getting-started.rst +++ b/doc/source/data/getting-started.rst @@ -1,6 +1,6 @@ .. _data_getting_started: -Getting Started +Getting started =============== Ray Data's main abstraction is a :class:`Dataset `, which @@ -21,7 +21,7 @@ To learn more about installing Ray and its libraries, read :ref:`Installing Ray `. Create a dataset -------------------- +---------------- Create datasets from on-disk files, Python objects, and cloud storage services like S3. Ray Data can read from any `filesystem supported by Arrow @@ -43,7 +43,7 @@ To learn more about creating datasets, read :ref:`Loading data `. Transform the dataset ------------------------- +--------------------- Apply :ref:`user-defined functions ` (UDFs) to transform datasets. Ray executes transformations in parallel for performance. @@ -82,7 +82,7 @@ To learn more about transforming datasets, read :ref:`Transforming data `. Consume the dataset ----------------------- +------------------- Pass datasets to Ray tasks or actors, and access records with methods like :meth:`~ray.data.Dataset.take_batch` and :meth:`~ray.data.Dataset.iter_batches`. @@ -138,7 +138,7 @@ To learn more about consuming datasets, read :ref:`Consuming data `. Save the dataset -------------------- +---------------- Call methods like :meth:`~ray.data.Dataset.write_parquet` to save dataset contents to local or remote filesystems. From 508ff9fce51702f3a963cafe7bd97aeef4c9540a Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 30 May 2023 18:05:20 -0700 Subject: [PATCH 07/21] Update vale.yml Signed-off-by: Balaji Veeramani --- .github/workflows/vale.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index 2dcbd3b57915..37999b702303 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -10,7 +10,5 @@ jobs: with: files: docs/source/data/ fail_on_error: true - # Report level for reviewdog [info,warning,error]. - level: info # Error on introduced changes only. filter_mode: added From 4ceee8aa63f08ca04503bf66ffd72c45f88dffa1 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 30 May 2023 18:09:57 -0700 Subject: [PATCH 08/21] Update stuff Signed-off-by: Balaji Veeramani --- .github/styles/Vocab/Ray/reject.txt | 0 .github/workflows/vale.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 .github/styles/Vocab/Ray/reject.txt diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index 37999b702303..ed0ca20750c8 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -8,7 +8,7 @@ jobs: - uses: actions/checkout@v3 - uses: errata-ai/vale-action@reviewdog with: - files: docs/source/data/ + files: doc/source/data/ fail_on_error: true # Error on introduced changes only. filter_mode: added From f602b8ea5cf6f6cea2df845873bd826e8c5e2cf6 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 30 May 2023 18:14:08 -0700 Subject: [PATCH 09/21] Test stuff Signed-off-by: Balaji Veeramani --- .github/workflows/vale.yml | 1 + doc/source/data/transforming-data.rst | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index ed0ca20750c8..794b6663e897 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -12,3 +12,4 @@ jobs: fail_on_error: true # Error on introduced changes only. filter_mode: added + level: warning diff --git a/doc/source/data/transforming-data.rst b/doc/source/data/transforming-data.rst index 0b9305dabbef..72d7a255ea9e 100644 --- a/doc/source/data/transforming-data.rst +++ b/doc/source/data/transforming-data.rst @@ -18,7 +18,7 @@ There are two main types of supported transforms: * One-to-one: each input block will contribute to only one output block, such as :meth:`ds.map_batches() `. -* All-to-all: input blocks can contribute to multiple output blocks, +* All-to-all: input blocks will contribute to multiple output blocks, such as :meth:`ds.random_shuffle() `. .. list-table:: Common Ray Data transforms. @@ -225,7 +225,7 @@ globally shuffle the order of data records. >>> dataset.random_shuffle().take_batch() # doctest: +SKIP {'id': array([7, 0, 9, 3, 5, 1, 4, 2, 8, 6])} -For reduced overhead during training ingest, use local shuffles. Read +For reduced overhead during training ingest, use local shuffles. Read :ref:`Shuffling Data ` in the AIR user guide to learn more. .. _data-groupbys: From 2a13086de8aaf8a7745e40cbb57d24455c78b641 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 30 May 2023 18:17:25 -0700 Subject: [PATCH 10/21] Update stuff Signed-off-by: Balaji Veeramani --- .github/workflows/vale.yml | 2 +- .vale.ini | 2 +- doc/source/data/transforming-data.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index 794b6663e897..2f96a92fd797 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -8,7 +8,7 @@ jobs: - uses: actions/checkout@v3 - uses: errata-ai/vale-action@reviewdog with: - files: doc/source/data/ + files: doc/source/data/getting-started.rst fail_on_error: true # Error on introduced changes only. filter_mode: added diff --git a/.vale.ini b/.vale.ini index 993e968b4972..1189591d35d4 100644 --- a/.vale.ini +++ b/.vale.ini @@ -6,5 +6,5 @@ MinAlertLevel = suggestion Packages = Google -[*.{md,rst,py}] +[*.{md,rst}] BasedOnStyles = Vale, Google diff --git a/doc/source/data/transforming-data.rst b/doc/source/data/transforming-data.rst index 72d7a255ea9e..dcf7742dd1fd 100644 --- a/doc/source/data/transforming-data.rst +++ b/doc/source/data/transforming-data.rst @@ -18,7 +18,7 @@ There are two main types of supported transforms: * One-to-one: each input block will contribute to only one output block, such as :meth:`ds.map_batches() `. -* All-to-all: input blocks will contribute to multiple output blocks, +* All-to-all: input blocks can contribute to multiple output blocks, such as :meth:`ds.random_shuffle() `. .. list-table:: Common Ray Data transforms. From 9a4740d89b3f13f82558ad4a5d3ba1e9bc3f07ee Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 13 Jun 2023 13:10:31 -0700 Subject: [PATCH 11/21] Update stuff Signed-off-by: Balaji Veeramani --- .github/styles/Vocab/ray/accept.txt | 6 ++++++ .github/workflows/vale.yml | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt index ff1e24b785c6..088291ccf700 100644 --- a/.github/styles/Vocab/ray/accept.txt +++ b/.github/styles/Vocab/ray/accept.txt @@ -2,3 +2,9 @@ Data's APIs UDFs Ray Data +:(?:class|meth):([`~\w\s]+?)<([\w\.]+?)> +API[s] +UDF[s] +Data's +CPUs +app diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml index 2f96a92fd797..5f3f0454d464 100644 --- a/.github/workflows/vale.yml +++ b/.github/workflows/vale.yml @@ -10,6 +10,4 @@ jobs: with: files: doc/source/data/getting-started.rst fail_on_error: true - # Error on introduced changes only. - filter_mode: added level: warning From a91cd78ac165b94ef70133334354842b3e212c82 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 11 Jul 2023 12:27:32 -0700 Subject: [PATCH 12/21] Update files Signed-off-by: Balaji Veeramani --- .github/styles/Google/Colons.yml | 2 +- .github/styles/Google/Headings.yml | 2 +- .github/styles/Vocab/Ray/reject.txt | 1 + .github/styles/Vocab/ray/accept.txt | 27 +++++++-- doc/source/data/batch_inference.rst | 16 +++--- doc/source/data/data-internals.rst | 47 ++++++++-------- doc/source/data/inspecting-data.rst | 5 +- doc/source/data/key-concepts.rst | 2 +- doc/source/data/loading-data.rst | 8 +-- doc/source/data/performance-tips.rst | 25 ++++----- doc/source/data/user-guide.rst | 4 +- doc/source/data/working-with-pytorch.rst | 56 +++++++++---------- doc/source/data/working-with-tensors.rst | 7 +-- doc/source/data/working-with-text.rst | 2 +- .../data/tests/test_dynamic_block_split.py | 5 +- 15 files changed, 112 insertions(+), 97 deletions(-) diff --git a/.github/styles/Google/Colons.yml b/.github/styles/Google/Colons.yml index 99363fbd46d7..dc6ba867b3c3 100644 --- a/.github/styles/Google/Colons.yml +++ b/.github/styles/Google/Colons.yml @@ -2,7 +2,7 @@ extends: existence message: "'%s' should be in lowercase." link: 'https://developers.google.com/style/colons' nonword: true -level: warning +level: suggestion scope: sentence tokens: - ':\s[A-Z]' diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml index a53301338a47..168eb6c050bd 100644 --- a/.github/styles/Google/Headings.yml +++ b/.github/styles/Google/Headings.yml @@ -1,7 +1,7 @@ extends: capitalization message: "'%s' should use sentence-style capitalization." link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings' -level: warning +level: suggestion scope: heading match: $sentence indicators: diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt index e69de29bb2d1..12c6d5d5eac2 100644 --- a/.github/styles/Vocab/Ray/reject.txt +++ b/.github/styles/Vocab/Ray/reject.txt @@ -0,0 +1 @@ +torch diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt index 088291ccf700..bfc654ae0956 100644 --- a/.github/styles/Vocab/ray/accept.txt +++ b/.github/styles/Vocab/ray/accept.txt @@ -1,10 +1,27 @@ Data's APIs -UDFs Ray Data -:(?:class|meth):([`~\w\s]+?)<([\w\.]+?)> API[s] UDF[s] -Data's -CPUs -app +CPU[s] +GPU[s] +performant +config +ingest +application +touch +ndarray[s] +dataset's +URI[s] +codec +interoperates +Spotify's +preprocess +Predibase +pushdown +dicts +[gG]roupby +parallelization +prefetching +indexable +dtype diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst index c26eba992203..a5330685c83f 100644 --- a/doc/source/data/batch_inference.rst +++ b/doc/source/data/batch_inference.rst @@ -31,7 +31,7 @@ Using Ray Data for offline inference involves four basic steps: - **Step 3:** Transform your dataset using the pre-trained model by calling :meth:`ds.map_batches() `. For more details, see :ref:`Transforming Data `. - **Step 4:** Get the final predictions by either iterating through the output or saving the results. For more details, see the :ref:`Iterating over data ` and :ref:`Saving data ` user guides. -For more in-depth examples for your use case, see :ref:`our batch inference examples`. +For more in-depth examples for your use case, see :ref:`the batch inference examples`. For how to configure batch inference, see :ref:`the configuration guide`. .. tabs:: @@ -184,7 +184,7 @@ More examples ------------- - :doc:`Image Classification Batch Inference with PyTorch ResNet18 ` - :doc:`Object Detection Batch Inference with PyTorch FasterRCNN_ResNet50 ` -- :doc:`Image Classification Batch Inference with Huggingface Vision Transformer ` +- :doc:`Image Classification Batch Inference with Hugging Face Vision Transformer ` .. _batch_inference_configuration: @@ -199,8 +199,8 @@ Using GPUs for inference To use GPUs for inference, make the following changes to your code: 1. Update the class implementation to move the model and data to and from GPU. -2. Specify `num_gpus=1` in the :meth:`ds.map_batches() ` call to indicate that each actor should use 1 GPU. -3. Specify a `batch_size` for inference. For more details on how to configure the batch size, see `batch_inference_batch_size`_. +2. Specify ``num_gpus=1`` in the :meth:`ds.map_batches() ` call to indicate that each actor should use 1 GPU. +3. Specify a ``batch_size`` for inference. For more details on how to configure the batch size, see :ref:`Configuring Batch Size `. The remaining is the same as the :ref:`Quickstart `. @@ -342,7 +342,7 @@ Configuring Batch Size Configure the size of the input batch that is passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() ` -Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter OOMs, decreasing ``batch_size`` may help. +Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter out-of-memory errors, decreasing ``batch_size`` may help. .. testcode:: @@ -361,7 +361,7 @@ Increasing batch size results in faster execution because inference is a vectori .. caution:: The default ``batch_size`` of ``4096`` may be too large for datasets with large rows - (e.g., tables with many columns or a collection of large images). + (for example, tables with many columns or a collection of large images). Handling GPU out-of-memory failures ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -383,9 +383,9 @@ Handling CPU out-of-memory failures ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you run out of CPU RAM, you likely that you have too many model replicas that are running concurrently on the same node. For example, if a model -uses 5GB of RAM when created / run, and a machine has 16GB of RAM total, then no more +uses 5 GB of RAM when created / run, and a machine has 16 GB of RAM total, then no more than three of these models can be run at the same time. The default resource assignments -of one CPU per task/actor will likely lead to `OutOfMemoryError` from Ray in this situation. +of one CPU per task/actor might lead to `OutOfMemoryError` from Ray in this situation. Suppose your cluster has 4 nodes, each with 16 CPUs. To limit to at most 3 of these actors per node, you can override the CPU or memory: diff --git a/doc/source/data/data-internals.rst b/doc/source/data/data-internals.rst index 8c96c74d1908..833bb19624df 100644 --- a/doc/source/data/data-internals.rst +++ b/doc/source/data/data-internals.rst @@ -40,7 +40,7 @@ task reads one or more files and produces an output block: .. https://docs.google.com/drawings/d/15B4TB8b5xN15Q9S8-s0MjW6iIvo_PrH7JtV1fL123pU/edit -To handle transient errors from remote datasources, Ray Data retries application-level +To handle transient errors from remote data sources, Ray Data retries application-level exceptions. For more information on loading data, see :ref:`Loading data `. @@ -87,9 +87,9 @@ Ray Data uses Ray Core for execution, and is subject to the same scheduling cons Ray Data and placement groups ----------------------------- -By default, Ray Data configures its tasks and actors to use the cluster-default scheduling strategy ("DEFAULT"). You can inspect this configuration variable here: +By default, Ray Data configures its tasks and actors to use the cluster-default scheduling strategy (``"DEFAULT"``). You can inspect this configuration variable here: :class:`ray.data.DataContext.get_current().scheduling_strategy `. This scheduling strategy schedules these Tasks and Actors outside any present -placement group. To force Ray Data to schedule tasks within the current placement group (i.e., to use current placement group resources specifically for Ray Data), set ``ray.data.DataContext.get_current().scheduling_strategy = None``. +placement group. To use current placement group resources specifically for Ray Data, set ``ray.data.DataContext.get_current().scheduling_strategy = None``. Consider this override only for advanced use cases to improve performance predictability. The general recommendation is to let Ray Data run outside placement groups. @@ -98,9 +98,9 @@ Consider this override only for advanced use cases to improve performance predic Ray Data and Tune ----------------- -When using Ray Data in conjunction with :ref:`Ray Tune `, it is important to ensure there are enough free CPUs for Ray Data to run on. By default, Tune will try to fully utilize cluster CPUs. This can prevent Ray Data from scheduling tasks, reducing performance or causing workloads to hang. +When using Ray Data in conjunction with :ref:`Ray Tune `, it's important to ensure there are enough free CPUs for Ray Data to run on. By default, Tune tries to fully utilize cluster CPUs. This can prevent Ray Data from scheduling tasks, reducing performance or causing workloads to hang. -To ensure CPU resources are always available for Ray Data execution, limit the number of concurrent Tune trials. This can be done using the ``max_concurrent_trials`` Tune option. +To ensure CPU resources are always available for Ray Data execution, limit the number of concurrent Tune trials with the ``max_concurrent_trials`` Tune option. .. literalinclude:: ./doc_code/key_concepts.py :language: python @@ -114,11 +114,11 @@ Execution Ray Data execution by default is: -- **Lazy**: This means that transformations on Dataset are not executed until a - consumption operation (e.g. :meth:`ds.iter_batches() `) - or :meth:`Dataset.materialize() ` is called. This creates - opportunities for optimizing the execution plan (e.g. :ref:`stage fusion `). -- **Streaming**: This means that Dataset transformations will be executed in a +- **Lazy**: This means that transformations on Dataset aren't executed until you call a + consumption operation like :meth:`ds.iter_batches() ` + or :meth:`Dataset.materialize() `. This creates + opportunities for optimizing the execution plan like :ref:`stage fusion `. +- **Streaming**: This means that Dataset transformations are executed in a streaming way, incrementally on the base data, instead of on all of the data at once, and overlapping the execution of operations. This can be used for streaming data loading into ML training to overlap the data preprocessing and model training, @@ -139,11 +139,10 @@ writing (:meth:`ds.write_parquet() `), or manual :meth:`ds.materialize() `. There are a few exceptions to this rule, where transformations such as :meth:`ds.union() ` and -:meth:`ds.limit() ` trigger execution; we plan to make these -operations lazy in the future. +:meth:`ds.limit() ` trigger execution. Check the API docs for Ray Data methods to see if they -trigger execution. Those that do trigger execution will have a ``Note`` indicating as +trigger execution. Those that do trigger execution have a ``Note`` indicating as much. .. _streaming_execution: @@ -152,7 +151,7 @@ Streaming Execution ------------------- The following code is a hello world example which invokes the execution with -:meth:`ds.iter_batches() ` consumption. We will also enable verbose progress reporting, which shows per-operator progress in addition to overall progress. +:meth:`ds.iter_batches() ` consumption. The example also enables verbose progress reporting, which shows per-operator progress in addition to overall progress. .. code-block:: @@ -177,19 +176,19 @@ The following code is a hello world example which invokes the execution with ): pass -This launches a simple 4-stage pipeline. We use different compute args for each stage, which forces them to be run as separate operators instead of getting fused together. You should see a log message indicating streaming execution is being used: +This launches a simple 4-stage pipeline. The example uses different compute arguments for each stage, which forces them to be run as separate operators instead of getting fused together. You should see a log message indicating streaming execution is being used: .. code-block:: 2023-03-30 16:40:10,076 INFO streaming_executor.py:83 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadRange] -> TaskPoolMapOperator[MapBatches(sleep)] -> ActorPoolMapOperator[MapBatches(sleep)] -> TaskPoolMapOperator[MapBatches(sleep)] -The next few lines will show execution progress. Here is how to interpret the output: +The next few lines shows execution progress. Here is how to interpret the output: .. code-block:: Running: 7.0/16.0 CPU, 0.0/0.0 GPU, 76.91 MiB/2.25 GiB object_store_memory 65%|██▊ | 130/200 [00:08<00:02, 22.52it/s] -This line tells you how many resources are currently being used by the streaming executor out of the limits, as well as the number of completed output blocks. The streaming executor will attempt to keep resource usage under the printed limits by throttling task executions. +This line tells you how many resources are currently being used by the streaming executor out of the limits, as well as the number of completed output blocks. The streaming executor attempts to keep resource usage under the printed limits by throttling task executions. .. code-block:: @@ -202,7 +201,7 @@ These lines are only shown when verbose progress reporting is enabled. The `acti .. tip:: - Avoid returning large outputs from the final operation of a pipeline you are iterating over, since the consumer process will be a serial bottleneck. + Avoid returning large outputs from the final operation of a pipeline you are iterating over, since the consumer process is a serial bottleneck. Fault tolerance --------------- @@ -221,19 +220,19 @@ system failure occurs, Ray Data recreates blocks by re-executing tasks. Stage Fusion Optimization ------------------------- -In order to reduce memory usage and task overheads, Ray Data will automatically fuse together +In order to reduce memory usage and task overheads, Ray Data automatically fuses together lazy operations that are compatible: * Same compute pattern: embarrassingly parallel map vs. all-to-all shuffle * Same compute strategy: Ray tasks vs Ray actors -* Same resource specification, e.g. ``num_cpus`` or ``num_gpus`` requests +* Same resource specification, for example, ``num_cpus`` or ``num_gpus`` requests -Read stages and subsequent map-like transformations will usually be fused together. +Read stages and subsequent map-like transformations are usually fused together. All-to-all transformations such as :meth:`ds.random_shuffle() ` can be fused with earlier map-like stages, but not later stages. -You can tell if stage fusion is enabled by checking the :ref:`Dataset stats ` and looking for fused stages (e.g., ``read->map_batches``). +You can tell if stage fusion is enabled by checking the :ref:`Dataset stats ` and looking for fused stages (for example, ``read->map_batches``). .. code-block:: @@ -252,7 +251,7 @@ Execution Memory During execution, a task can read multiple input blocks, and write multiple output blocks. Input and output blocks consume both worker heap memory and shared memory via Ray's object store. -Ray Data attempts to bound its heap memory usage to `num_execution_slots * max_block_size`. The number of execution slots is by default equal to the number of CPUs, unless custom resources are specified. The maximum block size is set by the configuration parameter `ray.data.DataContext.target_max_block_size` and is set to 512MiB by default. When a task's output is larger than this value, the worker will automatically split the output into multiple smaller blocks to avoid running out of heap memory. +Ray Data attempts to bound its heap memory usage to ``num_execution_slots * max_block_size``. The number of execution slots is by default equal to the number of CPUs, unless custom resources are specified. The maximum block size is set by the configuration parameter `ray.data.DataContext.target_max_block_size` and is set to 512MiB by default. When a task's output is larger than this value, the worker automatically splits the output into multiple smaller blocks to avoid running out of heap memory. Large block size can lead to potential out-of-memory situations. To avoid these issues, make sure no single item in your Ray Data is too large, and always call :meth:`ds.map_batches() ` with batch size small enough such that the output batch can comfortably fit into memory. @@ -262,5 +261,5 @@ Object Store Memory Ray Data uses the Ray object store to store data blocks, which means it inherits the memory management features of the Ray object store. This section discusses the relevant features: * Object Spilling: Since Ray Data uses the Ray object store to store data blocks, any blocks that can't fit into object store memory are automatically spilled to disk. The objects are automatically reloaded when needed by downstream compute tasks: -* Locality Scheduling: Ray will preferentially schedule compute tasks on nodes that already have a local copy of the object, reducing the need to transfer objects between nodes in the cluster. +* Locality Scheduling: Ray preferentially schedules compute tasks on nodes that already have a local copy of the object, reducing the need to transfer objects between nodes in the cluster. * Reference Counting: Dataset blocks are kept alive by object store reference counting as long as there is any Dataset that references them. To free memory, delete any Python references to the Dataset object. diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst index 271ad1f28201..105f75e538b3 100644 --- a/doc/source/data/inspecting-data.rst +++ b/doc/source/data/inspecting-data.rst @@ -18,7 +18,7 @@ This guide shows you how to: Describing datasets =================== -:class:`Datasets ` are tabular. To view a Dataset's column names and +:class:`Datasets ` are tabular. To view a dataset's column names and types, call :meth:`Dataset.schema() `. .. testcode:: @@ -149,7 +149,7 @@ For more information on working with batches, see Inspecting execution statistics =============================== -Ray Data calculates statistics during execution like the wall clock time and memory usage for the different stages. +Ray Data calculates statistics during execution like the wall clock time and memory usage for the different stages. To view stats about your :class:`Datasets `, call :meth:`Dataset.stats() ` on an executed dataset. The stats are also persisted under `/tmp/ray/session_*/logs/ray-data.log`. @@ -195,4 +195,3 @@ To view stats about your :class:`Datasets `, call :meth:`Datas * In ray.get(): 2.16ms min, 2.16ms max, 2.16ms avg, 2.16ms total * In batch creation: 897.67us min, 897.67us max, 897.67us avg, 897.67us total * In batch formatting: 836.87us min, 836.87us max, 836.87us avg, 836.87us total - diff --git a/doc/source/data/key-concepts.rst b/doc/source/data/key-concepts.rst index bc9cc118e9a6..7aa66784e91d 100644 --- a/doc/source/data/key-concepts.rst +++ b/doc/source/data/key-concepts.rst @@ -3,7 +3,7 @@ Key Concepts ============ -Learn about :class:`Dataset ` and the functionality it provides. +Learn about :class:`Dataset ` and the capabilities it provides. This guide provides a lightweight introduction to: diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index fe854c99b5cd..e41c89e98b29 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -309,7 +309,7 @@ Handling compressed files ~~~~~~~~~~~~~~~~~~~~~~~~~ To read a compressed file, specify ``compression`` in ``arrow_open_stream_args``. -You can use any `Codec supported by Arrow `__. +You can use any `codec supported by Arrow `__. .. testcode:: @@ -640,7 +640,7 @@ Ray Data interoperates with HuggingFace and TensorFlow datasets. Reading databases ================= -Ray Data reads from databases like MySQL, Postgres, and MongoDB. +Ray Data reads from databases like MySQL, PostgreSQL, and MongoDB. .. _reading_sql: @@ -944,8 +944,8 @@ For an example, see :ref:`Implementing a Custom Datasource ` Performance considerations ========================== -The dataset ``parallelism`` determines the number of blocks the base data will be split -into for parallel reads. Ray Data will decide internally how many read tasks to run +The dataset ``parallelism`` determines the number of blocks the base data is split +into for parallel reads. Ray Data decides internally how many read tasks to run concurrently to best utilize the cluster, ranging from ``1...parallelism`` tasks. In other words, the higher the parallelism, the smaller the data blocks in the Dataset and hence the more opportunity for parallel execution. diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index 2c0ce87c3a58..ae7b513ca6a1 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -36,13 +36,13 @@ Tuning read resources ~~~~~~~~~~~~~~~~~~~~~ By default, Ray requests 1 CPU per read task, which means one read tasks per CPU can execute concurrently. -For datasources that can benefit from higher degress of IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter. +For data sources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter. For example, use ``ray.data.read_parquet(path, ray_remote_args={"num_cpus": 0.25})`` to allow up to four read tasks per CPU. Parquet column pruning ~~~~~~~~~~~~~~~~~~~~~~ -Current Dataset will read all Parquet columns into memory. +Current Dataset reads all Parquet columns into memory. If you only need a subset of the columns, make sure to specify the list of columns explicitly when calling :meth:`ray.data.read_parquet() ` to avoid loading unnecessary data (projection pushdown). @@ -55,8 +55,8 @@ Parquet row pruning ~~~~~~~~~~~~~~~~~~~ Similarly, you can pass in a filter to :meth:`ray.data.read_parquet() ` (filter pushdown) -which will be applied at the file scan so only rows that match the filter predicate -will be returned. +which is applied at the file scan so only rows that match the filter predicate +are returned. For example, use ``ray.data.read_parquet("example://iris.parquet", filter=pyarrow.dataset.field("sepal.length") > 5.0)`` (where ``pyarrow`` has to be imported) to read rows with sepal.length greater than 5.0. @@ -65,8 +65,8 @@ This can be used in conjunction with column pruning when appropriate to get the Optimizing shuffles ------------------- -When should I use global per-epoch shuffling? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +When should you use global per-epoch shuffling? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use global per-epoch shuffling only if your model is sensitive to the randomness of the training data. Based on a @@ -87,8 +87,7 @@ particular model under different shuffling policies: * windowed (pseudo-global) shuffling, and * fully global shuffling. -From the perspective of keeping preprocessing time in check, as long as your data -loading and shuffling throughput is higher than your training throughput, your GPU should +As long as your data loading and shuffling throughput is higher than your training throughput, your GPU should be saturated. If you have shuffle-sensitive models, push the shuffle quality higher until this threshold is hit. @@ -103,10 +102,10 @@ These operations include :meth:`Dataset.random_shuffle `_ that Ray runs for :meth:`Dataset.random_shuffle ` and :meth:`Dataset.sort `. -To get an idea of the performance you can expect, here are some run time results for :meth:`Dataset.random_shuffle ` on 1-10TB of data on 20 machines (m5.4xlarge instances on AWS EC2, each with 16 vCPUs, 64GB RAM). +To get an idea of the performance you can expect, here are some run time results for :meth:`Dataset.random_shuffle ` on 1-10 TB of data on 20 machines (m5.4xlarge instances on AWS EC2, each with 16 vCPUs, 64 GB RAM). .. image:: https://docs.google.com/spreadsheets/d/e/2PACX-1vQvBWpdxHsW0-loasJsBpdarAixb7rjoo-lTgikghfCeKPQtjQDDo2fY51Yc1B6k_S4bnYEoChmFrH2/pubchart?oid=598567373&format=image :align: center @@ -149,7 +148,7 @@ By default, the CPU and GPU limits are set to the cluster size, and the object s You may want to customize these limits in the following scenarios: - If running multiple concurrent jobs on the cluster, setting lower limits can avoid resource contention between the jobs. - If you want to fine-tune the memory limit to maximize performance. -- For data loading into training jobs, you may want to set the object store memory to a low value (e.g., 2GB) to limit resource usage. +- For data loading into training jobs, you may want to set the object store memory to a low value (for example, 2 GB) to limit resource usage. You can configure execution options with the global DataContext. The options are applied for future jobs launched in the process: @@ -168,7 +167,7 @@ Locality with output (ML ingest use case) ctx.execution_options.locality_with_output = True -Setting this parameter to True tells Ray Data to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This setting can be useful if you know you are consuming the output data directly on the consumer node (i.e., for ML training ingest). However, other use cases may incur a performance penalty with this setting. +Setting this parameter to True tells Ray Data to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This setting can be useful if you know you are consuming the output data directly on the consumer node (such as, for ML training ingest). However, other use cases may incur a performance penalty with this setting. Reproducibility --------------- @@ -181,7 +180,7 @@ Deterministic execution # By default, this is set to False. ctx.execution_options.preserve_order = True -To enable deterministic execution, set the above to True. This setting may decrease performance, but ensures block ordering is preserved through execution. This flag defaults to False. +To enable deterministic execution, set the preceding to True. This setting may decrease performance, but ensures block ordering is preserved through execution. This flag defaults to False. Monitoring your application --------------------------- diff --git a/doc/source/data/user-guide.rst b/doc/source/data/user-guide.rst index 83c5cd459dac..86f8cb49b8fb 100644 --- a/doc/source/data/user-guide.rst +++ b/doc/source/data/user-guide.rst @@ -4,9 +4,9 @@ User Guides =========== -If you’re new to Ray Data, we recommend starting with the +If you’re new to Ray Data, start with the :ref:`Ray Data Key Concepts `. -This user guide will help you navigate the Ray Data project and +This user guide helps you navigate the Ray Data project and show you how achieve several tasks. .. toctree:: diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst index 1f41e3afa4b7..d3012eba7230 100644 --- a/doc/source/data/working-with-pytorch.rst +++ b/doc/source/data/working-with-pytorch.rst @@ -7,19 +7,19 @@ Ray Data integrates with the PyTorch ecosystem. This guide describes how to: -* :ref:`Iterate over your dataset as torch tensors for model training ` -* :ref:`Write transformations that deal with torch tensors ` -* :ref:`Perform batch inference with torch models ` -* :ref:`Save Datasets containing torch tensors ` +* :ref:`Iterate over your dataset as Torch tensors for model training ` +* :ref:`Write transformations that deal with Torch tensors ` +* :ref:`Perform batch inference with Torch models ` +* :ref:`Save Datasets containing Torch tensors ` * :ref:`Migrate from PyTorch Datasets to Ray Data ` .. _iterating_pytorch: -Iterating over torch tensors for training +Iterating over Torch tensors for training ----------------------------------------- -To iterate over batches of data in torch format, call :meth:`Dataset.iter_torch_batches() `. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset. +To iterate over batches of data in Torch format, call :meth:`Dataset.iter_torch_batches() `. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset. -This is useful for training torch models with batches from your dataset. For configuration details such as providing a `collate_fn` for customizing the conversion, see `the API reference `. +This is useful for training Torch models with batches from your dataset. For configuration details such as providing a ``collate_fn`` for customizing the conversion, see `the API reference `. .. testcode:: @@ -40,12 +40,12 @@ This is useful for training torch models with batches from your dataset. For con Integration with Ray Train ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Ray Data integrates with :ref:`Ray Train ` for easy data ingest for data parallel training, with support for PyTorch, PyTorch Lightning, or Huggingface training. +Ray Data integrates with :ref:`Ray Train ` for easy data ingest for data parallel training, with support for PyTorch, PyTorch Lightning, or Hugging Face training. .. testcode:: import torch - from torch import nn + from Torch import nn import ray from ray.air import session, ScalingConfig from ray.train.torch import TorchTrainer @@ -85,13 +85,13 @@ For more details, see the :ref:`Ray Train user guide `. .. _transform_pytorch: -Transformations with torch tensors +Transformations with Torch tensors ---------------------------------- -Transformations applied with `map` or `map_batches` can return torch tensors. +Transformations applied with `map` or ``map_batches`` can return Torch tensors. .. caution:: - Under the hood, Ray Data automatically converts torch tensors to numpy arrays. Subsequent transformations accept numpy arrays as input, not torch tensors. + Under the hood, Ray Data automatically converts Torch tensors to NumPy arrays. Subsequent transformations accept NumPy arrays as input, not Torch tensors. .. tab-set:: @@ -162,7 +162,7 @@ For more information on transforming data, see :ref:`Transforming data `. @@ -338,12 +338,12 @@ Migrating from PyTorch Datasets and DataLoaders If you're currently using PyTorch Datasets and DataLoaders, you can migrate to Ray Data for working with distributed datasets. -PyTorch Datasets are replaced by the :class:`Dataset ` abtraction, and the PyTorch DataLoader is replaced by :meth:`Dataset.iter_torch_batches() `. +PyTorch Datasets are replaced by the :class:`Dataset ` abstraction, and the PyTorch DataLoader is replaced by :meth:`Dataset.iter_torch_batches() `. Built-in PyTorch Datasets ~~~~~~~~~~~~~~~~~~~~~~~~~ -If you are using built-in PyTorch datasets, for example from `torchvision`, these can be converted to a Ray Dataset using the :meth:`from_torch() ` API. +If you are using built-in PyTorch datasets, for example from ``torchvision``, these can be converted to a Ray Dataset using the :meth:`from_torch() ` API. .. caution:: @@ -357,7 +357,7 @@ If you are using built-in PyTorch datasets, for example from `torchvision`, thes mnist = torchvision.datasets.MNIST(root="/tmp/", download=True) ds = ray.data.from_torch(mnist) - # The data for each item of the torch dataset is under the "item" key. + # The data for each item of the Torch dataset is under the "item" key. print(ds.schema()) .. @@ -378,11 +378,11 @@ If you have a custom PyTorch Dataset, you can migrate to Ray Data by converting Any logic for reading data from cloud storage and disk can be replaced by one of the Ray Data ``read_*`` APIs, and any transformation logic can be applied as a :meth:`map ` call on the Dataset. -The following example shows a custom PyTorch Dataset, and what the analagous would look like with Ray Data. +The following example shows a custom PyTorch Dataset, and what the analogous would look like with Ray Data. .. note:: - Unlike PyTorch Map-style datasets, Ray Datasets are not indexable. + Unlike PyTorch Map-style datasets, Ray Datasets aren't indexable. .. tab-set:: @@ -477,7 +477,7 @@ PyTorch DataLoader The PyTorch DataLoader can be replaced by calling :meth:`Dataset.iter_torch_batches() ` to iterate over batches of the dataset. -The following table describes how the arguments for PyTorch DataLoader map to Ray Data. Note the the behavior may not necessarily be identical. For exact semantics and usage, :meth:`see the API reference `. +The following table describes how the arguments for PyTorch DataLoader map to Ray Data. Note the behavior may not necessarily be identical. For exact semantics and usage, :meth:`see the API reference `. .. list-table:: :header-rows: 1 @@ -485,20 +485,20 @@ The following table describes how the arguments for PyTorch DataLoader map to Ra * - PyTorch DataLoader arguments - Ray Data API * - ``batch_size`` - - ``batch_size`` arg to :meth:`ds.iter_torch_batches() ` + - ``batch_size`` argument to :meth:`ds.iter_torch_batches() ` * - ``shuffle`` - - ``local_shuffle_buffer_size`` arg to :meth:`ds.iter_torch_batches() ` + - ``local_shuffle_buffer_size`` argument to :meth:`ds.iter_torch_batches() ` * - ``collate_fn`` - - ``collate_fn`` arg to :meth:`ds.iter_torch_batches() ` + - ``collate_fn`` argument to :meth:`ds.iter_torch_batches() ` * - ``sampler`` - Not supported. Can be manually implemented after iterating through the dataset with :meth:`ds.iter_torch_batches() `. * - ``batch_sampler`` - Not supported. Can be manually implemented after iterating through the dataset with :meth:`ds.iter_torch_batches() `. * - ``drop_last`` - - ``drop_last`` arg to :meth:`ds.iter_torch_batches() ` + - ``drop_last`` argument to :meth:`ds.iter_torch_batches() ` * - ``num_workers`` - - Use ``prefetch_batches`` arg to :meth:`ds.iter_torch_batches() ` to indicate how many batches to prefetch. The number of prefetching threads will automatically be configured according to ``prefetch_batches``. + - Use ``prefetch_batches`` argument to :meth:`ds.iter_torch_batches() ` to indicate how many batches to prefetch. The number of prefetching threads are automatically configured according to ``prefetch_batches``. * - ``prefetch_factor`` - - Use ``prefetch_batches`` arg to :meth:`ds.iter_torch_batches() ` to indicate how many batches to prefetch. The number of prefetching threads will automatically be configured according to ``prefetch_batches``. + - Use ``prefetch_batches`` argument to :meth:`ds.iter_torch_batches() ` to indicate how many batches to prefetch. The number of prefetching threads are automatically configured according to ``prefetch_batches``. * - ``pin_memory`` - Pass in ``device`` to :meth:`ds.iter_torch_batches() ` to get tensors that have already been moved to the correct device. diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst index 810eaab894ee..28d29d29c74d 100644 --- a/doc/source/data/working-with-tensors.rst +++ b/doc/source/data/working-with-tensors.rst @@ -3,7 +3,7 @@ Working with Tensors ==================== -N-dimensional arrays (i.e., tensors) are ubiquitous in ML workloads. This guide +N-dimensional arrays (that is, tensors) are ubiquitous in ML workloads. This guide describes the limitations and best practices of working with such data. Tensor data representation @@ -98,9 +98,8 @@ Call :meth:`~ray.data.Dataset.map` or :meth:`~ray.data.Dataset.map_batches` to t # Increase the brightness, batch at a time. ds.map_batches(batch_increase_brightness) -In this example, we return ``np.ndarray`` directly as the output. Ray Data will also treat -returned lists of ``np.ndarray`` and objects implementing ``__array__`` (e.g., ``torch.Tensor``) -as tensor data. +In addition to NumPy ndarrays, Ray Data also treats returned lists of NumPy ndarrays and +objects implementing ``__array__`` (for example, ``torch.Tensor``) as tensor data. For more information on transforming data, read :ref:`Transforming data `. diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst index 33276c792248..da639c45a173 100644 --- a/doc/source/data/working-with-text.rst +++ b/doc/source/data/working-with-text.rst @@ -15,7 +15,7 @@ This guide shows you how to: Reading text files ------------------ -Ray Data can read lines of text and JSONL. Alternatiely, you can read raw binary +Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary files and manually decode data. .. tab-set:: diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index 2b25c74c37ae..2deb573fef2a 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -1,10 +1,11 @@ import time -import numpy as np import pandas as pd import pyarrow as pa import pytest +import numpy as np + import ray from ray.data._internal.lazy_block_list import LazyBlockList from ray.data.block import BlockMetadata @@ -14,7 +15,7 @@ from ray.tests.conftest import * # noqa -# Data source generates random bytes data +# Datasource generates random bytes data class RandomBytesDatasource(Datasource): def create_reader(self, **read_args): return RandomBytesReader( From a895099f7483946a4b9f5bbef8e6560a7a30526f Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 11:53:36 -0700 Subject: [PATCH 13/21] Update styles Signed-off-by: Balaji Veeramani --- .github/styles/Google/Acronyms.yml | 8 +++++- .github/styles/Google/Contractions.yml | 2 +- .github/styles/Google/Headings.yml | 2 +- .github/styles/Google/Spacing.yml | 9 +++---- .github/styles/Google/WordList.yml | 2 +- .github/styles/Vocab/Data/accept.txt | 21 +++++++++++++++ .github/styles/Vocab/General/accept.txt | 14 ++++++++++ .github/styles/Vocab/Ray/reject.txt | 1 - .github/styles/Vocab/ray/accept.txt | 27 ------------------- .vale.ini | 7 ++++- doc/requirements-doc.txt | 4 +-- doc/source/data/api/from_other_data_libs.rst | 22 +++++++-------- doc/source/data/api/grouped_data.rst | 3 ++- doc/source/data/api/random_access_dataset.rst | 3 ++- doc/source/data/batch_inference.rst | 4 +-- .../data/examples/custom-datasource.rst | 5 +++- doc/source/data/examples/index.rst | 4 --- doc/source/data/examples/random-access.rst | 12 ++++----- doc/source/data/performance-tips.rst | 6 ++--- doc/source/data/preprocessors.rst | 8 +++--- doc/source/data/saving-data.rst | 2 +- doc/source/data/working-with-images.rst | 4 +-- doc/source/data/working-with-tensors.rst | 2 +- doc/source/data/working-with-text.rst | 2 ++ 24 files changed, 96 insertions(+), 78 deletions(-) create mode 100644 .github/styles/Vocab/Data/accept.txt create mode 100644 .github/styles/Vocab/General/accept.txt delete mode 100644 .github/styles/Vocab/Ray/reject.txt delete mode 100644 .github/styles/Vocab/ray/accept.txt diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml index f41af0189b07..f15489e94c99 100644 --- a/.github/styles/Google/Acronyms.yml +++ b/.github/styles/Google/Acronyms.yml @@ -1,7 +1,7 @@ extends: conditional message: "Spell out '%s', if it's unfamiliar to the audience." link: 'https://developers.google.com/style/abbreviations' -level: suggestion +level: warning ignorecase: false # Ensures that the existence of 'first' implies the existence of 'second'. first: '\b([A-Z]{3,5})\b' @@ -10,13 +10,16 @@ second: '(?:\b[A-Z][a-z]+ )+\(([A-Z]{3,5})\)' exceptions: - API - ASP + - AWS - CLI - CPU - CSS - CSV + - CUDA - DEBUG - DOM - DPI + - ETL - FAQ - GCC - GDB @@ -30,16 +33,19 @@ exceptions: - IDE - JAR - JSON + - JSONL - JSX - LESS - LLDB - NET + - NFS - NOTE - NVDA - OSS - PATH - PDF - PHP + - PNG - POST - RAM - REPL diff --git a/.github/styles/Google/Contractions.yml b/.github/styles/Google/Contractions.yml index 4f6fd5d489dc..07a604d4e837 100644 --- a/.github/styles/Google/Contractions.yml +++ b/.github/styles/Google/Contractions.yml @@ -1,7 +1,7 @@ extends: substitution message: "Use '%s' instead of '%s'." link: 'https://developers.google.com/style/contractions' -level: suggestion +level: warning ignorecase: true action: name: replace diff --git a/.github/styles/Google/Headings.yml b/.github/styles/Google/Headings.yml index 168eb6c050bd..a53301338a47 100644 --- a/.github/styles/Google/Headings.yml +++ b/.github/styles/Google/Headings.yml @@ -1,7 +1,7 @@ extends: capitalization message: "'%s' should use sentence-style capitalization." link: 'https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings' -level: suggestion +level: warning scope: heading match: $sentence indicators: diff --git a/.github/styles/Google/Spacing.yml b/.github/styles/Google/Spacing.yml index e0d26537eb99..b430011c2ca5 100644 --- a/.github/styles/Google/Spacing.yml +++ b/.github/styles/Google/Spacing.yml @@ -1,13 +1,10 @@ extends: existence message: "'%s' should have one space." link: 'https://developers.google.com/style/sentence-spacing' -level: error +level: warning nonword: true action: name: remove -# FIXME: This rule complains about Sphinx directives like -# ":class:`Dataset `". tokens: - - '^(?!a)b$' # This regex is impossible to match. - # - '[a-z][.?!] {2,}[A-Z]' - # - '[a-z][.?!][A-Z]' + - '[a-z][.?!] {2,}[A-Z]' + - '[a-z][.?!][A-Z]' diff --git a/.github/styles/Google/WordList.yml b/.github/styles/Google/WordList.yml index 0d675f2372a2..1b502744ec33 100644 --- a/.github/styles/Google/WordList.yml +++ b/.github/styles/Google/WordList.yml @@ -75,7 +75,7 @@ swap: synch: sync tablename: table name tablet: device - touch: tap + # touch: tap # We rarely use touch in the sense of "tap" in our docs. url: URL vs\.: versus World Wide Web: web diff --git a/.github/styles/Vocab/Data/accept.txt b/.github/styles/Vocab/Data/accept.txt new file mode 100644 index 000000000000..8270e8b56dc4 --- /dev/null +++ b/.github/styles/Vocab/Data/accept.txt @@ -0,0 +1,21 @@ +[Pp]ushdown +[Ii]ngest +[Gg]roupby +TFRecord(s)? +Dask +Modin +[Dd]atasource +[Pp]refetch +[Pp]refetching +[Ii]ndexable +[Pp]reprocess +[Pp]reprocessor(s)? +Spotify('s)? +Predibase('s)? +UDF(s)? +ndarray(s)? +dtype +[Ll]ookup(s)? +[Mm]ultiget(s)? +[Ss]calers +Data('s)? \ No newline at end of file diff --git a/.github/styles/Vocab/General/accept.txt b/.github/styles/Vocab/General/accept.txt new file mode 100644 index 000000000000..9bc01c2bdc16 --- /dev/null +++ b/.github/styles/Vocab/General/accept.txt @@ -0,0 +1,14 @@ +[Ii]nteroperates +CPU[s] +GPU[s] +# Use 'API' judiciously: https://developers.google.com/style/word-list#api. +API[s] +[Aa]pplication +NumPy +[Pp]erformant +[Cc]odec +URI[s] +[Ii]nterpretability +[Pp]arallelization +[Ss]ubclassing +[Dd]ict(s)? \ No newline at end of file diff --git a/.github/styles/Vocab/Ray/reject.txt b/.github/styles/Vocab/Ray/reject.txt deleted file mode 100644 index 12c6d5d5eac2..000000000000 --- a/.github/styles/Vocab/Ray/reject.txt +++ /dev/null @@ -1 +0,0 @@ -torch diff --git a/.github/styles/Vocab/ray/accept.txt b/.github/styles/Vocab/ray/accept.txt deleted file mode 100644 index bfc654ae0956..000000000000 --- a/.github/styles/Vocab/ray/accept.txt +++ /dev/null @@ -1,27 +0,0 @@ -Data's -APIs -Ray Data -API[s] -UDF[s] -CPU[s] -GPU[s] -performant -config -ingest -application -touch -ndarray[s] -dataset's -URI[s] -codec -interoperates -Spotify's -preprocess -Predibase -pushdown -dicts -[gG]roupby -parallelization -prefetching -indexable -dtype diff --git a/.vale.ini b/.vale.ini index 1189591d35d4..119c309ca4c8 100644 --- a/.vale.ini +++ b/.vale.ini @@ -1,10 +1,15 @@ StylesPath = .github/styles -Vocab = Ray +Vocab = General, Data MinAlertLevel = suggestion Packages = Google +[*.rst] +TokenIgnores = (:class:`.*`)|(:.*:`.*`)|(`.*`) + [*.{md,rst}] BasedOnStyles = Vale, Google +Google.Colons = No +Google.Headings = No diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 648984d6ed0f..2003a3a8d7e6 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -18,14 +18,14 @@ mock numpy scikit-image pandas -pickle5 +# pickle5 pillow pyarrow pydantic < 1.10.0 # Note: more recent typing-extensions does not work well with pinned pydantic <1.10.0 typing-extensions < 4.6.0 pyyaml -pytorch-lightning==1.6.5 +pytorch-lightning scikit-optimize redis starlette diff --git a/doc/source/data/api/from_other_data_libs.rst b/doc/source/data/api/from_other_data_libs.rst index 1a62249bb750..30a611f3b1d9 100644 --- a/doc/source/data/api/from_other_data_libs.rst +++ b/doc/source/data/api/from_other_data_libs.rst @@ -5,15 +5,15 @@ API Guide for Users from Other Data Libraries Ray Data is a data loading and preprocessing library for ML. It shares certain similarities with other ETL data processing libraries, but also has its own focus. -In this API guide, we will provide API mappings for users who come from those data +This guide provides API mappings for users who come from those data libraries, so you can quickly map what you may already know to Ray Data APIs. .. note:: - This is meant to map APIs that perform comparable but not necessarily identical operations. - Please check the API reference for exact semantics and usage. - - This list may not be exhaustive: Ray Data is not a traditional ETL data processing library, so not all data processing APIs can map to Datasets. - In addition, we try to focus on common APIs or APIs that are less obvious to see a connection. + Please select the API reference for exact semantics and usage. + - This list may not be exhaustive: Ray Data isn't a traditional ETL data processing library, so not all data processing APIs can map to Datasets. + In addition, this list focuses on common APIs or APIs that are less obvious to see a connection. .. _api-guide-for-pandas-users: @@ -72,19 +72,19 @@ For PyArrow Users * - PyArrow Table API - Ray Data API - * - pa.Table.schema + * - ``pa.Table.schema`` - :meth:`ds.schema() ` - * - pa.Table.num_rows + * - ``pa.Table.num_rows`` - :meth:`ds.count() ` - * - pa.Table.filter() + * - ``pa.Table.filter()`` - :meth:`ds.filter() ` - * - pa.Table.drop() + * - ``pa.Table.drop()`` - :meth:`ds.drop_columns() ` - * - pa.Table.add_column() + * - ``pa.Table.add_column()`` - :meth:`ds.add_column() ` - * - pa.Table.groupby() + * - ``pa.Table.groupby()`` - :meth:`ds.groupby() ` - * - pa.Table.sort_by() + * - ``pa.Table.sort_by()`` - :meth:`ds.sort() ` diff --git a/doc/source/data/api/grouped_data.rst b/doc/source/data/api/grouped_data.rst index fce6a8d9705e..e7abb1f9187e 100644 --- a/doc/source/data/api/grouped_data.rst +++ b/doc/source/data/api/grouped_data.rst @@ -5,7 +5,8 @@ GroupedData API .. currentmodule:: ray.data -GroupedData objects are returned by groupby call: Dataset.groupby(). +GroupedData objects are returned by groupby call: +:meth:`Dataset.groupby() `. Constructor ----------- diff --git a/doc/source/data/api/random_access_dataset.rst b/doc/source/data/api/random_access_dataset.rst index 6bfbdba1585c..82c3bf1d14da 100644 --- a/doc/source/data/api/random_access_dataset.rst +++ b/doc/source/data/api/random_access_dataset.rst @@ -5,7 +5,8 @@ RandomAccessDataset (experimental) .. currentmodule:: ray.data -RandomAccessDataset objects are returned by call: Dataset.to_random_access_dataset(). +RandomAccessDataset objects are returned by call: +:meth:``. Constructor ----------- diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst index 8b8e5a0236e1..1f54d5c14214 100644 --- a/doc/source/data/batch_inference.rst +++ b/doc/source/data/batch_inference.rst @@ -340,7 +340,7 @@ The remaining is the same as the :ref:`Quickstart `. Configuring Batch Size ~~~~~~~~~~~~~~~~~~~~~~ -Configure the size of the input batch that is passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() ` +Configure the size of the input batch that's passed to ``__call__`` by setting the ``batch_size`` argument for :meth:`ds.map_batches() ` Increasing batch size results in faster execution because inference is a vectorized operation. For GPU inference, increasing batch size increases GPU utilization. Set the batch size to as large possible without running out of memory. If you encounter out-of-memory errors, decreasing ``batch_size`` may help. @@ -464,7 +464,7 @@ Models that have been trained with :ref:`Ray Train ` can then be use **Step 3:** Use Ray Data for batch inference. To load in the model from the :class:`Checkpoint ` inside the Python class, use one of the framework-specific Checkpoint classes. -In this case, we use the :class:`XGBoostCheckpoint ` to load the model. +In this case, use :class:`XGBoostCheckpoint ` to load the model. The rest of the logic looks the same as in the `Quickstart <#quickstart>`_. diff --git a/doc/source/data/examples/custom-datasource.rst b/doc/source/data/examples/custom-datasource.rst index e60b7b423fa8..2ed79fa572c7 100644 --- a/doc/source/data/examples/custom-datasource.rst +++ b/doc/source/data/examples/custom-datasource.rst @@ -1,5 +1,8 @@ .. _custom_datasources: +.. TODO: Re-write this guide with correct editorial style. +.. vale off + ================================ Implementing a Custom Datasource ================================ @@ -38,7 +41,7 @@ By the end of the guide, you will have a ``MongoDatasource`` that you can use to There are a few MongoDB concepts involved here. The `URI `__ points to a MongoDB instance, which hosts `Databases and Collections `__. A collection is analogous to a table in SQL databases. MongoDB also has a `pipeline `__ concept, - which expresses document processing in a series of stages (e.g. match documents with a predicate, sort results, and then select a few fields). + which expresses document processing in a series of stages (for example, match documents with a predicate, sort results, and then select a few fields). The execution results of the pipelines are used to create dataset. A custom datasource is an implementation of :class:`~ray.data.Datasource`. In this diff --git a/doc/source/data/examples/index.rst b/doc/source/data/examples/index.rst index bcaeb5b8ccbb..01bccaff9774 100644 --- a/doc/source/data/examples/index.rst +++ b/doc/source/data/examples/index.rst @@ -26,10 +26,6 @@ Ray Data Examples .. _data-recipes: -Ray Data is a data processing engine that supports multiple data -modalities and types. Here you will find a few end-to-end examples of some basic data -processing with Ray Data on tabular data, text (coming soon), and images. - Computer Vision --------------- .. grid:: 1 2 2 3 diff --git a/doc/source/data/examples/random-access.rst b/doc/source/data/examples/random-access.rst index 2b985bb85a86..dd33fe114ea8 100644 --- a/doc/source/data/examples/random-access.rst +++ b/doc/source/data/examples/random-access.rst @@ -36,7 +36,7 @@ Similar to Dataset, a RandomAccessDataset can be passed to and used from any Ray Architecture ------------ -RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source data found on its node. In addition, it is ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups. +RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source data found on its node. In addition, it's ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups. Lookups occur as follows: @@ -44,16 +44,16 @@ Lookups occur as follows: * Second, an actor that has the block pinned is selected (this is done randomly). * A method call is sent to the actor, which then performs binary search to locate the record for the key. -This means that each random lookup costs ~1 network RTT as well as a small amount of computation on both the client and server side. +This means that each random lookup costs ~1 network round-trip time, as well as a small amount of computation on both the client and server side. Performance ----------- -Since actor communication goes directly from worker to worker in Ray, the throughput of a RandomAccessDataset scales linearly with the number of workers available. As a rough measure, a single worker can provide ~2k individual gets/s and serve ~10k records/s for multigets, and this scales linearly as you increase the number of clients and workers for a single RandomAccessDataset. Large workloads may require hundreds of workers for sufficient throughput. You will also generally want more workers than clients, since the client does less computation than worker actors do. +Since actor communication goes directly from worker to worker in Ray, the throughput of a RandomAccessDataset scales linearly with the number of workers available. As a rough measure, a single worker can provide ~2k individual gets/s and serve ~10k records/s for multigets, and this scales linearly as you increase the number of clients and workers for a single RandomAccessDataset. Large workloads may require hundreds of workers for sufficient throughput. You also generally want more workers than clients, since the client does less computation than worker actors do. -To debug performance problems, use ``random_access_ds.stats()``. This will return a string showing the actor-side measured latencies as well as the distribution of data blocks and queries across the actors. Load imbalances can cause bottlenecks as certain actors receive more requests than others. Ensure that load is evenly distributed across the key space to avoid this. +To debug performance problems, use ``random_access_ds.stats()``. This returns a string showing the actor-side measured latencies as well as the distribution of data blocks and queries across the actors. Load imbalances can cause bottlenecks as certain actors receive more requests than others. Ensure that load is evenly distributed across the key space to avoid this. -It is important to note that the client (Ray worker process) can also be a bottleneck. To scale past the throughput of a single client, use multiple tasks to gather the data, for example: +It's important to note that the client (Ray worker process) can also be a bottleneck. To scale past the throughput of a single client, use multiple tasks to gather the data, for example: .. testcode:: @@ -81,4 +81,4 @@ It is important to note that the client (Ray worker process) can also be a bottl Fault Tolerance --------------- -Currently, RandomAccessDataset is not fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source data. +Currently, RandomAccessDataset isn't fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source data. diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index bf97e7fdae3d..18a25fb115f1 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -26,10 +26,10 @@ Tuning read parallelism By default, Ray Data automatically selects the read ``parallelism`` according to the following procedure: 1. The number of available CPUs is estimated. If in a placement group, the number of CPUs in the cluster is scaled by the size of the placement group compared to the cluster size. If not in a placement group, this is the number of CPUs in the cluster. -2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it is set to 8. +2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it's set to 8. 3. The in-memory data size is estimated. If the parallelism would create in-memory blocks that are larger on average than the target block size (512MiB), the parallelism is increased until the blocks are < 512MiB in size. -Occasionally, it is advantageous to manually tune the parallelism to optimize the application. This can be done when loading data via the ``parallelism`` parameter. +Occasionally, it's advantageous to manually tune the parallelism to optimize the application. This can be done when loading data via the ``parallelism`` parameter. For example, use ``ray.data.read_parquet(path, parallelism=1000)`` to force up to 1000 read tasks to be created. Tuning read resources @@ -101,7 +101,7 @@ Enabling push-based shuffle Some Dataset operations require a *shuffle* operation, meaning that data is shuffled from all of the input partitions to all of the output partitions. These operations include :meth:`Dataset.random_shuffle `, :meth:`Dataset.sort ` and :meth:`Dataset.groupby `. -Shuffle can be challenging to scale to large data sizes and clusters, especially when the total dataset size cannot fit into memory. +Shuffle can be challenging to scale to large data sizes and clusters, especially when the total dataset size can't fit into memory. Datasets provides an alternative shuffle implementation known as push-based shuffle for improving large-scale performance. Try this out if your dataset has more than 1000 blocks or is larger than 1 TB in size. diff --git a/doc/source/data/preprocessors.rst b/doc/source/data/preprocessors.rst index 8b775916543a..e53fcc937061 100644 --- a/doc/source/data/preprocessors.rst +++ b/doc/source/data/preprocessors.rst @@ -6,7 +6,7 @@ Using Preprocessors Data preprocessing is a common technique for transforming raw data into features for a machine learning model. In general, you may want to apply the same preprocessing logic to your offline training data and online inference data. -This page covers *preprocessors*, which are a higher level API on top of existing Ray Data operations like `map_batches`, +This page covers *preprocessors*, which are a higher level API on top of existing Ray Data operations like ``map_batches``, targeted towards tabular and structured data use cases. If you are working with tabular data, you should use Ray Data preprocessors. However, the recommended way to perform preprocessing @@ -23,7 +23,7 @@ Overview The :class:`Preprocessor ` class has four public methods: -#. :meth:`fit() `: Compute state information about a :class:`Dataset ` (e.g., the mean or standard deviation of a column) +#. :meth:`fit() `: Compute state information about a :class:`Dataset ` (for example, the mean or standard deviation of a column) and save it to the :class:`Preprocessor `. This information is used to perform :meth:`transform() `, and the method is typically called on a training dataset. #. :meth:`transform() `: Apply a transformation to a :class:`Dataset `. @@ -32,7 +32,7 @@ The :class:`Preprocessor ` class has four pu #. :meth:`transform_batch() `: Apply a transformation to a single :class:`batch ` of data. This method is typically called on online or offline inference data. #. :meth:`fit_transform() `: Syntactic sugar for calling both :meth:`fit() ` and :meth:`transform() ` on a :class:`Dataset `. -To show these methods in action, let's walk through a basic example. First, we'll set up two simple Ray ``Dataset``\s. +To show these methods in action, walk through a basic example. First, you'll set up two simple Ray ``Dataset``\s. .. literalinclude:: doc_code/preprocessors.py :language: python @@ -266,7 +266,7 @@ If you want to implement a custom preprocessor that needs to be fit, extend the If your preprocessor doesn't need to be fit, construct a :class:`~ray.data.preprocessors.BatchMapper` to apply a UDF in parallel over your data. :class:`~ray.data.preprocessors.BatchMapper` can drop, add, or modify columns, and you -can specify a `batch_size` to control the size of the data batches provided to your UDF. +can specify a ``batch_size`` to control the size of the data batches provided to your UDF. .. literalinclude:: doc_code/preprocessors.py :language: python diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst index a2ce55261606..5fd2529bd7e8 100644 --- a/doc/source/data/saving-data.rst +++ b/doc/source/data/saving-data.rst @@ -7,7 +7,7 @@ Saving Data Ray Data lets you save data in files or other Python objects. This guide shows you how to: - + * `Write data to files <#writing-data-to-files>`_ * `Convert Datasets to other Python libraries <#converting-datasets-to-other-python-libraries>`_ diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst index 1d458c47bc16..57b1521c0ab9 100644 --- a/doc/source/data/working-with-images.rst +++ b/doc/source/data/working-with-images.rst @@ -242,7 +242,7 @@ Finally, call :meth:`Dataset.map_batches() `. {'class': 296} For more information on performing inference, see -:ref:`End-to-end: Offline Batch Inference ` +:ref:`End-to-end: offline batch inference ` and :ref:`Transforming batches with actors `. .. _saving_images: @@ -250,7 +250,7 @@ and :ref:`Transforming batches with actors `. Saving images ------------- -Save images with formats like PNG, Parquet, and Numpy. To view all supported formats, +Save images with formats like PNG, Parquet, and NumPy. To view all supported formats, see the :ref:`Input/Output reference `. .. tab-set:: diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst index 1bf606869c14..392765f4e56a 100644 --- a/doc/source/data/working-with-tensors.rst +++ b/doc/source/data/working-with-tensors.rst @@ -3,7 +3,7 @@ Working with Tensors ==================== -N-dimensional arrays (that is, tensors) are ubiquitous in ML workloads. This guide +N-dimensional arrays (in other words, tensors) are ubiquitous in ML workloads. This guide describes the limitations and best practices of working with such data. Tensor data representation diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst index da639c45a173..6540f6748289 100644 --- a/doc/source/data/working-with-text.rst +++ b/doc/source/data/working-with-text.rst @@ -15,6 +15,8 @@ This guide shows you how to: Reading text files ------------------ +Food: Spam ham eggs. + Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary files and manually decode data. From bbf60f83616c7c632074d29b9cfba187802b80a0 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 11:56:17 -0700 Subject: [PATCH 14/21] Update stuff Signed-off-by: Balaji Veeramani --- doc/source/data/batch_inference.rst | 2 +- doc/source/data/data-internals.rst | 2 +- doc/source/data/loading-data.rst | 2 +- doc/source/data/performance-tips.rst | 2 +- python/ray/data/tests/test_dynamic_block_split.py | 3 +-- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst index 1f54d5c14214..4effd4f338a5 100644 --- a/doc/source/data/batch_inference.rst +++ b/doc/source/data/batch_inference.rst @@ -26,7 +26,7 @@ To start, install Ray Data: Using Ray Data for offline inference involves four basic steps: -- **Step 1:** Load your data into a Ray Dataset. Ray Data supports many different data sources and formats. For more details, see :ref:`Loading Data `. +- **Step 1:** Load your data into a Ray Dataset. Ray Data supports many different datasources and formats. For more details, see :ref:`Loading Data `. - **Step 2:** Define a Python class to load the pre-trained model. - **Step 3:** Transform your dataset using the pre-trained model by calling :meth:`ds.map_batches() `. For more details, see :ref:`Transforming Data `. - **Step 4:** Get the final predictions by either iterating through the output or saving the results. For more details, see the :ref:`Iterating over data ` and :ref:`Saving data ` user guides. diff --git a/doc/source/data/data-internals.rst b/doc/source/data/data-internals.rst index 833bb19624df..033612279e9e 100644 --- a/doc/source/data/data-internals.rst +++ b/doc/source/data/data-internals.rst @@ -40,7 +40,7 @@ task reads one or more files and produces an output block: .. https://docs.google.com/drawings/d/15B4TB8b5xN15Q9S8-s0MjW6iIvo_PrH7JtV1fL123pU/edit -To handle transient errors from remote data sources, Ray Data retries application-level +To handle transient errors from remote datasources, Ray Data retries application-level exceptions. For more information on loading data, see :ref:`Loading data `. diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 1e57a8904d6b..4ce1e75119d1 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -925,7 +925,7 @@ Synthetic datasets can be useful for testing and benchmarking. ------ ---- data numpy.ndarray(shape=(64, 64), dtype=int64) -Loading other data sources +Loading other datasources ========================== If Ray Data can't load your data, subclass diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index 18a25fb115f1..20abfe47668c 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -36,7 +36,7 @@ Tuning read resources ~~~~~~~~~~~~~~~~~~~~~ By default, Ray requests 1 CPU per read task, which means one read tasks per CPU can execute concurrently. -For data sources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter. +For datasources that benefit from more IO parallelism, you can specify a lower ``num_cpus`` value for the read function with the ``ray_remote_args`` parameter. For example, use ``ray.data.read_parquet(path, ray_remote_args={"num_cpus": 0.25})`` to allow up to four read tasks per CPU. Parquet column pruning diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index 8643db172f84..0e76ab42e2c3 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -1,12 +1,11 @@ import os import time +import numpy as np import pandas as pd import pyarrow as pa import pytest -import numpy as np - import ray from ray.data import Dataset from ray.data._internal.lazy_block_list import LazyBlockList From 13ed8dc48aaa07ff3153f87f0e2d094568232fe4 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 12:00:27 -0700 Subject: [PATCH 15/21] Fix stuff Signed-off-by: Balaji Veeramani --- .github/styles/Vocab/General/reject.txt | 1 + .github/workflows/vale.yml | 13 ------------- doc/requirements-doc.txt | 4 ++-- doc/source/data/api/from_other_data_libs.rst | 2 +- doc/source/data/working-with-images.rst | 2 +- doc/source/data/working-with-pytorch.rst | 2 +- doc/source/data/working-with-text.rst | 2 -- python/ray/data/tests/test_dynamic_block_split.py | 2 +- 8 files changed, 7 insertions(+), 21 deletions(-) create mode 100644 .github/styles/Vocab/General/reject.txt delete mode 100644 .github/workflows/vale.yml diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt new file mode 100644 index 000000000000..02cc4c0883fe --- /dev/null +++ b/.github/styles/Vocab/General/reject.txt @@ -0,0 +1 @@ +[Pp]lease \ No newline at end of file diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml deleted file mode 100644 index 5f3f0454d464..000000000000 --- a/.github/workflows/vale.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: reviewdog -on: [pull_request] - -jobs: - vale: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: errata-ai/vale-action@reviewdog - with: - files: doc/source/data/getting-started.rst - fail_on_error: true - level: warning diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 2003a3a8d7e6..648984d6ed0f 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -18,14 +18,14 @@ mock numpy scikit-image pandas -# pickle5 +pickle5 pillow pyarrow pydantic < 1.10.0 # Note: more recent typing-extensions does not work well with pinned pydantic <1.10.0 typing-extensions < 4.6.0 pyyaml -pytorch-lightning +pytorch-lightning==1.6.5 scikit-optimize redis starlette diff --git a/doc/source/data/api/from_other_data_libs.rst b/doc/source/data/api/from_other_data_libs.rst index 30a611f3b1d9..500cb123ebbd 100644 --- a/doc/source/data/api/from_other_data_libs.rst +++ b/doc/source/data/api/from_other_data_libs.rst @@ -11,7 +11,7 @@ libraries, so you can quickly map what you may already know to Ray Data APIs. .. note:: - This is meant to map APIs that perform comparable but not necessarily identical operations. - Please select the API reference for exact semantics and usage. + Select the API reference for exact semantics and usage. - This list may not be exhaustive: Ray Data isn't a traditional ETL data processing library, so not all data processing APIs can map to Datasets. In addition, this list focuses on common APIs or APIs that are less obvious to see a connection. diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst index 57b1521c0ab9..a66fd695815a 100644 --- a/doc/source/data/working-with-images.rst +++ b/doc/source/data/working-with-images.rst @@ -242,7 +242,7 @@ Finally, call :meth:`Dataset.map_batches() `. {'class': 296} For more information on performing inference, see -:ref:`End-to-end: offline batch inference ` +:ref:`End-to-end: Offline Batch Inference ` and :ref:`Transforming batches with actors `. .. _saving_images: diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst index 1ae1515981c8..4f92e01a875e 100644 --- a/doc/source/data/working-with-pytorch.rst +++ b/doc/source/data/working-with-pytorch.rst @@ -45,7 +45,7 @@ Ray Data integrates with :ref:`Ray Train ` for easy data ingest for .. testcode:: import torch - from Torch import nn + from torch import nn import ray from ray import train from ray.train import ScalingConfig diff --git a/doc/source/data/working-with-text.rst b/doc/source/data/working-with-text.rst index 6540f6748289..da639c45a173 100644 --- a/doc/source/data/working-with-text.rst +++ b/doc/source/data/working-with-text.rst @@ -15,8 +15,6 @@ This guide shows you how to: Reading text files ------------------ -Food: Spam ham eggs. - Ray Data can read lines of text and JSONL. Alternatively, you can read raw binary files and manually decode data. diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index 0e76ab42e2c3..0d5de4acbf6e 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -16,7 +16,7 @@ from ray.tests.conftest import * # noqa -# Datasource generates random bytes data +# Data source generates random bytes data class RandomBytesDatasource(Datasource): def create_reader(self, **read_args): return RandomBytesReader( From df1b0e0a30d1463d99727cfd9ca46dc357a8c8b4 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 12:03:18 -0700 Subject: [PATCH 16/21] Sort files Signed-off-by: Balaji Veeramani --- .github/styles/Vocab/Data/accept.txt | 26 ++++++++++++------------- .github/styles/Vocab/General/accept.txt | 14 ++++++------- .github/styles/Vocab/General/reject.txt | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/styles/Vocab/Data/accept.txt b/.github/styles/Vocab/Data/accept.txt index 8270e8b56dc4..e612b3460d46 100644 --- a/.github/styles/Vocab/Data/accept.txt +++ b/.github/styles/Vocab/Data/accept.txt @@ -1,21 +1,21 @@ -[Pp]ushdown -[Ii]ngest -[Gg]roupby -TFRecord(s)? Dask +Data('s)? Modin +Predibase('s)? +Spotify('s)? +TFRecord(s)? +UDF(s)? [Dd]atasource +[Gg]roupby +[Ii]ndexable +[Ii]ngest +[Ll]ookup(s)? +[Mm]ultiget(s)? [Pp]refetch [Pp]refetching -[Ii]ndexable [Pp]reprocess [Pp]reprocessor(s)? -Spotify('s)? -Predibase('s)? -UDF(s)? -ndarray(s)? -dtype -[Ll]ookup(s)? -[Mm]ultiget(s)? +[Pp]ushdown [Ss]calers -Data('s)? \ No newline at end of file +dtype +ndarray(s)? diff --git a/.github/styles/Vocab/General/accept.txt b/.github/styles/Vocab/General/accept.txt index 9bc01c2bdc16..208355bd18fb 100644 --- a/.github/styles/Vocab/General/accept.txt +++ b/.github/styles/Vocab/General/accept.txt @@ -1,14 +1,14 @@ -[Ii]nteroperates -CPU[s] -GPU[s] # Use 'API' judiciously: https://developers.google.com/style/word-list#api. API[s] -[Aa]pplication +CPU[s] +GPU[s] NumPy -[Pp]erformant -[Cc]odec URI[s] +[Aa]pplication +[Cc]odec +[Dd]ict(s)? +[Ii]nteroperates [Ii]nterpretability [Pp]arallelization +[Pp]erformant [Ss]ubclassing -[Dd]ict(s)? \ No newline at end of file diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt index 02cc4c0883fe..c2b33abf6daf 100644 --- a/.github/styles/Vocab/General/reject.txt +++ b/.github/styles/Vocab/General/reject.txt @@ -1 +1 @@ -[Pp]lease \ No newline at end of file +[Pp]lease From 4dcf8f2a241960da2429744165e86542bd18b0c2 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 12:08:19 -0700 Subject: [PATCH 17/21] Add notes Signed-off-by: Balaji Veeramani --- .vale.ini | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.vale.ini b/.vale.ini index 119c309ca4c8..14c01c4dbf5e 100644 --- a/.vale.ini +++ b/.vale.ini @@ -7,9 +7,15 @@ MinAlertLevel = suggestion Packages = Google [*.rst] +# HACK(@bveeramani): I have no clue why we need to include `(:class:`.*`)` in addition +# to `(:.*:`.*`)`, but we get false positives if we don't. `TokenIgnores` is weird. TokenIgnores = (:class:`.*`)|(:.*:`.*`)|(`.*`) [*.{md,rst}] BasedOnStyles = Vale, Google +# We're disabling "Colons" because we disagree with Google's suggestion to lowercase the +# first word after a colon. Google.Colons = No +# TODO(@bveeramani): We're temporarily disabling "Heading". In the future, we'll update +# all headings and enable this rule. Google.Headings = No From 7808303ecebbd5dd1813a11bd0d3c5d856fb5300 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 12:11:08 -0700 Subject: [PATCH 18/21] Remove whitespace Signed-off-by: Balaji Veeramani --- doc/source/data/saving-data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst index 5fd2529bd7e8..a2ce55261606 100644 --- a/doc/source/data/saving-data.rst +++ b/doc/source/data/saving-data.rst @@ -7,7 +7,7 @@ Saving Data Ray Data lets you save data in files or other Python objects. This guide shows you how to: - + * `Write data to files <#writing-data-to-files>`_ * `Convert Datasets to other Python libraries <#converting-datasets-to-other-python-libraries>`_ From 6ac1000a72a74fb425fc1e9913c6a16c9e9f7c9c Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 22:50:07 -0700 Subject: [PATCH 19/21] Update files Address review comments Signed-off-by: Balaji Veeramani --- .github/styles/Google/Acronyms.yml | 2 ++ .github/styles/Google/Will.yml | 1 + .github/styles/Vocab/General/reject.txt | 1 + doc/source/data/preprocessors.rst | 2 +- 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/styles/Google/Acronyms.yml b/.github/styles/Google/Acronyms.yml index f15489e94c99..695bb5864f2c 100644 --- a/.github/styles/Google/Acronyms.yml +++ b/.github/styles/Google/Acronyms.yml @@ -22,6 +22,8 @@ exceptions: - ETL - FAQ - GCC + - GCE + - GCP - GDB - GET - GPU diff --git a/.github/styles/Google/Will.yml b/.github/styles/Google/Will.yml index 128a918362b8..20f6bc55294d 100644 --- a/.github/styles/Google/Will.yml +++ b/.github/styles/Google/Will.yml @@ -5,3 +5,4 @@ ignorecase: true level: warning tokens: - will + - "'ll" diff --git a/.github/styles/Vocab/General/reject.txt b/.github/styles/Vocab/General/reject.txt index c2b33abf6daf..b94f0057f42e 100644 --- a/.github/styles/Vocab/General/reject.txt +++ b/.github/styles/Vocab/General/reject.txt @@ -1 +1,2 @@ [Pp]lease +[Cc]ongratulations diff --git a/doc/source/data/preprocessors.rst b/doc/source/data/preprocessors.rst index e53fcc937061..aca5aa0377ab 100644 --- a/doc/source/data/preprocessors.rst +++ b/doc/source/data/preprocessors.rst @@ -32,7 +32,7 @@ The :class:`Preprocessor ` class has four pu #. :meth:`transform_batch() `: Apply a transformation to a single :class:`batch ` of data. This method is typically called on online or offline inference data. #. :meth:`fit_transform() `: Syntactic sugar for calling both :meth:`fit() ` and :meth:`transform() ` on a :class:`Dataset `. -To show these methods in action, walk through a basic example. First, you'll set up two simple Ray ``Dataset``\s. +To show these methods in action, walk through a basic example. First, set up two simple Ray ``Dataset``\s. .. literalinclude:: doc_code/preprocessors.py :language: python From 6e2dea9b6d78d7c843f6bb601a94916290ce8a36 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 21 Aug 2023 23:14:59 -0700 Subject: [PATCH 20/21] Update files Update dashes Signed-off-by: Balaji Veeramani --- .github/styles/Google/EmDash.yml | 2 +- .github/styles/Google/EnDash.yml | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/styles/Google/EmDash.yml b/.github/styles/Google/EmDash.yml index 1befe72aa881..e7231cff5f7a 100644 --- a/.github/styles/Google/EmDash.yml +++ b/.github/styles/Google/EmDash.yml @@ -9,4 +9,4 @@ action: - remove - ' ' tokens: - - '\s[—–]\s' + - '\s-{2,3}\s' diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml index b314dc4e98ab..0480d79eb9b8 100644 --- a/.github/styles/Google/EnDash.yml +++ b/.github/styles/Google/EnDash.yml @@ -1,5 +1,5 @@ extends: existence -message: "Use an em dash ('—') instead of '–'." +message: "Use an em dash ('---') instead of '--'." link: 'https://developers.google.com/style/dashes' nonword: true level: error @@ -7,7 +7,8 @@ action: name: edit params: - replace - - '-' - - '—' -tokens: - - '–' + - '--' + - '---' +raw: + - '(? Date: Mon, 21 Aug 2023 23:23:46 -0700 Subject: [PATCH 21/21] Update files Remove trailing newine Signed-off-by: Balaji Veeramani --- .github/styles/Google/EnDash.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/styles/Google/EnDash.yml b/.github/styles/Google/EnDash.yml index 0480d79eb9b8..e01331ffcfee 100644 --- a/.github/styles/Google/EnDash.yml +++ b/.github/styles/Google/EnDash.yml @@ -11,4 +11,3 @@ action: - '---' raw: - '(?