Skip to content

Commit 0908ddb

Browse files
shehabgaminlinhr
andauthored
Value Prefix Whitespace (#6)
Co-authored-by: Heran Lin <[email protected]>
1 parent d1e08c9 commit 0908ddb

File tree

3 files changed

+74
-7
lines changed

3 files changed

+74
-7
lines changed

.github/workflows/rust.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ jobs:
6565
with:
6666
crate: cargo-tarpaulin
6767
version: 0.14.2
68-
use-tool-cache: true
68+
use-tool-cache: false
6969
- name: Test
7070
run: cargo test --all-features
7171

@@ -83,7 +83,7 @@ jobs:
8383
with:
8484
crate: cargo-tarpaulin
8585
version: 0.14.2
86-
use-tool-cache: true
86+
use-tool-cache: false
8787
- name: Coverage
8888
run: cargo tarpaulin -o Lcov --output-dir ./coverage
8989
- name: Coveralls

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# Patching SQL Parser for LakeSail
2+
3+
1. Use `dev` as the base branch when creating PRs in the fork.
4+
2. Please confirm the base repository when creating PRs. You should manually choose `lakehq/sqlparser-rs` when proposing changes to the fork.
5+
3. For patching, use a squash commit to merge the PR. This ensures that each patch appears as a single commit in the `dev` branch of the fork.
6+
4. For merging from upstream, use a merge commit to merge the PR. This ensures that the upstream history is kept in the `dev` branch of the fork.
7+
5. Please avoid mixing code changes and upstream merge in a single PR.
8+
19
# Extensible SQL Lexer and Parser for Rust
210

311
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

src/tokenizer.rs

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,9 @@ impl<'a> Tokenizer<'a> {
706706
// BigQuery uses b or B for byte string literal
707707
b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
708708
chars.next(); // consume
709-
match chars.peek() {
709+
match peeking_skip_whitespace_take_if(chars, |ch| {
710+
matches!(ch, '\'') || matches!(ch, '\"')
711+
}) {
710712
Some('\'') => {
711713
if self.dialect.supports_triple_quoted_string() {
712714
return self
@@ -745,7 +747,9 @@ impl<'a> Tokenizer<'a> {
745747
// BigQuery uses r or R for raw string literal
746748
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
747749
chars.next(); // consume
748-
match chars.peek() {
750+
match peeking_skip_whitespace_take_if(chars, |ch| {
751+
matches!(ch, '\'') || matches!(ch, '\"')
752+
}) {
749753
Some('\'') => self
750754
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
751755
chars,
@@ -772,12 +776,19 @@ impl<'a> Tokenizer<'a> {
772776
// Redshift uses lower case n for national string literal
773777
n @ 'N' | n @ 'n' => {
774778
chars.next(); // consume, to check the next char
775-
match chars.peek() {
779+
match peeking_skip_whitespace_take_if(chars, |ch| {
780+
matches!(ch, '\'') || matches!(ch, '\"')
781+
}) {
776782
Some('\'') => {
777783
// N'...' - a <national character string literal>
778784
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
779785
Ok(Some(Token::NationalStringLiteral(s)))
780786
}
787+
Some('\"') => {
788+
// N"..." - a <national character string literal>
789+
let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
790+
Ok(Some(Token::NationalStringLiteral(s)))
791+
}
781792
_ => {
782793
// regular identifier starting with an "N"
783794
let s = self.tokenize_word(n, chars);
@@ -789,7 +800,7 @@ impl<'a> Tokenizer<'a> {
789800
x @ 'e' | x @ 'E' => {
790801
let starting_loc = chars.location();
791802
chars.next(); // consume, to check the next char
792-
match chars.peek() {
803+
match peeking_skip_whitespace_take_if(chars, |ch| matches!(ch, '\'')) {
793804
Some('\'') => {
794805
let s =
795806
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
@@ -823,12 +834,19 @@ impl<'a> Tokenizer<'a> {
823834
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
824835
x @ 'x' | x @ 'X' => {
825836
chars.next(); // consume, to check the next char
826-
match chars.peek() {
837+
match peeking_skip_whitespace_take_if(chars, |ch| {
838+
matches!(ch, '\'') || matches!(ch, '\"')
839+
}) {
827840
Some('\'') => {
828841
// X'...' - a <binary string literal>
829842
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
830843
Ok(Some(Token::HexStringLiteral(s)))
831844
}
845+
Some('\"') => {
846+
// X"..." - a <binary string literal>
847+
let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
848+
Ok(Some(Token::HexStringLiteral(s)))
849+
}
832850
_ => {
833851
// regular identifier starting with an "X"
834852
let s = self.tokenize_word(x, chars);
@@ -1674,6 +1692,47 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
16741692
s
16751693
}
16761694

1695+
/// Peek ahead in a clone of `self.peekable`, skipping whitespace,
1696+
/// until `predicate` returns `true` or a non-whitespace character is encountered.
1697+
/// If a character matching the predicate is found:
1698+
/// - Advance the original iterator by the number of whitespace characters skipped
1699+
/// - Return the peeked character matching the predicate
1700+
///
1701+
/// If a non-whitespace character not matching the predicate is encountered, or EOF is reached,
1702+
/// return `self.peek()` without advancing the iterator.
1703+
///
1704+
/// Note: This function may advance the original iterator if a match is found after skipping whitespace.
1705+
fn peeking_skip_whitespace_take_if(
1706+
chars: &mut State,
1707+
mut predicate: impl FnMut(char) -> bool,
1708+
) -> Option<char> {
1709+
// Check if the next character is a match to avoid unnecessary cloning.
1710+
if let Some(&ch) = chars.peek() {
1711+
if predicate(ch) {
1712+
return Some(ch);
1713+
}
1714+
}
1715+
1716+
let mut chars_clone = chars.peekable.clone();
1717+
let mut next_count = 0;
1718+
loop {
1719+
match chars_clone.peek() {
1720+
Some(&ch) if predicate(ch) => {
1721+
// Advance the original iterator
1722+
for _ in 0..next_count {
1723+
chars.next();
1724+
}
1725+
return chars.peek().copied();
1726+
}
1727+
Some(ch) if ch.is_whitespace() || matches!(ch, ' ' | '\t' | '\n' | '\r') => {
1728+
next_count += 1;
1729+
chars_clone.next();
1730+
}
1731+
_ => return chars.peek().copied(),
1732+
}
1733+
}
1734+
}
1735+
16771736
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
16781737
Unescape::new(chars).unescape()
16791738
}

0 commit comments

Comments
 (0)