Value Prefix Whitespace (#6)

shehabgamin · linhr · web-flow · commit 0908ddb0bcd0 · 2024-09-28T10:04:36.000Z
Co-authored-by: Heran Lin &lt;heran@lakesail.com&gt;
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -65,7 +65,7 @@ jobs:
       with:
         crate: cargo-tarpaulin
         version: 0.14.2
-        use-tool-cache: true
+        use-tool-cache: false
     - name: Test
       run: cargo test --all-features
 
@@ -83,7 +83,7 @@ jobs:
       with:
         crate: cargo-tarpaulin
         version: 0.14.2
-        use-tool-cache: true
+        use-tool-cache: false
     - name: Coverage
       run: cargo tarpaulin -o Lcov --output-dir ./coverage
     - name: Coveralls
diff --git a/README.md b/README.md
@@ -1,3 +1,11 @@
+# Patching SQL Parser for LakeSail
+
+1. Use `dev` as the base branch when creating PRs in the fork.
+2. Please confirm the base repository when creating PRs. You should manually choose `lakehq/sqlparser-rs` when proposing changes to the fork.
+3. For patching, use a squash commit to merge the PR. This ensures that each patch appears as a single commit in the `dev` branch of the fork.
+4. For merging from upstream, use a merge commit to merge the PR. This ensures that the upstream history is kept in the `dev` branch of the fork.
+5. Please avoid mixing code changes and upstream merge in a single PR.
+
 # Extensible SQL Lexer and Parser for Rust
 
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -706,7 +706,9 @@ impl<'a> Tokenizer<'a> {
                 // BigQuery uses b or B for byte string literal
                 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
                     chars.next(); // consume
-                    match chars.peek() {
+                    match peeking_skip_whitespace_take_if(chars, |ch| {
+                        matches!(ch, '\'') || matches!(ch, '\"')
+                    }) {
                         Some('\'') => {
                             if self.dialect.supports_triple_quoted_string() {
                                 return self
@@ -745,7 +747,9 @@ impl<'a> Tokenizer<'a> {
                 // BigQuery uses r or R for raw string literal
                 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
                     chars.next(); // consume
-                    match chars.peek() {
+                    match peeking_skip_whitespace_take_if(chars, |ch| {
+                        matches!(ch, '\'') || matches!(ch, '\"')
+                    }) {
                         Some('\'') => self
                             .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                 chars,
@@ -772,12 +776,19 @@ impl<'a> Tokenizer<'a> {
                 // Redshift uses lower case n for national string literal
                 n @ 'N' | n @ 'n' => {
                     chars.next(); // consume, to check the next char
-                    match chars.peek() {
+                    match peeking_skip_whitespace_take_if(chars, |ch| {
+                        matches!(ch, '\'') || matches!(ch, '\"')
+                    }) {
                         Some('\'') => {
                             // N'...' - a <national character string literal>
                             let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
                             Ok(Some(Token::NationalStringLiteral(s)))
                         }
+                        Some('\"') => {
+                            // N"..." - a <national character string literal>
+                            let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
+                            Ok(Some(Token::NationalStringLiteral(s)))
+                        }
                         _ => {
                             // regular identifier starting with an "N"
                             let s = self.tokenize_word(n, chars);
@@ -789,7 +800,7 @@ impl<'a> Tokenizer<'a> {
                 x @ 'e' | x @ 'E' => {
                     let starting_loc = chars.location();
                     chars.next(); // consume, to check the next char
-                    match chars.peek() {
+                    match peeking_skip_whitespace_take_if(chars, |ch| matches!(ch, '\'')) {
                         Some('\'') => {
                             let s =
                                 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
@@ -823,12 +834,19 @@ impl<'a> Tokenizer<'a> {
                 // string, but PostgreSQL, at least, allows a lowercase 'x' too.
                 x @ 'x' | x @ 'X' => {
                     chars.next(); // consume, to check the next char
-                    match chars.peek() {
+                    match peeking_skip_whitespace_take_if(chars, |ch| {
+                        matches!(ch, '\'') || matches!(ch, '\"')
+                    }) {
                         Some('\'') => {
                             // X'...' - a <binary string literal>
                             let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
                             Ok(Some(Token::HexStringLiteral(s)))
                         }
+                        Some('\"') => {
+                            // X"..." - a <binary string literal>
+                            let s = self.tokenize_single_quoted_string(chars, '\"', true)?;
+                            Ok(Some(Token::HexStringLiteral(s)))
+                        }
                         _ => {
                             // regular identifier starting with an "X"
                             let s = self.tokenize_word(x, chars);
@@ -1674,6 +1692,47 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
     s
 }
 
+/// Peek ahead in a clone of `self.peekable`, skipping whitespace,
+/// until `predicate` returns `true` or a non-whitespace character is encountered.
+/// If a character matching the predicate is found:
+///   - Advance the original iterator by the number of whitespace characters skipped
+///   - Return the peeked character matching the predicate
+///
+/// If a non-whitespace character not matching the predicate is encountered, or EOF is reached,
+/// return `self.peek()` without advancing the iterator.
+///
+/// Note: This function may advance the original iterator if a match is found after skipping whitespace.
+fn peeking_skip_whitespace_take_if(
+    chars: &mut State,
+    mut predicate: impl FnMut(char) -> bool,
+) -> Option<char> {
+    // Check if the next character is a match to avoid unnecessary cloning.
+    if let Some(&ch) = chars.peek() {
+        if predicate(ch) {
+            return Some(ch);
+        }
+    }
+
+    let mut chars_clone = chars.peekable.clone();
+    let mut next_count = 0;
+    loop {
+        match chars_clone.peek() {
+            Some(&ch) if predicate(ch) => {
+                // Advance the original iterator
+                for _ in 0..next_count {
+                    chars.next();
+                }
+                return chars.peek().copied();
+            }
+            Some(ch) if ch.is_whitespace() || matches!(ch, ' ' | '\t' | '\n' | '\r') => {
+                next_count += 1;
+                chars_clone.next();
+            }
+            _ => return chars.peek().copied(),
+        }
+    }
+}
+
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
     Unescape::new(chars).unescape()
 }