Skip to content

Commit

Permalink
Parse and transpile more SQL
Browse files Browse the repository at this point in the history
New features include:

- Report unknown functions when parsing.
- Handle ORDER BY "higher up" in the grammar.
- Parse GENERATE_DATE_ARRAY (but we don't transpile yet).
- Hnadle double-quoted strings.

Paired-with: Tom Caruso <[email protected]>
  • Loading branch information
emk committed Feb 16, 2024
1 parent 5f54517 commit 56e34d2
Show file tree
Hide file tree
Showing 9 changed files with 270 additions and 54 deletions.
50 changes: 40 additions & 10 deletions src/analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@ use std::collections::HashMap;

use derive_visitor::{Drive, Visitor};

use crate::ast::{FunctionCall, SpecialDateFunctionCall, SqlProgram};
use crate::{
ast::{FunctionCall, Name, Node, NodeVec, SpecialDateFunctionCall, SqlProgram},
scope::{Scope, ScopeGet, ScopeHandle},
tokenizer::Span,
};

/// A `phf` set of functions that are known to take any number of arguments.
static KNOWN_VARARG_FUNCTIONS: phf::Set<&'static str> = phf::phf_set! {
"COALESCE", "CONCAT", "GREATEST", "LEAST",
};

/// Count all the function calls in a [`SqlProgram`].
#[derive(Debug, Default, Visitor)]
#[derive(Debug, Visitor)]
#[visitor(FunctionCall(enter), SpecialDateFunctionCall(enter))]
pub struct FunctionCallCounts {
root_scope: ScopeHandle,
counts: HashMap<String, usize>,
}

Expand All @@ -25,6 +30,18 @@ impl FunctionCallCounts {
sql_program.drive(self)
}

/// Return true if we have at least one signature for a function which
/// could be called with the given number of arguments.
fn is_known_function_and_airty<T: Node>(&self, name: &str, args: &NodeVec<T>) -> bool {
let name = Name::new(name, Span::Unknown);
let ftype = match self.root_scope.get_function_type(&name) {
Ok(ftype) => ftype,
Err(_) => return false,
};
let arg_count = args.node_iter().count();
ftype.could_be_called_with_arg_count(arg_count)
}

fn record_call(&mut self, name: String) {
let count = self.counts.entry(name).or_default();
*count += 1;
Expand All @@ -48,28 +65,32 @@ impl FunctionCallCounts {
if function_call.over_clause.is_some() {
name.push_str(" OVER(..)");
}
if !self.is_known_function_and_airty(&base_name, &function_call.args) {
name.push_str(" (UNKNOWN)");
}
self.record_call(name);
}

fn enter_special_date_function_call(
&mut self,
special_date_function_call: &SpecialDateFunctionCall,
) {
let mut name = format!(
"{}(",
special_date_function_call
.function_name
.ident
.name
.to_ascii_uppercase(),
);
let base_name = special_date_function_call
.function_name
.ident
.name
.to_ascii_uppercase();
let mut name = format!("{}(", base_name);
for (i, _) in special_date_function_call.args.node_iter().enumerate() {
if i > 0 {
name.push(',');
}
name.push('_');
}
name.push_str(") (special)");
if !self.is_known_function_and_airty(&base_name, &special_date_function_call.args) {
name.push_str(" (UNKNOWN)");
}
self.record_call(name);
}

Expand All @@ -81,3 +102,12 @@ impl FunctionCallCounts {
counts
}
}

impl Default for FunctionCallCounts {
fn default() -> Self {
Self {
root_scope: Scope::root(),
counts: HashMap::new(),
}
}
}
73 changes: 52 additions & 21 deletions src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -644,22 +644,33 @@ pub struct QueryStatement {
/// [official grammar]:
/// https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#sql_syntax.
#[derive(Clone, Debug, Drive, DriveMut, Emit, EmitDefault, Spanned, ToTokens)]
pub enum QueryExpression {
SelectExpression(SelectExpression),
pub struct QueryExpression {
pub with_clause: Option<QueryExpressionWithClause>,
pub query: QueryExpressionQuery,
pub order_by: Option<OrderBy>,
pub limit: Option<Limit>,
}

/// The `WITH` clause of a `QueryExpression`.
#[derive(Clone, Debug, Drive, DriveMut, Emit, EmitDefault, Spanned, ToTokens)]
pub struct QueryExpressionWithClause {
pub with_token: Keyword,
pub ctes: NodeVec<CommonTableExpression>,
}

/// The actual query portion of a `QueryExpression`.
#[derive(Clone, Debug, Drive, DriveMut, Emit, EmitDefault, Spanned, ToTokens)]
pub enum QueryExpressionQuery {
Select(SelectExpression),
Nested {
paren1: Punct,
query: Box<QueryStatement>,
paren2: Punct,
},
With {
with_token: Keyword,
ctes: NodeVec<CommonTableExpression>,
query: Box<QueryStatement>,
},
SetOperation {
left: Box<QueryExpression>,
left: Box<QueryExpressionQuery>,
set_operator: SetOperator,
right: Box<QueryExpression>,
right: Box<QueryExpressionQuery>,
},
}

Expand Down Expand Up @@ -2139,28 +2150,47 @@ peg::parser! {
}
}

pub rule query_expression() -> QueryExpression = precedence! {
pub rule query_expression() -> QueryExpression =
with_clause:query_expression_with_clause()?
query:query_expression_query()
order_by:order_by()?
limit:limit()?
{
QueryExpression {
with_clause,
query,
order_by,
limit,
}
}

rule query_expression_with_clause() -> QueryExpressionWithClause
= with_token:k("WITH") ctes:sep_opt_trailing(<common_table_expression()>, ",") {
QueryExpressionWithClause {
with_token,
ctes,
}
}

pub rule query_expression_query() -> QueryExpressionQuery = precedence! {
left:(@) set_operator:set_operator() right:@ {
QueryExpression::SetOperation {
left: Box::new(left), set_operator, right: Box::new(right)
QueryExpressionQuery::SetOperation {
left: Box::new(left),
set_operator,
right: Box::new(right),
}
}
--
select_expression:select_expression() { QueryExpression::SelectExpression(select_expression) }
select_expression:select_expression() {
QueryExpressionQuery::Select(select_expression)
}
paren1:p("(") query:query_statement() paren2:p(")") {
QueryExpression::Nested {
QueryExpressionQuery::Nested {
paren1,
query: Box::new(query),
paren2,
}
}
with_token:k("WITH") ctes:sep_opt_trailing(<common_table_expression()>, ",") query:query_statement() {
QueryExpression::With {
with_token,
ctes,
query: Box::new(query),
}
}
}

rule set_operator() -> SetOperator
Expand Down Expand Up @@ -2587,6 +2617,7 @@ peg::parser! {
rule special_date_function_name() -> PseudoKeyword
= pk("DATE_DIFF") / pk("DATE_TRUNC") / pk("DATE_ADD") / pk("DATE_SUB")
/ pk("DATETIME_DIFF") / pk("DATETIME_TRUNC") / pk("DATETIME_ADD") / pk("DATETIME_SUB")
/ pk("GENERATE_DATE_ARRAY")

rule special_date_expression() -> SpecialDateExpression
= interval:interval_expression() { SpecialDateExpression::Interval(interval) }
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ pub fn cmd_parse(files: &mut KnownFiles, opt: &ParseOpt) -> Result<()> {
Ok(sql_program) => {
ok_count += 1;
ok_line_count += row.query.lines().count();
println!("OK {}", row.id);
//println!("OK {}", row.id);
if opt.count_function_calls {
function_call_counts.visit(&sql_program);
}
Expand Down
79 changes: 64 additions & 15 deletions src/infer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,33 +225,78 @@ impl InferTypes for ast::QueryExpression {
type Scope = ScopeHandle;
type Output = TableType;

fn infer_types(&mut self, scope: &ScopeHandle) -> Result<Self::Output> {
let ast::QueryExpression {
with_clause,
query,
order_by,
limit,
} = self;

let mut scope = scope.clone();
if let Some(with_clause) = with_clause {
scope = with_clause.infer_types(&scope)?;
}
let (ty, column_set_scope) = query.infer_types(&scope)?;
if let Some(order_by) = order_by {
let column_set_scope = column_set_scope
.unwrap_or_else(|| ColumnSetScope::new_from_table_type(&scope, &ty));
order_by.infer_types(&column_set_scope)?;
}
if let Some(limit) = limit {
limit.infer_types(&())?;
}
Ok(ty)
}
}

impl InferTypes for ast::QueryExpressionWithClause {
type Scope = ScopeHandle;
type Output = ScopeHandle;

fn infer_types(&mut self, scope: &ScopeHandle) -> Result<Self::Output> {
let mut scope = scope.clone();
for cte in self.ctes.node_iter_mut() {
scope = cte.infer_types(&scope)?;
}
Ok(scope)
}
}

impl InferTypes for ast::QueryExpressionQuery {
type Scope = ScopeHandle;

/// We return both a `TableType` and _possibly_ a `ColumnSetScope` because
/// `ORDER BY` may need a full `ColumnSetScope` to support things like
/// `ORDER BY table1.col1, table2.col2`. But the `ColumnSetScope` is easily
/// lost in more complicated cases. See the test `order_and_limit.sql` for
/// example code.
type Output = (TableType, Option<ColumnSetScope>);

fn infer_types(&mut self, scope: &ScopeHandle) -> Result<Self::Output> {
match self {
ast::QueryExpression::SelectExpression(expr) => expr.infer_types(scope),
ast::QueryExpression::Nested { query, .. } => query.infer_types(scope),
ast::QueryExpression::With { ctes, query, .. } => {
// Non-recursive CTEs, so each will create a new namespace.
let mut scope = scope.to_owned();
for cte in ctes.node_iter_mut() {
scope = cte.infer_types(&scope)?;
}
query.infer_types(&scope)
ast::QueryExpressionQuery::Select(expr) => {
let (ty, column_set_scope) = expr.infer_types(scope)?;
Ok((ty, Some(column_set_scope)))
}
ast::QueryExpressionQuery::Nested { query, .. } => {
Ok((query.infer_types(scope)?, None))
}
ast::QueryExpression::SetOperation {
ast::QueryExpressionQuery::SetOperation {
left,
set_operator,
right,
} => {
let left_ty = left.infer_types(scope)?;
let right_ty = right.infer_types(scope)?;
let (left_ty, _) = left.infer_types(scope)?;
let (right_ty, _) = right.infer_types(scope)?;
let result_ty = left_ty.common_supertype(&right_ty).ok_or_else(|| {
Error::annotated(
format!("cannot combine {} and {}", left_ty, right_ty),
set_operator.span(),
"incompatible types",
)
})?;
Ok(result_ty)
Ok((result_ty, None))
}
}
}
Expand All @@ -271,7 +316,11 @@ impl InferTypes for ast::CommonTableExpression {

impl InferTypes for ast::SelectExpression {
type Scope = ScopeHandle;
type Output = TableType;

/// We return both a `TableType` and a `ColumnSetScope` because `ORDER BY`
/// may need a full `ColumnSetScope` to support things like `ORDER BY
/// table1.col1, table2.col2`, which is allowed in certain contexts.
type Output = (TableType, ColumnSetScope);

fn infer_types(&mut self, scope: &ScopeHandle) -> Result<Self::Output> {
// In order of type inference:
Expand Down Expand Up @@ -431,7 +480,7 @@ impl InferTypes for ast::SelectExpression {
}

*ty = Some(table_type.clone());
Ok(table_type)
Ok((table_type, column_set_scope))
}
}

Expand Down
10 changes: 10 additions & 0 deletions src/scope.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,16 @@ impl ColumnSetScope {
}
}

/// Create a column set scope from a [`TableType`]. This is used to
/// implement some of the more complicated rules around things like `(...
/// UNION ALL ...) ORDER BY ...`
pub fn new_from_table_type(parent: &ScopeHandle, table_type: &TableType) -> Self {
Self {
parent: parent.to_owned(),
column_set: ColumnSet::from_table(None, table_type.to_owned()),
}
}

/// Get our [`ColumnSet`].
pub fn column_set(&self) -> &ColumnSet {
&self.column_set
Expand Down
11 changes: 11 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -616,10 +616,16 @@ impl TokenStream {
}

/// Try to parse this stream as a [`ast::QueryExpression`].
#[allow(dead_code)]
pub fn try_into_query_expression(self) -> Result<ast::QueryExpression> {
self.try_into_parsed(ast::sql_program::query_expression)
}

/// Try to parse this stream as a [`ast::QueryExpressionQuery`].
pub fn try_into_query_expression_query(self) -> Result<ast::QueryExpressionQuery> {
self.try_into_parsed(ast::sql_program::query_expression_query)
}

/// Try to parse this stream as a [`ast::SelectExpression`].
pub fn try_into_select_expression(self) -> Result<ast::SelectExpression> {
self.try_into_parsed(ast::sql_program::select_expression)
Expand Down Expand Up @@ -1076,6 +1082,11 @@ peg::parser! {
let value = LiteralValue::String(s.into_iter().collect());
Literal { token, value }
} }
/ quiet! { s_and_token:t(<"\"" s:(([^ '\\' | '\"'] / escape())*) "\"" { s }>) {
let (s, token) = s_and_token;
let value = LiteralValue::String(s.into_iter().collect());
Literal { token, value }
} }
/ quiet! { s_and_token:t(<"r'" s:[^ '\'']* "'" { s }>) {
let (s, token) = s_and_token;
let value = LiteralValue::String(s.into_iter().collect());
Expand Down
Loading

0 comments on commit 56e34d2

Please sign in to comment.