From 50032f298b2815e41fa7ca1cea41562fd0e804ee Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 22 Sep 2023 12:26:44 +0300 Subject: [PATCH 01/16] feat: implement the non-proc macro version of resolve_tokens --- crates/parser/src/get_children_codegen.rs | 30 +++++ crates/parser/src/get_location_codegen.rs | 3 + crates/parser/src/lib.rs | 3 + crates/parser/src/resolve_tokens.rs | 141 ++++++++++++++++++++++ crates/parser/src/statement_parser.rs | 9 +- 5 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 crates/parser/src/get_children_codegen.rs create mode 100644 crates/parser/src/get_location_codegen.rs create mode 100644 crates/parser/src/resolve_tokens.rs diff --git a/crates/parser/src/get_children_codegen.rs b/crates/parser/src/get_children_codegen.rs new file mode 100644 index 00000000..13b895f2 --- /dev/null +++ b/crates/parser/src/get_children_codegen.rs @@ -0,0 +1,30 @@ +use codegen::get_children; + +get_children!(); + +#[cfg(test)] +mod tests { + use crate::get_children_codegen::get_children; + + #[test] + fn test_get_children() { + let input = "with c as (insert into contact (id) values ('id')) select * from c;"; + + let pg_query_root = match pg_query::parse(input) { + Ok(parsed) => Some( + parsed + .protobuf + .nodes() + .iter() + .find(|n| n.1 == 1) + .unwrap() + .0 + .to_enum(), + ), + Err(_) => None, + }; + + let children = get_children(&pg_query_root.unwrap(), input.to_string(), 1); + assert_eq!(children.len(), 13); + } +} diff --git a/crates/parser/src/get_location_codegen.rs b/crates/parser/src/get_location_codegen.rs new file mode 100644 index 00000000..fcc6685d --- /dev/null +++ b/crates/parser/src/get_location_codegen.rs @@ -0,0 +1,3 @@ +use codegen::get_location; + +get_location!(); diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 90b2f9a2..8be85d21 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -16,7 +16,10 @@ //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. mod ast_node; +mod get_children_codegen; +mod get_location_codegen; mod parser; +mod resolve_tokens; mod sibling_token; mod source_parser; mod statement_parser; diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs new file mode 100644 index 00000000..d2dde198 --- /dev/null +++ b/crates/parser/src/resolve_tokens.rs @@ -0,0 +1,141 @@ +use crate::get_children_codegen::ChildrenNode; +use crate::get_location_codegen::get_location; +use cstree::text::{TextRange, TextSize}; +use pg_query::{protobuf::ScanToken, NodeEnum}; + +// all tokens of a node beneath it +// get estimation for each node location from tokens +// and also node range +// +// how to handle tokens that cannot be put beneath node based on the ast? +// pass token -> if not beneath current node, apply immediately + +#[derive(Debug, Clone)] +pub struct NestedNode { + pub node: NodeEnum, + pub depth: i32, + pub path: String, + pub tokens: Vec, + pub range: TextRange, +} + +/// Turns a `Vec` into a `Vec` by adding `tokens` and `range` to each node. +/// +/// For each node, we walk all properties and search for tokens that match the property value. The +/// token that is closest to the node or a parent is used. +/// +/// The node range is the minimum start and maximum end of all tokens. +pub fn resolve_tokens( + children: &Vec, + tokens: &Vec, + text: &str, +) -> Vec { + children + .iter() + .map(|c| { + let nearest_parent_location = get_nearest_parent_location(&c, children); + let furthest_child_location = get_furthest_child_location(&c, children); + + let mut child_tokens = Vec::new(); + + let mut find_token = |property: String| { + child_tokens.push( + tokens + .iter() + .filter_map(|t| { + if get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) != property + { + return None; + } + + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } + + let distance = t.start - nearest_parent_location; + if distance > 0 { + Some((distance, t)) + } else { + None + } + }) + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t) + .unwrap(), + ); + }; + + match &c.node { + NodeEnum::RangeVar(n) => { + find_token(n.relname.to_owned()); + } + _ => {} + }; + + NestedNode { + node: c.node.to_owned(), + depth: c.depth, + path: c.path.to_owned(), + tokens: child_tokens.iter().map(|t| t.token).collect(), + range: TextRange::new( + TextSize::from( + child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32, + ), + TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), + ), + } + }) + .collect() +} + +fn get_token_text(start: usize, end: usize, text: &str) -> String { + text.chars() + .skip(start) + .take(end - start) + .collect::() +} + +fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) -> Option { + children + .iter() + .filter_map(|n| { + if !n.path.starts_with(c.path.as_str()) { + return None; + } + get_location(&n.node) + }) + .max() +} + +fn get_nearest_parent_location(n: &ChildrenNode, children: &Vec) -> i32 { + // if location is set, return it + let location = get_location(&n.node); + if location.is_some() { + return location.unwrap(); + } + + // go up in the tree and check if location exists on any parent + let mut path_elements = n.path.split(".").collect::>(); + path_elements.pop(); + while path_elements.len() > 0 { + let parent_path = path_elements.join("."); + let node = children.iter().find(|c| c.path == parent_path); + if node.is_some() { + let location = get_location(&node.unwrap().node); + if location.is_some() { + return location.unwrap(); + } + } + + path_elements.pop(); + } + + // fallback to 0 + return 0; +} diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index d0f6a25c..68644702 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,7 +1,7 @@ use cstree::text::{TextRange, TextSize}; use logos::{Logos, Span}; -use crate::{parser::Parser, syntax_kind_codegen::SyntaxKind}; +use crate::{get_children_codegen::get_children, parser::Parser, syntax_kind_codegen::SyntaxKind}; /// A super simple lexer for sql statements. /// @@ -83,6 +83,13 @@ impl Parser { } }; + let mut pg_query_nodes = match &pg_query_root { + Some(root) => get_children(root, text.to_string(), 1) + .into_iter() + .peekable(), + None => Vec::new().into_iter().peekable(), + }; + let mut lexer = StatementToken::lexer(&text); // parse root node if no syntax errors From 06b5836254f0c74b01d3dfebf288f6efde7c966c Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 24 Sep 2023 18:51:04 +0300 Subject: [PATCH 02/16] feat: resolve tokens --- crates/parser/src/resolve_tokens.rs | 23 ++++++++--------------- crates/parser/src/statement_parser.rs | 21 +++++++++++++++------ 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs index d2dde198..93a8a877 100644 --- a/crates/parser/src/resolve_tokens.rs +++ b/crates/parser/src/resolve_tokens.rs @@ -3,18 +3,11 @@ use crate::get_location_codegen::get_location; use cstree::text::{TextRange, TextSize}; use pg_query::{protobuf::ScanToken, NodeEnum}; -// all tokens of a node beneath it -// get estimation for each node location from tokens -// and also node range -// -// how to handle tokens that cannot be put beneath node based on the ast? -// pass token -> if not beneath current node, apply immediately - #[derive(Debug, Clone)] pub struct NestedNode { - pub node: NodeEnum, - pub depth: i32, - pub path: String, + pub id: usize, + pub inner: ChildrenNode, + // .start property of `ScanToken` pub tokens: Vec, pub range: TextRange, } @@ -32,7 +25,8 @@ pub fn resolve_tokens( ) -> Vec { children .iter() - .map(|c| { + .enumerate() + .map(|(idx, c)| { let nearest_parent_location = get_nearest_parent_location(&c, children); let furthest_child_location = get_furthest_child_location(&c, children); @@ -79,10 +73,9 @@ pub fn resolve_tokens( }; NestedNode { - node: c.node.to_owned(), - depth: c.depth, - path: c.path.to_owned(), - tokens: child_tokens.iter().map(|t| t.token).collect(), + id: idx, + inner: c.to_owned(), + tokens: child_tokens.iter().map(|t| t.start).collect(), range: TextRange::new( TextSize::from( child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32, diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 68644702..fde95be8 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,7 +1,10 @@ use cstree::text::{TextRange, TextSize}; use logos::{Logos, Span}; -use crate::{get_children_codegen::get_children, parser::Parser, syntax_kind_codegen::SyntaxKind}; +use crate::{ + get_children_codegen::get_children, parser::Parser, resolve_tokens::resolve_tokens, + syntax_kind_codegen::SyntaxKind, +}; /// A super simple lexer for sql statements. /// @@ -57,10 +60,10 @@ impl Parser { ); let mut pg_query_tokens = match pg_query::scan(text) { - Ok(scanned) => scanned.tokens.into_iter().peekable(), + Ok(scanned) => scanned.tokens, Err(e) => { self.error(e.to_string(), range); - Vec::new().into_iter().peekable() + Vec::new() } }; @@ -84,12 +87,18 @@ impl Parser { }; let mut pg_query_nodes = match &pg_query_root { - Some(root) => get_children(root, text.to_string(), 1) - .into_iter() - .peekable(), + Some(root) => resolve_tokens( + &get_children(root, text.to_string(), 1), + &pg_query_tokens, + &text, + ) + .into_iter() + .peekable(), None => Vec::new().into_iter().peekable(), }; + let mut pg_query_tokens = pg_query_tokens.iter().peekable(); + let mut lexer = StatementToken::lexer(&text); // parse root node if no syntax errors From 2b7729d47691a6e5d3ab3c3edc89c284449cb918 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 1 Oct 2023 16:31:48 +0200 Subject: [PATCH 03/16] feat: refactor and improve all over the place --- .../src/{get_children.rs => get_nodes.rs} | 12 +- crates/codegen/src/lib.rs | 8 +- ...ildren_codegen.rs => get_nodes_codegen.rs} | 12 +- crates/parser/src/lib.rs | 4 +- crates/parser/src/parser.rs | 72 +--- crates/parser/src/resolve_tokens.rs | 185 +++++---- crates/parser/src/sibling_token.rs | 31 -- crates/parser/src/source_parser.rs | 17 +- crates/parser/src/statement_parser.rs | 380 ++++++++++++++---- 9 files changed, 429 insertions(+), 292 deletions(-) rename crates/codegen/src/{get_children.rs => get_nodes.rs} (92%) rename crates/parser/src/{get_children_codegen.rs => get_nodes_codegen.rs} (67%) delete mode 100644 crates/parser/src/sibling_token.rs diff --git a/crates/codegen/src/get_children.rs b/crates/codegen/src/get_nodes.rs similarity index 92% rename from crates/codegen/src/get_children.rs rename to crates/codegen/src/get_nodes.rs index e92c5f6e..26fd5a53 100644 --- a/crates/codegen/src/get_children.rs +++ b/crates/codegen/src/get_nodes.rs @@ -2,7 +2,7 @@ use pg_query_proto_parser::{FieldType, Node, ProtoParser}; use proc_macro2::{Ident, TokenStream}; use quote::{format_ident, quote}; -pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { +pub fn get_nodes_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { let parser = ProtoParser::new("./libpg_query/protobuf/pg_query.proto"); let proto_file = parser.parse(); @@ -16,7 +16,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt use std::collections::VecDeque; #[derive(Debug, Clone)] - pub struct ChildrenNode { + pub struct Node { pub node: NodeEnum, pub depth: i32, pub path: String, @@ -24,8 +24,10 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt /// Returns all children of the node, recursively /// location is resolved manually - pub fn get_children(node: &NodeEnum, text: String, current_depth: i32) -> Vec { - let mut nodes: Vec = vec![]; + pub fn get_nodes(node: &NodeEnum, text: String, current_depth: i32) -> Vec { + let mut nodes: Vec = vec![ + Node { node: node.to_owned(), depth: current_depth, path: "0".to_string() } + ]; // Node, depth, path let mut stack: VecDeque<(NodeEnum, i32, String)> = VecDeque::from(vec![(node.to_owned(), current_depth, "0".to_string())]); @@ -37,7 +39,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt let path = path.clone() + "." + child_ctr.to_string().as_str(); child_ctr = child_ctr + 1; stack.push_back((c.to_owned(), current_depth, path.clone())); - nodes.push(ChildrenNode { + nodes.push(Node { node: c, depth: current_depth, path: path.clone(), diff --git a/crates/codegen/src/lib.rs b/crates/codegen/src/lib.rs index fba42ea7..bc63d4f6 100644 --- a/crates/codegen/src/lib.rs +++ b/crates/codegen/src/lib.rs @@ -1,14 +1,14 @@ -mod get_children; mod get_location; +mod get_nodes; mod syntax_kind; -use get_children::get_children_mod; use get_location::get_location_mod; +use get_nodes::get_nodes_mod; use syntax_kind::syntax_kind_mod; #[proc_macro] -pub fn get_children(item: proc_macro::TokenStream) -> proc_macro::TokenStream { - get_children_mod(item.into()).into() +pub fn get_nodes(item: proc_macro::TokenStream) -> proc_macro::TokenStream { + get_nodes_mod(item.into()).into() } #[proc_macro] diff --git a/crates/parser/src/get_children_codegen.rs b/crates/parser/src/get_nodes_codegen.rs similarity index 67% rename from crates/parser/src/get_children_codegen.rs rename to crates/parser/src/get_nodes_codegen.rs index 13b895f2..3305baab 100644 --- a/crates/parser/src/get_children_codegen.rs +++ b/crates/parser/src/get_nodes_codegen.rs @@ -1,13 +1,13 @@ -use codegen::get_children; +use codegen::get_nodes; -get_children!(); +get_nodes!(); #[cfg(test)] mod tests { - use crate::get_children_codegen::get_children; + use crate::get_nodes_codegen::get_nodes; #[test] - fn test_get_children() { + fn test_get_nodes() { let input = "with c as (insert into contact (id) values ('id')) select * from c;"; let pg_query_root = match pg_query::parse(input) { @@ -24,7 +24,7 @@ mod tests { Err(_) => None, }; - let children = get_children(&pg_query_root.unwrap(), input.to_string(), 1); - assert_eq!(children.len(), 13); + let nodes = get_nodes(&pg_query_root.unwrap(), input.to_string(), 1); + assert_eq!(nodes.len(), 14); } } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 8be85d21..5344ff27 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -16,17 +16,15 @@ //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. mod ast_node; -mod get_children_codegen; mod get_location_codegen; +mod get_nodes_codegen; mod parser; mod resolve_tokens; -mod sibling_token; mod source_parser; mod statement_parser; mod syntax_error; mod syntax_kind_codegen; mod syntax_node; -pub use crate::parser::{Parse, Parser}; pub use crate::syntax_kind_codegen::SyntaxKind; pub use crate::syntax_node::{SyntaxElement, SyntaxNode, SyntaxToken}; diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 80219f8d..2f311ba5 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -1,6 +1,5 @@ use cstree::syntax::ResolvedNode; use cstree::{build::GreenNodeBuilder, text::TextRange}; -use log::debug; use pg_query::NodeEnum; use crate::ast_node::RawStmt; @@ -8,7 +7,7 @@ use crate::syntax_error::SyntaxError; use crate::syntax_kind_codegen::SyntaxKind; use crate::syntax_node::SyntaxNode; -/// Main parser that controls the cst building process, and collects errors and statements +/// Main parser that exposes the `cstree` api, and collects errors and statements #[derive(Debug)] pub struct Parser { /// The cst builder @@ -17,16 +16,9 @@ pub struct Parser { errors: Vec, /// The pg_query statements representing the abtract syntax tree stmts: Vec, - /// The current checkpoint depth, if any - checkpoint: Option, - /// Whether the parser is currently parsing a flat node - is_parsing_flat_node: bool, - /// Keeps track of currently open nodes - /// Latest opened is last - open_nodes: Vec<(SyntaxKind, i32)>, } -/// Result of parsing +/// Result of Building #[derive(Debug)] pub struct Parse { /// The concrete syntax tree @@ -43,72 +35,16 @@ impl Parser { inner: GreenNodeBuilder::new(), errors: Vec::new(), stmts: Vec::new(), - checkpoint: None, - is_parsing_flat_node: false, - open_nodes: Vec::new(), } } - /// close all nodes until the specified depth is reached - pub fn close_until_depth(&mut self, depth: i32) { - debug!("close until depth {}", depth); - if self.open_nodes.is_empty() || self.get_current_depth() < depth { - return; - } - loop { - if self.open_nodes.is_empty() || self.get_current_depth() < depth { - break; - } - self.finish_node(); - } - } - - fn get_current_depth(&self) -> i32 { - self.open_nodes[self.open_nodes.len() - 1].1 - } - - /// set a checkpoint at current depth - /// - /// if `is_parsing_flat_node` is true, all tokens parsed until this checkpoint is closed will be applied immediately - pub fn set_checkpoint(&mut self) { - assert!( - self.checkpoint.is_none(), - "Must close previouos checkpoint before setting new one" - ); - self.checkpoint = Some(self.get_current_depth()); - } - - /// close all nodes until checkpoint depth is reached - pub fn close_checkpoint(&mut self) { - if self.checkpoint.is_some() { - self.close_until_depth(self.checkpoint.unwrap()); - } - self.checkpoint = None; - self.is_parsing_flat_node = false; - } - - /// start a new node of `SyntaxKind` at `depth` - /// handles closing previous nodes if necessary - pub fn start_node_at(&mut self, kind: SyntaxKind, depth: i32) { - debug!("starting node at depth {} {:?}", depth, kind); - // close until target depth - self.close_until_depth(depth); - - self.open_nodes.push((kind, depth)); - debug!("start node {:?}", kind); + /// start a new node of `SyntaxKind` + pub fn start_node(&mut self, kind: SyntaxKind) { self.inner.start_node(kind); } /// finish current node pub fn finish_node(&mut self) { - debug!("finish_node"); - - let n = self.open_nodes.pop(); - if n.is_none() { - panic!("No node to finish"); - } - - debug!("finish node {:?}", n.unwrap().0); self.inner.finish_node(); } diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs index 93a8a877..9c01b471 100644 --- a/crates/parser/src/resolve_tokens.rs +++ b/crates/parser/src/resolve_tokens.rs @@ -1,88 +1,96 @@ -use crate::get_children_codegen::ChildrenNode; +use std::{ + cmp::{max, min}, + convert::identity, +}; + use crate::get_location_codegen::get_location; +use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; use pg_query::{protobuf::ScanToken, NodeEnum}; #[derive(Debug, Clone)] -pub struct NestedNode { - pub id: usize, - pub inner: ChildrenNode, - // .start property of `ScanToken` - pub tokens: Vec, - pub range: TextRange, +pub struct RangedNode { + pub inner: Node, + pub estimated_range: TextRange, } -/// Turns a `Vec` into a `Vec` by adding `tokens` and `range` to each node. -/// -/// For each node, we walk all properties and search for tokens that match the property value. The -/// token that is closest to the node or a parent is used. -/// -/// The node range is the minimum start and maximum end of all tokens. -pub fn resolve_tokens( - children: &Vec, - tokens: &Vec, - text: &str, -) -> Vec { - children - .iter() - .enumerate() - .map(|(idx, c)| { - let nearest_parent_location = get_nearest_parent_location(&c, children); - let furthest_child_location = get_furthest_child_location(&c, children); - - let mut child_tokens = Vec::new(); - - let mut find_token = |property: String| { - child_tokens.push( - tokens - .iter() - .filter_map(|t| { - if get_token_text( - usize::try_from(t.start).unwrap(), - usize::try_from(t.end).unwrap(), - text, - ) != property - { - return None; - } - - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - return None; - } - - let distance = t.start - nearest_parent_location; - if distance > 0 { - Some((distance, t)) - } else { - None - } - }) - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t) - .unwrap(), - ); - }; - - match &c.node { - NodeEnum::RangeVar(n) => { - find_token(n.relname.to_owned()); - } - _ => {} - }; - - NestedNode { - id: idx, - inner: c.to_owned(), - tokens: child_tokens.iter().map(|t| t.start).collect(), - range: TextRange::new( - TextSize::from( - child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32, - ), - TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), - ), +/// Turns a `Vec` into a `Vec` by estimating their range. +pub fn resolve_tokens(nodes: &Vec, tokens: &Vec, text: &str) -> Vec { + let mut ranged_nodes: Vec = Vec::new(); + + // we get an estimated range by searching for tokens that match the node property values + // and, if available, the `location` of the node itself + nodes.iter().for_each(|n| { + let nearest_parent_location = get_nearest_parent_location(&n, nodes); + let furthest_child_location = get_furthest_child_location(&n, nodes); + + let mut child_tokens = Vec::new(); + + let mut find_token = |property: String| { + child_tokens.push( + tokens + .iter() + .filter_map(|t| { + if get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) != property + { + return None; + } + + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } + + let distance = t.start - nearest_parent_location; + if distance > 0 { + Some((distance, t)) + } else { + None + } + }) + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t) + .unwrap(), + ); + }; + + match &n.node { + NodeEnum::RangeVar(n) => { + find_token(n.relname.to_owned()); } + _ => {} + }; + + let from_locations: Vec = [ + get_location(&n.node), + Some(nearest_parent_location), + Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start), + ] + .into_iter() + .filter_map(|x| x) + .collect(); + + ranged_nodes.push(RangedNode { + inner: n.to_owned(), + estimated_range: TextRange::new( + TextSize::from(from_locations.iter().min().unwrap_or(&0).to_owned() as u32), + TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), + ), + }); + }); + + // FIXME: this additional loop is not required if we order the nodes by path first + ranged_nodes + .iter() + .map(|n| RangedNode { + inner: n.inner.to_owned(), + // the range of a node must be larger than the range of all children nodes + estimated_range: get_largest_child_range(&n, &ranged_nodes), }) .collect() } @@ -94,7 +102,26 @@ fn get_token_text(start: usize, end: usize, text: &str) -> String { .collect::() } -fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) -> Option { +fn get_largest_child_range(node: &RangedNode, nodes: &Vec) -> TextRange { + let mut start: TextSize = node.estimated_range.start().to_owned(); + let mut end: TextSize = node.estimated_range.end().to_owned(); + + nodes.iter().for_each(|n| { + if !n.inner.path.starts_with(node.inner.path.as_str()) { + return; + } + if start < n.estimated_range.start() { + start = n.estimated_range.start(); + } + if end > n.estimated_range.end() { + end = n.estimated_range.end(); + } + }); + + TextRange::new(start, end) +} + +fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { children .iter() .filter_map(|n| { @@ -106,7 +133,7 @@ fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) - .max() } -fn get_nearest_parent_location(n: &ChildrenNode, children: &Vec) -> i32 { +fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { // if location is set, return it let location = get_location(&n.node); if location.is_some() { diff --git a/crates/parser/src/sibling_token.rs b/crates/parser/src/sibling_token.rs deleted file mode 100644 index 6a42dd0d..00000000 --- a/crates/parser/src/sibling_token.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::syntax_kind_codegen::SyntaxKind; - -impl SyntaxKind { - pub fn is_opening_sibling(&self) -> bool { - match self { - SyntaxKind::Ascii40 => true, - SyntaxKind::Ascii91 => true, - SyntaxKind::Case => true, - _ => false, - } - } - pub fn is_closing_sibling(&self) -> bool { - match self { - SyntaxKind::Ascii41 => true, - SyntaxKind::Ascii93 => true, - SyntaxKind::EndP => true, - _ => false, - } - } - pub fn sibling(&self) -> Option { - match self { - SyntaxKind::Case => Some(SyntaxKind::EndP), - SyntaxKind::EndP => Some(SyntaxKind::Case), - SyntaxKind::Ascii40 => Some(SyntaxKind::Ascii41), - SyntaxKind::Ascii41 => Some(SyntaxKind::Ascii40), - SyntaxKind::Ascii91 => Some(SyntaxKind::Ascii93), - SyntaxKind::Ascii93 => Some(SyntaxKind::Ascii91), - _ => None, - } - } -} diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs index b7a727ec..341d6eb9 100644 --- a/crates/parser/src/source_parser.rs +++ b/crates/parser/src/source_parser.rs @@ -75,14 +75,15 @@ fn tokens(input: &str) -> Vec { } impl Parser { - /// Parse a source - pub fn parse_source_at(&mut self, text: &str, at_offset: Option) { + fn parse_source_at(&mut self, text: &str, at_offset: Option) { let offset = at_offset.unwrap_or(0); let tokens = tokens(&text); let mut tokens_iter = tokens.iter(); - self.start_node_at(SyntaxKind::SourceFile, 0); + // open root `SourceFile` node + self.start_node(SyntaxKind::SourceFile); + while let Some(token) = tokens_iter.next() { match token.kind { SourceFileToken::Comment => { @@ -92,13 +93,15 @@ impl Parser { self.token(SyntaxKind::Newline, token.text.as_str()); } SourceFileToken::Statement => { - self.parse_statement( - token.text.as_str(), - Some(offset + u32::from(token.span.start())), - ); + // self.parse_statement( + // token.text.as_str(), + // Some(offset + u32::from(token.span.start())), + // ); } }; } + + // close root `SourceFile` node self.finish_node(); } } diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index fde95be8..ee461391 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,8 +1,10 @@ +use std::collections::VecDeque; + use cstree::text::{TextRange, TextSize}; -use logos::{Logos, Span}; +use logos::Logos; use crate::{ - get_children_codegen::get_children, parser::Parser, resolve_tokens::resolve_tokens, + get_nodes_codegen::get_nodes, parser::Parser, resolve_tokens::resolve_tokens, syntax_kind_codegen::SyntaxKind, }; @@ -25,7 +27,6 @@ pub enum StatementToken { impl StatementToken { /// Creates a `SyntaxKind` from a `StatementToken`. - /// can be generated. pub fn syntax_kind(&self) -> SyntaxKind { match self { StatementToken::Whitespace => SyntaxKind::Whitespace, @@ -36,30 +37,48 @@ impl StatementToken { } } +struct TokenBuffer { + tokens: VecDeque<(SyntaxKind, String)>, +} + +impl TokenBuffer { + fn new() -> Self { + Self { + tokens: VecDeque::new(), + } + } + + fn push(&mut self, kind: SyntaxKind, text: String) { + self.tokens.push_back((kind, text)); + } + + fn drain(&mut self, until: Option) -> Vec<(SyntaxKind, String)> { + if self.tokens.is_empty() { + return Vec::new(); + } + let range = match until { + Some(u) => 0..u as usize, + None => 0..self.tokens.len(), + }; + self.tokens.drain(range).collect::>() + } +} + impl Parser { - /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file. - /// - /// On a high level, the algorithm works as follows: - /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information - /// about the nodes. The result will be a flat list of tokens under the generic `Stmt` node. - /// If successful, the first node in the ordered list will be the main node of the statement, - /// and serves as a root node. - /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors. - /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens - /// that are not parsed by pg_query.rs, such as whitespace. - /// 4. Define a pointer that starts at 0 and move it along the statement. - /// - first, check if the current pointer is within a pg_query token. If so, consume the - /// token. - /// - if not, consume the next token from the `StatementToken` lexer. - /// 5. Close all open nodes for that statement. - pub fn parse_statement(&mut self, text: &str, at_offset: Option) { + pub fn parse_statement_at(&mut self, text: &str, at_offset: Option) { + // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer + + // offset of the statement in the source file. let offset = at_offset.unwrap_or(0); + + // range of the statement in the source file. let range = TextRange::new( TextSize::from(offset), TextSize::from(offset + text.len() as u32), ); - let mut pg_query_tokens = match pg_query::scan(text) { + // tokens from pg_query.rs + let pg_query_tokens = match pg_query::scan(text) { Ok(scanned) => scanned.tokens, Err(e) => { self.error(e.to_string(), range); @@ -67,8 +86,7 @@ impl Parser { } }; - // Get root node with depth 1 - // Since we are parsing only a single statement there can be only a single node at depth 1 + // root node of the statement, if no syntax errors let pg_query_root = match pg_query::parse(text) { Ok(parsed) => Some( parsed @@ -86,9 +104,11 @@ impl Parser { } }; + // ranged nodes from pg_query.rs, including the root node + // the nodes are ordered by starting range, starting with the root node let mut pg_query_nodes = match &pg_query_root { Some(root) => resolve_tokens( - &get_children(root, text.to_string(), 1), + &get_nodes(root, text.to_string(), 1), &pg_query_tokens, &text, ) @@ -99,109 +119,291 @@ impl Parser { let mut pg_query_tokens = pg_query_tokens.iter().peekable(); - let mut lexer = StatementToken::lexer(&text); + let mut statement_token_lexer = StatementToken::lexer(&text); + + // 2. Setup data structures required for the parsing algorithm + // A buffer for tokens that are not applied immediately to the cst + let mut token_buffer = TokenBuffer::new(); + // Keeps track of currently open nodes. Latest opened is last. + let mut open_nodes: Vec<(SyntaxKind, TextRange, i32)> = Vec::new(); - // parse root node if no syntax errors - if pg_query_root.is_some() { - let root_node = pg_query_root.unwrap(); - self.stmt(root_node.to_owned(), range); - self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1); + // 3. Parse the statement + + // Handle root node + if pg_query_nodes.len() > 0 { + // if there are no syntax errors, use the pg_query node as the root node + let root_node = pg_query_nodes + .find(|n| n.inner.path == "0".to_string()) + .unwrap(); + // can only be at depth 1 + assert_eq!( + root_node.inner.depth, 1, + "Root node must be at depth 1, but is at depth {}", + root_node.inner.depth + ); + self.stmt(root_node.inner.node.to_owned(), range); + self.start_node(SyntaxKind::new_from_pg_query_node(&root_node.inner.node)); + open_nodes.push(( + SyntaxKind::new_from_pg_query_node(&root_node.inner.node), + range, + 1, + )); } else { // fallback to generic node as root - self.start_node_at(SyntaxKind::Stmt, 1); + self.start_node(SyntaxKind::Stmt); + open_nodes.push((SyntaxKind::Stmt, range, 1)); } - self.set_checkpoint(); // start at 0, and increment by the length of the token let mut pointer: i32 = 0; - #[derive(Debug)] - struct Token { - syntax_kind: SyntaxKind, - span: Span, - } - + // main loop that walks through the statement token by token while pointer < text.len() as i32 { // Check if the pointer is within a pg_query token let next_pg_query_token = pg_query_tokens.peek(); - let token = if next_pg_query_token.is_some() + + let token_length = if next_pg_query_token.is_some() && next_pg_query_token.unwrap().start <= pointer && pointer <= next_pg_query_token.unwrap().end { let token = pg_query_tokens.next().unwrap(); - Token { - syntax_kind: SyntaxKind::new_from_pg_query_token(&token), - span: Span { - start: token.start as usize, - end: token.end as usize, - }, + + let token_text = text + .chars() + .skip(token.start as usize) + .take((token.end as usize) - (token.start as usize)) + .collect::(); + + // a node can only start and end with a pg_query token, so we can handle them here + + // before applying the token, close any node that ends before the token starts + while open_nodes.last().is_some() + && open_nodes.last().unwrap().1.end() <= TextSize::from(token.start as u32) + { + self.finish_node(); + open_nodes.pop(); } + + // drain token buffer + for (kind, text) in token_buffer.drain(None) { + self.token(kind, text.as_str()); + } + + // apply the token + self.token(SyntaxKind::new_from_pg_query_token(token), text); + + // consume all nodes that start at or before the token ends + while pg_query_nodes.peek().is_some() + && pg_query_nodes.peek().unwrap().estimated_range.start() + <= TextSize::from(token.end as u32) + { + let node = pg_query_nodes.next().unwrap(); + self.start_node(SyntaxKind::new_from_pg_query_node(&node.inner.node)); + open_nodes.push(( + SyntaxKind::new_from_pg_query_node(&node.inner.node), + node.estimated_range, + node.inner.depth, + )); + } + + token_text.len() as i32 } else { // fallback to statement token // move statement token lexer to before pointer - while (lexer.span().end as i32) < pointer { - lexer.next(); + while (statement_token_lexer.span().end as i32) < pointer { + statement_token_lexer.next(); } - let token = lexer.next(); - if token.is_none() || (lexer.span().start as i32) != pointer { + let token = statement_token_lexer.next(); + if token.is_none() || (statement_token_lexer.span().start as i32) != pointer { // if the token is not at the pointer, we have a syntax error panic!( "Expected token for '{}' at offset {}", - lexer.slice(), - lexer.span().start + statement_token_lexer.slice(), + statement_token_lexer.span().start ); } - Token { - syntax_kind: token.unwrap().unwrap().syntax_kind(), - span: lexer.span(), - } + let token_text = statement_token_lexer.slice().to_string(); + token_buffer.push(token.unwrap().unwrap().syntax_kind(), token_text.clone()); + token_text.len() as i32 }; - self.token( - token.syntax_kind, - text.chars() - .skip(token.span.start) - .take(token.span.end - token.span.start) - .collect::() - .as_str(), - ); - - pointer = pointer + (token.span.end - token.span.start) as i32; + pointer = pointer + token_length; } - // close up nodes - self.close_checkpoint(); + while open_nodes.last().is_some() { + self.finish_node(); + open_nodes.pop(); + } } } +// impl Parser { +// /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file. +// /// +// /// On a high level, the algorithm works as follows: +// /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information +// /// about the nodes. The result will be a flat list of tokens under the generic `Stmt` node. +// /// If successful, the first node in the ordered list will be the main node of the statement, +// /// and serves as a root node. +// /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors. +// /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens +// /// that are not parsed by pg_query.rs, such as whitespace. +// /// 4. Define a pointer that starts at 0 and move it along the statement. +// /// - first, check if the current pointer is within a pg_query token. If so, consume the +// /// token. +// /// - if not, consume the next token from the `StatementToken` lexer. +// /// 5. Close all open nodes for that statement. +// pub fn parse_statement(&mut self, text: &str, at_offset: Option) { +// let offset = at_offset.unwrap_or(0); +// let range = TextRange::new( +// TextSize::from(offset), +// TextSize::from(offset + text.len() as u32), +// ); +// +// let mut pg_query_tokens = match pg_query::scan(text) { +// Ok(scanned) => scanned.tokens, +// Err(e) => { +// self.error(e.to_string(), range); +// Vec::new() +// } +// }; +// +// // Get root node with depth 1 +// // Since we are parsing only a single statement there can be only a single node at depth 1 +// let pg_query_root = match pg_query::parse(text) { +// Ok(parsed) => Some( +// parsed +// .protobuf +// .nodes() +// .iter() +// .find(|n| n.1 == 1) +// .unwrap() +// .0 +// .to_enum(), +// ), +// Err(e) => { +// self.error(e.to_string(), range); +// None +// } +// }; +// +// let mut pg_query_nodes = match &pg_query_root { +// Some(root) => resolve_tokens( +// &get_nodes(root, text.to_string(), 1), +// &pg_query_tokens, +// &text, +// ) +// .into_iter() +// .peekable(), +// None => Vec::new().into_iter().peekable(), +// }; +// +// let mut pg_query_tokens = pg_query_tokens.iter().peekable(); +// +// let mut lexer = StatementToken::lexer(&text); +// +// // parse root node if no syntax errors +// if pg_query_root.is_some() { +// let root_node = pg_query_root.unwrap(); +// self.stmt(root_node.to_owned(), range); +// self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1); +// } else { +// // fallback to generic node as root +// self.start_node_at(SyntaxKind::Stmt, 1); +// } +// self.set_checkpoint(); +// +// // start at 0, and increment by the length of the token +// let mut pointer: i32 = 0; +// +// #[derive(Debug)] +// struct Token { +// syntax_kind: SyntaxKind, +// span: Span, +// } +// +// while pointer < text.len() as i32 { +// // Check if the pointer is within a pg_query token +// let next_pg_query_token = pg_query_tokens.peek(); +// let token = if next_pg_query_token.is_some() +// && next_pg_query_token.unwrap().start <= pointer +// && pointer <= next_pg_query_token.unwrap().end +// { +// let token = pg_query_tokens.next().unwrap(); +// Token { +// syntax_kind: SyntaxKind::new_from_pg_query_token(&token), +// span: Span { +// start: token.start as usize, +// end: token.end as usize, +// }, +// } +// } else { +// // fallback to statement token +// +// // move statement token lexer to before pointer +// while (lexer.span().end as i32) < pointer { +// lexer.next(); +// } +// let token = lexer.next(); +// if token.is_none() || (lexer.span().start as i32) != pointer { +// // if the token is not at the pointer, we have a syntax error +// panic!( +// "Expected token for '{}' at offset {}", +// lexer.slice(), +// lexer.span().start +// ); +// } +// Token { +// syntax_kind: token.unwrap().unwrap().syntax_kind(), +// span: lexer.span(), +// } +// }; +// +// self.token( +// token.syntax_kind, +// text.chars() +// .skip(token.span.start) +// .take(token.span.end - token.span.start) +// .collect::() +// .as_str(), +// ); +// +// pointer = pointer + (token.span.end - token.span.start) as i32; +// } +// +// // close up nodes +// self.close_checkpoint(); +// } +// } + #[cfg(test)] mod tests { use std::assert_eq; use super::*; - #[test] - fn test_invalid_statement() { - let input = "select select;"; - - let mut parser = Parser::new(); - parser.parse_statement(input, None); - let parsed = parser.finish(); - - assert_eq!(parsed.cst.text(), input); - } - - #[test] - fn test_create_sql_function() { - let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) - AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$ - LANGUAGE SQL;"; - - let mut parser = Parser::new(); - parser.parse_statement(input, None); - let parsed = parser.finish(); - - assert_eq!(parsed.cst.text(), input); - } + // #[test] + // fn test_invalid_statement() { + // let input = "select select;"; + // + // let mut parser = Parser::new(); + // parser.parse_statement(input, None); + // let parsed = parser.finish(); + // + // assert_eq!(parsed.cst.text(), input); + // } + // + // #[test] + // fn test_create_sql_function() { + // let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) + // AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$ + // LANGUAGE SQL;"; + // + // let mut parser = Parser::new(); + // parser.parse_statement(input, None); + // let parsed = parser.finish(); + // + // assert_eq!(parsed.cst.text(), input); + // } } From c84f852f5eeffb1c87248c86f52b7d022e3fd7a5 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 1 Oct 2023 16:36:10 +0200 Subject: [PATCH 04/16] refactor: drop token buffer struct --- crates/parser/src/statement_parser.rs | 33 +++------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index ee461391..3687f11c 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -37,33 +37,6 @@ impl StatementToken { } } -struct TokenBuffer { - tokens: VecDeque<(SyntaxKind, String)>, -} - -impl TokenBuffer { - fn new() -> Self { - Self { - tokens: VecDeque::new(), - } - } - - fn push(&mut self, kind: SyntaxKind, text: String) { - self.tokens.push_back((kind, text)); - } - - fn drain(&mut self, until: Option) -> Vec<(SyntaxKind, String)> { - if self.tokens.is_empty() { - return Vec::new(); - } - let range = match until { - Some(u) => 0..u as usize, - None => 0..self.tokens.len(), - }; - self.tokens.drain(range).collect::>() - } -} - impl Parser { pub fn parse_statement_at(&mut self, text: &str, at_offset: Option) { // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer @@ -123,7 +96,7 @@ impl Parser { // 2. Setup data structures required for the parsing algorithm // A buffer for tokens that are not applied immediately to the cst - let mut token_buffer = TokenBuffer::new(); + let mut token_buffer: VecDeque<(SyntaxKind, String)> = VecDeque::new(); // Keeps track of currently open nodes. Latest opened is last. let mut open_nodes: Vec<(SyntaxKind, TextRange, i32)> = Vec::new(); @@ -185,7 +158,7 @@ impl Parser { } // drain token buffer - for (kind, text) in token_buffer.drain(None) { + for (kind, text) in token_buffer.drain(0..token_buffer.len()) { self.token(kind, text.as_str()); } @@ -224,7 +197,7 @@ impl Parser { ); } let token_text = statement_token_lexer.slice().to_string(); - token_buffer.push(token.unwrap().unwrap().syntax_kind(), token_text.clone()); + token_buffer.push_back((token.unwrap().unwrap().syntax_kind(), token_text.clone())); token_text.len() as i32 }; From cf5465a1cbe71a9effec76b72410b3d364631cbc Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 1 Oct 2023 19:47:47 +0200 Subject: [PATCH 05/16] feat: estimate_node_range poc --- crates/parser/src/estimate_node_range.rs | 271 +++++++++++++++++++ crates/parser/src/lib.rs | 3 +- crates/parser/src/resolve_tokens.rs | 161 ----------- crates/parser/src/statement_parser.rs | 14 +- crates/parser/tests/statement_parser_test.rs | 2 +- 5 files changed, 282 insertions(+), 169 deletions(-) create mode 100644 crates/parser/src/estimate_node_range.rs delete mode 100644 crates/parser/src/resolve_tokens.rs diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs new file mode 100644 index 00000000..390212e8 --- /dev/null +++ b/crates/parser/src/estimate_node_range.rs @@ -0,0 +1,271 @@ +use std::cmp::{max, min}; + +use crate::get_location_codegen::get_location; +use crate::get_nodes_codegen::Node; +use cstree::text::{TextRange, TextSize}; +use pg_query::{protobuf::ScanToken, NodeEnum}; + +#[derive(Debug, Clone)] +pub struct RangedNode { + pub inner: Node, + pub range: TextRange, +} + +/// Turns a `Vec` into a `Vec` by estimating their range. +pub fn estimate_node_range( + nodes: &mut Vec, + tokens: &Vec, + text: &str, +) -> Vec { + let mut ranged_nodes: Vec = Vec::new(); + + // ensure that all children of any given node are already processed before processing the node itself + nodes.sort_by(|a, b| b.path.cmp(&a.path)); + + // we get an estimated range by searching for tokens that match the node property values + // and, if available, the `location` of the node itself + nodes.iter().for_each(|n| { + // first, get the estimated boundaries of the node based on the `location` property of a node + let nearest_parent_location = get_nearest_parent_location(&n, nodes); + let furthest_child_location = get_furthest_child_location(&n, nodes); + + let mut child_tokens = Vec::new(); + + let mut find_token = |property: String| { + println!("find_token {}", property); + child_tokens.push( + tokens + .iter() + .filter_map(|t| { + println!("token {:#?}", t); + // make a string comparison of the text of the token and the property value + if get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) + .to_lowercase() + != property.to_lowercase() + { + println!("token text does not match property value"); + return None; + } + + // if the furthest child location is set, and it is smaller than the start of the token, + // we can safely ignore this token, because it is not a child of the node + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + println!("furthest child location is smaller than token start"); + return None; + } + + // if the token is before the nearest parent location, we can safely ignore it + // if not, we calculate the distance to the nearest parent location + let distance = t.start - nearest_parent_location; + if distance >= 0 { + println!("distance {} for token {:#?}", distance, t); + Some((distance, t)) + } else { + println!("distance is smaller than 0 for token {:#?}", t); + None + } + }) + // and use the token with the smallest distance to the nearest parent location + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t) + .unwrap(), + ); + }; + + match &n.node { + NodeEnum::RangeVar(n) => { + find_token(n.relname.to_owned()); + } + NodeEnum::Integer(n) => { + find_token(n.ival.to_owned().to_string()); + } + NodeEnum::AConst(n) => { + if n.isnull { + find_token("null".to_string()); + } + } + NodeEnum::ResTarget(n) => { + if n.name.len() > 0 { + find_token(n.name.to_owned()); + } + } + NodeEnum::SelectStmt(n) => { + find_token("select".to_string()); + } + _ => panic!("Node type not implemented: {:?}", n.node), + }; + + // For `from`, the location of the node itself is always correct. + // If not available, the closest estimation is the smaller value of the start of the first direct child token, + // and the start of all children ranges. If neither is available, let’s panic for now. + // The parent location as a fallback should never be required, because any node must have either children with tokens, or a token itself. + let location = get_location(&n.node); + let from = if location.is_some() { + location.unwrap() + } else { + let start_of_first_child_token = if child_tokens.len() > 0 { + Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start) + } else { + None + }; + let start_of_all_children_ranges = if ranged_nodes.len() > 0 { + Some( + ranged_nodes + .iter() + .filter(|x| x.inner.path.starts_with(n.path.as_str())) + .min_by_key(|n| n.range.start()) + .unwrap() + .range + .start(), + ) + } else { + None + }; + + if start_of_first_child_token.is_some() { + if start_of_all_children_ranges.is_some() { + min( + start_of_first_child_token.unwrap(), + u32::from(start_of_all_children_ranges.unwrap()) as i32, + ) + } else { + start_of_first_child_token.unwrap() + } + } else if start_of_all_children_ranges.is_some() { + u32::from(start_of_all_children_ranges.unwrap()) as i32 + } else { + panic!("No location or child tokens found for node {:?}", n); + } + }; + + // For `to`, it’s the larger value of the end of the last direkt child token, and the end of all children ranges. + println!("{}: {:?}", n.path, n.node); + let end_of_last_child_token = if child_tokens.len() > 0 { + Some(child_tokens.iter().max_by_key(|t| t.end).unwrap().end) + } else { + None + }; + let end_of_all_children_ranges = if ranged_nodes.len() > 0 { + Some( + ranged_nodes + .iter() + .filter(|x| x.inner.path.starts_with(n.path.as_str())) + .max_by_key(|n| n.range.end()) + .unwrap() + .range + .end(), + ) + } else { + None + }; + let to = if end_of_last_child_token.is_some() { + if end_of_all_children_ranges.is_some() { + max( + end_of_last_child_token.unwrap(), + u32::from(end_of_all_children_ranges.unwrap()) as i32, + ) + } else { + end_of_last_child_token.unwrap() + } + } else if end_of_all_children_ranges.is_some() { + u32::from(end_of_all_children_ranges.unwrap()) as i32 + } else { + panic!("No child tokens or children ranges found for node {:?}", n); + }; + + ranged_nodes.push(RangedNode { + inner: n.to_owned(), + range: TextRange::new(TextSize::from(from as u32), TextSize::from(to as u32)), + }); + }); + + ranged_nodes +} + +fn get_token_text(start: usize, end: usize, text: &str) -> String { + text.chars() + .skip(start) + .take(end - start) + .collect::() +} + +fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { + children + .iter() + .filter_map(|n| { + if !n.path.starts_with(c.path.as_str()) { + return None; + } + get_location(&n.node) + }) + .max() +} + +fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { + // if location is set, return it + let location = get_location(&n.node); + if location.is_some() { + return location.unwrap(); + } + + // go up in the tree and check if location exists on any parent + let mut path_elements = n.path.split(".").collect::>(); + path_elements.pop(); + while path_elements.len() > 0 { + let parent_path = path_elements.join("."); + let node = children.iter().find(|c| c.path == parent_path); + if node.is_some() { + let location = get_location(&node.unwrap().node); + if location.is_some() { + return location.unwrap(); + } + } + + path_elements.pop(); + } + + // fallback to 0 + return 0; +} + +#[cfg(test)] +mod tests { + use crate::estimate_node_range::estimate_node_range; + use crate::get_nodes_codegen::get_nodes; + + #[test] + fn test_estimate_node_range() { + let input = "select null"; + + let pg_query_tokens = match pg_query::scan(input) { + Ok(scanned) => scanned.tokens, + Err(_) => Vec::new(), + }; + + let pg_query_root = match pg_query::parse(input) { + Ok(parsed) => Some( + parsed + .protobuf + .nodes() + .iter() + .find(|n| n.1 == 1) + .unwrap() + .0 + .to_enum(), + ), + Err(_) => None, + }; + + let mut nodes = get_nodes(&pg_query_root.unwrap(), input.to_string(), 1); + + let ranged_nodes = estimate_node_range(&mut nodes, &pg_query_tokens, &input); + + dbg!(&ranged_nodes); + } +} diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 5344ff27..3bc10622 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -16,15 +16,16 @@ //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. mod ast_node; +mod estimate_node_range; mod get_location_codegen; mod get_nodes_codegen; mod parser; -mod resolve_tokens; mod source_parser; mod statement_parser; mod syntax_error; mod syntax_kind_codegen; mod syntax_node; +pub use crate::parser::{Parse, Parser}; pub use crate::syntax_kind_codegen::SyntaxKind; pub use crate::syntax_node::{SyntaxElement, SyntaxNode, SyntaxToken}; diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs deleted file mode 100644 index 9c01b471..00000000 --- a/crates/parser/src/resolve_tokens.rs +++ /dev/null @@ -1,161 +0,0 @@ -use std::{ - cmp::{max, min}, - convert::identity, -}; - -use crate::get_location_codegen::get_location; -use crate::get_nodes_codegen::Node; -use cstree::text::{TextRange, TextSize}; -use pg_query::{protobuf::ScanToken, NodeEnum}; - -#[derive(Debug, Clone)] -pub struct RangedNode { - pub inner: Node, - pub estimated_range: TextRange, -} - -/// Turns a `Vec` into a `Vec` by estimating their range. -pub fn resolve_tokens(nodes: &Vec, tokens: &Vec, text: &str) -> Vec { - let mut ranged_nodes: Vec = Vec::new(); - - // we get an estimated range by searching for tokens that match the node property values - // and, if available, the `location` of the node itself - nodes.iter().for_each(|n| { - let nearest_parent_location = get_nearest_parent_location(&n, nodes); - let furthest_child_location = get_furthest_child_location(&n, nodes); - - let mut child_tokens = Vec::new(); - - let mut find_token = |property: String| { - child_tokens.push( - tokens - .iter() - .filter_map(|t| { - if get_token_text( - usize::try_from(t.start).unwrap(), - usize::try_from(t.end).unwrap(), - text, - ) != property - { - return None; - } - - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - return None; - } - - let distance = t.start - nearest_parent_location; - if distance > 0 { - Some((distance, t)) - } else { - None - } - }) - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t) - .unwrap(), - ); - }; - - match &n.node { - NodeEnum::RangeVar(n) => { - find_token(n.relname.to_owned()); - } - _ => {} - }; - - let from_locations: Vec = [ - get_location(&n.node), - Some(nearest_parent_location), - Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start), - ] - .into_iter() - .filter_map(|x| x) - .collect(); - - ranged_nodes.push(RangedNode { - inner: n.to_owned(), - estimated_range: TextRange::new( - TextSize::from(from_locations.iter().min().unwrap_or(&0).to_owned() as u32), - TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), - ), - }); - }); - - // FIXME: this additional loop is not required if we order the nodes by path first - ranged_nodes - .iter() - .map(|n| RangedNode { - inner: n.inner.to_owned(), - // the range of a node must be larger than the range of all children nodes - estimated_range: get_largest_child_range(&n, &ranged_nodes), - }) - .collect() -} - -fn get_token_text(start: usize, end: usize, text: &str) -> String { - text.chars() - .skip(start) - .take(end - start) - .collect::() -} - -fn get_largest_child_range(node: &RangedNode, nodes: &Vec) -> TextRange { - let mut start: TextSize = node.estimated_range.start().to_owned(); - let mut end: TextSize = node.estimated_range.end().to_owned(); - - nodes.iter().for_each(|n| { - if !n.inner.path.starts_with(node.inner.path.as_str()) { - return; - } - if start < n.estimated_range.start() { - start = n.estimated_range.start(); - } - if end > n.estimated_range.end() { - end = n.estimated_range.end(); - } - }); - - TextRange::new(start, end) -} - -fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { - children - .iter() - .filter_map(|n| { - if !n.path.starts_with(c.path.as_str()) { - return None; - } - get_location(&n.node) - }) - .max() -} - -fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { - // if location is set, return it - let location = get_location(&n.node); - if location.is_some() { - return location.unwrap(); - } - - // go up in the tree and check if location exists on any parent - let mut path_elements = n.path.split(".").collect::>(); - path_elements.pop(); - while path_elements.len() > 0 { - let parent_path = path_elements.join("."); - let node = children.iter().find(|c| c.path == parent_path); - if node.is_some() { - let location = get_location(&node.unwrap().node); - if location.is_some() { - return location.unwrap(); - } - } - - path_elements.pop(); - } - - // fallback to 0 - return 0; -} diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 3687f11c..e1888998 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,10 +1,12 @@ -use std::collections::VecDeque; +use std::{collections::VecDeque, iter::Peekable}; use cstree::text::{TextRange, TextSize}; use logos::Logos; use crate::{ - get_nodes_codegen::get_nodes, parser::Parser, resolve_tokens::resolve_tokens, + estimate_node_range::{estimate_node_range, RangedNode}, + get_nodes_codegen::get_nodes, + parser::Parser, syntax_kind_codegen::SyntaxKind, }; @@ -80,8 +82,8 @@ impl Parser { // ranged nodes from pg_query.rs, including the root node // the nodes are ordered by starting range, starting with the root node let mut pg_query_nodes = match &pg_query_root { - Some(root) => resolve_tokens( - &get_nodes(root, text.to_string(), 1), + Some(root) => estimate_node_range( + &mut get_nodes(root, text.to_string(), 1), &pg_query_tokens, &text, ) @@ -167,14 +169,14 @@ impl Parser { // consume all nodes that start at or before the token ends while pg_query_nodes.peek().is_some() - && pg_query_nodes.peek().unwrap().estimated_range.start() + && pg_query_nodes.peek().unwrap().range.start() <= TextSize::from(token.end as u32) { let node = pg_query_nodes.next().unwrap(); self.start_node(SyntaxKind::new_from_pg_query_node(&node.inner.node)); open_nodes.push(( SyntaxKind::new_from_pg_query_node(&node.inner.node), - node.estimated_range, + node.range, node.inner.depth, )); } diff --git a/crates/parser/tests/statement_parser_test.rs b/crates/parser/tests/statement_parser_test.rs index 4eef7303..77542df1 100644 --- a/crates/parser/tests/statement_parser_test.rs +++ b/crates/parser/tests/statement_parser_test.rs @@ -23,7 +23,7 @@ fn valid_statements() { let contents = fs::read_to_string(&path).unwrap(); let mut parser = Parser::new(); - parser.parse_statement(&contents, None); + parser.parse_statement_at(&contents, None); let parsed = parser.finish(); let mut settings = insta::Settings::clone_current(); From 88de8438b09044d24834fdf189d1c639c5d42025 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Tue, 3 Oct 2023 17:22:53 +0200 Subject: [PATCH 06/16] minor improvements to estimate_node_range poc --- crates/parser/src/estimate_node_range.rs | 164 ++++++++++++++++------- 1 file changed, 117 insertions(+), 47 deletions(-) diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index 390212e8..addc6408 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -3,7 +3,7 @@ use std::cmp::{max, min}; use crate::get_location_codegen::get_location; use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; -use pg_query::{protobuf::ScanToken, NodeEnum}; +use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; #[derive(Debug, Clone)] pub struct RangedNode { @@ -31,72 +31,109 @@ pub fn estimate_node_range( let mut child_tokens = Vec::new(); - let mut find_token = |property: String| { - println!("find_token {}", property); - child_tokens.push( - tokens - .iter() - .filter_map(|t| { - println!("token {:#?}", t); - // make a string comparison of the text of the token and the property value - if get_token_text( + #[derive(Debug)] + struct TokenProperty { + value: Option, + token: Option, + } + + impl TokenProperty { + fn from_int(value: &i32) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + + fn from_string(value: &String) -> TokenProperty { + assert!(value.len() > 0, "String property value has length 0"); + TokenProperty { + value: Some(value.to_owned()), + token: None, + } + } + + fn from_token(token: Token) -> TokenProperty { + TokenProperty { + value: None, + token: Some(token), + } + } + } + + let mut get_token = |property: TokenProperty| { + let token = tokens + .iter() + .filter_map(|t| { + if property.token.is_some() { + // if a token is set, we can safely ignore all tokens that are not of the same type + if t.token() != property.token.unwrap() { + return None; + } + } + // make a string comparison of the text of the token and the property value + if property.value.is_some() + && get_token_text( usize::try_from(t.start).unwrap(), usize::try_from(t.end).unwrap(), text, ) .to_lowercase() - != property.to_lowercase() - { - println!("token text does not match property value"); - return None; - } + != property.value.as_ref().unwrap().to_lowercase() + { + return None; + } - // if the furthest child location is set, and it is smaller than the start of the token, - // we can safely ignore this token, because it is not a child of the node - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - println!("furthest child location is smaller than token start"); - return None; - } + // if the furthest child location is set, and it is smaller than the start of the token, + // we can safely ignore this token, because it is not a child of the node + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } - // if the token is before the nearest parent location, we can safely ignore it - // if not, we calculate the distance to the nearest parent location - let distance = t.start - nearest_parent_location; - if distance >= 0 { - println!("distance {} for token {:#?}", distance, t); - Some((distance, t)) - } else { - println!("distance is smaller than 0 for token {:#?}", t); - None - } - }) - // and use the token with the smallest distance to the nearest parent location - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t) - .unwrap(), - ); + // if the token is before the nearest parent location, we can safely ignore it + // if not, we calculate the distance to the nearest parent location + let distance = t.start - nearest_parent_location; + if distance >= 0 { + Some((distance, t)) + } else { + None + } + }) + // and use the token with the smallest distance to the nearest parent location + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t); + + if token.is_none() { + panic!( + "No matching token found for property {:?} in {:#?}", + property, tokens + ); + } + + child_tokens.push(token.unwrap()); }; match &n.node { NodeEnum::RangeVar(n) => { - find_token(n.relname.to_owned()); + get_token(TokenProperty::from_string(&n.relname)); } NodeEnum::Integer(n) => { - find_token(n.ival.to_owned().to_string()); + get_token(TokenProperty::from_int(&n.ival)); } NodeEnum::AConst(n) => { if n.isnull { - find_token("null".to_string()); + get_token(TokenProperty::from_token(Token::NullP)); } } NodeEnum::ResTarget(n) => { if n.name.len() > 0 { - find_token(n.name.to_owned()); + get_token(TokenProperty::from_string(&n.name)); } } - NodeEnum::SelectStmt(n) => { - find_token("select".to_string()); + NodeEnum::SelectStmt(_) => { + get_token(TokenProperty::from_token(Token::Select)); } _ => panic!("Node type not implemented: {:?}", n.node), }; @@ -145,7 +182,6 @@ pub fn estimate_node_range( }; // For `to`, it’s the larger value of the end of the last direkt child token, and the end of all children ranges. - println!("{}: {:?}", n.path, n.node); let end_of_last_child_token = if child_tokens.len() > 0 { Some(child_tokens.iter().max_by_key(|t| t.end).unwrap().end) } else { @@ -236,6 +272,9 @@ fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { #[cfg(test)] mod tests { + use cstree::text::{TextRange, TextSize}; + use pg_query::NodeEnum; + use crate::estimate_node_range::estimate_node_range; use crate::get_nodes_codegen::get_nodes; @@ -266,6 +305,37 @@ mod tests { let ranged_nodes = estimate_node_range(&mut nodes, &pg_query_tokens, &input); - dbg!(&ranged_nodes); + assert!(ranged_nodes + .iter() + .find( + |n| n.range == TextRange::new(TextSize::from(0), TextSize::from(11)) + && match &n.inner.node { + NodeEnum::SelectStmt(_) => true, + _ => false, + } + ) + .is_some()); + + assert!(ranged_nodes + .iter() + .find( + |n| n.range == TextRange::new(TextSize::from(7), TextSize::from(11)) + && match &n.inner.node { + NodeEnum::ResTarget(_) => true, + _ => false, + } + ) + .is_some()); + + assert!(ranged_nodes + .iter() + .find( + |n| n.range == TextRange::new(TextSize::from(7), TextSize::from(11)) + && match &n.inner.node { + NodeEnum::AConst(_) => true, + _ => false, + } + ) + .is_some()); } } From 52a660c9c16e0673590d8654c5f5b4254c6e3b5e Mon Sep 17 00:00:00 2001 From: psteinroe Date: Tue, 3 Oct 2023 17:41:05 +0200 Subject: [PATCH 07/16] fix: statement parser --- crates/parser/src/estimate_node_range.rs | 13 +- crates/parser/src/statement_parser.rs | 170 +++-------------------- 2 files changed, 26 insertions(+), 157 deletions(-) diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index addc6408..fce88a89 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -215,10 +215,15 @@ pub fn estimate_node_range( panic!("No child tokens or children ranges found for node {:?}", n); }; - ranged_nodes.push(RangedNode { - inner: n.to_owned(), - range: TextRange::new(TextSize::from(from as u32), TextSize::from(to as u32)), - }); + // TODO: validate that prepending is enough to ensure that `ranged_nodes` is sorted by + // range.start + ranged_nodes.insert( + 0, + RangedNode { + inner: n.to_owned(), + range: TextRange::new(TextSize::from(from as u32), TextSize::from(to as u32)), + }, + ); }); ranged_nodes diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index e1888998..6d0187fa 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -164,9 +164,6 @@ impl Parser { self.token(kind, text.as_str()); } - // apply the token - self.token(SyntaxKind::new_from_pg_query_token(token), text); - // consume all nodes that start at or before the token ends while pg_query_nodes.peek().is_some() && pg_query_nodes.peek().unwrap().range.start() @@ -181,6 +178,12 @@ impl Parser { )); } + // apply the token + self.token( + SyntaxKind::new_from_pg_query_token(token), + token_text.as_str(), + ); + token_text.len() as i32 } else { // fallback to statement token @@ -213,162 +216,23 @@ impl Parser { } } -// impl Parser { -// /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file. -// /// -// /// On a high level, the algorithm works as follows: -// /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information -// /// about the nodes. The result will be a flat list of tokens under the generic `Stmt` node. -// /// If successful, the first node in the ordered list will be the main node of the statement, -// /// and serves as a root node. -// /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors. -// /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens -// /// that are not parsed by pg_query.rs, such as whitespace. -// /// 4. Define a pointer that starts at 0 and move it along the statement. -// /// - first, check if the current pointer is within a pg_query token. If so, consume the -// /// token. -// /// - if not, consume the next token from the `StatementToken` lexer. -// /// 5. Close all open nodes for that statement. -// pub fn parse_statement(&mut self, text: &str, at_offset: Option) { -// let offset = at_offset.unwrap_or(0); -// let range = TextRange::new( -// TextSize::from(offset), -// TextSize::from(offset + text.len() as u32), -// ); -// -// let mut pg_query_tokens = match pg_query::scan(text) { -// Ok(scanned) => scanned.tokens, -// Err(e) => { -// self.error(e.to_string(), range); -// Vec::new() -// } -// }; -// -// // Get root node with depth 1 -// // Since we are parsing only a single statement there can be only a single node at depth 1 -// let pg_query_root = match pg_query::parse(text) { -// Ok(parsed) => Some( -// parsed -// .protobuf -// .nodes() -// .iter() -// .find(|n| n.1 == 1) -// .unwrap() -// .0 -// .to_enum(), -// ), -// Err(e) => { -// self.error(e.to_string(), range); -// None -// } -// }; -// -// let mut pg_query_nodes = match &pg_query_root { -// Some(root) => resolve_tokens( -// &get_nodes(root, text.to_string(), 1), -// &pg_query_tokens, -// &text, -// ) -// .into_iter() -// .peekable(), -// None => Vec::new().into_iter().peekable(), -// }; -// -// let mut pg_query_tokens = pg_query_tokens.iter().peekable(); -// -// let mut lexer = StatementToken::lexer(&text); -// -// // parse root node if no syntax errors -// if pg_query_root.is_some() { -// let root_node = pg_query_root.unwrap(); -// self.stmt(root_node.to_owned(), range); -// self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1); -// } else { -// // fallback to generic node as root -// self.start_node_at(SyntaxKind::Stmt, 1); -// } -// self.set_checkpoint(); -// -// // start at 0, and increment by the length of the token -// let mut pointer: i32 = 0; -// -// #[derive(Debug)] -// struct Token { -// syntax_kind: SyntaxKind, -// span: Span, -// } -// -// while pointer < text.len() as i32 { -// // Check if the pointer is within a pg_query token -// let next_pg_query_token = pg_query_tokens.peek(); -// let token = if next_pg_query_token.is_some() -// && next_pg_query_token.unwrap().start <= pointer -// && pointer <= next_pg_query_token.unwrap().end -// { -// let token = pg_query_tokens.next().unwrap(); -// Token { -// syntax_kind: SyntaxKind::new_from_pg_query_token(&token), -// span: Span { -// start: token.start as usize, -// end: token.end as usize, -// }, -// } -// } else { -// // fallback to statement token -// -// // move statement token lexer to before pointer -// while (lexer.span().end as i32) < pointer { -// lexer.next(); -// } -// let token = lexer.next(); -// if token.is_none() || (lexer.span().start as i32) != pointer { -// // if the token is not at the pointer, we have a syntax error -// panic!( -// "Expected token for '{}' at offset {}", -// lexer.slice(), -// lexer.span().start -// ); -// } -// Token { -// syntax_kind: token.unwrap().unwrap().syntax_kind(), -// span: lexer.span(), -// } -// }; -// -// self.token( -// token.syntax_kind, -// text.chars() -// .skip(token.span.start) -// .take(token.span.end - token.span.start) -// .collect::() -// .as_str(), -// ); -// -// pointer = pointer + (token.span.end - token.span.start) as i32; -// } -// -// // close up nodes -// self.close_checkpoint(); -// } -// } - #[cfg(test)] mod tests { use std::assert_eq; use super::*; - // #[test] - // fn test_invalid_statement() { - // let input = "select select;"; - // - // let mut parser = Parser::new(); - // parser.parse_statement(input, None); - // let parsed = parser.finish(); - // - // assert_eq!(parsed.cst.text(), input); - // } - // + #[test] + fn test_statement() { + let input = "select null;"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + // #[test] // fn test_create_sql_function() { // let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) From 789673ba5678004b67a54337f9a169ce6176c99d Mon Sep 17 00:00:00 2001 From: psteinroe Date: Tue, 3 Oct 2023 19:50:09 +0200 Subject: [PATCH 08/16] feat: codegen for get_child_tokens --- crates/codegen/src/get_child_tokens.rs | 253 ++++++++++++++++++ crates/codegen/src/lib.rs | 7 + crates/parser/src/estimate_node_range.rs | 123 +-------- crates/parser/src/get_child_tokens_codegen.rs | 3 + crates/parser/src/lib.rs | 1 + crates/parser/src/statement_parser.rs | 8 +- 6 files changed, 275 insertions(+), 120 deletions(-) create mode 100644 crates/codegen/src/get_child_tokens.rs create mode 100644 crates/parser/src/get_child_tokens_codegen.rs diff --git a/crates/codegen/src/get_child_tokens.rs b/crates/codegen/src/get_child_tokens.rs new file mode 100644 index 00000000..ce386882 --- /dev/null +++ b/crates/codegen/src/get_child_tokens.rs @@ -0,0 +1,253 @@ +use pg_query_proto_parser::{FieldType, Node, ProtoParser}; +use proc_macro2::{Ident, TokenStream}; +use quote::{format_ident, quote}; + +pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { + let parser = ProtoParser::new("./libpg_query/protobuf/pg_query.proto"); + + let proto_file = parser.parse(); + + let node_identifiers = node_identifiers(&proto_file.nodes); + let node_handlers = node_handlers(&proto_file.nodes); + + quote! { + use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; + + #[derive(Debug)] + struct TokenProperty { + value: Option, + token: Option, + } + + impl From for TokenProperty { + fn from(value: i32) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + impl From for TokenProperty { + fn from(value: u32) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + + impl From for TokenProperty { + fn from(value: i64) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + impl From for TokenProperty { + fn from(value: u64) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + impl From for TokenProperty { + fn from(value: f64) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + impl From for TokenProperty { + fn from(value: bool) -> TokenProperty { + TokenProperty { + value: Some(value.to_string()), + token: None, + } + } + } + + impl From for TokenProperty { + fn from(value: String) -> TokenProperty { + assert!(value.len() > 0, "String property value has length 0"); + TokenProperty { + value: Some(value), + token: None, + } + } + } + + + impl From<&pg_query::protobuf::Integer> for TokenProperty { + fn from(node: &pg_query::protobuf::Integer) -> TokenProperty { + TokenProperty { + value: Some(node.ival.to_string()), + token: Some(Token::Iconst) + } + } + } + + impl From<&pg_query::protobuf::Boolean> for TokenProperty { + fn from(node: &pg_query::protobuf::Boolean) -> TokenProperty { + TokenProperty { + value: Some(node.boolval.to_string()), + token: match node.boolval { + true => Some(Token::TrueP), + false => Some(Token::FalseP), + } + } + } + } + + impl From for TokenProperty { + fn from(token: Token) -> TokenProperty { + TokenProperty { + value: None, + token: Some(token), + } + } + } + + fn get_token_text(start: usize, end: usize, text: &str) -> String { + text.chars() + .skip(start) + .take(end - start) + .collect::() + } + + + pub fn get_child_tokens<'tokens>(node: &NodeEnum, tokens: &'tokens Vec, text: &str, nearest_parent_location: i32, furthest_child_location: Option) -> Vec<&'tokens ScanToken> { + let mut child_tokens = Vec::new(); + + let mut get_token = |property: TokenProperty| { + let token = tokens + .iter() + .filter_map(|t| { + if property.token.is_some() { + // if a token is set, we can safely ignore all tokens that are not of the same type + if t.token() != property.token.unwrap() { + return None; + } + } + // make a string comparison of the text of the token and the property value + if property.value.is_some() + && get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) + .to_lowercase() + != property.value.as_ref().unwrap().to_lowercase() + { + return None; + } + + // if the furthest child location is set, and it is smaller than the start of the token, + // we can safely ignore this token, because it is not a child of the node + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } + + // if the token is before the nearest parent location, we can safely ignore it + // if not, we calculate the distance to the nearest parent location + let distance = t.start - nearest_parent_location; + if distance >= 0 { + Some((distance, t)) + } else { + None + } + }) + // and use the token with the smallest distance to the nearest parent location + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t); + + if token.is_none() { + panic!( + "No matching token found for property {:?} in {:#?}", + property, tokens + ); + } + + child_tokens.push(token.unwrap()); + }; + + match node { + #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, + }; + + child_tokens + } + } +} + +fn node_identifiers(nodes: &[Node]) -> Vec { + nodes + .iter() + .map(|node| format_ident!("{}", &node.name)) + .collect() +} + +fn node_handlers(nodes: &[Node]) -> Vec { + nodes + .iter() + .map(|node| { + let string_property_handlers = string_property_handlers(&node); + let custom_handlers = custom_handlers(&node); + quote! { + #custom_handlers + #(#string_property_handlers)* + } + }) + .collect() +} + +fn custom_handlers(node: &Node) -> TokenStream { + match node.name.as_str() { + "SelectStmt" => quote! { + get_token(TokenProperty::from(Token::Select)); + }, + "Integer" => quote! { + get_token(TokenProperty::from(n)); + }, + "Boolean" => quote! { + get_token(TokenProperty::from(n)); + }, + "AConst" => quote! { + if n.isnull { + get_token(TokenProperty::from(Token::NullP)); + } + }, + _ => quote! {}, + } +} + +fn string_property_handlers(node: &Node) -> Vec { + node.fields + .iter() + .filter_map(|field| { + if field.repeated { + return None; + } + let field_name = format_ident!("{}", field.name.as_str()); + match field.field_type { + // just handle string values for now + FieldType::String => Some(quote! { + // most string values are never None, but an empty string + if n.#field_name.len() > 0 { + get_token(TokenProperty::from(n.#field_name.to_owned())); + } + }), + _ => None, + } + }) + .collect() +} diff --git a/crates/codegen/src/lib.rs b/crates/codegen/src/lib.rs index bc63d4f6..b91b9789 100644 --- a/crates/codegen/src/lib.rs +++ b/crates/codegen/src/lib.rs @@ -1,11 +1,18 @@ +mod get_child_tokens; mod get_location; mod get_nodes; mod syntax_kind; +use get_child_tokens::get_child_tokens_mod; use get_location::get_location_mod; use get_nodes::get_nodes_mod; use syntax_kind::syntax_kind_mod; +#[proc_macro] +pub fn get_child_tokens(item: proc_macro::TokenStream) -> proc_macro::TokenStream { + get_child_tokens_mod(item.into()).into() +} + #[proc_macro] pub fn get_nodes(item: proc_macro::TokenStream) -> proc_macro::TokenStream { get_nodes_mod(item.into()).into() diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index fce88a89..8d461c87 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -1,5 +1,6 @@ use std::cmp::{max, min}; +use crate::get_child_tokens_codegen::get_child_tokens; use crate::get_location_codegen::get_location; use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; @@ -29,114 +30,13 @@ pub fn estimate_node_range( let nearest_parent_location = get_nearest_parent_location(&n, nodes); let furthest_child_location = get_furthest_child_location(&n, nodes); - let mut child_tokens = Vec::new(); - - #[derive(Debug)] - struct TokenProperty { - value: Option, - token: Option, - } - - impl TokenProperty { - fn from_int(value: &i32) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - token: None, - } - } - - fn from_string(value: &String) -> TokenProperty { - assert!(value.len() > 0, "String property value has length 0"); - TokenProperty { - value: Some(value.to_owned()), - token: None, - } - } - - fn from_token(token: Token) -> TokenProperty { - TokenProperty { - value: None, - token: Some(token), - } - } - } - - let mut get_token = |property: TokenProperty| { - let token = tokens - .iter() - .filter_map(|t| { - if property.token.is_some() { - // if a token is set, we can safely ignore all tokens that are not of the same type - if t.token() != property.token.unwrap() { - return None; - } - } - // make a string comparison of the text of the token and the property value - if property.value.is_some() - && get_token_text( - usize::try_from(t.start).unwrap(), - usize::try_from(t.end).unwrap(), - text, - ) - .to_lowercase() - != property.value.as_ref().unwrap().to_lowercase() - { - return None; - } - - // if the furthest child location is set, and it is smaller than the start of the token, - // we can safely ignore this token, because it is not a child of the node - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - return None; - } - - // if the token is before the nearest parent location, we can safely ignore it - // if not, we calculate the distance to the nearest parent location - let distance = t.start - nearest_parent_location; - if distance >= 0 { - Some((distance, t)) - } else { - None - } - }) - // and use the token with the smallest distance to the nearest parent location - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t); - - if token.is_none() { - panic!( - "No matching token found for property {:?} in {:#?}", - property, tokens - ); - } - - child_tokens.push(token.unwrap()); - }; - - match &n.node { - NodeEnum::RangeVar(n) => { - get_token(TokenProperty::from_string(&n.relname)); - } - NodeEnum::Integer(n) => { - get_token(TokenProperty::from_int(&n.ival)); - } - NodeEnum::AConst(n) => { - if n.isnull { - get_token(TokenProperty::from_token(Token::NullP)); - } - } - NodeEnum::ResTarget(n) => { - if n.name.len() > 0 { - get_token(TokenProperty::from_string(&n.name)); - } - } - NodeEnum::SelectStmt(_) => { - get_token(TokenProperty::from_token(Token::Select)); - } - _ => panic!("Node type not implemented: {:?}", n.node), - }; + let child_tokens = get_child_tokens( + &n.node, + tokens, + text, + nearest_parent_location, + furthest_child_location, + ); // For `from`, the location of the node itself is always correct. // If not available, the closest estimation is the smaller value of the start of the first direct child token, @@ -229,13 +129,6 @@ pub fn estimate_node_range( ranged_nodes } -fn get_token_text(start: usize, end: usize, text: &str) -> String { - text.chars() - .skip(start) - .take(end - start) - .collect::() -} - fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { children .iter() diff --git a/crates/parser/src/get_child_tokens_codegen.rs b/crates/parser/src/get_child_tokens_codegen.rs new file mode 100644 index 00000000..22430efb --- /dev/null +++ b/crates/parser/src/get_child_tokens_codegen.rs @@ -0,0 +1,3 @@ +use codegen::get_child_tokens; + +get_child_tokens!(); diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 3bc10622..b2975eaf 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -17,6 +17,7 @@ mod ast_node; mod estimate_node_range; +mod get_child_tokens_codegen; mod get_location_codegen; mod get_nodes_codegen; mod parser; diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 6d0187fa..9df7c50e 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,12 +1,10 @@ -use std::{collections::VecDeque, iter::Peekable}; +use std::collections::VecDeque; use cstree::text::{TextRange, TextSize}; use logos::Logos; use crate::{ - estimate_node_range::{estimate_node_range, RangedNode}, - get_nodes_codegen::get_nodes, - parser::Parser, + estimate_node_range::estimate_node_range, get_nodes_codegen::get_nodes, parser::Parser, syntax_kind_codegen::SyntaxKind, }; @@ -224,7 +222,7 @@ mod tests { #[test] fn test_statement() { - let input = "select null;"; + let input = "select 1;"; let mut parser = Parser::new(); parser.parse_statement_at(input, None); From 3ee8664f5dd73d8a8f910626bcd4c280164347bb Mon Sep 17 00:00:00 2001 From: psteinroe Date: Wed, 4 Oct 2023 18:18:32 +0200 Subject: [PATCH 09/16] fix: minor fixes while making tests green --- crates/codegen/src/get_child_tokens.rs | 58 ++++++++++++++++-------- crates/parser/src/estimate_node_range.rs | 58 ++++++++++++++---------- crates/parser/src/source_parser.rs | 16 +++++-- 3 files changed, 84 insertions(+), 48 deletions(-) diff --git a/crates/codegen/src/get_child_tokens.rs b/crates/codegen/src/get_child_tokens.rs index ce386882..7d33ef63 100644 --- a/crates/codegen/src/get_child_tokens.rs +++ b/crates/codegen/src/get_child_tokens.rs @@ -78,7 +78,7 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok fn from(value: String) -> TokenProperty { assert!(value.len() > 0, "String property value has length 0"); TokenProperty { - value: Some(value), + value: Some(value.to_lowercase()), token: None, } } @@ -115,11 +115,24 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok } } - fn get_token_text(start: usize, end: usize, text: &str) -> String { + fn get_token_text(token: &ScanToken ,text: &str) -> String { + let start = usize::try_from(token.start).unwrap(); + let end = usize::try_from(token.end).unwrap(); text.chars() .skip(start) .take(end - start) .collect::() + .to_lowercase() + } + + /// returns a list of aliases for a string. primarily used for data types. + /// + /// list from https://www.postgresql.org/docs/current/datatype.html + fn aliases(text: &str) -> Vec<&str> { + match text { + "integer" | "int" | "int4" => vec!["integer", "int", "int4"], + _ => vec![text], + } } @@ -136,17 +149,19 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok return None; } } + // make a string comparison of the text of the token and the property value - if property.value.is_some() - && get_token_text( - usize::try_from(t.start).unwrap(), - usize::try_from(t.end).unwrap(), - text, - ) - .to_lowercase() - != property.value.as_ref().unwrap().to_lowercase() - { - return None; + if property.value.is_some() { + let mut token_text = get_token_text(t, text); + // if token is Sconst, remove leading and trailing quotes + if t.token() == Token::Sconst { + let string_delimiter: &[char; 2] = &['\'', '$']; + token_text = token_text.trim_start_matches(string_delimiter).trim_end_matches(string_delimiter).to_string(); + } + + if !aliases(property.value.as_ref().unwrap()).contains(&token_text.as_str()) { + return None; + } } // if the furthest child location is set, and it is smaller than the start of the token, @@ -170,14 +185,16 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok .min_by_key(|(d, _)| d.to_owned()) .map(|(_, t)| t); - if token.is_none() { - panic!( - "No matching token found for property {:?} in {:#?}", - property, tokens - ); - } + // if token.is_none() { + // panic!( + // "No matching token found for property {:#?} of node {:#?} in {:#?} with tokens {:#?}", + // property, node, text, tokens + // ); + // } - child_tokens.push(token.unwrap()); + if token.is_some() { + child_tokens.push(token.unwrap()); + } }; match node { @@ -221,6 +238,9 @@ fn custom_handlers(node: &Node) -> TokenStream { "Boolean" => quote! { get_token(TokenProperty::from(n)); }, + "AStar" => quote! { + get_token(TokenProperty::from(Token::Ascii42)); + }, "AConst" => quote! { if n.isnull { get_token(TokenProperty::from(Token::NullP)); diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index 8d461c87..e601fab5 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -4,6 +4,7 @@ use crate::get_child_tokens_codegen::get_child_tokens; use crate::get_location_codegen::get_location; use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; +use log::debug; use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; #[derive(Debug, Clone)] @@ -42,20 +43,23 @@ pub fn estimate_node_range( // If not available, the closest estimation is the smaller value of the start of the first direct child token, // and the start of all children ranges. If neither is available, let’s panic for now. // The parent location as a fallback should never be required, because any node must have either children with tokens, or a token itself. + let children_ranges = ranged_nodes + .iter() + .filter(|x| x.inner.path.starts_with(n.path.as_str())) + .collect::>(); let location = get_location(&n.node); let from = if location.is_some() { - location.unwrap() + Some(location.unwrap()) } else { let start_of_first_child_token = if child_tokens.len() > 0 { Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start) } else { None }; - let start_of_all_children_ranges = if ranged_nodes.len() > 0 { + let start_of_all_children_ranges = if children_ranges.len() > 0 { Some( - ranged_nodes + children_ranges .iter() - .filter(|x| x.inner.path.starts_with(n.path.as_str())) .min_by_key(|n| n.range.start()) .unwrap() .range @@ -67,17 +71,18 @@ pub fn estimate_node_range( if start_of_first_child_token.is_some() { if start_of_all_children_ranges.is_some() { - min( + Some(min( start_of_first_child_token.unwrap(), u32::from(start_of_all_children_ranges.unwrap()) as i32, - ) + )) } else { - start_of_first_child_token.unwrap() + Some(start_of_first_child_token.unwrap()) } } else if start_of_all_children_ranges.is_some() { - u32::from(start_of_all_children_ranges.unwrap()) as i32 + Some(u32::from(start_of_all_children_ranges.unwrap()) as i32) } else { - panic!("No location or child tokens found for node {:?}", n); + debug!("No location or child tokens found for node {:?}", n); + None } }; @@ -87,11 +92,10 @@ pub fn estimate_node_range( } else { None }; - let end_of_all_children_ranges = if ranged_nodes.len() > 0 { + let end_of_all_children_ranges = if children_ranges.len() > 0 { Some( - ranged_nodes + children_ranges .iter() - .filter(|x| x.inner.path.starts_with(n.path.as_str())) .max_by_key(|n| n.range.end()) .unwrap() .range @@ -102,30 +106,34 @@ pub fn estimate_node_range( }; let to = if end_of_last_child_token.is_some() { if end_of_all_children_ranges.is_some() { - max( + Some(max( end_of_last_child_token.unwrap(), u32::from(end_of_all_children_ranges.unwrap()) as i32, - ) + )) } else { - end_of_last_child_token.unwrap() + Some(end_of_last_child_token.unwrap()) } } else if end_of_all_children_ranges.is_some() { - u32::from(end_of_all_children_ranges.unwrap()) as i32 + Some(u32::from(end_of_all_children_ranges.unwrap()) as i32) } else { - panic!("No child tokens or children ranges found for node {:?}", n); + debug!("No child tokens or children ranges found for node {:?}", n); + None }; - // TODO: validate that prepending is enough to ensure that `ranged_nodes` is sorted by - // range.start - ranged_nodes.insert( - 0, - RangedNode { + if from.is_some() && to.is_some() { + ranged_nodes.push(RangedNode { inner: n.to_owned(), - range: TextRange::new(TextSize::from(from as u32), TextSize::from(to as u32)), - }, - ); + range: TextRange::new( + TextSize::from(from.unwrap() as u32), + TextSize::from(to.unwrap() as u32), + ), + }); + } }); + // sort by start of range, and then by depth + ranged_nodes.sort_by_key(|i| (i.range.start(), i.inner.depth)); + ranged_nodes } diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs index 341d6eb9..98e4c91f 100644 --- a/crates/parser/src/source_parser.rs +++ b/crates/parser/src/source_parser.rs @@ -93,10 +93,10 @@ impl Parser { self.token(SyntaxKind::Newline, token.text.as_str()); } SourceFileToken::Statement => { - // self.parse_statement( - // token.text.as_str(), - // Some(offset + u32::from(token.span.start())), - // ); + self.parse_statement_at( + token.text.as_str(), + Some(offset + u32::from(token.span.start())), + ); } }; } @@ -110,6 +110,10 @@ impl Parser { mod tests { use super::*; + fn init() { + let _ = env_logger::builder().is_test(true).try_init(); + } + #[test] fn test_source_file_lexer() { let input = "select * from contact where id = '123';\n\n-- test comment\n\nselect wrong statement;\n\nselect id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';\n\n"; @@ -145,6 +149,8 @@ mod tests { #[test] fn test_source_file_parser() { + init(); + let input = "select id, name from users where id = '1224'; select select; @@ -166,6 +172,8 @@ select 1; #[test] fn test_lexer_with_nested_statements() { + init(); + let input = "select * from test; select 123; From 16c3dcc068177fc04ef36cfde1dac0eb5af4f589 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Wed, 4 Oct 2023 18:20:18 +0200 Subject: [PATCH 10/16] chore: cleanup --- crates/codegen/src/get_child_tokens.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/crates/codegen/src/get_child_tokens.rs b/crates/codegen/src/get_child_tokens.rs index 7d33ef63..cf124f8f 100644 --- a/crates/codegen/src/get_child_tokens.rs +++ b/crates/codegen/src/get_child_tokens.rs @@ -11,6 +11,7 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok let node_handlers = node_handlers(&proto_file.nodes); quote! { + use log::{debug}; use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; #[derive(Debug)] @@ -185,15 +186,13 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok .min_by_key(|(d, _)| d.to_owned()) .map(|(_, t)| t); - // if token.is_none() { - // panic!( - // "No matching token found for property {:#?} of node {:#?} in {:#?} with tokens {:#?}", - // property, node, text, tokens - // ); - // } - if token.is_some() { child_tokens.push(token.unwrap()); + } else { + debug!( + "No matching token found for property {:#?} of node {:#?} in {:#?} with tokens {:#?}", + property, node, text, tokens + ); } }; From 3e5f488e54521c3025e3c386417d0ed59ba4cc41 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 5 Oct 2023 21:09:05 +0200 Subject: [PATCH 11/16] feat: bring back sibling token logic --- crates/parser/src/lib.rs | 1 + crates/parser/src/parser.rs | 4 ++ crates/parser/src/sibling_token.rs | 42 +++++++++++ crates/parser/src/statement_parser.rs | 42 +++++++++-- .../snapshots/statements/valid/0001.snap.new | 70 +++++++++++++++++++ 5 files changed, 154 insertions(+), 5 deletions(-) create mode 100644 crates/parser/src/sibling_token.rs create mode 100644 crates/parser/tests/snapshots/statements/valid/0001.snap.new diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index b2975eaf..1bb1c53a 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -21,6 +21,7 @@ mod get_child_tokens_codegen; mod get_location_codegen; mod get_nodes_codegen; mod parser; +mod sibling_token; mod source_parser; mod statement_parser; mod syntax_error; diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 2f311ba5..06cd9d85 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -1,5 +1,6 @@ use cstree::syntax::ResolvedNode; use cstree::{build::GreenNodeBuilder, text::TextRange}; +use log::debug; use pg_query::NodeEnum; use crate::ast_node::RawStmt; @@ -40,16 +41,19 @@ impl Parser { /// start a new node of `SyntaxKind` pub fn start_node(&mut self, kind: SyntaxKind) { + debug!("start_node: {:?}", kind); self.inner.start_node(kind); } /// finish current node pub fn finish_node(&mut self) { + debug!("finish_node"); self.inner.finish_node(); } /// applies token pub fn token(&mut self, kind: SyntaxKind, text: &str) { + debug!("token: {:?} {:?}", kind, text); self.inner.token(kind, text); } diff --git a/crates/parser/src/sibling_token.rs b/crates/parser/src/sibling_token.rs new file mode 100644 index 00000000..39ad0993 --- /dev/null +++ b/crates/parser/src/sibling_token.rs @@ -0,0 +1,42 @@ +use crate::syntax_kind_codegen::SyntaxKind; + +const SIBLINGS: [(SyntaxKind, SyntaxKind); 1] = [(SyntaxKind::Ascii40, SyntaxKind::Ascii41)]; + +impl SyntaxKind { + pub fn is_closing_sibling(self) -> bool { + SIBLINGS.iter().any(|(_, close)| *close == self) + } + + pub fn is_opening_sibling(self) -> bool { + SIBLINGS.iter().any(|(open, _)| *open == self) + } + + pub fn get_closing_sibling(self) -> SyntaxKind { + SIBLINGS + .iter() + .find_map(|(open, close)| if *open == self { Some(*close) } else { None }) + .unwrap() + } +} + +#[cfg(test)] +mod tests { + use std::assert_eq; + + use super::*; + + #[test] + fn test_siblings() { + assert_eq!(SyntaxKind::Ascii40.is_opening_sibling(), true); + assert_eq!( + SyntaxKind::Ascii40.get_closing_sibling(), + SyntaxKind::Ascii41 + ); + } + + #[test] + #[should_panic] + fn test_mismatched_siblings() { + SyntaxKind::Ascii41.get_closing_sibling(); + } +} diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 9df7c50e..dae142e8 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -99,6 +99,8 @@ impl Parser { let mut token_buffer: VecDeque<(SyntaxKind, String)> = VecDeque::new(); // Keeps track of currently open nodes. Latest opened is last. let mut open_nodes: Vec<(SyntaxKind, TextRange, i32)> = Vec::new(); + // List of (SyntaxKind, depth) to keep track of currently open sibling tokens and their depths. Latest opened is last. + let mut open_tokens: Vec<(SyntaxKind, i32)> = Vec::new(); // 3. Parse the statement @@ -140,6 +142,7 @@ impl Parser { && pointer <= next_pg_query_token.unwrap().end { let token = pg_query_tokens.next().unwrap(); + let token_syntax_kind = SyntaxKind::new_from_pg_query_token(token); let token_text = text .chars() @@ -149,9 +152,18 @@ impl Parser { // a node can only start and end with a pg_query token, so we can handle them here + // if closing token, close nodes until depth of opening token before applying it + let target_depth = if token_syntax_kind.is_closing_sibling() { + Some(open_tokens.last().unwrap().1) + } else { + None + }; + // before applying the token, close any node that ends before the token starts while open_nodes.last().is_some() && open_nodes.last().unwrap().1.end() <= TextSize::from(token.start as u32) + && (target_depth.is_none() + || open_nodes.last().unwrap().2 > target_depth.unwrap()) { self.finish_node(); open_nodes.pop(); @@ -176,11 +188,12 @@ impl Parser { )); } - // apply the token - self.token( - SyntaxKind::new_from_pg_query_token(token), - token_text.as_str(), - ); + // apply the token to the cst + self.token(token_syntax_kind, token_text.as_str()); + // save the token as an opening sibling token, if it is one + if token_syntax_kind.is_opening_sibling() { + open_tokens.push((token_syntax_kind, open_nodes.last().unwrap().2)); + } token_text.len() as i32 } else { @@ -220,8 +233,14 @@ mod tests { use super::*; + fn init() { + let _ = env_logger::builder().is_test(true).try_init(); + } + #[test] fn test_statement() { + init(); + let input = "select 1;"; let mut parser = Parser::new(); @@ -231,6 +250,19 @@ mod tests { assert_eq!(parsed.cst.text(), input); } + #[test] + fn test_sibling_tokens() { + init(); + + let input = "SELECT city, count(*) FILTER (WHERE temp_lo < 45), max(temp_lo) FROM weather GROUP BY city;"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + // #[test] // fn test_create_sql_function() { // let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) diff --git a/crates/parser/tests/snapshots/statements/valid/0001.snap.new b/crates/parser/tests/snapshots/statements/valid/0001.snap.new new file mode 100644 index 00000000..469b0b71 --- /dev/null +++ b/crates/parser/tests/snapshots/statements/valid/0001.snap.new @@ -0,0 +1,70 @@ +--- +source: crates/parser/tests/statement_parser_test.rs +assertion_line: 36 +description: "SELECT city, count(*) FILTER (WHERE temp_lo < 45), max(temp_lo)\n FROM weather\n GROUP BY city;\n" +--- +SelectStmt@0..102 + Select@0..6 "SELECT" + Whitespace@6..7 " " + ResTarget@7..11 + ColumnRef@7..11 + String@7..11 + Ident@7..11 "city" + Ascii44@11..12 "," + Whitespace@12..13 " " + ResTarget@13..49 + FuncCall@13..49 + String@13..18 + Ident@13..18 "count" + Ascii40@18..19 "(" + Ascii42@19..20 "*" + Ascii41@20..21 ")" + Whitespace@21..22 " " + Filter@22..28 "FILTER" + Whitespace@28..29 " " + Ascii40@29..30 "(" + Where@30..35 "WHERE" + Whitespace@35..36 " " + AExpr@36..48 + ColumnRef@36..43 + String@36..43 + Ident@36..43 "temp_lo" + Whitespace@43..44 " " + String@44..45 + Ascii60@44..45 "<" + Whitespace@45..46 " " + AConst@46..48 + Integer@46..48 + Iconst@46..48 "45" + Ascii41@48..49 ")" + Ascii41@49..50 ")" + Ascii44@50..51 "," + Whitespace@51..52 " " + ResTarget@52..65 + FuncCall@52..65 + String@52..55 + Ident@52..55 "max" + ColumnRef@55..64 + String@55..64 + Ascii40@55..56 "(" + Ident@56..63 "temp_lo" + Ascii41@63..64 ")" + Ascii41@64..65 ")" + Ascii41@65..66 ")" + Newline@66..67 "\n" + Whitespace@67..71 " " + From@71..75 "FROM" + Whitespace@75..76 " " + RangeVar@76..83 + Ident@76..83 "weather" + Newline@83..84 "\n" + Whitespace@84..88 " " + GroupP@88..93 "GROUP" + Whitespace@93..94 " " + By@94..96 "BY" + Whitespace@96..97 " " + ColumnRef@97..101 + String@97..101 + Ident@97..101 "city" + Ascii59@101..102 ";" + From ba7fee4cf7037372d40f814aa2d4b62b41b8b770 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sat, 7 Oct 2023 09:59:16 +0200 Subject: [PATCH 12/16] fix: minor fixes while making tests green --- ...ild_tokens.rs => get_child_token_range.rs} | 70 +++++-- crates/codegen/src/get_location.rs | 25 ++- crates/codegen/src/get_nodes.rs | 5 +- crates/codegen/src/lib.rs | 8 +- crates/parser/src/estimate_node_range.rs | 78 +++----- .../src/get_child_token_range_codegen.rs | 3 + crates/parser/src/get_child_tokens_codegen.rs | 3 - crates/parser/src/lib.rs | 2 +- crates/parser/src/sibling_token.rs | 7 + crates/parser/src/statement_parser.rs | 105 +++++++++- .../snapshots/statements/valid/0001.snap | 69 ++++--- .../snapshots/statements/valid/0001.snap.new | 70 ------- .../snapshots/statements/valid/0002.snap | 6 +- .../snapshots/statements/valid/0003.snap | 76 ++++--- .../snapshots/statements/valid/0004.snap | 103 ++++++---- .../snapshots/statements/valid/0005.snap | 22 ++- .../snapshots/statements/valid/0006.snap | 7 +- .../snapshots/statements/valid/0007.snap | 33 ++-- .../snapshots/statements/valid/0008.snap | 48 +++-- .../snapshots/statements/valid/0009.snap | 185 +++++++++++------- .../snapshots/statements/valid/0010.snap | 44 +++-- .../snapshots/statements/valid/0011.snap | 24 ++- .../snapshots/statements/valid/0012.snap.new | 61 ++++++ .../snapshots/statements/valid/0013.snap.new | 65 ++++++ .../snapshots/statements/valid/0014.snap | 114 ++++++----- .../snapshots/statements/valid/0015.snap.new | 80 ++++++++ crates/parser/tests/statement_parser_test.rs | 3 + 27 files changed, 882 insertions(+), 434 deletions(-) rename crates/codegen/src/{get_child_tokens.rs => get_child_token_range.rs} (79%) create mode 100644 crates/parser/src/get_child_token_range_codegen.rs delete mode 100644 crates/parser/src/get_child_tokens_codegen.rs delete mode 100644 crates/parser/tests/snapshots/statements/valid/0001.snap.new create mode 100644 crates/parser/tests/snapshots/statements/valid/0012.snap.new create mode 100644 crates/parser/tests/snapshots/statements/valid/0013.snap.new create mode 100644 crates/parser/tests/snapshots/statements/valid/0015.snap.new diff --git a/crates/codegen/src/get_child_tokens.rs b/crates/codegen/src/get_child_token_range.rs similarity index 79% rename from crates/codegen/src/get_child_tokens.rs rename to crates/codegen/src/get_child_token_range.rs index cf124f8f..40c9c4fe 100644 --- a/crates/codegen/src/get_child_tokens.rs +++ b/crates/codegen/src/get_child_token_range.rs @@ -2,7 +2,7 @@ use pg_query_proto_parser::{FieldType, Node, ProtoParser}; use proc_macro2::{Ident, TokenStream}; use quote::{format_ident, quote}; -pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { +pub fn get_child_token_range_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { let parser = ProtoParser::new("./libpg_query/protobuf/pg_query.proto"); let proto_file = parser.parse(); @@ -12,7 +12,8 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok quote! { use log::{debug}; - use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; + use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum, protobuf::SortByDir}; + use cstree::text::{TextRange, TextSize}; #[derive(Debug)] struct TokenProperty { @@ -126,18 +127,30 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok .to_lowercase() } + + /// list of aliases from https://www.postgresql.org/docs/current/datatype.html + const ALIASES: [&[&str]; 2]= [ + &["integer", "int", "int4"], + &["real", "float4"], + ]; + /// returns a list of aliases for a string. primarily used for data types. - /// - /// list from https://www.postgresql.org/docs/current/datatype.html fn aliases(text: &str) -> Vec<&str> { - match text { - "integer" | "int" | "int4" => vec!["integer", "int", "int4"], - _ => vec![text], + for alias in ALIASES { + if alias.contains(&text) { + return alias.to_vec(); + } } + return vec![text]; } + pub struct ChildTokenRange { + /// the .start of all child tokens used to estimate the range + pub child_token_indices: Vec, + pub range: Option + } - pub fn get_child_tokens<'tokens>(node: &NodeEnum, tokens: &'tokens Vec, text: &str, nearest_parent_location: i32, furthest_child_location: Option) -> Vec<&'tokens ScanToken> { + pub fn get_child_token_range(node: &NodeEnum, tokens: Vec<&ScanToken>, text: &str, nearest_parent_location: u32) -> ChildTokenRange { let mut child_tokens = Vec::new(); let mut get_token = |property: TokenProperty| { @@ -165,17 +178,9 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok } } - // if the furthest child location is set, and it is smaller than the start of the token, - // we can safely ignore this token, because it is not a child of the node - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - return None; - } - // if the token is before the nearest parent location, we can safely ignore it // if not, we calculate the distance to the nearest parent location - let distance = t.start - nearest_parent_location; + let distance = t.start - nearest_parent_location as i32; if distance >= 0 { Some((distance, t)) } else { @@ -200,7 +205,17 @@ pub fn get_child_tokens_mod(_item: proc_macro2::TokenStream) -> proc_macro2::Tok #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, }; - child_tokens + ChildTokenRange { + child_token_indices: child_tokens.iter().map(|t| t.start).collect(), + range: if child_tokens.len() > 0 { + Some(TextRange::new( + TextSize::from(child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32), + TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), + )) + } else { + None + } + } } } } @@ -230,6 +245,9 @@ fn custom_handlers(node: &Node) -> TokenStream { match node.name.as_str() { "SelectStmt" => quote! { get_token(TokenProperty::from(Token::Select)); + if n.distinct_clause.len() > 0 { + get_token(TokenProperty::from(Token::Distinct)); + } }, "Integer" => quote! { get_token(TokenProperty::from(n)); @@ -240,6 +258,22 @@ fn custom_handlers(node: &Node) -> TokenStream { "AStar" => quote! { get_token(TokenProperty::from(Token::Ascii42)); }, + "FuncCall" => quote! { + if n.agg_filter.is_some() { + get_token(TokenProperty::from(Token::Filter)); + } + }, + "SortBy" => quote! { + get_token(TokenProperty::from(Token::Order)); + match n.sortby_dir { + 2 => get_token(TokenProperty::from(Token::Asc)), + 3 => get_token(TokenProperty::from(Token::Desc)), + _ => {} + } + }, + "WindowDef" => quote! { + get_token(TokenProperty::from(Token::Partition)); + }, "AConst" => quote! { if n.isnull { get_token(TokenProperty::from(Token::NullP)); diff --git a/crates/codegen/src/get_location.rs b/crates/codegen/src/get_location.rs index 29c79088..7e8cf5b0 100644 --- a/crates/codegen/src/get_location.rs +++ b/crates/codegen/src/get_location.rs @@ -14,27 +14,36 @@ pub fn get_location_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt quote! { use pg_query::NodeEnum; - // Returns the location of a node - pub fn get_location(node: &NodeEnum) -> Option { + /// Returns the location of a node + pub fn get_location(node: &NodeEnum) -> Option { + let loc = get_location_internal(node); + if loc.is_some() { + u32::try_from(loc.unwrap()).ok() + } else { + None + } + } + + fn get_location_internal(node: &NodeEnum) -> Option { let location = match node { - // for some nodes, the location of the node itself is after their childrens location. + // for some nodes, the location of the node itself is after their children location. // we implement the logic for those nodes manually. // if you add one, make sure to add its name to `manual_node_names()`. NodeEnum::BoolExpr(n) => { let a = n.args.iter().min_by(|a, b| { - let loc_a = get_location(&a.node.as_ref().unwrap()); - let loc_b = get_location(&b.node.as_ref().unwrap()); + let loc_a = get_location_internal(&a.node.as_ref().unwrap()); + let loc_b = get_location_internal(&b.node.as_ref().unwrap()); loc_a.cmp(&loc_b) }); - get_location(&a.unwrap().node.as_ref().unwrap()) + get_location_internal(&a.unwrap().node.as_ref().unwrap()) }, - NodeEnum::AExpr(n) => get_location(&n.lexpr.as_ref().unwrap().node.as_ref().unwrap()), + NodeEnum::AExpr(n) => get_location_internal(&n.lexpr.as_ref().unwrap().node.as_ref().unwrap()), #(NodeEnum::#node_identifiers(n) => #location_idents),* }; if location.is_some() && location.unwrap() < 0 { None } else { - location + location } } } diff --git a/crates/codegen/src/get_nodes.rs b/crates/codegen/src/get_nodes.rs index 26fd5a53..758bf8f7 100644 --- a/crates/codegen/src/get_nodes.rs +++ b/crates/codegen/src/get_nodes.rs @@ -101,8 +101,9 @@ fn property_handlers(node: &Node) -> Vec { Some(quote! { n.#field_name .iter() - .for_each(|x| handle_child(x.node.as_ref().unwrap().to_owned())); - + .for_each(|x| if x.node.is_some() { + handle_child(x.node.as_ref().unwrap().to_owned()); + }); }) } else if field.field_type == FieldType::Node && field.is_one_of == false { if field.node_name == Some("Node".to_owned()) { diff --git a/crates/codegen/src/lib.rs b/crates/codegen/src/lib.rs index b91b9789..b935182d 100644 --- a/crates/codegen/src/lib.rs +++ b/crates/codegen/src/lib.rs @@ -1,16 +1,16 @@ -mod get_child_tokens; +mod get_child_token_range; mod get_location; mod get_nodes; mod syntax_kind; -use get_child_tokens::get_child_tokens_mod; +use get_child_token_range::get_child_token_range_mod; use get_location::get_location_mod; use get_nodes::get_nodes_mod; use syntax_kind::syntax_kind_mod; #[proc_macro] -pub fn get_child_tokens(item: proc_macro::TokenStream) -> proc_macro::TokenStream { - get_child_tokens_mod(item.into()).into() +pub fn get_child_token_range(item: proc_macro::TokenStream) -> proc_macro::TokenStream { + get_child_token_range_mod(item.into()).into() } #[proc_macro] diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index e601fab5..166b2723 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -1,6 +1,6 @@ use std::cmp::{max, min}; -use crate::get_child_tokens_codegen::get_child_tokens; +use crate::get_child_token_range_codegen::get_child_token_range; use crate::get_location_codegen::get_location; use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; @@ -21,6 +21,8 @@ pub fn estimate_node_range( ) -> Vec { let mut ranged_nodes: Vec = Vec::new(); + let mut used_tokens: Vec = Vec::new(); + // ensure that all children of any given node are already processed before processing the node itself nodes.sort_by(|a, b| b.path.cmp(&a.path)); @@ -29,36 +31,34 @@ pub fn estimate_node_range( nodes.iter().for_each(|n| { // first, get the estimated boundaries of the node based on the `location` property of a node let nearest_parent_location = get_nearest_parent_location(&n, nodes); - let furthest_child_location = get_furthest_child_location(&n, nodes); - let child_tokens = get_child_tokens( + let child_token_range = get_child_token_range( &n.node, - tokens, + tokens + .iter() + .filter(|t| !used_tokens.contains(&t.start)) + .collect(), text, nearest_parent_location, - furthest_child_location, ); + used_tokens.extend(child_token_range.child_token_indices); + // For `from`, the location of the node itself is always correct. // If not available, the closest estimation is the smaller value of the start of the first direct child token, // and the start of all children ranges. If neither is available, let’s panic for now. // The parent location as a fallback should never be required, because any node must have either children with tokens, or a token itself. - let children_ranges = ranged_nodes + let child_node_ranges = ranged_nodes .iter() .filter(|x| x.inner.path.starts_with(n.path.as_str())) .collect::>(); let location = get_location(&n.node); let from = if location.is_some() { - Some(location.unwrap()) + Some(TextSize::from(location.unwrap())) } else { - let start_of_first_child_token = if child_tokens.len() > 0 { - Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start) - } else { - None - }; - let start_of_all_children_ranges = if children_ranges.len() > 0 { + let start_of_all_children_ranges = if child_node_ranges.len() > 0 { Some( - children_ranges + child_node_ranges .iter() .min_by_key(|n| n.range.start()) .unwrap() @@ -69,17 +69,18 @@ pub fn estimate_node_range( None }; - if start_of_first_child_token.is_some() { + if child_token_range.range.is_some() { + let start_of_first_child_token = child_token_range.range.unwrap().start(); if start_of_all_children_ranges.is_some() { Some(min( - start_of_first_child_token.unwrap(), - u32::from(start_of_all_children_ranges.unwrap()) as i32, + start_of_first_child_token, + start_of_all_children_ranges.unwrap(), )) } else { - Some(start_of_first_child_token.unwrap()) + Some(start_of_first_child_token) } } else if start_of_all_children_ranges.is_some() { - Some(u32::from(start_of_all_children_ranges.unwrap()) as i32) + Some(start_of_all_children_ranges.unwrap()) } else { debug!("No location or child tokens found for node {:?}", n); None @@ -87,14 +88,9 @@ pub fn estimate_node_range( }; // For `to`, it’s the larger value of the end of the last direkt child token, and the end of all children ranges. - let end_of_last_child_token = if child_tokens.len() > 0 { - Some(child_tokens.iter().max_by_key(|t| t.end).unwrap().end) - } else { - None - }; - let end_of_all_children_ranges = if children_ranges.len() > 0 { + let end_of_all_children_ranges = if child_node_ranges.len() > 0 { Some( - children_ranges + child_node_ranges .iter() .max_by_key(|n| n.range.end()) .unwrap() @@ -104,17 +100,18 @@ pub fn estimate_node_range( } else { None }; - let to = if end_of_last_child_token.is_some() { + let to = if child_token_range.range.is_some() { + let end_of_last_child_token = child_token_range.range.unwrap().end(); if end_of_all_children_ranges.is_some() { Some(max( - end_of_last_child_token.unwrap(), - u32::from(end_of_all_children_ranges.unwrap()) as i32, + end_of_last_child_token, + end_of_all_children_ranges.unwrap(), )) } else { - Some(end_of_last_child_token.unwrap()) + Some(end_of_last_child_token) } } else if end_of_all_children_ranges.is_some() { - Some(u32::from(end_of_all_children_ranges.unwrap()) as i32) + Some(end_of_all_children_ranges.unwrap()) } else { debug!("No child tokens or children ranges found for node {:?}", n); None @@ -123,10 +120,7 @@ pub fn estimate_node_range( if from.is_some() && to.is_some() { ranged_nodes.push(RangedNode { inner: n.to_owned(), - range: TextRange::new( - TextSize::from(from.unwrap() as u32), - TextSize::from(to.unwrap() as u32), - ), + range: TextRange::new(from.unwrap(), to.unwrap()), }); } }); @@ -137,19 +131,7 @@ pub fn estimate_node_range( ranged_nodes } -fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { - children - .iter() - .filter_map(|n| { - if !n.path.starts_with(c.path.as_str()) { - return None; - } - get_location(&n.node) - }) - .max() -} - -fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { +fn get_nearest_parent_location(n: &Node, children: &Vec) -> u32 { // if location is set, return it let location = get_location(&n.node); if location.is_some() { diff --git a/crates/parser/src/get_child_token_range_codegen.rs b/crates/parser/src/get_child_token_range_codegen.rs new file mode 100644 index 00000000..9b90d602 --- /dev/null +++ b/crates/parser/src/get_child_token_range_codegen.rs @@ -0,0 +1,3 @@ +use codegen::get_child_token_range; + +get_child_token_range!(); diff --git a/crates/parser/src/get_child_tokens_codegen.rs b/crates/parser/src/get_child_tokens_codegen.rs deleted file mode 100644 index 22430efb..00000000 --- a/crates/parser/src/get_child_tokens_codegen.rs +++ /dev/null @@ -1,3 +0,0 @@ -use codegen::get_child_tokens; - -get_child_tokens!(); diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 1bb1c53a..493b49cb 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -17,7 +17,7 @@ mod ast_node; mod estimate_node_range; -mod get_child_tokens_codegen; +mod get_child_token_range_codegen; mod get_location_codegen; mod get_nodes_codegen; mod parser; diff --git a/crates/parser/src/sibling_token.rs b/crates/parser/src/sibling_token.rs index 39ad0993..67e2ed71 100644 --- a/crates/parser/src/sibling_token.rs +++ b/crates/parser/src/sibling_token.rs @@ -17,6 +17,13 @@ impl SyntaxKind { .find_map(|(open, close)| if *open == self { Some(*close) } else { None }) .unwrap() } + + pub fn get_opening_sibling(self) -> SyntaxKind { + SIBLINGS + .iter() + .find_map(|(open, close)| if *close == self { Some(*open) } else { None }) + .unwrap() + } } #[cfg(test)] diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index dae142e8..89c3c99c 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,6 +1,7 @@ use std::collections::VecDeque; use cstree::text::{TextRange, TextSize}; +use log::debug; use logos::Logos; use crate::{ @@ -77,6 +78,8 @@ impl Parser { } }; + debug!("pg_query_root: {:#?}", pg_query_root); + // ranged nodes from pg_query.rs, including the root node // the nodes are ordered by starting range, starting with the root node let mut pg_query_nodes = match &pg_query_root { @@ -154,7 +157,15 @@ impl Parser { // if closing token, close nodes until depth of opening token before applying it let target_depth = if token_syntax_kind.is_closing_sibling() { - Some(open_tokens.last().unwrap().1) + let opening_token = open_tokens.pop().unwrap(); + assert_eq!( + opening_token.0.get_closing_sibling(), + token_syntax_kind, + "Opening token {:?} does not match closing token {:?}", + opening_token.0, + token_syntax_kind + ); + Some(opening_token.1) } else { None }; @@ -174,10 +185,10 @@ impl Parser { self.token(kind, text.as_str()); } - // consume all nodes that start at or before the token ends + // consume all nodes that start before the token ends while pg_query_nodes.peek().is_some() && pg_query_nodes.peek().unwrap().range.start() - <= TextSize::from(token.end as u32) + < TextSize::from(token.end as u32) { let node = pg_query_nodes.next().unwrap(); self.start_node(SyntaxKind::new_from_pg_query_node(&node.inner.node)); @@ -263,6 +274,94 @@ mod tests { assert_eq!(parsed.cst.text(), input); } + #[test] + fn test_opening_token() { + init(); + + let input = "INSERT INTO weather VALUES ('San Francisco', 46, 50, 0.25, '1994-11-27');"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_closing_token_at_last_position() { + init(); + + let input = "CREATE TABLE weather ( + city varchar(80) references cities(name), + temp_lo int +);"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_select_with_alias() { + init(); + + let input = "SELECT w1.temp_lo AS low, w1.temp_hi AS high FROM weather"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_select_distinct() { + init(); + + let input = "SELECT DISTINCT city + FROM weather + ORDER BY city;"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_order_by() { + init(); + + let input = "SELECT sum(salary) OVER w, avg(salary) OVER w + FROM empsalary + WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_fn_call() { + init(); + + let input = + "SELECT count(*) FILTER (WHERE i < 5) AS filtered FROM generate_series(1,10) AS s(i);"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + dbg!(&parsed.cst); + + assert_eq!(parsed.cst.text(), input); + } + // #[test] // fn test_create_sql_function() { // let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) diff --git a/crates/parser/tests/snapshots/statements/valid/0001.snap b/crates/parser/tests/snapshots/statements/valid/0001.snap index 1c2083ab..b1bcd7cd 100644 --- a/crates/parser/tests/snapshots/statements/valid/0001.snap +++ b/crates/parser/tests/snapshots/statements/valid/0001.snap @@ -2,46 +2,65 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT city, count(*) FILTER (WHERE temp_lo < 45), max(temp_lo)\n FROM weather\n GROUP BY city;\n" --- -SelectStmt@0..100 +SelectStmt@0..99 Select@0..6 "SELECT" Whitespace@6..7 " " - Ident@7..11 "city" + ResTarget@7..11 + ColumnRef@7..11 + String@7..11 + Ident@7..11 "city" Ascii44@11..12 "," Whitespace@12..13 " " - Ident@13..18 "count" - Ascii40@18..19 "(" - Ascii42@19..20 "*" - Ascii41@20..21 ")" - Whitespace@21..22 " " - Filter@22..28 "FILTER" - Whitespace@28..29 " " - Ascii40@29..30 "(" - Where@30..35 "WHERE" - Whitespace@35..36 " " - Ident@36..43 "temp_lo" - Whitespace@43..44 " " - Ascii60@44..45 "<" - Whitespace@45..46 " " - Iconst@46..48 "45" - Ascii41@48..49 ")" + ResTarget@13..49 + FuncCall@13..49 + String@13..18 + Ident@13..18 "count" + Ascii40@18..19 "(" + Ascii42@19..20 "*" + Ascii41@20..21 ")" + Whitespace@21..22 " " + Filter@22..28 "FILTER" + Whitespace@28..29 " " + Ascii40@29..30 "(" + Where@30..35 "WHERE" + Whitespace@35..36 " " + AExpr@36..48 + ColumnRef@36..43 + String@36..43 + Ident@36..43 "temp_lo" + Whitespace@43..44 " " + String@44..45 + Ascii60@44..45 "<" + Whitespace@45..46 " " + AConst@46..48 + Integer@46..48 + Iconst@46..48 "45" + Ascii41@48..49 ")" Ascii44@49..50 "," Whitespace@50..51 " " - Ident@51..54 "max" - Ascii40@54..55 "(" - Ident@55..62 "temp_lo" - Ascii41@62..63 ")" + ResTarget@51..63 + FuncCall@51..63 + String@51..54 + Ident@51..54 "max" + Ascii40@54..55 "(" + ColumnRef@55..62 + String@55..62 + Ident@55..62 "temp_lo" + Ascii41@62..63 ")" Newline@63..64 "\n" Whitespace@64..68 " " From@68..72 "FROM" Whitespace@72..73 " " - Ident@73..80 "weather" + RangeVar@73..80 + Ident@73..80 "weather" Newline@80..81 "\n" Whitespace@81..85 " " GroupP@85..90 "GROUP" Whitespace@90..91 " " By@91..93 "BY" Whitespace@93..94 " " - Ident@94..98 "city" + ColumnRef@94..98 + String@94..98 + Ident@94..98 "city" Ascii59@98..99 ";" - Newline@99..100 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0001.snap.new b/crates/parser/tests/snapshots/statements/valid/0001.snap.new deleted file mode 100644 index 469b0b71..00000000 --- a/crates/parser/tests/snapshots/statements/valid/0001.snap.new +++ /dev/null @@ -1,70 +0,0 @@ ---- -source: crates/parser/tests/statement_parser_test.rs -assertion_line: 36 -description: "SELECT city, count(*) FILTER (WHERE temp_lo < 45), max(temp_lo)\n FROM weather\n GROUP BY city;\n" ---- -SelectStmt@0..102 - Select@0..6 "SELECT" - Whitespace@6..7 " " - ResTarget@7..11 - ColumnRef@7..11 - String@7..11 - Ident@7..11 "city" - Ascii44@11..12 "," - Whitespace@12..13 " " - ResTarget@13..49 - FuncCall@13..49 - String@13..18 - Ident@13..18 "count" - Ascii40@18..19 "(" - Ascii42@19..20 "*" - Ascii41@20..21 ")" - Whitespace@21..22 " " - Filter@22..28 "FILTER" - Whitespace@28..29 " " - Ascii40@29..30 "(" - Where@30..35 "WHERE" - Whitespace@35..36 " " - AExpr@36..48 - ColumnRef@36..43 - String@36..43 - Ident@36..43 "temp_lo" - Whitespace@43..44 " " - String@44..45 - Ascii60@44..45 "<" - Whitespace@45..46 " " - AConst@46..48 - Integer@46..48 - Iconst@46..48 "45" - Ascii41@48..49 ")" - Ascii41@49..50 ")" - Ascii44@50..51 "," - Whitespace@51..52 " " - ResTarget@52..65 - FuncCall@52..65 - String@52..55 - Ident@52..55 "max" - ColumnRef@55..64 - String@55..64 - Ascii40@55..56 "(" - Ident@56..63 "temp_lo" - Ascii41@63..64 ")" - Ascii41@64..65 ")" - Ascii41@65..66 ")" - Newline@66..67 "\n" - Whitespace@67..71 " " - From@71..75 "FROM" - Whitespace@75..76 " " - RangeVar@76..83 - Ident@76..83 "weather" - Newline@83..84 "\n" - Whitespace@84..88 " " - GroupP@88..93 "GROUP" - Whitespace@93..94 " " - By@94..96 "BY" - Whitespace@96..97 " " - ColumnRef@97..101 - String@97..101 - Ident@97..101 "city" - Ascii59@101..102 ";" - diff --git a/crates/parser/tests/snapshots/statements/valid/0002.snap b/crates/parser/tests/snapshots/statements/valid/0002.snap index 45ebc465..ec7822f2 100644 --- a/crates/parser/tests/snapshots/statements/valid/0002.snap +++ b/crates/parser/tests/snapshots/statements/valid/0002.snap @@ -2,14 +2,14 @@ source: crates/parser/tests/statement_parser_test.rs description: "COPY weather FROM '/home/user/weather.txt';\n" --- -CopyStmt@0..44 +CopyStmt@0..43 Copy@0..4 "COPY" Whitespace@4..5 " " - Ident@5..12 "weather" + RangeVar@5..12 + Ident@5..12 "weather" Whitespace@12..13 " " From@13..17 "FROM" Whitespace@17..18 " " Sconst@18..42 "'/home/user/weather.txt'" Ascii59@42..43 ";" - Newline@43..44 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0003.snap b/crates/parser/tests/snapshots/statements/valid/0003.snap index 090b3a9a..8e53c49f 100644 --- a/crates/parser/tests/snapshots/statements/valid/0003.snap +++ b/crates/parser/tests/snapshots/statements/valid/0003.snap @@ -2,55 +2,75 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE weather (\n city varchar(80) references cities(name),\n temp_lo int,\n temp_hi int,\n prcp real,\n date date\n);\n" --- -CreateStmt@0..174 +CreateStmt@0..173 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..20 "weather" + RangeVar@13..20 + Ident@13..20 "weather" Whitespace@20..21 " " Ascii40@21..22 "(" Newline@22..23 "\n" Whitespace@23..31 " " - Ident@31..35 "city" - Whitespace@35..41 " " - Varchar@41..48 "varchar" - Ascii40@48..49 "(" - Iconst@49..51 "80" - Ascii41@51..52 ")" - Whitespace@52..53 " " - References@53..63 "references" - Whitespace@63..64 " " - Ident@64..70 "cities" - Ascii40@70..71 "(" - NameP@71..75 "name" - Ascii41@75..76 ")" + ColumnDef@31..76 + Ident@31..35 "city" + Whitespace@35..41 " " + TypeName@41..52 + String@41..48 + Varchar@41..48 "varchar" + Ascii40@48..49 "(" + AConst@49..51 + Integer@49..51 + Iconst@49..51 "80" + Ascii41@51..52 ")" + Whitespace@52..53 " " + Constraint@53..76 + References@53..63 "references" + Whitespace@63..64 " " + RangeVar@64..70 + Ident@64..70 "cities" + Ascii40@70..71 "(" + String@71..75 + NameP@71..75 "name" + Ascii41@75..76 ")" Ascii44@76..77 "," Newline@77..78 "\n" Whitespace@78..86 " " - Ident@86..93 "temp_lo" - Whitespace@93..96 " " - IntP@96..99 "int" + ColumnDef@86..99 + Ident@86..93 "temp_lo" + Whitespace@93..96 " " + TypeName@96..99 + String@96..99 + IntP@96..99 "int" Ascii44@99..100 "," Newline@100..101 "\n" Whitespace@101..109 " " - Ident@109..116 "temp_hi" - Whitespace@116..119 " " - IntP@119..122 "int" + ColumnDef@109..122 + Ident@109..116 "temp_hi" + Whitespace@116..119 " " + TypeName@119..122 + String@119..122 + IntP@119..122 "int" Ascii44@122..123 "," Newline@123..124 "\n" Whitespace@124..132 " " - Ident@132..136 "prcp" - Whitespace@136..142 " " - Real@142..146 "real" + ColumnDef@132..146 + Ident@132..136 "prcp" + Whitespace@136..142 " " + TypeName@142..146 + String@142..146 + Real@142..146 "real" Ascii44@146..147 "," Newline@147..148 "\n" Whitespace@148..156 " " - Ident@156..160 "date" - Whitespace@160..166 " " - Ident@166..170 "date" + ColumnDef@156..170 + Ident@156..160 "date" + Whitespace@160..166 " " + TypeName@166..170 + String@166..170 + Ident@166..170 "date" Newline@170..171 "\n" Ascii41@171..172 ")" Ascii59@172..173 ";" - Newline@173..174 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0004.snap b/crates/parser/tests/snapshots/statements/valid/0004.snap index d5e1524f..aa2ae312 100644 --- a/crates/parser/tests/snapshots/statements/valid/0004.snap +++ b/crates/parser/tests/snapshots/statements/valid/0004.snap @@ -2,51 +2,78 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE VIEW myview AS\n SELECT name, temp_lo, temp_hi, prcp, date, location\n FROM weather, cities\n WHERE city = name;\n" --- -ViewStmt@0..134 +ViewStmt@0..133 Create@0..6 "CREATE" Whitespace@6..7 " " View@7..11 "VIEW" Whitespace@11..12 " " - Ident@12..18 "myview" + RangeVar@12..18 + Ident@12..18 "myview" Whitespace@18..19 " " As@19..21 "AS" Newline@21..22 "\n" Whitespace@22..26 " " - Select@26..32 "SELECT" - Whitespace@32..33 " " - NameP@33..37 "name" - Ascii44@37..38 "," - Whitespace@38..39 " " - Ident@39..46 "temp_lo" - Ascii44@46..47 "," - Whitespace@47..48 " " - Ident@48..55 "temp_hi" - Ascii44@55..56 "," - Whitespace@56..57 " " - Ident@57..61 "prcp" - Ascii44@61..62 "," - Whitespace@62..63 " " - Ident@63..67 "date" - Ascii44@67..68 "," - Whitespace@68..69 " " - Location@69..77 "location" - Newline@77..78 "\n" - Whitespace@78..86 " " - From@86..90 "FROM" - Whitespace@90..91 " " - Ident@91..98 "weather" - Ascii44@98..99 "," - Whitespace@99..100 " " - Ident@100..106 "cities" - Newline@106..107 "\n" - Whitespace@107..115 " " - Where@115..120 "WHERE" - Whitespace@120..121 " " - Ident@121..125 "city" - Whitespace@125..126 " " - Ascii61@126..127 "=" - Whitespace@127..128 " " - NameP@128..132 "name" + SelectStmt@26..132 + Select@26..32 "SELECT" + Whitespace@32..33 " " + ResTarget@33..37 + ColumnRef@33..37 + String@33..37 + NameP@33..37 "name" + Ascii44@37..38 "," + Whitespace@38..39 " " + ResTarget@39..46 + ColumnRef@39..46 + String@39..46 + Ident@39..46 "temp_lo" + Ascii44@46..47 "," + Whitespace@47..48 " " + ResTarget@48..55 + ColumnRef@48..55 + String@48..55 + Ident@48..55 "temp_hi" + Ascii44@55..56 "," + Whitespace@56..57 " " + ResTarget@57..61 + ColumnRef@57..61 + String@57..61 + Ident@57..61 "prcp" + Ascii44@61..62 "," + Whitespace@62..63 " " + ResTarget@63..67 + ColumnRef@63..67 + String@63..67 + Ident@63..67 "date" + Ascii44@67..68 "," + Whitespace@68..69 " " + ResTarget@69..77 + ColumnRef@69..77 + String@69..77 + Location@69..77 "location" + Newline@77..78 "\n" + Whitespace@78..86 " " + From@86..90 "FROM" + Whitespace@90..91 " " + RangeVar@91..98 + Ident@91..98 "weather" + Ascii44@98..99 "," + Whitespace@99..100 " " + RangeVar@100..106 + Ident@100..106 "cities" + Newline@106..107 "\n" + Whitespace@107..115 " " + Where@115..120 "WHERE" + Whitespace@120..121 " " + AExpr@121..132 + ColumnRef@121..125 + String@121..125 + Ident@121..125 "city" + Whitespace@125..126 " " + String@126..127 + Ascii61@126..127 "=" + Whitespace@127..128 " " + ColumnRef@128..132 + String@128..132 + NameP@128..132 "name" Ascii59@132..133 ";" - Newline@133..134 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0005.snap b/crates/parser/tests/snapshots/statements/valid/0005.snap index d99908db..bb3a6ef7 100644 --- a/crates/parser/tests/snapshots/statements/valid/0005.snap +++ b/crates/parser/tests/snapshots/statements/valid/0005.snap @@ -2,20 +2,26 @@ source: crates/parser/tests/statement_parser_test.rs description: "DELETE FROM weather WHERE city = 'Hayward';\n" --- -DeleteStmt@0..44 +DeleteStmt@0..43 DeleteP@0..6 "DELETE" Whitespace@6..7 " " From@7..11 "FROM" Whitespace@11..12 " " - Ident@12..19 "weather" + RangeVar@12..19 + Ident@12..19 "weather" Whitespace@19..20 " " Where@20..25 "WHERE" Whitespace@25..26 " " - Ident@26..30 "city" - Whitespace@30..31 " " - Ascii61@31..32 "=" - Whitespace@32..33 " " - Sconst@33..42 "'Hayward'" + AExpr@26..42 + ColumnRef@26..30 + String@26..30 + Ident@26..30 "city" + Whitespace@30..31 " " + String@31..32 + Ascii61@31..32 "=" + Whitespace@32..33 " " + AConst@33..42 + String@33..42 + Sconst@33..42 "'Hayward'" Ascii59@42..43 ";" - Newline@43..44 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0006.snap b/crates/parser/tests/snapshots/statements/valid/0006.snap index 37cfd337..81fda79f 100644 --- a/crates/parser/tests/snapshots/statements/valid/0006.snap +++ b/crates/parser/tests/snapshots/statements/valid/0006.snap @@ -2,12 +2,13 @@ source: crates/parser/tests/statement_parser_test.rs description: "DROP TABLE tablename;\n" --- -DropStmt@0..22 +DropStmt@0..21 Drop@0..4 "DROP" Whitespace@4..5 " " Table@5..10 "TABLE" Whitespace@10..11 " " - Ident@11..20 "tablename" + List@11..20 + String@11..20 + Ident@11..20 "tablename" Ascii59@20..21 ";" - Newline@21..22 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0007.snap b/crates/parser/tests/snapshots/statements/valid/0007.snap index 61ccfb22..f4235d2c 100644 --- a/crates/parser/tests/snapshots/statements/valid/0007.snap +++ b/crates/parser/tests/snapshots/statements/valid/0007.snap @@ -2,35 +2,44 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE cities (\n name text,\n population real,\n elevation int -- (in ft)\n);\n\n" --- -CreateStmt@0..96 +CreateStmt@0..94 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..19 "cities" + RangeVar@13..19 + Ident@13..19 "cities" Whitespace@19..20 " " Ascii40@20..21 "(" Newline@21..22 "\n" Whitespace@22..24 " " - NameP@24..28 "name" - Whitespace@28..35 " " - TextP@35..39 "text" + ColumnDef@24..39 + NameP@24..28 "name" + Whitespace@28..35 " " + TypeName@35..39 + String@35..39 + TextP@35..39 "text" Ascii44@39..40 "," Newline@40..41 "\n" Whitespace@41..43 " " - Ident@43..53 "population" - Whitespace@53..54 " " - Real@54..58 "real" + ColumnDef@43..58 + Ident@43..53 "population" + Whitespace@53..54 " " + TypeName@54..58 + String@54..58 + Real@54..58 "real" Ascii44@58..59 "," Newline@59..60 "\n" Whitespace@60..62 " " - Ident@62..71 "elevation" - Whitespace@71..73 " " - IntP@73..76 "int" + ColumnDef@62..76 + Ident@62..71 "elevation" + Whitespace@71..73 " " + TypeName@73..76 + String@73..76 + IntP@73..76 "int" Whitespace@76..81 " " SqlComment@81..91 "-- (in ft)" Newline@91..92 "\n" Ascii41@92..93 ")" Ascii59@93..94 ";" - Newline@94..96 "\n\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0008.snap b/crates/parser/tests/snapshots/statements/valid/0008.snap index a33927f9..14a3c516 100644 --- a/crates/parser/tests/snapshots/statements/valid/0008.snap +++ b/crates/parser/tests/snapshots/statements/valid/0008.snap @@ -2,41 +2,55 @@ source: crates/parser/tests/statement_parser_test.rs description: "INSERT INTO weather (date, city, temp_hi, temp_lo)\n VALUES ('1994-11-29', 'Hayward', 54, 37);\n" --- -InsertStmt@0..97 +InsertStmt@0..96 Insert@0..6 "INSERT" Whitespace@6..7 " " Into@7..11 "INTO" Whitespace@11..12 " " - Ident@12..19 "weather" + RangeVar@12..19 + Ident@12..19 "weather" Whitespace@19..20 " " Ascii40@20..21 "(" - Ident@21..25 "date" + ResTarget@21..25 + Ident@21..25 "date" Ascii44@25..26 "," Whitespace@26..27 " " - Ident@27..31 "city" + ResTarget@27..31 + Ident@27..31 "city" Ascii44@31..32 "," Whitespace@32..33 " " - Ident@33..40 "temp_hi" + ResTarget@33..40 + Ident@33..40 "temp_hi" Ascii44@40..41 "," Whitespace@41..42 " " - Ident@42..49 "temp_lo" + ResTarget@42..49 + Ident@42..49 "temp_lo" Ascii41@49..50 ")" Newline@50..51 "\n" Whitespace@51..55 " " Values@55..61 "VALUES" Whitespace@61..62 " " Ascii40@62..63 "(" - Sconst@63..75 "'1994-11-29'" - Ascii44@75..76 "," - Whitespace@76..77 " " - Sconst@77..86 "'Hayward'" - Ascii44@86..87 "," - Whitespace@87..88 " " - Iconst@88..90 "54" - Ascii44@90..91 "," - Whitespace@91..92 " " - Iconst@92..94 "37" + SelectStmt@63..94 + List@63..94 + AConst@63..75 + String@63..75 + Sconst@63..75 "'1994-11-29'" + Ascii44@75..76 "," + Whitespace@76..77 " " + AConst@77..86 + String@77..86 + Sconst@77..86 "'Hayward'" + Ascii44@86..87 "," + Whitespace@87..88 " " + AConst@88..90 + Integer@88..90 + Iconst@88..90 "54" + Ascii44@90..91 "," + Whitespace@91..92 " " + AConst@92..94 + Integer@92..94 + Iconst@92..94 "37" Ascii41@94..95 ")" Ascii59@95..96 ";" - Newline@96..97 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0009.snap b/crates/parser/tests/snapshots/statements/valid/0009.snap index bf17d60a..fe1cba70 100644 --- a/crates/parser/tests/snapshots/statements/valid/0009.snap +++ b/crates/parser/tests/snapshots/statements/valid/0009.snap @@ -2,92 +2,137 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT w1.city, w1.temp_lo AS low, w1.temp_hi AS high,\n w2.city, w2.temp_lo AS low, w2.temp_hi AS high\n FROM weather w1 JOIN weather w2\n ON w1.temp_lo < w2.temp_lo AND w1.temp_hi > w2.temp_hi;\n" --- -SelectStmt@0..209 +SelectStmt@0..208 Select@0..6 "SELECT" Whitespace@6..7 " " - Ident@7..9 "w1" - Ascii46@9..10 "." - Ident@10..14 "city" + ResTarget@7..14 + ColumnRef@7..14 + String@7..9 + Ident@7..9 "w1" + Ascii46@9..10 "." + String@10..14 + Ident@10..14 "city" Ascii44@14..15 "," Whitespace@15..16 " " - Ident@16..18 "w1" - Ascii46@18..19 "." - Ident@19..26 "temp_lo" - Whitespace@26..27 " " - As@27..29 "AS" - Whitespace@29..30 " " - Ident@30..33 "low" + ResTarget@16..33 + ColumnRef@16..26 + String@16..18 + Ident@16..18 "w1" + Ascii46@18..19 "." + String@19..26 + Ident@19..26 "temp_lo" + Whitespace@26..27 " " + As@27..29 "AS" + Whitespace@29..30 " " + Ident@30..33 "low" Ascii44@33..34 "," Whitespace@34..35 " " - Ident@35..37 "w1" - Ascii46@37..38 "." - Ident@38..45 "temp_hi" - Whitespace@45..46 " " - As@46..48 "AS" - Whitespace@48..49 " " - Ident@49..53 "high" + ResTarget@35..53 + ColumnRef@35..45 + String@35..37 + Ident@35..37 "w1" + Ascii46@37..38 "." + String@38..45 + Ident@38..45 "temp_hi" + Whitespace@45..46 " " + As@46..48 "AS" + Whitespace@48..49 " " + Ident@49..53 "high" Ascii44@53..54 "," Newline@54..55 "\n" Whitespace@55..62 " " - Ident@62..64 "w2" - Ascii46@64..65 "." - Ident@65..69 "city" + ResTarget@62..69 + ColumnRef@62..69 + String@62..64 + Ident@62..64 "w2" + Ascii46@64..65 "." + String@65..69 + Ident@65..69 "city" Ascii44@69..70 "," Whitespace@70..71 " " - Ident@71..73 "w2" - Ascii46@73..74 "." - Ident@74..81 "temp_lo" - Whitespace@81..82 " " - As@82..84 "AS" - Whitespace@84..85 " " - Ident@85..88 "low" + ResTarget@71..88 + ColumnRef@71..81 + String@71..73 + Ident@71..73 "w2" + Ascii46@73..74 "." + String@74..81 + Ident@74..81 "temp_lo" + Whitespace@81..82 " " + As@82..84 "AS" + Whitespace@84..85 " " + Ident@85..88 "low" Ascii44@88..89 "," Whitespace@89..90 " " - Ident@90..92 "w2" - Ascii46@92..93 "." - Ident@93..100 "temp_hi" - Whitespace@100..101 " " - As@101..103 "AS" - Whitespace@103..104 " " - Ident@104..108 "high" + ResTarget@90..108 + ColumnRef@90..100 + String@90..92 + Ident@90..92 "w2" + Ascii46@92..93 "." + String@93..100 + Ident@93..100 "temp_hi" + Whitespace@100..101 " " + As@101..103 "AS" + Whitespace@103..104 " " + Ident@104..108 "high" Newline@108..109 "\n" Whitespace@109..113 " " From@113..117 "FROM" Whitespace@117..118 " " - Ident@118..125 "weather" - Whitespace@125..126 " " - Ident@126..128 "w1" - Whitespace@128..129 " " - Join@129..133 "JOIN" - Whitespace@133..134 " " - Ident@134..141 "weather" - Whitespace@141..142 " " - Ident@142..144 "w2" - Newline@144..145 "\n" - Whitespace@145..153 " " - On@153..155 "ON" - Whitespace@155..156 " " - Ident@156..158 "w1" - Ascii46@158..159 "." - Ident@159..166 "temp_lo" - Whitespace@166..167 " " - Ascii60@167..168 "<" - Whitespace@168..169 " " - Ident@169..171 "w2" - Ascii46@171..172 "." - Ident@172..179 "temp_lo" - Whitespace@179..180 " " - And@180..183 "AND" - Whitespace@183..184 " " - Ident@184..186 "w1" - Ascii46@186..187 "." - Ident@187..194 "temp_hi" - Whitespace@194..195 " " - Ascii62@195..196 ">" - Whitespace@196..197 " " - Ident@197..199 "w2" - Ascii46@199..200 "." - Ident@200..207 "temp_hi" + JoinExpr@118..207 + RangeVar@118..128 + Ident@118..125 "weather" + Whitespace@125..126 " " + Alias@126..128 + Ident@126..128 "w1" + Whitespace@128..129 " " + Join@129..133 "JOIN" + Whitespace@133..134 " " + RangeVar@134..144 + Ident@134..141 "weather" + Whitespace@141..142 " " + Alias@142..144 + Ident@142..144 "w2" + Newline@144..145 "\n" + Whitespace@145..153 " " + On@153..155 "ON" + Whitespace@155..156 " " + BoolExpr@156..207 + AExpr@156..179 + ColumnRef@156..166 + String@156..158 + Ident@156..158 "w1" + Ascii46@158..159 "." + String@159..166 + Ident@159..166 "temp_lo" + Whitespace@166..167 " " + String@167..168 + Ascii60@167..168 "<" + Whitespace@168..169 " " + ColumnRef@169..179 + String@169..171 + Ident@169..171 "w2" + Ascii46@171..172 "." + String@172..179 + Ident@172..179 "temp_lo" + Whitespace@179..180 " " + And@180..183 "AND" + Whitespace@183..184 " " + AExpr@184..207 + ColumnRef@184..194 + String@184..186 + Ident@184..186 "w1" + Ascii46@186..187 "." + String@187..194 + Ident@187..194 "temp_hi" + Whitespace@194..195 " " + String@195..196 + Ascii62@195..196 ">" + Whitespace@196..197 " " + ColumnRef@197..207 + String@197..199 + Ident@197..199 "w2" + Ascii46@199..200 "." + String@200..207 + Ident@200..207 "temp_hi" Ascii59@207..208 ";" - Newline@208..209 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0010.snap b/crates/parser/tests/snapshots/statements/valid/0010.snap index 402b5c07..bef5bc54 100644 --- a/crates/parser/tests/snapshots/statements/valid/0010.snap +++ b/crates/parser/tests/snapshots/statements/valid/0010.snap @@ -2,30 +2,42 @@ source: crates/parser/tests/statement_parser_test.rs description: "INSERT INTO weather VALUES ('San Francisco', 46, 50, 0.25, '1994-11-27');\n" --- -InsertStmt@0..74 +InsertStmt@0..73 Insert@0..6 "INSERT" Whitespace@6..7 " " Into@7..11 "INTO" Whitespace@11..12 " " - Ident@12..19 "weather" + RangeVar@12..19 + Ident@12..19 "weather" Whitespace@19..20 " " Values@20..26 "VALUES" Whitespace@26..27 " " Ascii40@27..28 "(" - Sconst@28..43 "'San Francisco'" - Ascii44@43..44 "," - Whitespace@44..45 " " - Iconst@45..47 "46" - Ascii44@47..48 "," - Whitespace@48..49 " " - Iconst@49..51 "50" - Ascii44@51..52 "," - Whitespace@52..53 " " - Fconst@53..57 "0.25" - Ascii44@57..58 "," - Whitespace@58..59 " " - Sconst@59..71 "'1994-11-27'" + SelectStmt@28..71 + List@28..71 + AConst@28..43 + String@28..43 + Sconst@28..43 "'San Francisco'" + Ascii44@43..44 "," + Whitespace@44..45 " " + AConst@45..47 + Integer@45..47 + Iconst@45..47 "46" + Ascii44@47..48 "," + Whitespace@48..49 " " + AConst@49..51 + Integer@49..51 + Iconst@49..51 "50" + Ascii44@51..52 "," + Whitespace@52..53 " " + AConst@53..57 + Float@53..57 + Fconst@53..57 "0.25" + Ascii44@57..58 "," + Whitespace@58..59 " " + AConst@59..71 + String@59..71 + Sconst@59..71 "'1994-11-27'" Ascii41@71..72 ")" Ascii59@72..73 ";" - Newline@73..74 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0011.snap b/crates/parser/tests/snapshots/statements/valid/0011.snap index 48d7b674..2f2bb2d8 100644 --- a/crates/parser/tests/snapshots/statements/valid/0011.snap +++ b/crates/parser/tests/snapshots/statements/valid/0011.snap @@ -2,24 +2,30 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT DISTINCT city\n FROM weather\n ORDER BY city;\n" --- -SelectStmt@0..57 +SelectStmt@0..56 Select@0..6 "SELECT" Whitespace@6..7 " " Distinct@7..15 "DISTINCT" Whitespace@15..16 " " - Ident@16..20 "city" + ResTarget@16..20 + ColumnRef@16..20 + String@16..20 + Ident@16..20 "city" Newline@20..21 "\n" Whitespace@21..25 " " From@25..29 "FROM" Whitespace@29..30 " " - Ident@30..37 "weather" + RangeVar@30..37 + Ident@30..37 "weather" Newline@37..38 "\n" Whitespace@38..42 " " - Order@42..47 "ORDER" - Whitespace@47..48 " " - By@48..50 "BY" - Whitespace@50..51 " " - Ident@51..55 "city" + SortBy@42..55 + Order@42..47 "ORDER" + Whitespace@47..48 " " + By@48..50 "BY" + Whitespace@50..51 " " + ColumnRef@51..55 + String@51..55 + Ident@51..55 "city" Ascii59@55..56 ";" - Newline@56..57 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0012.snap.new b/crates/parser/tests/snapshots/statements/valid/0012.snap.new new file mode 100644 index 00000000..f956dd31 --- /dev/null +++ b/crates/parser/tests/snapshots/statements/valid/0012.snap.new @@ -0,0 +1,61 @@ +--- +source: crates/parser/tests/statement_parser_test.rs +assertion_line: 39 +description: "CREATE TABLE measurement_y2008m01 PARTITION OF measurement\n FOR VALUES FROM ('2008-01-01') TO ('2008-02-01')\n WITH (parallel_workers = 4)\n TABLESPACE fasttablespace;\n" +--- +CreateStmt@0..174 + Create@0..6 "CREATE" + Whitespace@6..7 " " + Table@7..12 "TABLE" + Whitespace@12..13 " " + RangeVar@13..33 + Ident@13..33 "measurement_y2008m01" + Whitespace@33..34 " " + Partition@34..43 "PARTITION" + Whitespace@43..44 " " + Of@44..46 "OF" + Whitespace@46..47 " " + RangeVar@47..58 + Ident@47..58 "measurement" + Newline@58..59 "\n" + Whitespace@59..63 " " + For@63..66 "FOR" + Whitespace@66..67 " " + Values@67..73 "VALUES" + Whitespace@73..74 " " + PartitionBoundSpec@74..111 + From@74..78 "FROM" + Whitespace@78..79 " " + Ascii40@79..80 "(" + AConst@80..92 + String@80..92 + Sconst@80..92 "'2008-01-01'" + Ascii41@92..93 ")" + Whitespace@93..94 " " + To@94..96 "TO" + Whitespace@96..97 " " + Ascii40@97..98 "(" + AConst@98..110 + String@98..110 + Sconst@98..110 "'2008-02-01'" + Ascii41@110..111 ")" + Newline@111..112 "\n" + Whitespace@112..116 " " + With@116..120 "WITH" + Whitespace@120..121 " " + Ascii40@121..122 "(" + DefElem@122..142 + Ident@122..138 "parallel_workers" + Whitespace@138..139 " " + Ascii61@139..140 "=" + Whitespace@140..141 " " + Integer@141..142 + Iconst@141..142 "4" + Ascii41@142..143 ")" + Newline@143..144 "\n" + Whitespace@144..148 " " + Tablespace@148..158 "TABLESPACE" + Whitespace@158..159 " " + Ident@159..173 "fasttablespace" + Ascii59@173..174 ";" + diff --git a/crates/parser/tests/snapshots/statements/valid/0013.snap.new b/crates/parser/tests/snapshots/statements/valid/0013.snap.new new file mode 100644 index 00000000..2d1f7551 --- /dev/null +++ b/crates/parser/tests/snapshots/statements/valid/0013.snap.new @@ -0,0 +1,65 @@ +--- +source: crates/parser/tests/statement_parser_test.rs +assertion_line: 39 +description: "UPDATE weather\n SET temp_hi = temp_hi - 2, temp_lo = temp_lo - 2\n WHERE date > '1994-11-28';\n" +--- +UpdateStmt@0..99 + Update@0..6 "UPDATE" + Whitespace@6..7 " " + RangeVar@7..14 + Ident@7..14 "weather" + Newline@14..15 "\n" + Whitespace@15..19 " " + Set@19..22 "SET" + Whitespace@22..23 " " + ResTarget@23..44 + Ident@23..30 "temp_hi" + Whitespace@30..31 " " + Ascii61@31..32 "=" + Whitespace@32..33 " " + AExpr@33..44 + ColumnRef@33..40 + String@33..40 + Ident@33..40 "temp_hi" + Whitespace@40..41 " " + String@41..42 + Ascii45@41..42 "-" + Whitespace@42..43 " " + AConst@43..44 + Integer@43..44 + Iconst@43..44 "2" + Ascii44@44..45 "," + Whitespace@45..47 " " + ResTarget@47..68 + Ident@47..54 "temp_lo" + Whitespace@54..55 " " + Ascii61@55..56 "=" + Whitespace@56..57 " " + AExpr@57..68 + ColumnRef@57..64 + String@57..64 + Ident@57..64 "temp_lo" + Whitespace@64..65 " " + String@65..66 + Ascii45@65..66 "-" + Whitespace@66..67 " " + AConst@67..68 + Integer@67..68 + Iconst@67..68 "2" + Newline@68..69 "\n" + Whitespace@69..73 " " + Where@73..78 "WHERE" + Whitespace@78..79 " " + AExpr@79..98 + ColumnRef@79..83 + String@79..83 + Ident@79..83 "date" + Whitespace@83..84 " " + String@84..85 + Ascii62@84..85 ">" + Whitespace@85..86 " " + AConst@86..98 + String@86..98 + Sconst@86..98 "'1994-11-28'" + Ascii59@98..99 ";" + diff --git a/crates/parser/tests/snapshots/statements/valid/0014.snap b/crates/parser/tests/snapshots/statements/valid/0014.snap index 80ceaf6d..490b0142 100644 --- a/crates/parser/tests/snapshots/statements/valid/0014.snap +++ b/crates/parser/tests/snapshots/statements/valid/0014.snap @@ -2,55 +2,73 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT sum(salary) OVER w, avg(salary) OVER w\n FROM empsalary\n WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);\n" --- -SelectStmt@0..122 +SelectStmt@0..121 Select@0..6 "SELECT" Whitespace@6..7 " " - Ident@7..10 "sum" - Ascii40@10..11 "(" - Ident@11..17 "salary" - Ascii41@17..18 ")" - Whitespace@18..19 " " - Over@19..23 "OVER" - Whitespace@23..24 " " - Ident@24..25 "w" - Ascii44@25..26 "," - Whitespace@26..27 " " - Ident@27..30 "avg" - Ascii40@30..31 "(" - Ident@31..37 "salary" - Ascii41@37..38 ")" - Whitespace@38..39 " " - Over@39..43 "OVER" - Whitespace@43..44 " " - Ident@44..45 "w" - Newline@45..46 "\n" - Whitespace@46..48 " " - From@48..52 "FROM" - Whitespace@52..53 " " - Ident@53..62 "empsalary" - Newline@62..63 "\n" - Whitespace@63..65 " " - Window@65..71 "WINDOW" - Whitespace@71..72 " " - Ident@72..73 "w" - Whitespace@73..74 " " - As@74..76 "AS" - Whitespace@76..77 " " - Ascii40@77..78 "(" - Partition@78..87 "PARTITION" - Whitespace@87..88 " " - By@88..90 "BY" - Whitespace@90..91 " " - Ident@91..98 "depname" - Whitespace@98..99 " " - Order@99..104 "ORDER" - Whitespace@104..105 " " - By@105..107 "BY" - Whitespace@107..108 " " - Ident@108..114 "salary" - Whitespace@114..115 " " - Desc@115..119 "DESC" - Ascii41@119..120 ")" + ResTarget@7..120 + FuncCall@7..120 + String@7..10 + Ident@7..10 "sum" + Ascii40@10..11 "(" + ColumnRef@11..17 + String@11..17 + Ident@11..17 "salary" + Ascii41@17..18 ")" + Whitespace@18..19 " " + Over@19..23 "OVER" + Whitespace@23..24 " " + WindowDef@24..120 + Ident@24..25 "w" + Ascii44@25..26 "," + Whitespace@26..27 " " + ResTarget@27..120 + FuncCall@27..120 + String@27..30 + Ident@27..30 "avg" + Ascii40@30..31 "(" + ColumnRef@31..37 + String@31..37 + Ident@31..37 "salary" + Ascii41@37..38 ")" + Whitespace@38..39 " " + Over@39..43 "OVER" + Whitespace@43..44 " " + WindowDef@44..120 + Ident@44..45 "w" + Newline@45..46 "\n" + Whitespace@46..48 " " + From@48..52 "FROM" + Whitespace@52..53 " " + RangeVar@53..62 + Ident@53..62 "empsalary" + Newline@62..63 "\n" + Whitespace@63..65 " " + Window@65..71 "WINDOW" + Whitespace@71..72 " " + Ident@72..73 "w" + Whitespace@73..74 " " + As@74..76 "AS" + Whitespace@76..77 " " + WindowDef@77..120 + Ascii40@77..78 "(" + Partition@78..87 "PARTITION" + Whitespace@87..88 " " + By@88..90 "BY" + Whitespace@90..91 " " + ColumnRef@91..98 + String@91..98 + Ident@91..98 "depname" + Whitespace@98..99 " " + SortBy@99..119 + Order@99..104 "ORDER" + Whitespace@104..105 " " + By@105..107 "BY" + Whitespace@107..108 " " + ColumnRef@108..114 + String@108..114 + Ident@108..114 "salary" + Whitespace@114..115 " " + Desc@115..119 "DESC" + Ascii41@119..120 ")" Ascii59@120..121 ";" - Newline@121..122 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0015.snap.new b/crates/parser/tests/snapshots/statements/valid/0015.snap.new new file mode 100644 index 00000000..89d0aa08 --- /dev/null +++ b/crates/parser/tests/snapshots/statements/valid/0015.snap.new @@ -0,0 +1,80 @@ +--- +source: crates/parser/tests/statement_parser_test.rs +assertion_line: 39 +description: "SELECT\n count(*) AS unfiltered,\n count(*) FILTER (WHERE i < 5) AS filtered\nFROM generate_series(1,10) AS s(i);\n" +--- +SelectStmt@0..116 + Select@0..6 "SELECT" + Newline@6..7 "\n" + Whitespace@7..11 " " + ResTarget@11..33 + FuncCall@11..16 + String@11..16 + Ident@11..16 "count" + Ascii40@16..17 "(" + Ascii42@17..18 "*" + Ascii41@18..19 ")" + Whitespace@19..20 " " + As@20..22 "AS" + Whitespace@22..23 " " + Ident@23..33 "unfiltered" + Ascii44@33..34 "," + Newline@34..35 "\n" + Whitespace@35..39 " " + ResTarget@39..112 + FuncCall@39..112 + String@39..44 + Ident@39..44 "count" + Ascii40@44..45 "(" + Ascii42@45..46 "*" + Ascii41@46..47 ")" + Whitespace@47..48 " " + Filter@48..54 "FILTER" + Whitespace@54..55 " " + Ascii40@55..56 "(" + Where@56..61 "WHERE" + Whitespace@61..62 " " + RangeFunction@62..112 + Alias@62..112 + String@62..67 + AExpr@62..67 + ColumnRef@62..63 + String@62..63 + Ident@62..63 "i" + Whitespace@63..64 " " + String@64..65 + Ascii60@64..65 "<" + Whitespace@65..66 " " + AConst@66..67 + Integer@66..67 + Iconst@66..67 "5" + Ascii41@67..68 ")" + Whitespace@68..69 " " + As@69..71 "AS" + Whitespace@71..72 " " + Ident@72..80 "filtered" + Newline@80..81 "\n" + From@81..85 "FROM" + Whitespace@85..86 " " + List@86..107 + FuncCall@86..107 + String@86..101 + Ident@86..101 "generate_series" + Ascii40@101..102 "(" + AConst@102..103 + Integer@102..103 + Iconst@102..103 "1" + Ascii44@103..104 "," + AConst@104..106 + Integer@104..106 + Iconst@104..106 "10" + Ascii41@106..107 ")" + Whitespace@107..108 " " + As@108..110 "AS" + Whitespace@110..111 " " + Ident@111..112 "s" + Ascii40@112..113 "(" + Ident@113..114 "i" + Ascii41@114..115 ")" + Ascii59@115..116 ";" + diff --git a/crates/parser/tests/statement_parser_test.rs b/crates/parser/tests/statement_parser_test.rs index 77542df1..7ffbaf56 100644 --- a/crates/parser/tests/statement_parser_test.rs +++ b/crates/parser/tests/statement_parser_test.rs @@ -1,6 +1,7 @@ use std::fs; mod common; use insta; +use log::debug; use parser::Parser; const VALID_STATEMENTS_PATH: &str = "tests/data/statements/valid/"; @@ -22,6 +23,8 @@ fn valid_statements() { let contents = fs::read_to_string(&path).unwrap(); + debug!("Parsing statement: {}", test_name); + let mut parser = Parser::new(); parser.parse_statement_at(&contents, None); let parsed = parser.finish(); From f6cc9b018abb41b007dd947961f05fae2920f46b Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 12 Oct 2023 22:13:38 +0200 Subject: [PATCH 13/16] fix: minor fixes while making tests green --- crates/codegen/src/get_child_token_range.rs | 110 +++++++--- crates/parser/src/estimate_node_range.rs | 190 +++++++++++------- crates/parser/src/source_parser.rs | 2 +- crates/parser/src/statement_parser.rs | 42 ++++ .../snapshots/statements/valid/0012.snap | 48 +++-- .../snapshots/statements/valid/0012.snap.new | 61 ------ .../snapshots/statements/valid/0013.snap | 72 ++++--- .../snapshots/statements/valid/0013.snap.new | 65 ------ .../snapshots/statements/valid/0014.snap | 112 +++++------ .../snapshots/statements/valid/0015.snap | 107 ++++++---- .../snapshots/statements/valid/0015.snap.new | 80 -------- .../snapshots/statements/valid/0016.snap | 36 ++-- .../snapshots/statements/valid/0017.snap | 76 ++++--- .../snapshots/statements/valid/0018.snap | 34 ++-- .../snapshots/statements/valid/0019.snap | 73 ++++--- .../snapshots/statements/valid/0020.snap | 69 ++++--- .../snapshots/statements/valid/0021.snap | 59 +++--- .../snapshots/statements/valid/0022.snap | 44 ++-- .../snapshots/statements/valid/0023.snap | 121 +++++++---- .../snapshots/statements/valid/0024.snap | 76 ++++--- .../snapshots/statements/valid/0025.snap | 24 ++- .../snapshots/statements/valid/0026.snap | 34 ++-- .../snapshots/statements/valid/0027.snap | 12 +- .../snapshots/statements/valid/0028.snap | 6 +- .../snapshots/statements/valid/0029.snap | 33 +-- .../snapshots/statements/valid/0030.snap | 24 ++- .../snapshots/statements/valid/0031.snap | 115 ++++++----- .../snapshots/statements/valid/0032.snap | 11 +- .../snapshots/statements/valid/0033.snap | 62 +++--- .../snapshots/statements/valid/0034.snap | 32 ++- .../snapshots/statements/valid/0035.snap | 32 ++- .../snapshots/statements/valid/0036.snap | 46 +++-- 32 files changed, 1074 insertions(+), 834 deletions(-) delete mode 100644 crates/parser/tests/snapshots/statements/valid/0012.snap.new delete mode 100644 crates/parser/tests/snapshots/statements/valid/0013.snap.new delete mode 100644 crates/parser/tests/snapshots/statements/valid/0015.snap.new diff --git a/crates/codegen/src/get_child_token_range.rs b/crates/codegen/src/get_child_token_range.rs index 40c9c4fe..fed68c2f 100644 --- a/crates/codegen/src/get_child_token_range.rs +++ b/crates/codegen/src/get_child_token_range.rs @@ -144,17 +144,22 @@ pub fn get_child_token_range_mod(_item: proc_macro2::TokenStream) -> proc_macro2 return vec![text]; } - pub struct ChildTokenRange { - /// the .start of all child tokens used to estimate the range - pub child_token_indices: Vec, - pub range: Option + #[derive(Debug)] + pub enum ChildTokenRangeResult { + TooManyTokens, + NoTokens, + /// indices are the .start of all child tokens used to estimate the range + ChildTokenRange { used_token_indices: Vec, range: TextRange }, } - pub fn get_child_token_range(node: &NodeEnum, tokens: Vec<&ScanToken>, text: &str, nearest_parent_location: u32) -> ChildTokenRange { - let mut child_tokens = Vec::new(); + pub fn get_child_token_range(node: &NodeEnum, tokens: Vec<&ScanToken>, text: &str, nearest_parent_location: Option) -> ChildTokenRangeResult { + let mut child_tokens: Vec<&ScanToken> = Vec::new(); + + // if true, we found more than one valid token for at least one property of the node + let mut has_too_many_tokens: bool = false; let mut get_token = |property: TokenProperty| { - let token = tokens + let possible_tokens = tokens .iter() .filter_map(|t| { if property.token.is_some() { @@ -178,42 +183,59 @@ pub fn get_child_token_range_mod(_item: proc_macro2::TokenStream) -> proc_macro2 } } - // if the token is before the nearest parent location, we can safely ignore it - // if not, we calculate the distance to the nearest parent location - let distance = t.start - nearest_parent_location as i32; - if distance >= 0 { - Some((distance, t)) - } else { - None - } + Some(t) }) - // and use the token with the smallest distance to the nearest parent location - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t); + .collect::>(); - if token.is_some() { - child_tokens.push(token.unwrap()); - } else { + if possible_tokens.len() == 0 { debug!( "No matching token found for property {:#?} of node {:#?} in {:#?} with tokens {:#?}", property, node, text, tokens ); + return; + } + + if possible_tokens.len() == 1 { + debug!( + "Found token {:#?} for property {:#?} of node {:#?}", + possible_tokens[0], property, node + ); + child_tokens.push(possible_tokens[0]); + return; } + + if nearest_parent_location.is_none() { + debug!("Found {:#?} for property {:#?} and no nearest_parent_location set", possible_tokens, property); + has_too_many_tokens = true; + return; + } + + let token = possible_tokens + .iter().map(|t| ((nearest_parent_location.unwrap() as i32 - t.start), t)) + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t); + + debug!("Selected {:#?} as token closest from parent {:#?} as location {:#?}", token.unwrap(), node, nearest_parent_location); + + child_tokens.push(token.unwrap()); }; match node { #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, }; - ChildTokenRange { - child_token_indices: child_tokens.iter().map(|t| t.start).collect(), - range: if child_tokens.len() > 0 { - Some(TextRange::new( + + if has_too_many_tokens == true { + ChildTokenRangeResult::TooManyTokens + } else if child_tokens.len() == 0 { + ChildTokenRangeResult::NoTokens + } else { + ChildTokenRangeResult::ChildTokenRange { + used_token_indices: child_tokens.iter().map(|t| t.start).collect(), + range: TextRange::new( TextSize::from(child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32), TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), - )) - } else { - None + ) } } } @@ -252,6 +274,13 @@ fn custom_handlers(node: &Node) -> TokenStream { "Integer" => quote! { get_token(TokenProperty::from(n)); }, + "WindowDef" => quote! { + if n.partition_clause.len() > 0 { + get_token(TokenProperty::from(Token::Window)); + } else { + get_token(TokenProperty::from(Token::Over)); + } + }, "Boolean" => quote! { get_token(TokenProperty::from(n)); }, @@ -263,6 +292,28 @@ fn custom_handlers(node: &Node) -> TokenStream { get_token(TokenProperty::from(Token::Filter)); } }, + "SqlvalueFunction" => quote! { + match n.op { + // 1 SvfopCurrentDate + // 2 SvfopCurrentTime + // 3 SvfopCurrentTimeN + // 4 SvfopCurrentTimestamp + // 5 SvfopCurrentTimestampN + // 6 SvfopLocaltime + // 7 SvfopLocaltimeN + // 8 SvfopLocaltimestamp + // 9 SvfopLocaltimestampN + // 10 SvfopCurrentRole + 10 => get_token(TokenProperty::from(Token::CurrentRole)), + // 11 SvfopCurrentUser + 11 => get_token(TokenProperty::from(Token::CurrentUser)), + // 12 SvfopUser + // 13 SvfopSessionUser + // 14 SvfopCurrentCatalog + // 15 SvfopCurrentSchema + _ => panic!("Unknown SqlvalueFunction {:#?}", n.op), + } + }, "SortBy" => quote! { get_token(TokenProperty::from(Token::Order)); match n.sortby_dir { @@ -271,9 +322,6 @@ fn custom_handlers(node: &Node) -> TokenStream { _ => {} } }, - "WindowDef" => quote! { - get_token(TokenProperty::from(Token::Partition)); - }, "AConst" => quote! { if n.isnull { get_token(TokenProperty::from(Token::NullP)); diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index 166b2723..a55ffb65 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -1,11 +1,10 @@ -use std::cmp::{max, min}; +use std::cmp::max; -use crate::get_child_token_range_codegen::get_child_token_range; +use crate::get_child_token_range_codegen::{get_child_token_range, ChildTokenRangeResult}; use crate::get_location_codegen::get_location; use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; -use log::debug; -use pg_query::{protobuf::ScanToken, protobuf::Token, NodeEnum}; +use pg_query::protobuf::ScanToken; #[derive(Debug, Clone)] pub struct RangedNode { @@ -19,73 +18,110 @@ pub fn estimate_node_range( tokens: &Vec, text: &str, ) -> Vec { - let mut ranged_nodes: Vec = Vec::new(); - - let mut used_tokens: Vec = Vec::new(); - // ensure that all children of any given node are already processed before processing the node itself nodes.sort_by(|a, b| b.path.cmp(&a.path)); - // we get an estimated range by searching for tokens that match the node property values - // and, if available, the `location` of the node itself - nodes.iter().for_each(|n| { - // first, get the estimated boundaries of the node based on the `location` property of a node - let nearest_parent_location = get_nearest_parent_location(&n, nodes); + // first get ranges only from child tokens + let mut used_tokens: Vec = Vec::new(); + let mut child_token_ranges: Vec> = Vec::new(); + let mut too_many_tokens_at: Vec = Vec::new(); - let child_token_range = get_child_token_range( + nodes.iter().for_each(|n| { + match get_child_token_range( &n.node, tokens .iter() .filter(|t| !used_tokens.contains(&t.start)) .collect(), text, - nearest_parent_location, - ); + None, + ) { + ChildTokenRangeResult::TooManyTokens => { + too_many_tokens_at.push(nodes.iter().position(|x| x.path == n.path).unwrap()); + child_token_ranges.push(None); + } + ChildTokenRangeResult::ChildTokenRange { + used_token_indices, + range, + } => { + used_tokens.extend(used_token_indices); + child_token_ranges.push(Some(range)); + } + ChildTokenRangeResult::NoTokens => { + child_token_ranges.push(None); + } + }; + }); + + // second iteration using the nearest parent from the first + for idx in too_many_tokens_at { + // get the nearest parent location + let nearest_parent_start = + get_nearest_parent_start(&nodes[idx], &nodes, &child_token_ranges); + let nearest_parent_location = get_nearest_parent_location(&nodes[idx], &nodes); + + match get_child_token_range( + &nodes[idx].node, + tokens + .iter() + .filter(|t| !used_tokens.contains(&t.start)) + .collect(), + text, + Some(max(nearest_parent_start, nearest_parent_location)), + ) { + ChildTokenRangeResult::ChildTokenRange { + used_token_indices, + range, + } => { + used_tokens.extend(used_token_indices); + child_token_ranges[idx] = Some(range) + } + _ => {} + }; + } + + let mut ranged_nodes: Vec = Vec::new(); + + // we get an estimated range by searching for tokens that match the node property values + // and, if available, the `location` of the node itself + nodes.iter().enumerate().for_each(|(idx, n)| { + let child_token_range = child_token_ranges[idx]; - used_tokens.extend(child_token_range.child_token_indices); + println!("node: {:#?}, child_token_range: {:?}", n, child_token_range); - // For `from`, the location of the node itself is always correct. - // If not available, the closest estimation is the smaller value of the start of the first direct child token, - // and the start of all children ranges. If neither is available, let’s panic for now. - // The parent location as a fallback should never be required, because any node must have either children with tokens, or a token itself. let child_node_ranges = ranged_nodes .iter() .filter(|x| x.inner.path.starts_with(n.path.as_str())) .collect::>(); - let location = get_location(&n.node); - let from = if location.is_some() { - Some(TextSize::from(location.unwrap())) + + // get `from` location + let node_location = match get_location(&n.node) { + Some(l) => Some(TextSize::from(l)), + None => None, + }; + let start_of_all_children_ranges = if child_node_ranges.len() > 0 { + Some( + child_node_ranges + .iter() + .min_by_key(|n| n.range.start()) + .unwrap() + .range + .start(), + ) } else { - let start_of_all_children_ranges = if child_node_ranges.len() > 0 { - Some( - child_node_ranges - .iter() - .min_by_key(|n| n.range.start()) - .unwrap() - .range - .start(), - ) - } else { - None - }; - - if child_token_range.range.is_some() { - let start_of_first_child_token = child_token_range.range.unwrap().start(); - if start_of_all_children_ranges.is_some() { - Some(min( - start_of_first_child_token, - start_of_all_children_ranges.unwrap(), - )) - } else { - Some(start_of_first_child_token) - } - } else if start_of_all_children_ranges.is_some() { - Some(start_of_all_children_ranges.unwrap()) - } else { - debug!("No location or child tokens found for node {:?}", n); - None - } + None }; + let start_of_first_child_token = match child_token_range { + Some(r) => Some(r.start()), + None => None, + }; + + let from_locations: [Option; 3] = [ + node_location, + start_of_all_children_ranges, + start_of_first_child_token, + ]; + let from = from_locations.iter().filter(|v| v.is_some()).min(); // For `to`, it’s the larger value of the end of the last direkt child token, and the end of all children ranges. let end_of_all_children_ranges = if child_node_ranges.len() > 0 { @@ -100,27 +136,18 @@ pub fn estimate_node_range( } else { None }; - let to = if child_token_range.range.is_some() { - let end_of_last_child_token = child_token_range.range.unwrap().end(); - if end_of_all_children_ranges.is_some() { - Some(max( - end_of_last_child_token, - end_of_all_children_ranges.unwrap(), - )) - } else { - Some(end_of_last_child_token) - } - } else if end_of_all_children_ranges.is_some() { - Some(end_of_all_children_ranges.unwrap()) - } else { - debug!("No child tokens or children ranges found for node {:?}", n); - None + let end_of_last_child_token = match child_token_range { + Some(r) => Some(r.end()), + None => None, }; + let to_locations: [Option; 2] = + [end_of_all_children_ranges, end_of_last_child_token]; + let to = to_locations.iter().filter(|v| v.is_some()).max(); if from.is_some() && to.is_some() { ranged_nodes.push(RangedNode { inner: n.to_owned(), - range: TextRange::new(from.unwrap(), to.unwrap()), + range: TextRange::new(from.unwrap().unwrap(), to.unwrap().unwrap()), }); } }); @@ -131,6 +158,29 @@ pub fn estimate_node_range( ranged_nodes } +fn get_nearest_parent_start( + node: &Node, + nodes: &Vec, + child_token_ranges: &Vec>, +) -> u32 { + let mut path_elements = node.path.split(".").collect::>(); + path_elements.pop(); + while path_elements.len() > 0 { + let parent_path = path_elements.join("."); + let parent_idx = nodes.iter().position(|c| c.path == parent_path); + if parent_idx.is_some() { + if child_token_ranges[parent_idx.unwrap()].is_some() { + return u32::from(child_token_ranges[parent_idx.unwrap()].unwrap().start()); + } + } + + path_elements.pop(); + } + + // fallback to 0 + 0 +} + fn get_nearest_parent_location(n: &Node, children: &Vec) -> u32 { // if location is set, return it let location = get_location(&n.node); @@ -155,7 +205,7 @@ fn get_nearest_parent_location(n: &Node, children: &Vec) -> u32 { } // fallback to 0 - return 0; + 0 } #[cfg(test)] diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs index 98e4c91f..d59acc44 100644 --- a/crates/parser/src/source_parser.rs +++ b/crates/parser/src/source_parser.rs @@ -75,7 +75,7 @@ fn tokens(input: &str) -> Vec { } impl Parser { - fn parse_source_at(&mut self, text: &str, at_offset: Option) { + pub fn parse_source_at(&mut self, text: &str, at_offset: Option) { let offset = at_offset.unwrap_or(0); let tokens = tokens(&text); diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 89c3c99c..46e7126d 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -357,6 +357,48 @@ mod tests { parser.parse_statement_at(input, None); let parsed = parser.finish(); + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_window_call() { + init(); + + let input = + "SELECT sum(salary) OVER w FROM empsalary WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_access_priv() { + init(); + + let input = "GRANT SELECT (col1), UPDATE (col1) ON mytable TO miriam_rw;"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + + dbg!(&parsed.cst); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_create_policy() { + init(); + + let input = "CREATE POLICY account_managers ON accounts TO managers USING (manager = current_user);"; + + let mut parser = Parser::new(); + parser.parse_statement_at(input, None); + let parsed = parser.finish(); + dbg!(&parsed.cst); assert_eq!(parsed.cst.text(), input); diff --git a/crates/parser/tests/snapshots/statements/valid/0012.snap b/crates/parser/tests/snapshots/statements/valid/0012.snap index f4d77ae0..ad8b591b 100644 --- a/crates/parser/tests/snapshots/statements/valid/0012.snap +++ b/crates/parser/tests/snapshots/statements/valid/0012.snap @@ -2,45 +2,54 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE measurement_y2008m01 PARTITION OF measurement\n FOR VALUES FROM ('2008-01-01') TO ('2008-02-01')\n WITH (parallel_workers = 4)\n TABLESPACE fasttablespace;\n" --- -CreateStmt@0..175 +CreateStmt@0..174 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..33 "measurement_y2008m01" + RangeVar@13..33 + Ident@13..33 "measurement_y2008m01" Whitespace@33..34 " " Partition@34..43 "PARTITION" Whitespace@43..44 " " Of@44..46 "OF" Whitespace@46..47 " " - Ident@47..58 "measurement" + RangeVar@47..58 + Ident@47..58 "measurement" Newline@58..59 "\n" Whitespace@59..63 " " For@63..66 "FOR" Whitespace@66..67 " " Values@67..73 "VALUES" Whitespace@73..74 " " - From@74..78 "FROM" - Whitespace@78..79 " " - Ascii40@79..80 "(" - Sconst@80..92 "'2008-01-01'" - Ascii41@92..93 ")" - Whitespace@93..94 " " - To@94..96 "TO" - Whitespace@96..97 " " - Ascii40@97..98 "(" - Sconst@98..110 "'2008-02-01'" - Ascii41@110..111 ")" + PartitionBoundSpec@74..111 + From@74..78 "FROM" + Whitespace@78..79 " " + Ascii40@79..80 "(" + AConst@80..92 + String@80..92 + Sconst@80..92 "'2008-01-01'" + Ascii41@92..93 ")" + Whitespace@93..94 " " + To@94..96 "TO" + Whitespace@96..97 " " + Ascii40@97..98 "(" + AConst@98..110 + String@98..110 + Sconst@98..110 "'2008-02-01'" + Ascii41@110..111 ")" Newline@111..112 "\n" Whitespace@112..116 " " With@116..120 "WITH" Whitespace@120..121 " " Ascii40@121..122 "(" - Ident@122..138 "parallel_workers" - Whitespace@138..139 " " - Ascii61@139..140 "=" - Whitespace@140..141 " " - Iconst@141..142 "4" + DefElem@122..142 + Ident@122..138 "parallel_workers" + Whitespace@138..139 " " + Ascii61@139..140 "=" + Whitespace@140..141 " " + Integer@141..142 + Iconst@141..142 "4" Ascii41@142..143 ")" Newline@143..144 "\n" Whitespace@144..148 " " @@ -48,5 +57,4 @@ CreateStmt@0..175 Whitespace@158..159 " " Ident@159..173 "fasttablespace" Ascii59@173..174 ";" - Newline@174..175 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0012.snap.new b/crates/parser/tests/snapshots/statements/valid/0012.snap.new deleted file mode 100644 index f956dd31..00000000 --- a/crates/parser/tests/snapshots/statements/valid/0012.snap.new +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: crates/parser/tests/statement_parser_test.rs -assertion_line: 39 -description: "CREATE TABLE measurement_y2008m01 PARTITION OF measurement\n FOR VALUES FROM ('2008-01-01') TO ('2008-02-01')\n WITH (parallel_workers = 4)\n TABLESPACE fasttablespace;\n" ---- -CreateStmt@0..174 - Create@0..6 "CREATE" - Whitespace@6..7 " " - Table@7..12 "TABLE" - Whitespace@12..13 " " - RangeVar@13..33 - Ident@13..33 "measurement_y2008m01" - Whitespace@33..34 " " - Partition@34..43 "PARTITION" - Whitespace@43..44 " " - Of@44..46 "OF" - Whitespace@46..47 " " - RangeVar@47..58 - Ident@47..58 "measurement" - Newline@58..59 "\n" - Whitespace@59..63 " " - For@63..66 "FOR" - Whitespace@66..67 " " - Values@67..73 "VALUES" - Whitespace@73..74 " " - PartitionBoundSpec@74..111 - From@74..78 "FROM" - Whitespace@78..79 " " - Ascii40@79..80 "(" - AConst@80..92 - String@80..92 - Sconst@80..92 "'2008-01-01'" - Ascii41@92..93 ")" - Whitespace@93..94 " " - To@94..96 "TO" - Whitespace@96..97 " " - Ascii40@97..98 "(" - AConst@98..110 - String@98..110 - Sconst@98..110 "'2008-02-01'" - Ascii41@110..111 ")" - Newline@111..112 "\n" - Whitespace@112..116 " " - With@116..120 "WITH" - Whitespace@120..121 " " - Ascii40@121..122 "(" - DefElem@122..142 - Ident@122..138 "parallel_workers" - Whitespace@138..139 " " - Ascii61@139..140 "=" - Whitespace@140..141 " " - Integer@141..142 - Iconst@141..142 "4" - Ascii41@142..143 ")" - Newline@143..144 "\n" - Whitespace@144..148 " " - Tablespace@148..158 "TABLESPACE" - Whitespace@158..159 " " - Ident@159..173 "fasttablespace" - Ascii59@173..174 ";" - diff --git a/crates/parser/tests/snapshots/statements/valid/0013.snap b/crates/parser/tests/snapshots/statements/valid/0013.snap index 643f1e95..031b67df 100644 --- a/crates/parser/tests/snapshots/statements/valid/0013.snap +++ b/crates/parser/tests/snapshots/statements/valid/0013.snap @@ -2,43 +2,63 @@ source: crates/parser/tests/statement_parser_test.rs description: "UPDATE weather\n SET temp_hi = temp_hi - 2, temp_lo = temp_lo - 2\n WHERE date > '1994-11-28';\n" --- -UpdateStmt@0..100 +UpdateStmt@0..99 Update@0..6 "UPDATE" Whitespace@6..7 " " - Ident@7..14 "weather" + RangeVar@7..14 + Ident@7..14 "weather" Newline@14..15 "\n" Whitespace@15..19 " " Set@19..22 "SET" Whitespace@22..23 " " - Ident@23..30 "temp_hi" - Whitespace@30..31 " " - Ascii61@31..32 "=" - Whitespace@32..33 " " - Ident@33..40 "temp_hi" - Whitespace@40..41 " " - Ascii45@41..42 "-" - Whitespace@42..43 " " - Iconst@43..44 "2" + ResTarget@23..44 + Ident@23..30 "temp_hi" + Whitespace@30..31 " " + Ascii61@31..32 "=" + Whitespace@32..33 " " + AExpr@33..44 + ColumnRef@33..40 + String@33..40 + Ident@33..40 "temp_hi" + Whitespace@40..41 " " + String@41..42 + Ascii45@41..42 "-" + Whitespace@42..43 " " + AConst@43..44 + Integer@43..44 + Iconst@43..44 "2" Ascii44@44..45 "," Whitespace@45..47 " " - Ident@47..54 "temp_lo" - Whitespace@54..55 " " - Ascii61@55..56 "=" - Whitespace@56..57 " " - Ident@57..64 "temp_lo" - Whitespace@64..65 " " - Ascii45@65..66 "-" - Whitespace@66..67 " " - Iconst@67..68 "2" + ResTarget@47..68 + Ident@47..54 "temp_lo" + Whitespace@54..55 " " + Ascii61@55..56 "=" + Whitespace@56..57 " " + AExpr@57..68 + ColumnRef@57..64 + String@57..64 + Ident@57..64 "temp_lo" + Whitespace@64..65 " " + String@65..66 + Ascii45@65..66 "-" + Whitespace@66..67 " " + AConst@67..68 + Integer@67..68 + Iconst@67..68 "2" Newline@68..69 "\n" Whitespace@69..73 " " Where@73..78 "WHERE" Whitespace@78..79 " " - Ident@79..83 "date" - Whitespace@83..84 " " - Ascii62@84..85 ">" - Whitespace@85..86 " " - Sconst@86..98 "'1994-11-28'" + AExpr@79..98 + ColumnRef@79..83 + String@79..83 + Ident@79..83 "date" + Whitespace@83..84 " " + String@84..85 + Ascii62@84..85 ">" + Whitespace@85..86 " " + AConst@86..98 + String@86..98 + Sconst@86..98 "'1994-11-28'" Ascii59@98..99 ";" - Newline@99..100 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0013.snap.new b/crates/parser/tests/snapshots/statements/valid/0013.snap.new deleted file mode 100644 index 2d1f7551..00000000 --- a/crates/parser/tests/snapshots/statements/valid/0013.snap.new +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: crates/parser/tests/statement_parser_test.rs -assertion_line: 39 -description: "UPDATE weather\n SET temp_hi = temp_hi - 2, temp_lo = temp_lo - 2\n WHERE date > '1994-11-28';\n" ---- -UpdateStmt@0..99 - Update@0..6 "UPDATE" - Whitespace@6..7 " " - RangeVar@7..14 - Ident@7..14 "weather" - Newline@14..15 "\n" - Whitespace@15..19 " " - Set@19..22 "SET" - Whitespace@22..23 " " - ResTarget@23..44 - Ident@23..30 "temp_hi" - Whitespace@30..31 " " - Ascii61@31..32 "=" - Whitespace@32..33 " " - AExpr@33..44 - ColumnRef@33..40 - String@33..40 - Ident@33..40 "temp_hi" - Whitespace@40..41 " " - String@41..42 - Ascii45@41..42 "-" - Whitespace@42..43 " " - AConst@43..44 - Integer@43..44 - Iconst@43..44 "2" - Ascii44@44..45 "," - Whitespace@45..47 " " - ResTarget@47..68 - Ident@47..54 "temp_lo" - Whitespace@54..55 " " - Ascii61@55..56 "=" - Whitespace@56..57 " " - AExpr@57..68 - ColumnRef@57..64 - String@57..64 - Ident@57..64 "temp_lo" - Whitespace@64..65 " " - String@65..66 - Ascii45@65..66 "-" - Whitespace@66..67 " " - AConst@67..68 - Integer@67..68 - Iconst@67..68 "2" - Newline@68..69 "\n" - Whitespace@69..73 " " - Where@73..78 "WHERE" - Whitespace@78..79 " " - AExpr@79..98 - ColumnRef@79..83 - String@79..83 - Ident@79..83 "date" - Whitespace@83..84 " " - String@84..85 - Ascii62@84..85 ">" - Whitespace@85..86 " " - AConst@86..98 - String@86..98 - Sconst@86..98 "'1994-11-28'" - Ascii59@98..99 ";" - diff --git a/crates/parser/tests/snapshots/statements/valid/0014.snap b/crates/parser/tests/snapshots/statements/valid/0014.snap index 490b0142..8d760544 100644 --- a/crates/parser/tests/snapshots/statements/valid/0014.snap +++ b/crates/parser/tests/snapshots/statements/valid/0014.snap @@ -5,8 +5,8 @@ description: "SELECT sum(salary) OVER w, avg(salary) OVER w\n FROM empsalary\n SelectStmt@0..121 Select@0..6 "SELECT" Whitespace@6..7 " " - ResTarget@7..120 - FuncCall@7..120 + ResTarget@7..25 + FuncCall@7..25 String@7..10 Ident@7..10 "sum" Ascii40@10..11 "(" @@ -15,60 +15,60 @@ SelectStmt@0..121 Ident@11..17 "salary" Ascii41@17..18 ")" Whitespace@18..19 " " - Over@19..23 "OVER" - Whitespace@23..24 " " - WindowDef@24..120 + WindowDef@19..25 + Over@19..23 "OVER" + Whitespace@23..24 " " Ident@24..25 "w" - Ascii44@25..26 "," - Whitespace@26..27 " " - ResTarget@27..120 - FuncCall@27..120 - String@27..30 - Ident@27..30 "avg" - Ascii40@30..31 "(" - ColumnRef@31..37 - String@31..37 - Ident@31..37 "salary" - Ascii41@37..38 ")" - Whitespace@38..39 " " - Over@39..43 "OVER" - Whitespace@43..44 " " - WindowDef@44..120 - Ident@44..45 "w" - Newline@45..46 "\n" - Whitespace@46..48 " " - From@48..52 "FROM" - Whitespace@52..53 " " - RangeVar@53..62 - Ident@53..62 "empsalary" - Newline@62..63 "\n" - Whitespace@63..65 " " - Window@65..71 "WINDOW" - Whitespace@71..72 " " - Ident@72..73 "w" - Whitespace@73..74 " " - As@74..76 "AS" - Whitespace@76..77 " " - WindowDef@77..120 - Ascii40@77..78 "(" - Partition@78..87 "PARTITION" - Whitespace@87..88 " " - By@88..90 "BY" - Whitespace@90..91 " " - ColumnRef@91..98 - String@91..98 - Ident@91..98 "depname" - Whitespace@98..99 " " - SortBy@99..119 - Order@99..104 "ORDER" - Whitespace@104..105 " " - By@105..107 "BY" - Whitespace@107..108 " " - ColumnRef@108..114 - String@108..114 - Ident@108..114 "salary" - Whitespace@114..115 " " - Desc@115..119 "DESC" - Ascii41@119..120 ")" + Ascii44@25..26 "," + Whitespace@26..27 " " + ResTarget@27..45 + FuncCall@27..45 + String@27..30 + Ident@27..30 "avg" + Ascii40@30..31 "(" + ColumnRef@31..37 + String@31..37 + Ident@31..37 "salary" + Ascii41@37..38 ")" + Whitespace@38..39 " " + WindowDef@39..45 + Over@39..43 "OVER" + Whitespace@43..44 " " + Ident@44..45 "w" + Newline@45..46 "\n" + Whitespace@46..48 " " + From@48..52 "FROM" + Whitespace@52..53 " " + RangeVar@53..62 + Ident@53..62 "empsalary" + Newline@62..63 "\n" + Whitespace@63..65 " " + WindowDef@65..120 + Window@65..71 "WINDOW" + Whitespace@71..72 " " + Ident@72..73 "w" + Whitespace@73..74 " " + As@74..76 "AS" + Whitespace@76..77 " " + Ascii40@77..78 "(" + Partition@78..87 "PARTITION" + Whitespace@87..88 " " + By@88..90 "BY" + Whitespace@90..91 " " + ColumnRef@91..98 + String@91..98 + Ident@91..98 "depname" + Whitespace@98..99 " " + SortBy@99..119 + Order@99..104 "ORDER" + Whitespace@104..105 " " + By@105..107 "BY" + Whitespace@107..108 " " + ColumnRef@108..114 + String@108..114 + Ident@108..114 "salary" + Whitespace@114..115 " " + Desc@115..119 "DESC" + Ascii41@119..120 ")" Ascii59@120..121 ";" diff --git a/crates/parser/tests/snapshots/statements/valid/0015.snap b/crates/parser/tests/snapshots/statements/valid/0015.snap index 0d9223c5..4f6bf7df 100644 --- a/crates/parser/tests/snapshots/statements/valid/0015.snap +++ b/crates/parser/tests/snapshots/statements/valid/0015.snap @@ -2,57 +2,78 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT\n count(*) AS unfiltered,\n count(*) FILTER (WHERE i < 5) AS filtered\nFROM generate_series(1,10) AS s(i);\n" --- -SelectStmt@0..117 +SelectStmt@0..116 Select@0..6 "SELECT" Newline@6..7 "\n" Whitespace@7..11 " " - Ident@11..16 "count" - Ascii40@16..17 "(" - Ascii42@17..18 "*" - Ascii41@18..19 ")" - Whitespace@19..20 " " - As@20..22 "AS" - Whitespace@22..23 " " - Ident@23..33 "unfiltered" + ResTarget@11..33 + FuncCall@11..16 + String@11..16 + Ident@11..16 "count" + Ascii40@16..17 "(" + Ascii42@17..18 "*" + Ascii41@18..19 ")" + Whitespace@19..20 " " + As@20..22 "AS" + Whitespace@22..23 " " + Ident@23..33 "unfiltered" Ascii44@33..34 "," Newline@34..35 "\n" Whitespace@35..39 " " - Ident@39..44 "count" - Ascii40@44..45 "(" - Ascii42@45..46 "*" - Ascii41@46..47 ")" - Whitespace@47..48 " " - Filter@48..54 "FILTER" - Whitespace@54..55 " " - Ascii40@55..56 "(" - Where@56..61 "WHERE" - Whitespace@61..62 " " - Ident@62..63 "i" - Whitespace@63..64 " " - Ascii60@64..65 "<" - Whitespace@65..66 " " - Iconst@66..67 "5" - Ascii41@67..68 ")" - Whitespace@68..69 " " - As@69..71 "AS" - Whitespace@71..72 " " - Ident@72..80 "filtered" + ResTarget@39..80 + FuncCall@39..68 + String@39..44 + Ident@39..44 "count" + Ascii40@44..45 "(" + Ascii42@45..46 "*" + Ascii41@46..47 ")" + Whitespace@47..48 " " + Filter@48..54 "FILTER" + Whitespace@54..55 " " + Ascii40@55..56 "(" + Where@56..61 "WHERE" + Whitespace@61..62 " " + AExpr@62..67 + ColumnRef@62..63 + String@62..63 + Ident@62..63 "i" + Whitespace@63..64 " " + String@64..65 + Ascii60@64..65 "<" + Whitespace@65..66 " " + AConst@66..67 + Integer@66..67 + Iconst@66..67 "5" + Ascii41@67..68 ")" + Whitespace@68..69 " " + As@69..71 "AS" + Whitespace@71..72 " " + Ident@72..80 "filtered" Newline@80..81 "\n" From@81..85 "FROM" Whitespace@85..86 " " - Ident@86..101 "generate_series" - Ascii40@101..102 "(" - Iconst@102..103 "1" - Ascii44@103..104 "," - Iconst@104..106 "10" - Ascii41@106..107 ")" - Whitespace@107..108 " " - As@108..110 "AS" - Whitespace@110..111 " " - Ident@111..112 "s" - Ascii40@112..113 "(" - Ident@113..114 "i" - Ascii41@114..115 ")" + RangeFunction@86..115 + List@86..107 + FuncCall@86..107 + String@86..101 + Ident@86..101 "generate_series" + Ascii40@101..102 "(" + AConst@102..103 + Integer@102..103 + Iconst@102..103 "1" + Ascii44@103..104 "," + AConst@104..106 + Integer@104..106 + Iconst@104..106 "10" + Ascii41@106..107 ")" + Whitespace@107..108 " " + As@108..110 "AS" + Whitespace@110..111 " " + Alias@111..115 + Ident@111..112 "s" + Ascii40@112..113 "(" + String@113..114 + Ident@113..114 "i" + Ascii41@114..115 ")" Ascii59@115..116 ";" - Newline@116..117 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0015.snap.new b/crates/parser/tests/snapshots/statements/valid/0015.snap.new deleted file mode 100644 index 89d0aa08..00000000 --- a/crates/parser/tests/snapshots/statements/valid/0015.snap.new +++ /dev/null @@ -1,80 +0,0 @@ ---- -source: crates/parser/tests/statement_parser_test.rs -assertion_line: 39 -description: "SELECT\n count(*) AS unfiltered,\n count(*) FILTER (WHERE i < 5) AS filtered\nFROM generate_series(1,10) AS s(i);\n" ---- -SelectStmt@0..116 - Select@0..6 "SELECT" - Newline@6..7 "\n" - Whitespace@7..11 " " - ResTarget@11..33 - FuncCall@11..16 - String@11..16 - Ident@11..16 "count" - Ascii40@16..17 "(" - Ascii42@17..18 "*" - Ascii41@18..19 ")" - Whitespace@19..20 " " - As@20..22 "AS" - Whitespace@22..23 " " - Ident@23..33 "unfiltered" - Ascii44@33..34 "," - Newline@34..35 "\n" - Whitespace@35..39 " " - ResTarget@39..112 - FuncCall@39..112 - String@39..44 - Ident@39..44 "count" - Ascii40@44..45 "(" - Ascii42@45..46 "*" - Ascii41@46..47 ")" - Whitespace@47..48 " " - Filter@48..54 "FILTER" - Whitespace@54..55 " " - Ascii40@55..56 "(" - Where@56..61 "WHERE" - Whitespace@61..62 " " - RangeFunction@62..112 - Alias@62..112 - String@62..67 - AExpr@62..67 - ColumnRef@62..63 - String@62..63 - Ident@62..63 "i" - Whitespace@63..64 " " - String@64..65 - Ascii60@64..65 "<" - Whitespace@65..66 " " - AConst@66..67 - Integer@66..67 - Iconst@66..67 "5" - Ascii41@67..68 ")" - Whitespace@68..69 " " - As@69..71 "AS" - Whitespace@71..72 " " - Ident@72..80 "filtered" - Newline@80..81 "\n" - From@81..85 "FROM" - Whitespace@85..86 " " - List@86..107 - FuncCall@86..107 - String@86..101 - Ident@86..101 "generate_series" - Ascii40@101..102 "(" - AConst@102..103 - Integer@102..103 - Iconst@102..103 "1" - Ascii44@103..104 "," - AConst@104..106 - Integer@104..106 - Iconst@104..106 "10" - Ascii41@106..107 ")" - Whitespace@107..108 " " - As@108..110 "AS" - Whitespace@110..111 " " - Ident@111..112 "s" - Ascii40@112..113 "(" - Ident@113..114 "i" - Ascii41@114..115 ")" - Ascii59@115..116 ";" - diff --git a/crates/parser/tests/snapshots/statements/valid/0016.snap b/crates/parser/tests/snapshots/statements/valid/0016.snap index 6306aa10..a93b47ec 100644 --- a/crates/parser/tests/snapshots/statements/valid/0016.snap +++ b/crates/parser/tests/snapshots/statements/valid/0016.snap @@ -2,26 +2,36 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT * FROM tbl WHERE a COLLATE \"C\" > 'foo';\n" --- -SelectStmt@0..47 +SelectStmt@0..46 Select@0..6 "SELECT" Whitespace@6..7 " " - Ascii42@7..8 "*" + ResTarget@7..8 + ColumnRef@7..8 + AStar@7..8 + Ascii42@7..8 "*" Whitespace@8..9 " " From@9..13 "FROM" Whitespace@13..14 " " - Ident@14..17 "tbl" + RangeVar@14..17 + Ident@14..17 "tbl" Whitespace@17..18 " " Where@18..23 "WHERE" Whitespace@23..24 " " - Ident@24..25 "a" - Whitespace@25..26 " " - Collate@26..33 "COLLATE" - Whitespace@33..34 " " - Ident@34..37 "\"C\"" - Whitespace@37..38 " " - Ascii62@38..39 ">" - Whitespace@39..40 " " - Sconst@40..45 "'foo'" + AExpr@24..45 + CollateClause@24..25 + ColumnRef@24..25 + String@24..25 + Ident@24..25 "a" + Whitespace@25..26 " " + Collate@26..33 "COLLATE" + Whitespace@33..34 " " + Ident@34..37 "\"C\"" + Whitespace@37..38 " " + String@38..39 + Ascii62@38..39 ">" + Whitespace@39..40 " " + AConst@40..45 + String@40..45 + Sconst@40..45 "'foo'" Ascii59@45..46 ";" - Newline@46..47 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0017.snap b/crates/parser/tests/snapshots/statements/valid/0017.snap index 6c3f9d52..21dc6c7d 100644 --- a/crates/parser/tests/snapshots/statements/valid/0017.snap +++ b/crates/parser/tests/snapshots/statements/valid/0017.snap @@ -2,41 +2,61 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT name, (SELECT max(pop) FROM cities WHERE cities.state = states.name)\n FROM states;\n" --- -SelectStmt@0..93 +SelectStmt@0..92 Select@0..6 "SELECT" Whitespace@6..7 " " - NameP@7..11 "name" + ResTarget@7..11 + ColumnRef@7..11 + String@7..11 + NameP@7..11 "name" Ascii44@11..12 "," Whitespace@12..13 " " - Ascii40@13..14 "(" - Select@14..20 "SELECT" - Whitespace@20..21 " " - Ident@21..24 "max" - Ascii40@24..25 "(" - Ident@25..28 "pop" - Ascii41@28..29 ")" - Whitespace@29..30 " " - From@30..34 "FROM" - Whitespace@34..35 " " - Ident@35..41 "cities" - Whitespace@41..42 " " - Where@42..47 "WHERE" - Whitespace@47..48 " " - Ident@48..54 "cities" - Ascii46@54..55 "." - Ident@55..60 "state" - Whitespace@60..61 " " - Ascii61@61..62 "=" - Whitespace@62..63 " " - Ident@63..69 "states" - Ascii46@69..70 "." - NameP@70..74 "name" - Ascii41@74..75 ")" + ResTarget@13..75 + SubLink@13..75 + Ascii40@13..14 "(" + SelectStmt@14..74 + Select@14..20 "SELECT" + Whitespace@20..21 " " + ResTarget@21..29 + FuncCall@21..29 + String@21..24 + Ident@21..24 "max" + Ascii40@24..25 "(" + ColumnRef@25..28 + String@25..28 + Ident@25..28 "pop" + Ascii41@28..29 ")" + Whitespace@29..30 " " + From@30..34 "FROM" + Whitespace@34..35 " " + RangeVar@35..41 + Ident@35..41 "cities" + Whitespace@41..42 " " + Where@42..47 "WHERE" + Whitespace@47..48 " " + AExpr@48..74 + ColumnRef@48..60 + String@48..54 + Ident@48..54 "cities" + Ascii46@54..55 "." + String@55..60 + Ident@55..60 "state" + Whitespace@60..61 " " + String@61..62 + Ascii61@61..62 "=" + Whitespace@62..63 " " + ColumnRef@63..74 + String@63..69 + Ident@63..69 "states" + Ascii46@69..70 "." + String@70..74 + NameP@70..74 "name" + Ascii41@74..75 ")" Newline@75..76 "\n" Whitespace@76..80 " " From@80..84 "FROM" Whitespace@84..85 " " - Ident@85..91 "states" + RangeVar@85..91 + Ident@85..91 "states" Ascii59@91..92 ";" - Newline@92..93 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0018.snap b/crates/parser/tests/snapshots/statements/valid/0018.snap index a93ef99c..d8152852 100644 --- a/crates/parser/tests/snapshots/statements/valid/0018.snap +++ b/crates/parser/tests/snapshots/statements/valid/0018.snap @@ -2,21 +2,31 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT ARRAY[1,2,22.7]::integer[];\n" --- -SelectStmt@0..35 +SelectStmt@0..34 Select@0..6 "SELECT" Whitespace@6..7 " " - Array@7..12 "ARRAY" - Ascii91@12..13 "[" - Iconst@13..14 "1" - Ascii44@14..15 "," - Iconst@15..16 "2" - Ascii44@16..17 "," - Fconst@17..21 "22.7" - Ascii93@21..22 "]" - Typecast@22..24 "::" - Integer@24..31 "integer" + ResTarget@7..31 + TypeCast@7..31 + AArrayExpr@7..21 + Array@7..12 "ARRAY" + Ascii91@12..13 "[" + AConst@13..14 + Integer@13..14 + Iconst@13..14 "1" + Ascii44@14..15 "," + AConst@15..16 + Integer@15..16 + Iconst@15..16 "2" + Ascii44@16..17 "," + AConst@17..21 + Float@17..21 + Fconst@17..21 "22.7" + Ascii93@21..22 "]" + Typecast@22..24 "::" + TypeName@24..31 + String@24..31 + Integer@24..31 "integer" Ascii91@31..32 "[" Ascii93@32..33 "]" Ascii59@33..34 ";" - Newline@34..35 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0019.snap b/crates/parser/tests/snapshots/statements/valid/0019.snap index 97f59c8f..ac87dd53 100644 --- a/crates/parser/tests/snapshots/statements/valid/0019.snap +++ b/crates/parser/tests/snapshots/statements/valid/0019.snap @@ -2,33 +2,52 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT CASE WHEN min(employees) > 0\n THEN avg(expenses / employees)\n END\n FROM departments;\n" --- -SelectStmt@0..112 +SelectStmt@0..111 Select@0..6 "SELECT" Whitespace@6..7 " " - Case@7..11 "CASE" - Whitespace@11..12 " " - When@12..16 "WHEN" - Whitespace@16..17 " " - Ident@17..20 "min" - Ascii40@20..21 "(" - Ident@21..30 "employees" - Ascii41@30..31 ")" - Whitespace@31..32 " " - Ascii62@32..33 ">" - Whitespace@33..34 " " - Iconst@34..35 "0" - Newline@35..36 "\n" - Whitespace@36..48 " " - Then@48..52 "THEN" - Whitespace@52..53 " " - Ident@53..56 "avg" - Ascii40@56..57 "(" - Ident@57..65 "expenses" - Whitespace@65..66 " " - Ascii47@66..67 "/" - Whitespace@67..68 " " - Ident@68..77 "employees" - Ascii41@77..78 ")" + ResTarget@7..78 + CaseExpr@7..78 + Case@7..11 "CASE" + Whitespace@11..12 " " + CaseWhen@12..78 + When@12..16 "WHEN" + Whitespace@16..17 " " + AExpr@17..35 + FuncCall@17..31 + String@17..20 + Ident@17..20 "min" + Ascii40@20..21 "(" + ColumnRef@21..30 + String@21..30 + Ident@21..30 "employees" + Ascii41@30..31 ")" + Whitespace@31..32 " " + String@32..33 + Ascii62@32..33 ">" + Whitespace@33..34 " " + AConst@34..35 + Integer@34..35 + Iconst@34..35 "0" + Newline@35..36 "\n" + Whitespace@36..48 " " + Then@48..52 "THEN" + Whitespace@52..53 " " + FuncCall@53..78 + String@53..56 + Ident@53..56 "avg" + Ascii40@56..57 "(" + AExpr@57..77 + ColumnRef@57..65 + String@57..65 + Ident@57..65 "expenses" + Whitespace@65..66 " " + String@66..67 + Ascii47@66..67 "/" + Whitespace@67..68 " " + ColumnRef@68..77 + String@68..77 + Ident@68..77 "employees" + Ascii41@77..78 ")" Newline@78..79 "\n" Whitespace@79..86 " " EndP@86..89 "END" @@ -36,7 +55,7 @@ SelectStmt@0..112 Whitespace@90..94 " " From@94..98 "FROM" Whitespace@98..99 " " - Ident@99..110 "departments" + RangeVar@99..110 + Ident@99..110 "departments" Ascii59@110..111 ";" - Newline@111..112 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0020.snap b/crates/parser/tests/snapshots/statements/valid/0020.snap index f84f0ed9..6a404472 100644 --- a/crates/parser/tests/snapshots/statements/valid/0020.snap +++ b/crates/parser/tests/snapshots/statements/valid/0020.snap @@ -2,47 +2,66 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE FUNCTION concat_lower_or_upper(a text, b text, uppercase boolean DEFAULT false)\nRETURNS text\nAS\n$$\n SELECT CASE\n WHEN $3 THEN UPPER($1 || ' ' || $2)\n ELSE LOWER($1 || ' ' || $2)\n END;\n$$\nLANGUAGE SQL IMMUTABLE STRICT;\n" --- -CreateFunctionStmt@0..246 +CreateFunctionStmt@0..245 Create@0..6 "CREATE" Whitespace@6..7 " " Function@7..15 "FUNCTION" Whitespace@15..16 " " - Ident@16..37 "concat_lower_or_upper" + String@16..37 + Ident@16..37 "concat_lower_or_upper" Ascii40@37..38 "(" - Ident@38..39 "a" - Whitespace@39..40 " " - TextP@40..44 "text" + FunctionParameter@38..44 + Ident@38..39 "a" + Whitespace@39..40 " " + TypeName@40..44 + String@40..44 + TextP@40..44 "text" Ascii44@44..45 "," Whitespace@45..46 " " - Ident@46..47 "b" - Whitespace@47..48 " " - TextP@48..52 "text" + FunctionParameter@46..52 + Ident@46..47 "b" + Whitespace@47..48 " " + TypeName@48..52 + String@48..52 + TextP@48..52 "text" Ascii44@52..53 "," Whitespace@53..54 " " - Ident@54..63 "uppercase" - Whitespace@63..64 " " - BooleanP@64..71 "boolean" - Whitespace@71..72 " " - Default@72..79 "DEFAULT" - Whitespace@79..80 " " - FalseP@80..85 "false" + FunctionParameter@54..85 + Ident@54..63 "uppercase" + Whitespace@63..64 " " + BooleanP@64..71 "boolean" + Whitespace@71..72 " " + Default@72..79 "DEFAULT" + Whitespace@79..80 " " + AConst@80..85 + Boolean@80..85 + FalseP@80..85 "false" Ascii41@85..86 ")" Newline@86..87 "\n" Returns@87..94 "RETURNS" Whitespace@94..95 " " - TextP@95..99 "text" + TypeName@95..99 + String@95..99 + TextP@95..99 "text" Newline@99..100 "\n" - As@100..102 "AS" - Newline@102..103 "\n" - Sconst@103..214 "$$\n SELECT CASE\n ..." + DefElem@100..214 + As@100..102 "AS" + Newline@102..103 "\n" + List@103..214 + String@103..214 + Sconst@103..214 "$$\n SELECT CASE\n ..." Newline@214..215 "\n" - Language@215..223 "LANGUAGE" - Whitespace@223..224 " " - SqlP@224..227 "SQL" + DefElem@215..227 + Language@215..223 "LANGUAGE" + Whitespace@223..224 " " + String@224..227 + SqlP@224..227 "SQL" Whitespace@227..228 " " - Immutable@228..237 "IMMUTABLE" + DefElem@228..237 + String@228..237 + Immutable@228..237 "IMMUTABLE" Whitespace@237..238 " " - StrictP@238..244 "STRICT" + DefElem@238..244 + StrictP@238..244 "STRICT" Ascii59@244..245 ";" - Newline@245..246 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0021.snap b/crates/parser/tests/snapshots/statements/valid/0021.snap index ec8ea421..805cf921 100644 --- a/crates/parser/tests/snapshots/statements/valid/0021.snap +++ b/crates/parser/tests/snapshots/statements/valid/0021.snap @@ -2,31 +2,42 @@ source: crates/parser/tests/statement_parser_test.rs description: "SELECT concat_lower_or_upper(a => 'Hello', b => 'World', uppercase => true);\n" --- -SelectStmt@0..77 +SelectStmt@0..76 Select@0..6 "SELECT" Whitespace@6..7 " " - Ident@7..28 "concat_lower_or_upper" - Ascii40@28..29 "(" - Ident@29..30 "a" - Whitespace@30..31 " " - EqualsGreater@31..33 "=>" - Whitespace@33..34 " " - Sconst@34..41 "'Hello'" - Ascii44@41..42 "," - Whitespace@42..43 " " - Ident@43..44 "b" - Whitespace@44..45 " " - EqualsGreater@45..47 "=>" - Whitespace@47..48 " " - Sconst@48..55 "'World'" - Ascii44@55..56 "," - Whitespace@56..57 " " - Ident@57..66 "uppercase" - Whitespace@66..67 " " - EqualsGreater@67..69 "=>" - Whitespace@69..70 " " - TrueP@70..74 "true" - Ascii41@74..75 ")" + ResTarget@7..75 + FuncCall@7..75 + String@7..28 + Ident@7..28 "concat_lower_or_upper" + Ascii40@28..29 "(" + NamedArgExpr@29..41 + Ident@29..30 "a" + Whitespace@30..31 " " + EqualsGreater@31..33 "=>" + Whitespace@33..34 " " + AConst@34..41 + String@34..41 + Sconst@34..41 "'Hello'" + Ascii44@41..42 "," + Whitespace@42..43 " " + NamedArgExpr@43..55 + Ident@43..44 "b" + Whitespace@44..45 " " + EqualsGreater@45..47 "=>" + Whitespace@47..48 " " + AConst@48..55 + String@48..55 + Sconst@48..55 "'World'" + Ascii44@55..56 "," + Whitespace@56..57 " " + NamedArgExpr@57..74 + Ident@57..66 "uppercase" + Whitespace@66..67 " " + EqualsGreater@67..69 "=>" + Whitespace@69..70 " " + AConst@70..74 + Boolean@70..74 + TrueP@70..74 "true" + Ascii41@74..75 ")" Ascii59@75..76 ";" - Newline@76..77 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0022.snap b/crates/parser/tests/snapshots/statements/valid/0022.snap index b80610e9..78671f80 100644 --- a/crates/parser/tests/snapshots/statements/valid/0022.snap +++ b/crates/parser/tests/snapshots/statements/valid/0022.snap @@ -2,37 +2,49 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE products (\n product_no integer,\n name text,\n price numeric DEFAULT 9.99\n);\n" --- -CreateStmt@0..97 +CreateStmt@0..96 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..21 "products" + RangeVar@13..21 + Ident@13..21 "products" Whitespace@21..22 " " Ascii40@22..23 "(" Newline@23..24 "\n" Whitespace@24..28 " " - Ident@28..38 "product_no" - Whitespace@38..39 " " - Integer@39..46 "integer" + ColumnDef@28..46 + Ident@28..38 "product_no" + Whitespace@38..39 " " + TypeName@39..46 + String@39..46 + Integer@39..46 "integer" Ascii44@46..47 "," Newline@47..48 "\n" Whitespace@48..52 " " - NameP@52..56 "name" - Whitespace@56..57 " " - TextP@57..61 "text" + ColumnDef@52..61 + NameP@52..56 "name" + Whitespace@56..57 " " + TypeName@57..61 + String@57..61 + TextP@57..61 "text" Ascii44@61..62 "," Newline@62..63 "\n" Whitespace@63..67 " " - Ident@67..72 "price" - Whitespace@72..73 " " - Numeric@73..80 "numeric" - Whitespace@80..81 " " - Default@81..88 "DEFAULT" - Whitespace@88..89 " " - Fconst@89..93 "9.99" + ColumnDef@67..93 + Ident@67..72 "price" + Whitespace@72..73 " " + TypeName@73..80 + String@73..80 + Numeric@73..80 "numeric" + Whitespace@80..81 " " + Constraint@81..93 + Default@81..88 "DEFAULT" + Whitespace@88..89 " " + AConst@89..93 + Float@89..93 + Fconst@89..93 "9.99" Newline@93..94 "\n" Ascii41@94..95 ")" Ascii59@95..96 ";" - Newline@96..97 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0023.snap b/crates/parser/tests/snapshots/statements/valid/0023.snap index 2a8ccc2b..b9af2250 100644 --- a/crates/parser/tests/snapshots/statements/valid/0023.snap +++ b/crates/parser/tests/snapshots/statements/valid/0023.snap @@ -2,71 +2,104 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE products (\n product_no integer,\n name text,\n price numeric CHECK (price > 0),\n discounted_price numeric CHECK (discounted_price > 0),\n CHECK (price > discounted_price)\n);\n" --- -CreateStmt@0..199 +CreateStmt@0..198 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..21 "products" + RangeVar@13..21 + Ident@13..21 "products" Whitespace@21..22 " " Ascii40@22..23 "(" Newline@23..24 "\n" Whitespace@24..28 " " - Ident@28..38 "product_no" - Whitespace@38..39 " " - Integer@39..46 "integer" + ColumnDef@28..46 + Ident@28..38 "product_no" + Whitespace@38..39 " " + TypeName@39..46 + String@39..46 + Integer@39..46 "integer" Ascii44@46..47 "," Newline@47..48 "\n" Whitespace@48..52 " " - NameP@52..56 "name" - Whitespace@56..57 " " - TextP@57..61 "text" + ColumnDef@52..61 + NameP@52..56 "name" + Whitespace@56..57 " " + TypeName@57..61 + String@57..61 + TextP@57..61 "text" Ascii44@61..62 "," Newline@62..63 "\n" Whitespace@63..67 " " - Ident@67..72 "price" - Whitespace@72..73 " " - Numeric@73..80 "numeric" - Whitespace@80..81 " " - Check@81..86 "CHECK" - Whitespace@86..87 " " - Ascii40@87..88 "(" - Ident@88..93 "price" - Whitespace@93..94 " " - Ascii62@94..95 ">" - Whitespace@95..96 " " - Iconst@96..97 "0" - Ascii41@97..98 ")" + ColumnDef@67..98 + Ident@67..72 "price" + Whitespace@72..73 " " + TypeName@73..80 + String@73..80 + Numeric@73..80 "numeric" + Whitespace@80..81 " " + Constraint@81..98 + Check@81..86 "CHECK" + Whitespace@86..87 " " + Ascii40@87..88 "(" + AExpr@88..97 + ColumnRef@88..93 + String@88..93 + Ident@88..93 "price" + Whitespace@93..94 " " + String@94..95 + Ascii62@94..95 ">" + Whitespace@95..96 " " + AConst@96..97 + Integer@96..97 + Iconst@96..97 "0" + Ascii41@97..98 ")" Ascii44@98..99 "," Newline@99..100 "\n" Whitespace@100..104 " " - Ident@104..120 "discounted_price" - Whitespace@120..121 " " - Numeric@121..128 "numeric" - Whitespace@128..129 " " - Check@129..134 "CHECK" - Whitespace@134..135 " " - Ascii40@135..136 "(" - Ident@136..152 "discounted_price" - Whitespace@152..153 " " - Ascii62@153..154 ">" - Whitespace@154..155 " " - Iconst@155..156 "0" - Ascii41@156..157 ")" + ColumnDef@104..157 + Ident@104..120 "discounted_price" + Whitespace@120..121 " " + TypeName@121..128 + String@121..128 + Numeric@121..128 "numeric" + Whitespace@128..129 " " + Constraint@129..157 + Check@129..134 "CHECK" + Whitespace@134..135 " " + Ascii40@135..136 "(" + AExpr@136..156 + ColumnRef@136..152 + String@136..152 + Ident@136..152 "discounted_price" + Whitespace@152..153 " " + String@153..154 + Ascii62@153..154 ">" + Whitespace@154..155 " " + AConst@155..156 + Integer@155..156 + Iconst@155..156 "0" + Ascii41@156..157 ")" Ascii44@157..158 "," Newline@158..159 "\n" Whitespace@159..163 " " - Check@163..168 "CHECK" - Whitespace@168..169 " " - Ascii40@169..170 "(" - Ident@170..175 "price" - Whitespace@175..176 " " - Ascii62@176..177 ">" - Whitespace@177..178 " " - Ident@178..194 "discounted_price" - Ascii41@194..195 ")" + Constraint@163..195 + Check@163..168 "CHECK" + Whitespace@168..169 " " + Ascii40@169..170 "(" + AExpr@170..194 + ColumnRef@170..175 + String@170..175 + Ident@170..175 "price" + Whitespace@175..176 " " + String@176..177 + Ascii62@176..177 ">" + Whitespace@177..178 " " + ColumnRef@178..194 + String@178..194 + Ident@178..194 "discounted_price" + Ascii41@194..195 ")" Newline@195..196 "\n" Ascii41@196..197 ")" Ascii59@197..198 ";" - Newline@198..199 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0024.snap b/crates/parser/tests/snapshots/statements/valid/0024.snap index e4c63589..7e844839 100644 --- a/crates/parser/tests/snapshots/statements/valid/0024.snap +++ b/crates/parser/tests/snapshots/statements/valid/0024.snap @@ -2,54 +2,70 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE order_items (\n product_no integer REFERENCES products,\n order_id integer REFERENCES orders,\n quantity integer,\n PRIMARY KEY (product_no, order_id)\n);\n" --- -CreateStmt@0..175 +CreateStmt@0..174 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..24 "order_items" + RangeVar@13..24 + Ident@13..24 "order_items" Whitespace@24..25 " " Ascii40@25..26 "(" Newline@26..27 "\n" Whitespace@27..31 " " - Ident@31..41 "product_no" - Whitespace@41..42 " " - Integer@42..49 "integer" - Whitespace@49..50 " " - References@50..60 "REFERENCES" - Whitespace@60..61 " " - Ident@61..69 "products" + ColumnDef@31..69 + Ident@31..41 "product_no" + Whitespace@41..42 " " + TypeName@42..49 + String@42..49 + Integer@42..49 "integer" + Whitespace@49..50 " " + Constraint@50..69 + References@50..60 "REFERENCES" + Whitespace@60..61 " " + RangeVar@61..69 + Ident@61..69 "products" Ascii44@69..70 "," Newline@70..71 "\n" Whitespace@71..75 " " - Ident@75..83 "order_id" - Whitespace@83..84 " " - Integer@84..91 "integer" - Whitespace@91..92 " " - References@92..102 "REFERENCES" - Whitespace@102..103 " " - Ident@103..109 "orders" + ColumnDef@75..109 + Ident@75..83 "order_id" + Whitespace@83..84 " " + TypeName@84..91 + String@84..91 + Integer@84..91 "integer" + Whitespace@91..92 " " + Constraint@92..109 + References@92..102 "REFERENCES" + Whitespace@102..103 " " + RangeVar@103..109 + Ident@103..109 "orders" Ascii44@109..110 "," Newline@110..111 "\n" Whitespace@111..115 " " - Ident@115..123 "quantity" - Whitespace@123..124 " " - Integer@124..131 "integer" + ColumnDef@115..131 + Ident@115..123 "quantity" + Whitespace@123..124 " " + TypeName@124..131 + String@124..131 + Integer@124..131 "integer" Ascii44@131..132 "," Newline@132..133 "\n" Whitespace@133..137 " " - Primary@137..144 "PRIMARY" - Whitespace@144..145 " " - Key@145..148 "KEY" - Whitespace@148..149 " " - Ascii40@149..150 "(" - Ident@150..160 "product_no" - Ascii44@160..161 "," - Whitespace@161..162 " " - Ident@162..170 "order_id" - Ascii41@170..171 ")" + Constraint@137..171 + Primary@137..144 "PRIMARY" + Whitespace@144..145 " " + Key@145..148 "KEY" + Whitespace@148..149 " " + Ascii40@149..150 "(" + String@150..160 + Ident@150..160 "product_no" + Ascii44@160..161 "," + Whitespace@161..162 " " + String@162..170 + Ident@162..170 "order_id" + Ascii41@170..171 ")" Newline@171..172 "\n" Ascii41@172..173 ")" Ascii59@173..174 ";" - Newline@174..175 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0025.snap b/crates/parser/tests/snapshots/statements/valid/0025.snap index cc279acf..0b8b6f2a 100644 --- a/crates/parser/tests/snapshots/statements/valid/0025.snap +++ b/crates/parser/tests/snapshots/statements/valid/0025.snap @@ -2,24 +2,30 @@ source: crates/parser/tests/statement_parser_test.rs description: "ALTER TABLE products ADD CHECK (name <> '');\n" --- -AlterTableStmt@0..45 +AlterTableStmt@0..44 Alter@0..5 "ALTER" Whitespace@5..6 " " Table@6..11 "TABLE" Whitespace@11..12 " " - Ident@12..20 "products" + RangeVar@12..20 + Ident@12..20 "products" Whitespace@20..21 " " AddP@21..24 "ADD" Whitespace@24..25 " " - Check@25..30 "CHECK" - Whitespace@30..31 " " - Ascii40@31..32 "(" - NameP@32..36 "name" - Whitespace@36..37 " " - NotEquals@37..39 "<>" + AlterTableCmd@25..39 + Constraint@25..39 + Check@25..30 "CHECK" + Whitespace@30..31 " " + Ascii40@31..32 "(" + AExpr@32..39 + ColumnRef@32..36 + String@32..36 + NameP@32..36 "name" + Whitespace@36..37 " " + String@37..39 + NotEquals@37..39 "<>" Whitespace@39..40 " " Sconst@40..42 "''" Ascii41@42..43 ")" Ascii59@43..44 ";" - Newline@44..45 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0026.snap b/crates/parser/tests/snapshots/statements/valid/0026.snap index 730b2138..a575f9dc 100644 --- a/crates/parser/tests/snapshots/statements/valid/0026.snap +++ b/crates/parser/tests/snapshots/statements/valid/0026.snap @@ -2,27 +2,35 @@ source: crates/parser/tests/statement_parser_test.rs description: "ALTER TABLE products ALTER COLUMN price TYPE numeric(10,2);\n" --- -AlterTableStmt@0..60 +AlterTableStmt@0..59 Alter@0..5 "ALTER" Whitespace@5..6 " " Table@6..11 "TABLE" Whitespace@11..12 " " - Ident@12..20 "products" + RangeVar@12..20 + Ident@12..20 "products" Whitespace@20..21 " " Alter@21..26 "ALTER" Whitespace@26..27 " " Column@27..33 "COLUMN" Whitespace@33..34 " " - Ident@34..39 "price" - Whitespace@39..40 " " - TypeP@40..44 "TYPE" - Whitespace@44..45 " " - Numeric@45..52 "numeric" - Ascii40@52..53 "(" - Iconst@53..55 "10" - Ascii44@55..56 "," - Iconst@56..57 "2" - Ascii41@57..58 ")" + AlterTableCmd@34..58 + ColumnDef@34..58 + Ident@34..39 "price" + Whitespace@39..40 " " + TypeP@40..44 "TYPE" + Whitespace@44..45 " " + TypeName@45..58 + String@45..52 + Numeric@45..52 "numeric" + Ascii40@52..53 "(" + AConst@53..55 + Integer@53..55 + Iconst@53..55 "10" + Ascii44@55..56 "," + AConst@56..57 + Integer@56..57 + Iconst@56..57 "2" + Ascii41@57..58 ")" Ascii59@58..59 ";" - Newline@59..60 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0027.snap b/crates/parser/tests/snapshots/statements/valid/0027.snap index 2b88e432..d4354e7d 100644 --- a/crates/parser/tests/snapshots/statements/valid/0027.snap +++ b/crates/parser/tests/snapshots/statements/valid/0027.snap @@ -2,18 +2,20 @@ source: crates/parser/tests/statement_parser_test.rs description: "GRANT UPDATE ON accounts TO joe;\n" --- -GrantStmt@0..33 +GrantStmt@0..32 Grant@0..5 "GRANT" Whitespace@5..6 " " - Update@6..12 "UPDATE" + AccessPriv@6..12 + Update@6..12 "UPDATE" Whitespace@12..13 " " On@13..15 "ON" Whitespace@15..16 " " - Ident@16..24 "accounts" + RangeVar@16..24 + Ident@16..24 "accounts" Whitespace@24..25 " " To@25..27 "TO" Whitespace@27..28 " " - Ident@28..31 "joe" + RoleSpec@28..31 + Ident@28..31 "joe" Ascii59@31..32 ";" - Newline@32..33 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0028.snap b/crates/parser/tests/snapshots/statements/valid/0028.snap index c9ca0372..05a72aba 100644 --- a/crates/parser/tests/snapshots/statements/valid/0028.snap +++ b/crates/parser/tests/snapshots/statements/valid/0028.snap @@ -2,18 +2,18 @@ source: crates/parser/tests/statement_parser_test.rs description: "REVOKE ALL ON accounts FROM PUBLIC;\n" --- -GrantStmt@0..36 +GrantStmt@0..35 Revoke@0..6 "REVOKE" Whitespace@6..7 " " All@7..10 "ALL" Whitespace@10..11 " " On@11..13 "ON" Whitespace@13..14 " " - Ident@14..22 "accounts" + RangeVar@14..22 + Ident@14..22 "accounts" Whitespace@22..23 " " From@23..27 "FROM" Whitespace@27..28 " " Ident@28..34 "PUBLIC" Ascii59@34..35 ";" - Newline@35..36 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0029.snap b/crates/parser/tests/snapshots/statements/valid/0029.snap index ae280f52..4c2bcacc 100644 --- a/crates/parser/tests/snapshots/statements/valid/0029.snap +++ b/crates/parser/tests/snapshots/statements/valid/0029.snap @@ -2,29 +2,34 @@ source: crates/parser/tests/statement_parser_test.rs description: "GRANT SELECT (col1), UPDATE (col1) ON mytable TO miriam_rw;\n" --- -GrantStmt@0..60 +GrantStmt@0..59 Grant@0..5 "GRANT" Whitespace@5..6 " " - Select@6..12 "SELECT" - Whitespace@12..13 " " - Ascii40@13..14 "(" - Ident@14..18 "col1" - Ascii41@18..19 ")" + AccessPriv@6..19 + Select@6..12 "SELECT" + Whitespace@12..13 " " + Ascii40@13..14 "(" + String@14..18 + Ident@14..18 "col1" + Ascii41@18..19 ")" Ascii44@19..20 "," Whitespace@20..21 " " - Update@21..27 "UPDATE" - Whitespace@27..28 " " - Ascii40@28..29 "(" - Ident@29..33 "col1" - Ascii41@33..34 ")" + AccessPriv@21..34 + Update@21..27 "UPDATE" + Whitespace@27..28 " " + Ascii40@28..29 "(" + String@29..33 + Ident@29..33 "col1" + Ascii41@33..34 ")" Whitespace@34..35 " " On@35..37 "ON" Whitespace@37..38 " " - Ident@38..45 "mytable" + RangeVar@38..45 + Ident@38..45 "mytable" Whitespace@45..46 " " To@46..48 "TO" Whitespace@48..49 " " - Ident@49..58 "miriam_rw" + RoleSpec@49..58 + Ident@49..58 "miriam_rw" Ascii59@58..59 ";" - Newline@59..60 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0030.snap b/crates/parser/tests/snapshots/statements/valid/0030.snap index 68c40f34..e3f2bb1e 100644 --- a/crates/parser/tests/snapshots/statements/valid/0030.snap +++ b/crates/parser/tests/snapshots/statements/valid/0030.snap @@ -2,7 +2,7 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE POLICY account_managers ON accounts TO managers\n USING (manager = current_user);\n" --- -CreatePolicyStmt@0..91 +CreatePolicyStmt@0..90 Create@0..6 "CREATE" Whitespace@6..7 " " Policy@7..13 "POLICY" @@ -11,22 +11,28 @@ CreatePolicyStmt@0..91 Whitespace@30..31 " " On@31..33 "ON" Whitespace@33..34 " " - Ident@34..42 "accounts" + RangeVar@34..42 + Ident@34..42 "accounts" Whitespace@42..43 " " To@43..45 "TO" Whitespace@45..46 " " - Ident@46..54 "managers" + RoleSpec@46..54 + Ident@46..54 "managers" Newline@54..55 "\n" Whitespace@55..59 " " Using@59..64 "USING" Whitespace@64..65 " " Ascii40@65..66 "(" - Ident@66..73 "manager" - Whitespace@73..74 " " - Ascii61@74..75 "=" - Whitespace@75..76 " " - CurrentUser@76..88 "current_user" + AExpr@66..88 + ColumnRef@66..73 + String@66..73 + Ident@66..73 "manager" + Whitespace@73..74 " " + String@74..75 + Ascii61@74..75 "=" + Whitespace@75..76 " " + SqlvalueFunction@76..88 + CurrentUser@76..88 "current_user" Ascii41@88..89 ")" Ascii59@89..90 ";" - Newline@90..91 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0031.snap b/crates/parser/tests/snapshots/statements/valid/0031.snap index f3456e11..42afd22e 100644 --- a/crates/parser/tests/snapshots/statements/valid/0031.snap +++ b/crates/parser/tests/snapshots/statements/valid/0031.snap @@ -2,7 +2,7 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE POLICY user_mod ON passwd FOR UPDATE\n USING (current_user = user_name)\n WITH CHECK (\n current_user = user_name AND\n shell IN ('/bin/bash','/bin/sh','/bin/dash','/bin/zsh','/bin/tcsh')\n );\n" --- -CreatePolicyStmt@0..204 +CreatePolicyStmt@0..203 Create@0..6 "CREATE" Whitespace@6..7 " " Policy@7..13 "POLICY" @@ -11,7 +11,8 @@ CreatePolicyStmt@0..204 Whitespace@22..23 " " On@23..25 "ON" Whitespace@25..26 " " - Ident@26..32 "passwd" + RangeVar@26..32 + Ident@26..32 "passwd" Whitespace@32..33 " " For@33..36 "FOR" Whitespace@36..37 " " @@ -21,48 +22,72 @@ CreatePolicyStmt@0..204 Using@46..51 "USING" Whitespace@51..52 " " Ascii40@52..53 "(" - CurrentUser@53..65 "current_user" - Whitespace@65..66 " " - Ascii61@66..67 "=" - Whitespace@67..68 " " - Ident@68..77 "user_name" - Ascii41@77..78 ")" - Newline@78..79 "\n" - Whitespace@79..81 " " - With@81..85 "WITH" - Whitespace@85..86 " " - Check@86..91 "CHECK" - Whitespace@91..92 " " - Ascii40@92..93 "(" - Newline@93..94 "\n" - Whitespace@94..98 " " - CurrentUser@98..110 "current_user" - Whitespace@110..111 " " - Ascii61@111..112 "=" - Whitespace@112..113 " " - Ident@113..122 "user_name" - Whitespace@122..123 " " - And@123..126 "AND" - Newline@126..127 "\n" - Whitespace@127..131 " " - Ident@131..136 "shell" - Whitespace@136..137 " " - InP@137..139 "IN" - Whitespace@139..140 " " - Ascii40@140..141 "(" - Sconst@141..152 "'/bin/bash'" - Ascii44@152..153 "," - Sconst@153..162 "'/bin/sh'" - Ascii44@162..163 "," - Sconst@163..174 "'/bin/dash'" - Ascii44@174..175 "," - Sconst@175..185 "'/bin/zsh'" - Ascii44@185..186 "," - Sconst@186..197 "'/bin/tcsh'" - Ascii41@197..198 ")" - Newline@198..199 "\n" - Whitespace@199..201 " " - Ascii41@201..202 ")" + AExpr@53..202 + SqlvalueFunction@53..65 + CurrentUser@53..65 "current_user" + Whitespace@65..66 " " + BoolExpr@66..202 + AExpr@66..202 + String@66..67 + Ascii61@66..67 "=" + Whitespace@67..68 " " + ColumnRef@68..77 + String@68..77 + Ident@68..77 "user_name" + Ascii41@77..78 ")" + Newline@78..79 "\n" + Whitespace@79..81 " " + With@81..85 "WITH" + Whitespace@85..86 " " + Check@86..91 "CHECK" + Whitespace@91..92 " " + Ascii40@92..93 "(" + Newline@93..94 "\n" + Whitespace@94..98 " " + SqlvalueFunction@98..110 + CurrentUser@98..110 "current_user" + Whitespace@110..111 " " + AExpr@111..202 + String@111..112 + Ascii61@111..112 "=" + Whitespace@112..113 " " + ColumnRef@113..122 + String@113..122 + Ident@113..122 "user_name" + Whitespace@122..123 " " + And@123..126 "AND" + Newline@126..127 "\n" + Whitespace@127..131 " " + ColumnRef@131..136 + String@131..136 + Ident@131..136 "shell" + Whitespace@136..137 " " + InP@137..139 "IN" + Whitespace@139..140 " " + Ascii40@140..141 "(" + List@141..197 + AConst@141..152 + String@141..152 + Sconst@141..152 "'/bin/bash'" + Ascii44@152..153 "," + AConst@153..162 + String@153..162 + Sconst@153..162 "'/bin/sh'" + Ascii44@162..163 "," + AConst@163..174 + String@163..174 + Sconst@163..174 "'/bin/dash'" + Ascii44@174..175 "," + AConst@175..185 + String@175..185 + Sconst@175..185 "'/bin/zsh'" + Ascii44@185..186 "," + AConst@186..197 + String@186..197 + Sconst@186..197 "'/bin/tcsh'" + Ascii41@197..198 ")" + Newline@198..199 "\n" + Whitespace@199..201 " " + Ascii41@201..202 ")" Ascii59@202..203 ";" - Newline@203..204 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0032.snap b/crates/parser/tests/snapshots/statements/valid/0032.snap index d82132b6..772fbfbc 100644 --- a/crates/parser/tests/snapshots/statements/valid/0032.snap +++ b/crates/parser/tests/snapshots/statements/valid/0032.snap @@ -2,16 +2,19 @@ source: crates/parser/tests/statement_parser_test.rs description: "SET search_path TO myschema,public;\n" --- -VariableSetStmt@0..36 +VariableSetStmt@0..35 Set@0..3 "SET" Whitespace@3..4 " " Ident@4..15 "search_path" Whitespace@15..16 " " To@16..18 "TO" Whitespace@18..19 " " - Ident@19..27 "myschema" + AConst@19..27 + String@19..27 + Ident@19..27 "myschema" Ascii44@27..28 "," - Ident@28..34 "public" + AConst@28..34 + String@28..34 + Ident@28..34 "public" Ascii59@34..35 ";" - Newline@35..36 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0033.snap b/crates/parser/tests/snapshots/statements/valid/0033.snap index dbef84a2..f871cd80 100644 --- a/crates/parser/tests/snapshots/statements/valid/0033.snap +++ b/crates/parser/tests/snapshots/statements/valid/0033.snap @@ -2,19 +2,23 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE TABLE measurement (\n city_id int not null,\n logdate date not null,\n peaktemp int,\n unitsales int\n) PARTITION BY RANGE (logdate);\n" --- -CreateStmt@0..177 +CreateStmt@0..176 Create@0..6 "CREATE" Whitespace@6..7 " " Table@7..12 "TABLE" Whitespace@12..13 " " - Ident@13..24 "measurement" + RangeVar@13..24 + Ident@13..24 "measurement" Whitespace@24..25 " " Ascii40@25..26 "(" Newline@26..27 "\n" Whitespace@27..31 " " - Ident@31..38 "city_id" - Whitespace@38..47 " " - IntP@47..50 "int" + ColumnDef@31..50 + Ident@31..38 "city_id" + Whitespace@38..47 " " + TypeName@47..50 + String@47..50 + IntP@47..50 "int" Whitespace@50..51 " " Not@51..54 "not" Whitespace@54..55 " " @@ -22,9 +26,12 @@ CreateStmt@0..177 Ascii44@59..60 "," Newline@60..61 "\n" Whitespace@61..65 " " - Ident@65..72 "logdate" - Whitespace@72..81 " " - Ident@81..85 "date" + ColumnDef@65..85 + Ident@65..72 "logdate" + Whitespace@72..81 " " + TypeName@81..85 + String@81..85 + Ident@81..85 "date" Whitespace@85..86 " " Not@86..89 "not" Whitespace@89..90 " " @@ -32,27 +39,34 @@ CreateStmt@0..177 Ascii44@94..95 "," Newline@95..96 "\n" Whitespace@96..100 " " - Ident@100..108 "peaktemp" - Whitespace@108..116 " " - IntP@116..119 "int" + ColumnDef@100..119 + Ident@100..108 "peaktemp" + Whitespace@108..116 " " + TypeName@116..119 + String@116..119 + IntP@116..119 "int" Ascii44@119..120 "," Newline@120..121 "\n" Whitespace@121..125 " " - Ident@125..134 "unitsales" - Whitespace@134..141 " " - IntP@141..144 "int" + ColumnDef@125..144 + Ident@125..134 "unitsales" + Whitespace@134..141 " " + TypeName@141..144 + String@141..144 + IntP@141..144 "int" Newline@144..145 "\n" Ascii41@145..146 ")" Whitespace@146..147 " " - Partition@147..156 "PARTITION" - Whitespace@156..157 " " - By@157..159 "BY" - Whitespace@159..160 " " - Range@160..165 "RANGE" - Whitespace@165..166 " " - Ascii40@166..167 "(" - Ident@167..174 "logdate" - Ascii41@174..175 ")" + PartitionSpec@147..175 + Partition@147..156 "PARTITION" + Whitespace@156..157 " " + By@157..159 "BY" + Whitespace@159..160 " " + Range@160..165 "RANGE" + Whitespace@165..166 " " + Ascii40@166..167 "(" + PartitionElem@167..174 + Ident@167..174 "logdate" + Ascii41@174..175 ")" Ascii59@175..176 ";" - Newline@176..177 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0034.snap b/crates/parser/tests/snapshots/statements/valid/0034.snap index efede445..356bacab 100644 --- a/crates/parser/tests/snapshots/statements/valid/0034.snap +++ b/crates/parser/tests/snapshots/statements/valid/0034.snap @@ -2,24 +2,36 @@ source: crates/parser/tests/statement_parser_test.rs description: "select *,some_col from contact where id = '123 4 5';\n" --- -SelectStmt@0..53 +SelectStmt@0..52 Select@0..6 "select" Whitespace@6..7 " " - Ascii42@7..8 "*" + ResTarget@7..8 + ColumnRef@7..8 + AStar@7..8 + Ascii42@7..8 "*" Ascii44@8..9 "," - Ident@9..17 "some_col" + ResTarget@9..17 + ColumnRef@9..17 + String@9..17 + Ident@9..17 "some_col" Whitespace@17..18 " " From@18..22 "from" Whitespace@22..23 " " - Ident@23..30 "contact" + RangeVar@23..30 + Ident@23..30 "contact" Whitespace@30..31 " " Where@31..36 "where" Whitespace@36..37 " " - Ident@37..39 "id" - Whitespace@39..40 " " - Ascii61@40..41 "=" - Whitespace@41..42 " " - Sconst@42..51 "'123 4 5'" + AExpr@37..51 + ColumnRef@37..39 + String@37..39 + Ident@37..39 "id" + Whitespace@39..40 " " + String@40..41 + Ascii61@40..41 "=" + Whitespace@41..42 " " + AConst@42..51 + String@42..51 + Sconst@42..51 "'123 4 5'" Ascii59@51..52 ";" - Newline@52..53 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0035.snap b/crates/parser/tests/snapshots/statements/valid/0035.snap index efede445..356bacab 100644 --- a/crates/parser/tests/snapshots/statements/valid/0035.snap +++ b/crates/parser/tests/snapshots/statements/valid/0035.snap @@ -2,24 +2,36 @@ source: crates/parser/tests/statement_parser_test.rs description: "select *,some_col from contact where id = '123 4 5';\n" --- -SelectStmt@0..53 +SelectStmt@0..52 Select@0..6 "select" Whitespace@6..7 " " - Ascii42@7..8 "*" + ResTarget@7..8 + ColumnRef@7..8 + AStar@7..8 + Ascii42@7..8 "*" Ascii44@8..9 "," - Ident@9..17 "some_col" + ResTarget@9..17 + ColumnRef@9..17 + String@9..17 + Ident@9..17 "some_col" Whitespace@17..18 " " From@18..22 "from" Whitespace@22..23 " " - Ident@23..30 "contact" + RangeVar@23..30 + Ident@23..30 "contact" Whitespace@30..31 " " Where@31..36 "where" Whitespace@36..37 " " - Ident@37..39 "id" - Whitespace@39..40 " " - Ascii61@40..41 "=" - Whitespace@41..42 " " - Sconst@42..51 "'123 4 5'" + AExpr@37..51 + ColumnRef@37..39 + String@37..39 + Ident@37..39 "id" + Whitespace@39..40 " " + String@40..41 + Ascii61@40..41 "=" + Whitespace@41..42 " " + AConst@42..51 + String@42..51 + Sconst@42..51 "'123 4 5'" Ascii59@51..52 ";" - Newline@52..53 "\n" diff --git a/crates/parser/tests/snapshots/statements/valid/0036.snap b/crates/parser/tests/snapshots/statements/valid/0036.snap index 56b13946..2f52a5d4 100644 --- a/crates/parser/tests/snapshots/statements/valid/0036.snap +++ b/crates/parser/tests/snapshots/statements/valid/0036.snap @@ -2,41 +2,55 @@ source: crates/parser/tests/statement_parser_test.rs description: "CREATE FUNCTION dup(in int, out f1 int, out f2 text)\n AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$\n LANGUAGE SQL;\n" --- -CreateFunctionStmt@0..126 +CreateFunctionStmt@0..125 Create@0..6 "CREATE" Whitespace@6..7 " " Function@7..15 "FUNCTION" Whitespace@15..16 " " - Ident@16..19 "dup" + String@16..19 + Ident@16..19 "dup" Ascii40@19..20 "(" InP@20..22 "in" Whitespace@22..23 " " - IntP@23..26 "int" + FunctionParameter@23..26 + TypeName@23..26 + String@23..26 + IntP@23..26 "int" Ascii44@26..27 "," Whitespace@27..28 " " OutP@28..31 "out" Whitespace@31..32 " " - Ident@32..34 "f1" - Whitespace@34..35 " " - IntP@35..38 "int" + FunctionParameter@32..38 + Ident@32..34 "f1" + Whitespace@34..35 " " + TypeName@35..38 + String@35..38 + IntP@35..38 "int" Ascii44@38..39 "," Whitespace@39..40 " " OutP@40..43 "out" Whitespace@43..44 " " - Ident@44..46 "f2" - Whitespace@46..47 " " - TextP@47..51 "text" + FunctionParameter@44..51 + Ident@44..46 "f2" + Whitespace@46..47 " " + TypeName@47..51 + String@47..51 + TextP@47..51 "text" Ascii41@51..52 ")" Newline@52..53 "\n" Whitespace@53..57 " " - As@57..59 "AS" - Whitespace@59..60 " " - Sconst@60..107 "$$ SELECT $1, CAST($1 ..." + DefElem@57..107 + As@57..59 "AS" + Whitespace@59..60 " " + List@60..107 + String@60..107 + Sconst@60..107 "$$ SELECT $1, CAST($1 ..." Newline@107..108 "\n" Whitespace@108..112 " " - Language@112..120 "LANGUAGE" - Whitespace@120..121 " " - SqlP@121..124 "SQL" + DefElem@112..124 + Language@112..120 "LANGUAGE" + Whitespace@120..121 " " + String@121..124 + SqlP@121..124 "SQL" Ascii59@124..125 ";" - Newline@125..126 "\n" From e65f8de52880cae5a3739b695811676f40c22264 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 12 Oct 2023 22:16:33 +0200 Subject: [PATCH 14/16] chore: cleanup --- crates/parser/src/estimate_node_range.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index a55ffb65..524efe96 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -82,13 +82,9 @@ pub fn estimate_node_range( let mut ranged_nodes: Vec = Vec::new(); - // we get an estimated range by searching for tokens that match the node property values - // and, if available, the `location` of the node itself nodes.iter().enumerate().for_each(|(idx, n)| { let child_token_range = child_token_ranges[idx]; - println!("node: {:#?}, child_token_range: {:?}", n, child_token_range); - let child_node_ranges = ranged_nodes .iter() .filter(|x| x.inner.path.starts_with(n.path.as_str())) From 4d97205f723957414be9d20e7669f8958196f954 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 13 Oct 2023 17:10:20 +0200 Subject: [PATCH 15/16] chore: add comments --- crates/parser/src/lib.rs | 8 ++++---- crates/parser/src/statement_parser.rs | 15 ++++++++++----- crates/parser/src/syntax_error.rs | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 493b49cb..2a24b634 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -3,17 +3,17 @@ //! This crate provides a parser for the Postgres SQL dialect. //! It is based in the pg_query.rs crate, which is a wrapper around the PostgreSQL query parser. //! The main `Parser` struct parses a source file and individual statements. -//! The `Parse` struct contains the resulting concrete syntax tree, syntax errors, and the abtract syntax tree, which is a list of pg_query statements and their positions. +//! The `Parse` result struct contains the resulting concrete syntax tree, syntax errors, and the abtract syntax tree, which is a list of pg_query statements and their positions. //! //! The idea is to offload the heavy lifting to the same parser that the PostgreSQL server uses, -//! and just fill in the gaps to be able to build both cst and ast from a a source file that +//! and just fill in the gaps to be able to build both cst and ast from a source file that //! potentially contains erroneous statements. //! //! The main drawbacks of the PostgreSQL query parser mitigated by this parser are: //! - it only parsed a full source text, and if there is any syntax error in a file, it will not parse anything and return an error. -//! - it does not parse whitespaces and newlines, so it is not possible to build a concrete syntax tree build a concrete syntax tree. +//! - it does not parse whitespaces and newlines, and it only returns ast nodes. The concrete syntax tree has to be reverse-engineered. //! -//! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. +//! To see how these drawbacks are mitigated, see the `statement_parser.rs` and the `source_parser.rs` module. mod ast_node; mod estimate_node_range; diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 46e7126d..25d35511 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -9,10 +9,7 @@ use crate::{ syntax_kind_codegen::SyntaxKind, }; -/// A super simple lexer for sql statements. -/// -/// One weakness of pg_query.rs is that it does not parse whitespace or newlines. We use a very -/// simple lexer to fill the gaps. +/// Super simple lexer that only catches the tokens that libpg_query ignores. #[derive(Logos, Debug, PartialEq)] pub enum StatementToken { // comments and whitespaces @@ -27,7 +24,7 @@ pub enum StatementToken { } impl StatementToken { - /// Creates a `SyntaxKind` from a `StatementToken`. + /// Create a `SyntaxKind` from a `StatementToken`. pub fn syntax_kind(&self) -> SyntaxKind { match self { StatementToken::Whitespace => SyntaxKind::Whitespace, @@ -39,6 +36,13 @@ impl StatementToken { } impl Parser { + /// Parse a single statement passed in `text`. If `at_offset` is `Some`, the statement is assumed to be at that offset in the source file. + /// + /// On a high level, the parser works as follows: + /// - 1. Collect all information from pg_query.rs and `StatementToken` lexer + /// - 2. Derive as much information as possible from the collected information + /// - 3. Collect AST node and errors, if any + /// - 3. Walk the statement token by token, and reverse-engineer the concrete syntax tree pub fn parse_statement_at(&mut self, text: &str, at_offset: Option) { // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer @@ -98,6 +102,7 @@ impl Parser { let mut statement_token_lexer = StatementToken::lexer(&text); // 2. Setup data structures required for the parsing algorithm + // A buffer for tokens that are not applied immediately to the cst let mut token_buffer: VecDeque<(SyntaxKind, String)> = VecDeque::new(); // Keeps track of currently open nodes. Latest opened is last. diff --git a/crates/parser/src/syntax_error.rs b/crates/parser/src/syntax_error.rs index d8dee689..df6d670f 100644 --- a/crates/parser/src/syntax_error.rs +++ b/crates/parser/src/syntax_error.rs @@ -2,7 +2,7 @@ use std::fmt; use cstree::text::{TextRange, TextSize}; -/// Represents the result of unsuccessful tokenization, parsing +/// Represents the result of unsuccessful tokenization, parsing, /// or tree validation. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntaxError(String, TextRange); From 197f3512aecbdb2ee1cdb3db8da118f0f66005e8 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 13 Oct 2023 17:12:46 +0200 Subject: [PATCH 16/16] chore: add comments --- crates/parser/src/estimate_node_range.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/parser/src/estimate_node_range.rs b/crates/parser/src/estimate_node_range.rs index 524efe96..8c8e514b 100644 --- a/crates/parser/src/estimate_node_range.rs +++ b/crates/parser/src/estimate_node_range.rs @@ -53,9 +53,9 @@ pub fn estimate_node_range( }; }); - // second iteration using the nearest parent from the first + // second iteration using the nearest parent from the first, or the location of the nearest + // parent node for idx in too_many_tokens_at { - // get the nearest parent location let nearest_parent_start = get_nearest_parent_start(&nodes[idx], &nodes, &child_token_ranges); let nearest_parent_location = get_nearest_parent_location(&nodes[idx], &nodes); @@ -90,7 +90,7 @@ pub fn estimate_node_range( .filter(|x| x.inner.path.starts_with(n.path.as_str())) .collect::>(); - // get `from` location + // get `from` location as the smaller value of the location of the node, the start of all children nodes, and the start of the first child token let node_location = match get_location(&n.node) { Some(l) => Some(TextSize::from(l)), None => None, @@ -141,6 +141,7 @@ pub fn estimate_node_range( let to = to_locations.iter().filter(|v| v.is_some()).max(); if from.is_some() && to.is_some() { + // ignore nodes that have no range. They are not relevant for the cst. ranged_nodes.push(RangedNode { inner: n.to_owned(), range: TextRange::new(from.unwrap().unwrap(), to.unwrap().unwrap()),