From fa4dd4d06ed209755f36c6685fc4556ed0005f28 Mon Sep 17 00:00:00 2001 From: Ash Zahlen Date: Thu, 12 Jun 2025 14:57:54 -0700 Subject: [PATCH] Add support for Databricks JSON path syntax --- examples/cli.rs | 1 + src/ast/mod.rs | 22 ++++++-- src/ast/spans.rs | 4 +- src/dialect/databricks.rs | 17 +++++- src/dialect/mod.rs | 5 ++ src/parser/mod.rs | 36 ++++++++++--- tests/sqlparser_databricks.rs | 97 +++++++++++++++++++++++++++++++++++ tests/sqlparser_redshift.rs | 6 +++ tests/sqlparser_snowflake.rs | 12 +++++ 9 files changed, 188 insertions(+), 12 deletions(-) diff --git a/examples/cli.rs b/examples/cli.rs index 0252fca74..9f450c1d4 100644 --- a/examples/cli.rs +++ b/examples/cli.rs @@ -48,6 +48,7 @@ $ cargo run --example cli - [--dialectname] let dialect: Box = match std::env::args().nth(2).unwrap_or_default().as_ref() { "--ansi" => Box::new(AnsiDialect {}), + "--databricks" => Box::new(DatabricksDialect {}), "--bigquery" => Box::new(BigQueryDialect {}), "--postgres" => Box::new(PostgreSqlDialect {}), "--ms" => Box::new(MsSqlDialect {}), diff --git a/src/ast/mod.rs b/src/ast/mod.rs index b97394c10..acf409173 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -527,8 +527,16 @@ pub enum JsonPathElem { /// Accesses an object field or array element using bracket notation, /// e.g. `obj['foo']`. /// + /// Note that on Databricks this is *not* equivalent to dot notation; the + /// former is case-insensitive but the latter is not. + /// /// See . Bracket { key: Expr }, + /// Accesses all elements in the given (generally array) element. Used for + /// constructs like `foo:bar[*].baz`. + /// + /// See + AllElements, } /// A JSON path. @@ -539,17 +547,22 @@ pub enum JsonPathElem { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] pub struct JsonPath { + /// True if the path should start with a colon. Some dialects (e.g. Snowflake) allow + /// `a['b']`, whereas others (e.g. Databricks) require the colon even in this case + /// (so `a:['b']`). + pub has_colon: bool, pub path: Vec, } impl fmt::Display for JsonPath { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.has_colon { + write!(f, ":")?; + } for (i, elem) in self.path.iter().enumerate() { match elem { JsonPathElem::Dot { key, quoted } => { - if i == 0 { - write!(f, ":")?; - } else { + if i != 0 { write!(f, ".")?; } @@ -562,6 +575,9 @@ impl fmt::Display for JsonPath { JsonPathElem::Bracket { key } => { write!(f, "[{key}]")?; } + JsonPathElem::AllElements => { + write!(f, "[*]")?; + } } } Ok(()) diff --git a/src/ast/spans.rs b/src/ast/spans.rs index 9b8c8d790..1d321d8c6 100644 --- a/src/ast/spans.rs +++ b/src/ast/spans.rs @@ -1747,7 +1747,7 @@ impl Spanned for FunctionArgumentClause { /// see Spanned impl for JsonPathElem for more information impl Spanned for JsonPath { fn span(&self) -> Span { - let JsonPath { path } = self; + let JsonPath { path, has_colon: _ } = self; union_spans(path.iter().map(|i| i.span())) } @@ -1757,11 +1757,13 @@ impl Spanned for JsonPath { /// /// Missing spans: /// - [JsonPathElem::Dot] +/// - [JsonPathElem::AllElements] impl Spanned for JsonPathElem { fn span(&self) -> Span { match self { JsonPathElem::Dot { .. } => Span::empty(), JsonPathElem::Bracket { key } => key.span(), + JsonPathElem::AllElements => Span::empty(), } } } diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs index a3476b1b8..261133d19 100644 --- a/src/dialect/databricks.rs +++ b/src/dialect/databricks.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::dialect::Dialect; +use crate::dialect::{Dialect, Precedence}; +use crate::parser::{Parser, ParserError}; +use crate::tokenizer::Token; /// A [`Dialect`] for [Databricks SQL](https://www.databricks.com/) /// @@ -38,6 +40,19 @@ impl Dialect for DatabricksDialect { matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') } + fn get_next_precedence(&self, parser: &Parser) -> Option> { + let token = parser.peek_token(); + // : is used for JSON path access + match token.token { + Token::Colon => Some(Ok(self.prec_value(Precedence::Period))), + _ => None, + } + } + + fn supports_semi_structured_array_all_elements(&self) -> bool { + true + } + fn supports_filter_during_aggregation(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index a4c899e6b..06cca2fb9 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -892,6 +892,11 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports writing `[*]` to select all elements in a JSON array. + fn supports_semi_structured_array_all_elements(&self) -> bool { + false + } + /// Returns true if the specified keyword is reserved and cannot be /// used as an identifier without special handling like quoting. fn is_reserved_for_identifier(&self, kw: Keyword) -> bool { diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b54c7b138..74c98e85c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3632,7 +3632,8 @@ impl<'a> Parser<'a> { expr: Box::new(expr), }) } else if Token::LBracket == *tok && self.dialect.supports_partiql() - || (dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == *tok) + || (dialect_of!(self is SnowflakeDialect | GenericDialect | DatabricksDialect) + && Token::Colon == *tok) { self.prev_token(); self.parse_json_access(expr) @@ -3779,21 +3780,26 @@ impl<'a> Parser<'a> { }) } + // Parser is either looking at a : or a bracket expression. fn parse_json_path(&mut self) -> Result { let mut path = Vec::new(); + let mut has_colon = false; loop { match self.next_token().token { Token::Colon if path.is_empty() => { - path.push(self.parse_json_path_object_key()?); + has_colon = true; + if *self.peek_token_ref() == Token::LBracket { + path.push(self.parse_json_path_bracket_element()?); + } else { + path.push(self.parse_json_path_object_key()?); + } } Token::Period if !path.is_empty() => { path.push(self.parse_json_path_object_key()?); } Token::LBracket => { - let key = self.parse_expr()?; - self.expect_token(&Token::RBracket)?; - - path.push(JsonPathElem::Bracket { key }); + self.prev_token(); + path.push(self.parse_json_path_bracket_element()?); } _ => { self.prev_token(); @@ -3803,7 +3809,23 @@ impl<'a> Parser<'a> { } debug_assert!(!path.is_empty()); - Ok(JsonPath { path }) + Ok(JsonPath { has_colon, path }) + } + + /// Parses a single bracketed element in a JSON path expression, including both brackets. + fn parse_json_path_bracket_element(&mut self) -> Result { + self.expect_token(&Token::LBracket)?; + let elem = if *self.peek_token_ref() == Token::Mul + && self.dialect.supports_semi_structured_array_all_elements() + { + self.expect_token(&Token::Mul)?; + JsonPathElem::AllElements + } else { + let key = self.parse_expr()?; + JsonPathElem::Bracket { key } + }; + self.expect_token(&Token::RBracket)?; + Ok(elem) } /// Parses the parens following the `[ NOT ] IN` operator. diff --git a/tests/sqlparser_databricks.rs b/tests/sqlparser_databricks.rs index 99b7eecde..b9d6b28a3 100644 --- a/tests/sqlparser_databricks.rs +++ b/tests/sqlparser_databricks.rs @@ -360,3 +360,100 @@ fn data_type_timestamp_ntz() { s => panic!("Unexpected statement: {:?}", s), } } + +#[test] +fn parse_semi_structured_data_traversal() { + // basic case + let sql = "SELECT a:b.c FROM t"; + let select = databricks().verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + has_colon: true, + path: vec![ + JsonPathElem::Dot { + key: "b".to_owned(), + quoted: false + }, + JsonPathElem::Dot { + key: "c".to_owned(), + quoted: false + } + ] + }, + }), + select.projection[0] + ); + + // brackets + let sql = "SELECT a:b['c'][0] FROM t"; + let select = databricks().verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + has_colon: true, + path: vec![ + JsonPathElem::Dot { + key: "b".to_owned(), + quoted: false + }, + JsonPathElem::Bracket { + key: Expr::value(Value::SingleQuotedString("c".to_owned())) + }, + JsonPathElem::Bracket { + key: Expr::value(number("0")) + } + ] + }, + }), + select.projection[0] + ); + + // asterisk for arrays + let sql = "SELECT a:['b'].c FROM t"; + let select = databricks().verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + has_colon: true, + path: vec![ + JsonPathElem::Bracket { + key: Expr::value(Value::SingleQuotedString("b".to_owned())), + }, + JsonPathElem::Dot { + key: "c".to_owned(), + quoted: false + } + ] + }, + }), + select.projection[0] + ); + + // asterisk for arrays + let sql = "SELECT a:b[*].c FROM t"; + let select = databricks().verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + has_colon: true, + path: vec![ + JsonPathElem::Dot { + key: "b".to_owned(), + quoted: false + }, + JsonPathElem::AllElements, + JsonPathElem::Dot { + key: "c".to_owned(), + quoted: false + } + ] + }, + }), + select.projection[0] + ); +} diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index be2b67223..446a927f2 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -206,6 +206,7 @@ fn test_redshift_json_path() { Ident::new("c_orders") ])), path: JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) @@ -229,6 +230,7 @@ fn test_redshift_json_path() { Ident::new("c_orders") ])), path: JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) @@ -255,6 +257,7 @@ fn test_redshift_json_path() { Ident::new("col1") ])), path: JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) @@ -281,6 +284,7 @@ fn test_redshift_json_path() { Ident::new("col1") ])), path: JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) @@ -308,6 +312,7 @@ fn test_parse_json_path_from() { assert_eq!( json_path, &Some(JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) @@ -332,6 +337,7 @@ fn test_parse_json_path_from() { assert_eq!( json_path, &Some(JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")) diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index 11199a625..cf7dba342 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -1127,6 +1127,7 @@ fn parse_semi_structured_data_traversal() { SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "b".to_owned(), quoted: false @@ -1143,6 +1144,7 @@ fn parse_semi_structured_data_traversal() { SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "my long object key name".to_owned(), quoted: true @@ -1159,6 +1161,7 @@ fn parse_semi_structured_data_traversal() { SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: false, path: vec![JsonPathElem::Bracket { key: Expr::BinaryOp { left: Box::new(Expr::value(number("2"))), @@ -1181,6 +1184,7 @@ fn parse_semi_structured_data_traversal() { SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "select".to_owned(), quoted: false @@ -1190,6 +1194,7 @@ fn parse_semi_structured_data_traversal() { SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "from".to_owned(), quoted: false @@ -1208,6 +1213,7 @@ fn parse_semi_structured_data_traversal() { vec![SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![ JsonPathElem::Dot { key: "foo".to_owned(), @@ -1235,6 +1241,7 @@ fn parse_semi_structured_data_traversal() { vec![SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![ JsonPathElem::Dot { key: "foo".to_owned(), @@ -1261,6 +1268,7 @@ fn parse_semi_structured_data_traversal() { vec![SelectItem::UnnamedExpr(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: false, path: vec![ JsonPathElem::Bracket { key: Expr::value(number("0")), @@ -1285,10 +1293,12 @@ fn parse_semi_structured_data_traversal() { Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: false, path: vec![JsonPathElem::Bracket { key: Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("b"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "c".to_owned(), quoted: false @@ -1320,6 +1330,7 @@ fn parse_semi_structured_data_traversal() { expr: Box::new(Expr::JsonAccess { value: Box::new(Expr::Identifier(Ident::new("a"))), path: JsonPath { + has_colon: true, path: vec![JsonPathElem::Dot { key: "b".to_string(), quoted: false @@ -1328,6 +1339,7 @@ fn parse_semi_structured_data_traversal() { }) }), path: JsonPath { + has_colon: false, path: vec![JsonPathElem::Bracket { key: Expr::value(number("1")) }]