Skip to content

Support for databricks JSON path syntax #35

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ $ cargo run --example cli - [--dialectname]

let dialect: Box<dyn Dialect> = match std::env::args().nth(2).unwrap_or_default().as_ref() {
"--ansi" => Box::new(AnsiDialect {}),
"--databricks" => Box::new(DatabricksDialect {}),
"--bigquery" => Box::new(BigQueryDialect {}),
"--postgres" => Box::new(PostgreSqlDialect {}),
"--ms" => Box::new(MsSqlDialect {}),
Expand Down
22 changes: 19 additions & 3 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,16 @@ pub enum JsonPathElem {
/// Accesses an object field or array element using bracket notation,
/// e.g. `obj['foo']`.
///
/// Note that on Databricks this is *not* equivalent to dot notation; the
/// former is case-insensitive but the latter is not.
///
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured#bracket-notation>.
Bracket { key: Expr },
/// Accesses all elements in the given (generally array) element. Used for
/// constructs like `foo:bar[*].baz`.
///
/// See <https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-json-path-expression#extract-values-from-arrays>
AllElements,
}

/// A JSON path.
Expand All @@ -539,17 +547,22 @@ pub enum JsonPathElem {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct JsonPath {
/// True if the path should start with a colon. Some dialects (e.g. Snowflake) allow
/// `a['b']`, whereas others (e.g. Databricks) require the colon even in this case
/// (so `a:['b']`).
pub has_colon: bool,
pub path: Vec<JsonPathElem>,
}

impl fmt::Display for JsonPath {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.has_colon {
write!(f, ":")?;
}
for (i, elem) in self.path.iter().enumerate() {
match elem {
JsonPathElem::Dot { key, quoted } => {
if i == 0 {
write!(f, ":")?;
} else {
if i != 0 {
write!(f, ".")?;
}

Expand All @@ -562,6 +575,9 @@ impl fmt::Display for JsonPath {
JsonPathElem::Bracket { key } => {
write!(f, "[{key}]")?;
}
JsonPathElem::AllElements => {
write!(f, "[*]")?;
}
}
}
Ok(())
Expand Down
4 changes: 3 additions & 1 deletion src/ast/spans.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1747,7 +1747,7 @@ impl Spanned for FunctionArgumentClause {
/// see Spanned impl for JsonPathElem for more information
impl Spanned for JsonPath {
fn span(&self) -> Span {
let JsonPath { path } = self;
let JsonPath { path, has_colon: _ } = self;

union_spans(path.iter().map(|i| i.span()))
}
Expand All @@ -1757,11 +1757,13 @@ impl Spanned for JsonPath {
///
/// Missing spans:
/// - [JsonPathElem::Dot]
/// - [JsonPathElem::AllElements]
impl Spanned for JsonPathElem {
fn span(&self) -> Span {
match self {
JsonPathElem::Dot { .. } => Span::empty(),
JsonPathElem::Bracket { key } => key.span(),
JsonPathElem::AllElements => Span::empty(),
}
}
}
Expand Down
17 changes: 16 additions & 1 deletion src/dialect/databricks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use crate::dialect::Dialect;
use crate::dialect::{Dialect, Precedence};
use crate::parser::{Parser, ParserError};
use crate::tokenizer::Token;

/// A [`Dialect`] for [Databricks SQL](https://www.databricks.com/)
///
Expand All @@ -38,6 +40,19 @@ impl Dialect for DatabricksDialect {
matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
}

fn get_next_precedence(&self, parser: &Parser) -> Option<Result<u8, ParserError>> {
let token = parser.peek_token();
// : is used for JSON path access
match token.token {
Token::Colon => Some(Ok(self.prec_value(Precedence::Period))),
_ => None,
}
}

fn supports_semi_structured_array_all_elements(&self) -> bool {
true
}

fn supports_filter_during_aggregation(&self) -> bool {
true
}
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,11 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports writing `[*]` to select all elements in a JSON array.
fn supports_semi_structured_array_all_elements(&self) -> bool {
false
}

/// Returns true if the specified keyword is reserved and cannot be
/// used as an identifier without special handling like quoting.
fn is_reserved_for_identifier(&self, kw: Keyword) -> bool {
Expand Down
36 changes: 29 additions & 7 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3632,7 +3632,8 @@ impl<'a> Parser<'a> {
expr: Box::new(expr),
})
} else if Token::LBracket == *tok && self.dialect.supports_partiql()
|| (dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == *tok)
|| (dialect_of!(self is SnowflakeDialect | GenericDialect | DatabricksDialect)
&& Token::Colon == *tok)
{
self.prev_token();
self.parse_json_access(expr)
Expand Down Expand Up @@ -3779,21 +3780,26 @@ impl<'a> Parser<'a> {
})
}

// Parser is either looking at a : or a bracket expression.
fn parse_json_path(&mut self) -> Result<JsonPath, ParserError> {
let mut path = Vec::new();
let mut has_colon = false;
loop {
match self.next_token().token {
Token::Colon if path.is_empty() => {
path.push(self.parse_json_path_object_key()?);
has_colon = true;
if *self.peek_token_ref() == Token::LBracket {
path.push(self.parse_json_path_bracket_element()?);
} else {
path.push(self.parse_json_path_object_key()?);
}
}
Token::Period if !path.is_empty() => {
path.push(self.parse_json_path_object_key()?);
}
Token::LBracket => {
let key = self.parse_expr()?;
self.expect_token(&Token::RBracket)?;

path.push(JsonPathElem::Bracket { key });
self.prev_token();
path.push(self.parse_json_path_bracket_element()?);
}
_ => {
self.prev_token();
Expand All @@ -3803,7 +3809,23 @@ impl<'a> Parser<'a> {
}

debug_assert!(!path.is_empty());
Ok(JsonPath { path })
Ok(JsonPath { has_colon, path })
}

/// Parses a single bracketed element in a JSON path expression, including both brackets.
fn parse_json_path_bracket_element(&mut self) -> Result<JsonPathElem, ParserError> {
self.expect_token(&Token::LBracket)?;
let elem = if *self.peek_token_ref() == Token::Mul
&& self.dialect.supports_semi_structured_array_all_elements()
{
self.expect_token(&Token::Mul)?;
JsonPathElem::AllElements
} else {
let key = self.parse_expr()?;
JsonPathElem::Bracket { key }
};
self.expect_token(&Token::RBracket)?;
Ok(elem)
}

/// Parses the parens following the `[ NOT ] IN` operator.
Expand Down
97 changes: 97 additions & 0 deletions tests/sqlparser_databricks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,100 @@ fn data_type_timestamp_ntz() {
s => panic!("Unexpected statement: {:?}", s),
}
}

#[test]
fn parse_semi_structured_data_traversal() {
// basic case
let sql = "SELECT a:b.c FROM t";
let select = databricks().verified_only_select(sql);
assert_eq!(
SelectItem::UnnamedExpr(Expr::JsonAccess {
value: Box::new(Expr::Identifier(Ident::new("a"))),
path: JsonPath {
has_colon: true,
path: vec![
JsonPathElem::Dot {
key: "b".to_owned(),
quoted: false
},
JsonPathElem::Dot {
key: "c".to_owned(),
quoted: false
}
]
},
}),
select.projection[0]
);

// brackets
let sql = "SELECT a:b['c'][0] FROM t";
let select = databricks().verified_only_select(sql);
assert_eq!(
SelectItem::UnnamedExpr(Expr::JsonAccess {
value: Box::new(Expr::Identifier(Ident::new("a"))),
path: JsonPath {
has_colon: true,
path: vec![
JsonPathElem::Dot {
key: "b".to_owned(),
quoted: false
},
JsonPathElem::Bracket {
key: Expr::value(Value::SingleQuotedString("c".to_owned()))
},
JsonPathElem::Bracket {
key: Expr::value(number("0"))
}
]
},
}),
select.projection[0]
);

// asterisk for arrays
let sql = "SELECT a:['b'].c FROM t";
let select = databricks().verified_only_select(sql);
assert_eq!(
SelectItem::UnnamedExpr(Expr::JsonAccess {
value: Box::new(Expr::Identifier(Ident::new("a"))),
path: JsonPath {
has_colon: true,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(Value::SingleQuotedString("b".to_owned())),
},
JsonPathElem::Dot {
key: "c".to_owned(),
quoted: false
}
]
},
}),
select.projection[0]
);

// asterisk for arrays
let sql = "SELECT a:b[*].c FROM t";
let select = databricks().verified_only_select(sql);
assert_eq!(
SelectItem::UnnamedExpr(Expr::JsonAccess {
value: Box::new(Expr::Identifier(Ident::new("a"))),
path: JsonPath {
has_colon: true,
path: vec![
JsonPathElem::Dot {
key: "b".to_owned(),
quoted: false
},
JsonPathElem::AllElements,
JsonPathElem::Dot {
key: "c".to_owned(),
quoted: false
}
]
},
}),
select.projection[0]
);
}
6 changes: 6 additions & 0 deletions tests/sqlparser_redshift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ fn test_redshift_json_path() {
Ident::new("c_orders")
])),
path: JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand All @@ -229,6 +230,7 @@ fn test_redshift_json_path() {
Ident::new("c_orders")
])),
path: JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand All @@ -255,6 +257,7 @@ fn test_redshift_json_path() {
Ident::new("col1")
])),
path: JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand All @@ -281,6 +284,7 @@ fn test_redshift_json_path() {
Ident::new("col1")
])),
path: JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand Down Expand Up @@ -308,6 +312,7 @@ fn test_parse_json_path_from() {
assert_eq!(
json_path,
&Some(JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand All @@ -332,6 +337,7 @@ fn test_parse_json_path_from() {
assert_eq!(
json_path,
&Some(JsonPath {
has_colon: false,
path: vec![
JsonPathElem::Bracket {
key: Expr::value(number("0"))
Expand Down
Loading