diff --git a/src/ast/operator.rs b/src/ast/operator.rs index 66a35fee5..aaf3622fc 100644 --- a/src/ast/operator.rs +++ b/src/ast/operator.rs @@ -321,6 +321,15 @@ pub enum BinaryOperator { /// `~=` Same as? (PostgreSQL/Redshift geometric operator) /// See TildeEq, + /// Doris Match operator + /// See + /// e.g. `a MATCH_* 'keyword1_xxxxxxx'` + MatchAll, + MatchAny, + MatchPhrase, + MatchPhrasePrefix, + MatchRegexp, + MatchPhraseEdge, } impl fmt::Display for BinaryOperator { @@ -394,6 +403,12 @@ impl fmt::Display for BinaryOperator { BinaryOperator::QuestionDoublePipe => f.write_str("?||"), BinaryOperator::At => f.write_str("@"), BinaryOperator::TildeEq => f.write_str("~="), + BinaryOperator::MatchAll => f.write_str("MATCH_ALL"), + BinaryOperator::MatchAny => f.write_str("MATCH_ANY"), + BinaryOperator::MatchPhrase => f.write_str("MATCH_PHRASE"), + BinaryOperator::MatchPhrasePrefix => f.write_str("MATCH_PHRASE_PREFIX"), + BinaryOperator::MatchRegexp => f.write_str("MATCH_REGEXP"), + BinaryOperator::MatchPhraseEdge => f.write_str("MATCH_PHRASE_EDGE"), } } } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 1c32bc513..c2cb1982e 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -580,6 +580,14 @@ pub trait Dialect: Debug + Any { Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), + // Define Doris Match_* Operators: the same as LIKE precedence + Token::Word(w) if w.keyword == Keyword::MATCH_ALL => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::MATCH_ANY => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_PREFIX => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::MATCH_REGEXP => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_EDGE => Ok(p!(Like)), + // End Doris Match_* Operators Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)), Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)), Token::Period => Ok(p!(Period)), diff --git a/src/keywords.rs b/src/keywords.rs index a6854f073..8bf098557 100644 --- a/src/keywords.rs +++ b/src/keywords.rs @@ -514,8 +514,14 @@ define_keywords!( MATCH, MATCHED, MATCHES, + MATCH_ALL, + MATCH_ANY, MATCH_CONDITION, + MATCH_PHRASE, + MATCH_PHRASE_EDGE, + MATCH_PHRASE_PREFIX, MATCH_RECOGNIZE, + MATCH_REGEXP, MATERIALIZE, MATERIALIZED, MAX, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index f234fcc07..14eb4dba4 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3227,6 +3227,21 @@ impl<'a> Parser<'a> { self.expect_token(&Token::RParen)?; Some(BinaryOperator::PGCustomBinaryOperator(idents)) } + // Doris match operators + Keyword::MATCH_ALL => dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchAll), + Keyword::MATCH_ANY => dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchAny), + Keyword::MATCH_PHRASE => dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchPhrase), + Keyword::MATCH_PHRASE_PREFIX => { + dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchPhrasePrefix) + } + Keyword::MATCH_PHRASE_EDGE => dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchPhraseEdge), + Keyword::MATCH_REGEXP => dialect_is!(dialect is MySqlDialect | GenericDialect) + .then_some(BinaryOperator::MatchRegexp), _ => None, }, _ => None, @@ -15423,4 +15438,67 @@ mod tests { assert!(Parser::parse_sql(&MySqlDialect {}, sql).is_err()); } + + #[test] + fn test_doris_match_operators() { + let dialect = &MySqlDialect {}; + + // Copy from https://doris.apache.org/docs/table-design/index/inverted-index + let test_cases = [ + // 1.1 + "SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1';", + // 1.2 + "SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1 keyword2';", + // 1.3 + "SELECT * FROM table_name WHERE content MATCH_ALL 'keyword1 keyword2';", + // 2.1 + "SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2';", + // 2.2 + "SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3';", + "SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3+';", + // 2.3 + "SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1 keyword2';", + // 2.4 + "SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1';", + // 2.5 + "SELECT * FROM table_name WHERE content MATCH_REGEXP 'key*';", + ]; + + for sql in test_cases { + assert!(Parser::parse_sql(dialect, sql).is_ok()); + } + } + + #[test] + fn test_doris_match_precedence() { + let dialect = &MySqlDialect {}; + // Test sql with and, or, equal, like, between ... and operator + let sql = "SELECT + id, + title, + content, + score * 2 AS weighted_score + FROM + documents + WHERE + content MATCH_ALL 'important concept theory' + OR ( + author = 'Smith' AND content MATCH_ALL 'methodology approach' + ) + AND ( + (references > 10 AND citations MATCH_ALL 'credible source') + OR (importance = 'high' AND content MATCH_ALL 'breakthrough discovery') + ) + AND ( + (keywords LIKE '%analysis%' OR keywords MATCH_PHRASE 'evaluation') + AND (abstract MATCH_ALL 'systematic review' OR conclusion LIKE '%finding%') + ) + AND publication_date BETWEEN '2020-01-01' AND '2023-12-31' + ORDER BY + weighted_score DESC, + publication_date DESC + LIMIT 50"; + + assert!(Parser::parse_sql(dialect, sql).is_ok()); + } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index bc0f0efeb..8856c2bc9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -136,6 +136,18 @@ pub enum Token { DuckIntDiv, /// Modulo Operator `%` Mod, + /// MATCH_ALL + MatchAll, + /// MATCH_ANY + MatchAny, + /// MATCH_PHRASE + MatchPhrase, + /// MATCH_REGEX + MatchRegexp, + /// MATCH_PHRASE_PREFIX + MatchPhrasePrefix, + /// MATCH_PHRASE_EDGE + MatchPhraseEdge, /// String concatenation `||` StringConcat, /// Left parenthesis `(` @@ -382,6 +394,12 @@ impl fmt::Display for Token { Token::QuestionAnd => write!(f, "?&"), Token::QuestionPipe => write!(f, "?|"), Token::CustomBinaryOperator(s) => f.write_str(s), + Token::MatchAll => write!(f, "MATCH_ALL"), + Token::MatchAny => write!(f, "MATCH_ANY"), + Token::MatchPhrase => write!(f, "MATCH_PHRASE"), + Token::MatchRegexp => write!(f, "MATCH_REGEXP"), + Token::MatchPhraseEdge => write!(f, "MATCH_PHRASE_EDGE"), + Token::MatchPhrasePrefix => write!(f, "MATCH_PHRASE_PREFIX"), } } } @@ -3942,4 +3960,42 @@ mod tests { ], ); } + + #[test] + fn test_doris_match_phrase_operator() { + let dialect = MySqlDialect {}; + + for symbol in [ + "MATCH_ALL", + "MATCH_ANY", + "MATCH_PHRASE", + "MATCH_REGEXP", + "MATCH_PHRASE_PREFIX", + "MATCH_PHRASE_EDGE", + ] { + let sql = format!( + "SELECT * FROM table_name WHERE content {} 'keyword1 keyword2 ~3'", + symbol + ); + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Mul, + Token::Whitespace(Whitespace::Space), + Token::make_keyword("FROM"), + Token::Whitespace(Whitespace::Space), + Token::make_word("table_name", None), + Token::Whitespace(Whitespace::Space), + Token::make_keyword("WHERE"), + Token::Whitespace(Whitespace::Space), + Token::make_word("content", None), + Token::Whitespace(Whitespace::Space), + Token::make_keyword(symbol), + Token::Whitespace(Whitespace::Space), + Token::SingleQuotedString("keyword1 keyword2 ~3".into()), + ]; + compare(expected, tokens); + } + } } diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 15f79b4c2..f1f41d593 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -3418,3 +3418,24 @@ fn parse_cast_integers() { .run_parser_method("CAST(foo AS UNSIGNED INTEGER(3))", |p| p.parse_expr()) .expect_err("CAST doesn't allow display width"); } + +#[test] +fn parse_extend_match_operator() { + mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar'"); + mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar' OR foo LIKE 'cherry'"); + mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo LIKE 'cherry'"); + mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo MATCH_ALL 'cherry'"); + mysql().verified_expr("foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple' OR foo MATCH_ALL 'cherry'"); + + mysql().verified_stmt("SELECT * FROM table_name WHERE foo MATCH_ALL 'bar'"); + mysql().verified_stmt( + "SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple'", + ); + mysql().verified_stmt( + "SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple'", + ); + mysql().verified_stmt("SELECT * FROM table_name WHERE foo LIKE 'apple' OR foo MATCH_ALL 'bar'"); + mysql().verified_stmt( + "SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple' OR a BETWEEN 1 AND 2", + ); +}