Skip to content

Commit 656d907

Browse files
committed
feat: support doris match_* operator --story=121938564
1 parent 6c58e2d commit 656d907

File tree

6 files changed

+184
-0
lines changed

6 files changed

+184
-0
lines changed

src/ast/operator.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,15 @@ pub enum BinaryOperator {
321321
/// `~=` Same as? (PostgreSQL/Redshift geometric operator)
322322
/// See <https://www.postgresql.org/docs/9.5/functions-geometry.html>
323323
TildeEq,
324+
/// Doris Match operator
325+
/// See <https://doris.apache.org/zh-cn/docs/develop/sql-reference/operators/match/>
326+
/// e.g. `a MATCH_* 'keyword1_xxxxxxx'`
327+
MatchAll,
328+
MatchAny,
329+
MatchPhrase,
330+
MatchPhrasePrefix,
331+
MatchRegexp,
332+
MatchPhraseEdge,
324333
}
325334

326335
impl fmt::Display for BinaryOperator {
@@ -394,6 +403,12 @@ impl fmt::Display for BinaryOperator {
394403
BinaryOperator::QuestionDoublePipe => f.write_str("?||"),
395404
BinaryOperator::At => f.write_str("@"),
396405
BinaryOperator::TildeEq => f.write_str("~="),
406+
BinaryOperator::MatchAll => f.write_str("MATCH_ALL"),
407+
BinaryOperator::MatchAny => f.write_str("MATCH_ANY"),
408+
BinaryOperator::MatchPhrase => f.write_str("MATCH_PHRASE"),
409+
BinaryOperator::MatchPhrasePrefix => f.write_str("MATCH_PHRASE_PREFIX"),
410+
BinaryOperator::MatchRegexp => f.write_str("MATCH_REGEXP"),
411+
BinaryOperator::MatchPhraseEdge => f.write_str("MATCH_PHRASE_EDGE"),
397412
}
398413
}
399414
}

src/dialect/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,14 @@ pub trait Dialect: Debug + Any {
580580
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)),
581581
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)),
582582
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)),
583+
// Define Doris Match_* Operators: the same as LIKE precedence
584+
Token::Word(w) if w.keyword == Keyword::MATCH_ALL => Ok(p!(Like)),
585+
Token::Word(w) if w.keyword == Keyword::MATCH_ANY => Ok(p!(Like)),
586+
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE => Ok(p!(Like)),
587+
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_PREFIX => Ok(p!(Like)),
588+
Token::Word(w) if w.keyword == Keyword::MATCH_REGEXP => Ok(p!(Like)),
589+
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_EDGE => Ok(p!(Like)),
590+
// End Doris Match_* Operators
583591
Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)),
584592
Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)),
585593
Token::Period => Ok(p!(Period)),

src/keywords.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,8 +514,14 @@ define_keywords!(
514514
MATCH,
515515
MATCHED,
516516
MATCHES,
517+
MATCH_ALL,
518+
MATCH_ANY,
517519
MATCH_CONDITION,
520+
MATCH_PHRASE,
521+
MATCH_PHRASE_EDGE,
522+
MATCH_PHRASE_PREFIX,
518523
MATCH_RECOGNIZE,
524+
MATCH_REGEXP,
519525
MATERIALIZE,
520526
MATERIALIZED,
521527
MAX,

src/parser/mod.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3227,6 +3227,21 @@ impl<'a> Parser<'a> {
32273227
self.expect_token(&Token::RParen)?;
32283228
Some(BinaryOperator::PGCustomBinaryOperator(idents))
32293229
}
3230+
// Doris match operators
3231+
Keyword::MATCH_ALL => dialect_is!(dialect is MySqlDialect | GenericDialect)
3232+
.then_some(BinaryOperator::MatchAll),
3233+
Keyword::MATCH_ANY => dialect_is!(dialect is MySqlDialect | GenericDialect)
3234+
.then_some(BinaryOperator::MatchAny),
3235+
Keyword::MATCH_PHRASE => dialect_is!(dialect is MySqlDialect | GenericDialect)
3236+
.then_some(BinaryOperator::MatchPhrase),
3237+
Keyword::MATCH_PHRASE_PREFIX => {
3238+
dialect_is!(dialect is MySqlDialect | GenericDialect)
3239+
.then_some(BinaryOperator::MatchPhrasePrefix)
3240+
}
3241+
Keyword::MATCH_PHRASE_EDGE => dialect_is!(dialect is MySqlDialect | GenericDialect)
3242+
.then_some(BinaryOperator::MatchPhraseEdge),
3243+
Keyword::MATCH_REGEXP => dialect_is!(dialect is MySqlDialect | GenericDialect)
3244+
.then_some(BinaryOperator::MatchRegexp),
32303245
_ => None,
32313246
},
32323247
_ => None,
@@ -15423,4 +15438,67 @@ mod tests {
1542315438

1542415439
assert!(Parser::parse_sql(&MySqlDialect {}, sql).is_err());
1542515440
}
15441+
15442+
#[test]
15443+
fn test_doris_match_operators() {
15444+
let dialect = &MySqlDialect {};
15445+
15446+
// Copy from https://doris.apache.org/docs/table-design/index/inverted-index
15447+
let test_cases = [
15448+
// 1.1
15449+
"SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1';",
15450+
// 1.2
15451+
"SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1 keyword2';",
15452+
// 1.3
15453+
"SELECT * FROM table_name WHERE content MATCH_ALL 'keyword1 keyword2';",
15454+
// 2.1
15455+
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2';",
15456+
// 2.2
15457+
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3';",
15458+
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3+';",
15459+
// 2.3
15460+
"SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1 keyword2';",
15461+
// 2.4
15462+
"SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1';",
15463+
// 2.5
15464+
"SELECT * FROM table_name WHERE content MATCH_REGEXP 'key*';",
15465+
];
15466+
15467+
for sql in test_cases {
15468+
assert!(Parser::parse_sql(dialect, sql).is_ok());
15469+
}
15470+
}
15471+
15472+
#[test]
15473+
fn test_doris_match_precedence() {
15474+
let dialect = &MySqlDialect {};
15475+
// Test sql with and, or, equal, like, between ... and operator
15476+
let sql = "SELECT
15477+
id,
15478+
title,
15479+
content,
15480+
score * 2 AS weighted_score
15481+
FROM
15482+
documents
15483+
WHERE
15484+
content MATCH_ALL 'important concept theory'
15485+
OR (
15486+
author = 'Smith' AND content MATCH_ALL 'methodology approach'
15487+
)
15488+
AND (
15489+
(references > 10 AND citations MATCH_ALL 'credible source')
15490+
OR (importance = 'high' AND content MATCH_ALL 'breakthrough discovery')
15491+
)
15492+
AND (
15493+
(keywords LIKE '%analysis%' OR keywords MATCH_PHRASE 'evaluation')
15494+
AND (abstract MATCH_ALL 'systematic review' OR conclusion LIKE '%finding%')
15495+
)
15496+
AND publication_date BETWEEN '2020-01-01' AND '2023-12-31'
15497+
ORDER BY
15498+
weighted_score DESC,
15499+
publication_date DESC
15500+
LIMIT 50";
15501+
15502+
assert!(Parser::parse_sql(dialect, sql).is_ok());
15503+
}
1542615504
}

src/tokenizer.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,18 @@ pub enum Token {
136136
DuckIntDiv,
137137
/// Modulo Operator `%`
138138
Mod,
139+
/// MATCH_ALL
140+
MatchAll,
141+
/// MATCH_ANY
142+
MatchAny,
143+
/// MATCH_PHRASE
144+
MatchPhrase,
145+
/// MATCH_REGEX
146+
MatchRegexp,
147+
/// MATCH_PHRASE_PREFIX
148+
MatchPhrasePrefix,
149+
/// MATCH_PHRASE_EDGE
150+
MatchPhraseEdge,
139151
/// String concatenation `||`
140152
StringConcat,
141153
/// Left parenthesis `(`
@@ -382,6 +394,12 @@ impl fmt::Display for Token {
382394
Token::QuestionAnd => write!(f, "?&"),
383395
Token::QuestionPipe => write!(f, "?|"),
384396
Token::CustomBinaryOperator(s) => f.write_str(s),
397+
Token::MatchAll => write!(f, "MATCH_ALL"),
398+
Token::MatchAny => write!(f, "MATCH_ANY"),
399+
Token::MatchPhrase => write!(f, "MATCH_PHRASE"),
400+
Token::MatchRegexp => write!(f, "MATCH_REGEXP"),
401+
Token::MatchPhraseEdge => write!(f, "MATCH_PHRASE_EDGE"),
402+
Token::MatchPhrasePrefix => write!(f, "MATCH_PHRASE_PREFIX"),
385403
}
386404
}
387405
}
@@ -3942,4 +3960,42 @@ mod tests {
39423960
],
39433961
);
39443962
}
3963+
3964+
#[test]
3965+
fn test_doris_match_phrase_operator() {
3966+
let dialect = MySqlDialect {};
3967+
3968+
for symbol in [
3969+
"MATCH_ALL",
3970+
"MATCH_ANY",
3971+
"MATCH_PHRASE",
3972+
"MATCH_REGEXP",
3973+
"MATCH_PHRASE_PREFIX",
3974+
"MATCH_PHRASE_EDGE",
3975+
] {
3976+
let sql = format!(
3977+
"SELECT * FROM table_name WHERE content {} 'keyword1 keyword2 ~3'",
3978+
symbol
3979+
);
3980+
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3981+
let expected = vec![
3982+
Token::make_keyword("SELECT"),
3983+
Token::Whitespace(Whitespace::Space),
3984+
Token::Mul,
3985+
Token::Whitespace(Whitespace::Space),
3986+
Token::make_keyword("FROM"),
3987+
Token::Whitespace(Whitespace::Space),
3988+
Token::make_word("table_name", None),
3989+
Token::Whitespace(Whitespace::Space),
3990+
Token::make_keyword("WHERE"),
3991+
Token::Whitespace(Whitespace::Space),
3992+
Token::make_word("content", None),
3993+
Token::Whitespace(Whitespace::Space),
3994+
Token::make_keyword(symbol),
3995+
Token::Whitespace(Whitespace::Space),
3996+
Token::SingleQuotedString("keyword1 keyword2 ~3".into()),
3997+
];
3998+
compare(expected, tokens);
3999+
}
4000+
}
39454001
}

tests/sqlparser_mysql.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3418,3 +3418,24 @@ fn parse_cast_integers() {
34183418
.run_parser_method("CAST(foo AS UNSIGNED INTEGER(3))", |p| p.parse_expr())
34193419
.expect_err("CAST doesn't allow display width");
34203420
}
3421+
3422+
#[test]
3423+
fn parse_extend_match_operator() {
3424+
mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar'");
3425+
mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar' OR foo LIKE 'cherry'");
3426+
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo LIKE 'cherry'");
3427+
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo MATCH_ALL 'cherry'");
3428+
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple' OR foo MATCH_ALL 'cherry'");
3429+
3430+
mysql().verified_stmt("SELECT * FROM table_name WHERE foo MATCH_ALL 'bar'");
3431+
mysql().verified_stmt(
3432+
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple'",
3433+
);
3434+
mysql().verified_stmt(
3435+
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple'",
3436+
);
3437+
mysql().verified_stmt("SELECT * FROM table_name WHERE foo LIKE 'apple' OR foo MATCH_ALL 'bar'");
3438+
mysql().verified_stmt(
3439+
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple' OR a BETWEEN 1 AND 2",
3440+
);
3441+
}

0 commit comments

Comments
 (0)