Skip to content

Commit b814637

Browse files
authored
Fixing the string tokenization (#199)
* Fixing the string tokenization * Adding more test cases and fixed bug in the parser Now the parser can understand escaped characters in the string
1 parent 689aef1 commit b814637

File tree

3 files changed

+184
-105
lines changed

3 files changed

+184
-105
lines changed

src/PHPCR/Util/QOM/Sql2Scanner.php

Lines changed: 60 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,6 @@ class Sql2Scanner
2626
*/
2727
protected $tokens;
2828

29-
/**
30-
* Delimiters between tokens.
31-
*
32-
* @var array
33-
*/
34-
protected $delimiters;
35-
3629
/**
3730
* Parsing position in the SQL string.
3831
*
@@ -68,16 +61,6 @@ public function lookupNextToken($offset = 0)
6861
return '';
6962
}
7063

71-
/**
72-
* Get the delimiter that separated the two previous tokens.
73-
*
74-
* @return string
75-
*/
76-
public function getPreviousDelimiter()
77-
{
78-
return isset($this->delimiters[$this->curpos - 1]) ? $this->delimiters[$this->curpos - 1] : ' ';
79-
}
80-
8164
/**
8265
* Get the next token and remove it from the queue.
8366
* Return an empty string when there are no more tokens.
@@ -116,12 +99,12 @@ public function expectToken($token, $case_insensitive = true)
11699
* Expect the next tokens to be the one given in the array of tokens and
117100
* throws an exception if it's not the case.
118101
*
119-
* @see expectToken
120-
*
121102
* @param array $tokens
122103
* @param bool $case_insensitive
123104
*
124105
* @throws InvalidQueryException
106+
*
107+
* @see expectToken
125108
*/
126109
public function expectTokens($tokens, $case_insensitive = true)
127110
{
@@ -151,7 +134,7 @@ public function tokenIs($token, $value, $case_insensitive = true)
151134
}
152135

153136
/**
154-
* Scan a SQL2 string a extract the tokens.
137+
* Scan a SQL2 string and extract the tokens.
155138
*
156139
* @param string $sql2
157140
*
@@ -160,49 +143,72 @@ public function tokenIs($token, $value, $case_insensitive = true)
160143
protected function scan($sql2)
161144
{
162145
$tokens = [];
163-
$token = strtok($sql2, " \n\t");
164-
while ($token !== false) {
165-
$this->tokenize($tokens, $token);
166-
$token = strtok(" \n\t");
146+
$currentToken = '';
147+
$tokenEndChars = ['.', ',', '(', ')', '='];
148+
149+
$stringStartCharacter = false;
150+
$isEscaped = false;
151+
$escapedQuotesCount = 0;
152+
foreach (\str_split($sql2) as $index => $character) {
153+
if (!$stringStartCharacter && in_array($character, [' ', "\t", "\n"], true)) {
154+
if ($currentToken !== '') {
155+
$tokens[] = $currentToken;
156+
}
157+
$currentToken = '';
158+
continue;
159+
}
160+
if (!$stringStartCharacter && in_array($character, $tokenEndChars, true)) {
161+
if ($currentToken !== '') {
162+
$tokens[] = $currentToken;
163+
}
164+
$tokens[] = $character;
165+
$currentToken = '';
166+
continue;
167+
}
168+
$currentToken .= $character;
169+
170+
if (!$isEscaped && in_array($character, ['"', "'"], true)) {
171+
// Checking if the previous or next value is a ' to handle the weird SQL strings
172+
// This will not check if the amount of quotes is even
173+
$nextCharacter = $this->getCharacterAtIndex($sql2, $index + 1);
174+
if ($character === "'" && $nextCharacter === "'") {
175+
$isEscaped = true;
176+
$escapedQuotesCount++;
177+
continue;
178+
}
179+
// If the escaped quotes are not paired up. eg. "I'''m cool" would be a parsing error
180+
if ($escapedQuotesCount % 2 == 1 && $stringStartCharacter !== "'") {
181+
throw new InvalidQueryException("Syntax error: Number of single quotes to be even: $currentToken");
182+
}
183+
if ($character === $stringStartCharacter) {
184+
// reached the end of the string
185+
$stringStartCharacter = false;
186+
$tokens[] = $currentToken;
187+
$currentToken = '';
188+
} elseif (!$stringStartCharacter) {
189+
// If there is no start character already we have found the beginning of a new string
190+
$stringStartCharacter = $character;
191+
}
192+
}
193+
$isEscaped = $character === '\\';
167194
}
168-
169-
$regexpTokens = [];
170-
foreach ($tokens as $token) {
171-
$regexpTokens[] = preg_quote($token, '/');
195+
if ($currentToken !== '') {
196+
$tokens[] = $currentToken;
172197
}
173198

174-
$regexp = '/^'.implode('([ \t\n]*)', $regexpTokens).'$/';
175-
preg_match($regexp, $sql2, $this->delimiters);
176-
$this->delimiters[0] = '';
199+
if ($stringStartCharacter) {
200+
throw new InvalidQueryException("Syntax error: unterminated quoted string $currentToken in '$sql2'");
201+
}
177202

178203
return $tokens;
179204
}
180205

181-
/**
182-
* Tokenize a string returned by strtok to split the string at '.', ',', '(', '='
183-
* and ')' characters.
184-
*
185-
* @param array $tokens
186-
* @param string $token
187-
*/
188-
protected function tokenize(&$tokens, $token)
206+
private function getCharacterAtIndex($string, $index)
189207
{
190-
$buffer = '';
191-
for ($i = 0; $i < strlen($token); $i++) {
192-
$char = trim(substr($token, $i, 1));
193-
if (in_array($char, ['.', ',', '(', ')', '='])) {
194-
if ($buffer !== '') {
195-
$tokens[] = $buffer;
196-
$buffer = '';
197-
}
198-
$tokens[] = $char;
199-
} else {
200-
$buffer .= $char;
201-
}
208+
if ($index < strlen($string)) {
209+
return $string[$index];
202210
}
203211

204-
if ($buffer !== '') {
205-
$tokens[] = $buffer;
206-
}
212+
return '';
207213
}
208214
}

src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php

Lines changed: 8 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -756,27 +756,13 @@ protected function parseCastLiteral($token)
756756
$this->scanner->expectToken('(');
757757
$token = $this->scanner->fetchNextToken();
758758

759-
$quoteString = false;
760-
if (substr($token, 0, 1) === '\'') {
761-
$quoteString = "'";
762-
} elseif (substr($token, 0, 1) === '"') {
763-
$quoteString = '"';
764-
}
759+
$quoteString = in_array($token[0], ['\'', '"'], true);
765760

766761
if ($quoteString) {
767-
while (substr($token, -1) !== $quoteString) {
768-
$nextToken = $this->scanner->fetchNextToken();
769-
if ('' === $nextToken) {
770-
break;
771-
}
772-
$token .= $nextToken;
773-
}
774-
775-
if (substr($token, -1) !== $quoteString) {
776-
throw new InvalidQueryException("Syntax error: unterminated quoted string '$token' in '{$this->sql2}'");
777-
}
762+
$quotesUsed = $token[0];
778763
$token = substr($token, 1, -1);
779-
$token = str_replace('\\'.$quoteString, $quoteString, $token);
764+
// Un-escaping quotes
765+
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
780766
}
781767

782768
$this->scanner->expectToken('AS');
@@ -813,28 +799,13 @@ protected function parseLiteralValue()
813799
return $this->parseCastLiteral($token);
814800
}
815801

816-
$quoteString = false;
817-
if (substr($token, 0, 1) === '\'') {
818-
$quoteString = "'";
819-
} elseif (substr($token, 0, 1) === '"') {
820-
$quoteString = '"';
821-
}
802+
$quoteString = in_array($token[0], ['"', "'"], true);
822803

823804
if ($quoteString) {
824-
while (substr($token, -1) !== $quoteString) {
825-
$nextToken = $this->scanner->fetchNextToken();
826-
if ('' === $nextToken) {
827-
break;
828-
}
829-
$token .= $this->scanner->getPreviousDelimiter();
830-
$token .= $nextToken;
831-
}
832-
833-
if (substr($token, -1) !== $quoteString) {
834-
throw new InvalidQueryException("Syntax error: unterminated quoted string $token in '{$this->sql2}'");
835-
}
805+
$quotesUsed = $token[0];
836806
$token = substr($token, 1, -1);
837-
$token = str_replace('\\'.$quoteString, $quoteString, $token);
807+
// Unescape quotes
808+
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
838809
$token = str_replace("''", "'", $token);
839810
if (preg_match('/^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d+)?$/', $token)) {
840811
if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $token)) {

tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php

Lines changed: 116 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace PHPCR\Tests\Util\QOM;
44

5+
use PHPCR\Query\InvalidQueryException;
56
use PHPCR\Util\QOM\Sql2Scanner;
67
use PHPUnit\Framework\TestCase;
78

@@ -21,27 +22,128 @@ public function testToken()
2122
'page',
2223
];
2324

24-
while ($token = $scanner->fetchNextToken()) {
25-
$this->assertEquals(array_shift($expected), $token);
26-
}
25+
$this->expectTokensFromScanner($scanner, $expected);
2726
}
2827

29-
public function testDelimiter()
28+
/**
29+
* @dataProvider dataTestStringTokenization
30+
*/
31+
public function testStringTokenization()
3032
{
31-
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page');
33+
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"');
3234
$expected = [
33-
'',
34-
' ',
35-
'',
36-
'',
37-
' ',
38-
' ',
39-
' ',
40-
' ',
35+
'SELECT',
36+
'page',
37+
'.',
38+
'*',
39+
'FROM',
40+
'[nt:unstructured]',
41+
'AS',
42+
'page',
43+
'WHERE',
44+
'name',
45+
'=',
46+
'"Hello world"',
4147
];
4248

49+
$this->expectTokensFromScanner($scanner, $expected);
50+
}
51+
52+
public function dataTestStringTokenization()
53+
{
54+
$multilineQuery = <<<'SQL'
55+
SELECT page.*
56+
FROM [nt:unstructured] AS page
57+
WHERE name ="Hello world"
58+
SQL;
59+
60+
return [
61+
'single line query' => ['SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"'],
62+
'multi line query' => [$multilineQuery],
63+
];
64+
}
65+
66+
public function testEscapingStrings()
67+
{
68+
$sql = <<<SQL
69+
SELECT page.* FROM [nt:unstructured] AS page WHERE page.quotes = "\"'"
70+
SQL;
71+
$scanner = new Sql2Scanner($sql);
72+
$expected = [
73+
'SELECT',
74+
'page',
75+
'.',
76+
'*',
77+
'FROM',
78+
'[nt:unstructured]',
79+
'AS',
80+
'page',
81+
'WHERE',
82+
'page',
83+
'.',
84+
'quotes',
85+
'=',
86+
'"\"\'"',
87+
];
88+
89+
$this->expectTokensFromScanner($scanner, $expected);
90+
}
91+
92+
public function testSQLEscapedStrings()
93+
{
94+
$sql = "WHERE page.name = 'Hello, it''s me.'";
95+
96+
$scanner = new Sql2Scanner($sql);
97+
$expected = [
98+
'WHERE',
99+
'page',
100+
'.',
101+
'name',
102+
'=',
103+
"'Hello, it''s me.'",
104+
];
105+
106+
$this->expectTokensFromScanner($scanner, $expected);
107+
}
108+
109+
public function testSQLEscapedStrings2()
110+
{
111+
$sql = "WHERE page.name = 'Hello, it''' AND";
112+
113+
$scanner = new Sql2Scanner($sql);
114+
$expected = [
115+
'WHERE',
116+
'page',
117+
'.',
118+
'name',
119+
'=',
120+
"'Hello, it'''",
121+
'AND',
122+
];
123+
124+
$this->expectTokensFromScanner($scanner, $expected);
125+
}
126+
127+
public function testThrowingErrorOnUnclosedString()
128+
{
129+
$this->expectException(InvalidQueryException::class);
130+
new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello ');
131+
}
132+
133+
/**
134+
* Function to assert that the tokens the scanner finds match the expected output
135+
* and the entire expected output is consumed.
136+
*
137+
* @param Sql2Scanner $scanner
138+
* @param array<string> $expected
139+
*/
140+
private function expectTokensFromScanner(Sql2Scanner $scanner, array $expected)
141+
{
142+
$actualTokens = [];
43143
while ($token = $scanner->fetchNextToken()) {
44-
$this->assertEquals(array_shift($expected), $scanner->getPreviousDelimiter());
144+
$actualTokens[] = $token;
45145
}
146+
147+
$this->assertEquals($expected, $actualTokens);
46148
}
47149
}

0 commit comments

Comments
 (0)