Skip to content

Fixing the string tokenization #199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 29, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 38 additions & 59 deletions src/PHPCR/Util/QOM/Sql2Scanner.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,6 @@ class Sql2Scanner
*/
protected $tokens;

/**
* Delimiters between tokens.
*
* @var array
*/
protected $delimiters;

/**
* Parsing position in the SQL string.
*
Expand Down Expand Up @@ -68,16 +61,6 @@ public function lookupNextToken($offset = 0)
return '';
}

/**
* Get the delimiter that separated the two previous tokens.
*
* @return string
*/
public function getPreviousDelimiter()
{
return isset($this->delimiters[$this->curpos - 1]) ? $this->delimiters[$this->curpos - 1] : ' ';
}

/**
* Get the next token and remove it from the queue.
* Return an empty string when there are no more tokens.
Expand Down Expand Up @@ -116,12 +99,12 @@ public function expectToken($token, $case_insensitive = true)
* Expect the next tokens to be the one given in the array of tokens and
* throws an exception if it's not the case.
*
* @see expectToken
*
* @param array $tokens
* @param bool $case_insensitive
*
* @throws InvalidQueryException
*
* @see expectToken
*/
public function expectTokens($tokens, $case_insensitive = true)
{
Expand Down Expand Up @@ -151,7 +134,7 @@ public function tokenIs($token, $value, $case_insensitive = true)
}

/**
* Scan a SQL2 string a extract the tokens.
* Scan a SQL2 string and extract the tokens.
*
* @param string $sql2
*
Expand All @@ -160,49 +143,45 @@ public function tokenIs($token, $value, $case_insensitive = true)
protected function scan($sql2)
{
$tokens = [];
$token = strtok($sql2, " \n\t");
while ($token !== false) {
$this->tokenize($tokens, $token);
$token = strtok(" \n\t");
}

$regexpTokens = [];
foreach ($tokens as $token) {
$regexpTokens[] = preg_quote($token, '/');
}

$regexp = '/^'.implode('([ \t\n]*)', $regexpTokens).'$/';
preg_match($regexp, $sql2, $this->delimiters);
$this->delimiters[0] = '';

return $tokens;
}

/**
* Tokenize a string returned by strtok to split the string at '.', ',', '(', '='
* and ')' characters.
*
* @param array $tokens
* @param string $token
*/
protected function tokenize(&$tokens, $token)
{
$buffer = '';
for ($i = 0; $i < strlen($token); $i++) {
$char = trim(substr($token, $i, 1));
if (in_array($char, ['.', ',', '(', ')', '='])) {
if ($buffer !== '') {
$tokens[] = $buffer;
$buffer = '';
$currentToken = '';
$tokenEndChars = ['.', ',', '(', ')', '='];
$isString = false;
foreach (\str_split($sql2) as $character) {
if (!$isString && in_array($character, [' ', "\t", "\n"], true)) {
if ($currentToken !== '') {
$tokens[] = $currentToken;
}
$tokens[] = $char;
} else {
$buffer .= $char;
$currentToken = '';
continue;
}
if (!$isString && in_array($character, $tokenEndChars, true)) {
if ($currentToken !== '') {
$tokens[] = $currentToken;
}
$tokens[] = $character;
$currentToken = '';
continue;
}
$currentToken .= $character;
if (in_array($character, ['"', "'"], true)) {
if ($isString) {
// reached the end of the string
$isString = false;
$tokens[] = $currentToken;
$currentToken = '';
} else {
$isString = true;
}
}
}
if ($currentToken !== '') {
$tokens[] = $currentToken;
}

if ($buffer !== '') {
$tokens[] = $buffer;
if ($isString) {
throw new InvalidQueryException("Syntax error: unterminated quoted string $currentToken in '$sql2'");
}

return $tokens;
}
}
45 changes: 8 additions & 37 deletions src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -756,27 +756,13 @@ protected function parseCastLiteral($token)
$this->scanner->expectToken('(');
$token = $this->scanner->fetchNextToken();

$quoteString = false;
if (substr($token, 0, 1) === '\'') {
$quoteString = "'";
} elseif (substr($token, 0, 1) === '"') {
$quoteString = '"';
}
$quoteString = in_array($token[0], ['\'', '"'], true);

if ($quoteString) {
while (substr($token, -1) !== $quoteString) {
$nextToken = $this->scanner->fetchNextToken();
if ('' === $nextToken) {
break;
}
$token .= $nextToken;
}

if (substr($token, -1) !== $quoteString) {
throw new InvalidQueryException("Syntax error: unterminated quoted string '$token' in '{$this->sql2}'");
}
$quotesUsed = $token[0];
$token = substr($token, 1, -1);
$token = str_replace('\\'.$quoteString, $quoteString, $token);
// Un-escaping quotes
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
}

$this->scanner->expectToken('AS');
Expand Down Expand Up @@ -813,28 +799,13 @@ protected function parseLiteralValue()
return $this->parseCastLiteral($token);
}

$quoteString = false;
if (substr($token, 0, 1) === '\'') {
$quoteString = "'";
} elseif (substr($token, 0, 1) === '"') {
$quoteString = '"';
}
$quoteString = in_array($token[0], ['"', "'"], true);

if ($quoteString) {
while (substr($token, -1) !== $quoteString) {
$nextToken = $this->scanner->fetchNextToken();
if ('' === $nextToken) {
break;
}
$token .= $this->scanner->getPreviousDelimiter();
$token .= $nextToken;
}

if (substr($token, -1) !== $quoteString) {
throw new InvalidQueryException("Syntax error: unterminated quoted string $token in '{$this->sql2}'");
}
$quotesUsed = $token[0];
$token = substr($token, 1, -1);
$token = str_replace('\\'.$quoteString, $quoteString, $token);
// Unescape quotes
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
$token = str_replace("''", "'", $token);
if (preg_match('/^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d+)?$/', $token)) {
if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $token)) {
Expand Down
62 changes: 51 additions & 11 deletions tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace PHPCR\Tests\Util\QOM;

use PHPCR\Query\InvalidQueryException;
use PHPCR\Util\QOM\Sql2Scanner;
use PHPUnit\Framework\TestCase;

Expand All @@ -24,24 +25,63 @@ public function testToken()
while ($token = $scanner->fetchNextToken()) {
$this->assertEquals(array_shift($expected), $token);
}
$this->assertCount(0, $expected);
}

public function testDelimiter()
public function testStringTokenization()
{
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page');
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"');
$expected = [
'SELECT',
'page',
'.',
'*',
'FROM',
'[nt:unstructured]',
'AS',
'page',
'WHERE',
'name',
'=',
'"Hello world"',
];

while ($token = $scanner->fetchNextToken()) {
$this->assertEquals(array_shift($expected), $token);
}
$this->assertCount(0, $expected);
}

public function testStringTokenizationWithNewLines()
{
$scanner = new Sql2Scanner(<<<'SQL'
SELECT page.*
FROM [nt:unstructured] AS page WHERE name ="Hello world"
SQL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems to not work with PHP 7.1

which made me realize we should also build with 7.2 and 7.3 - i added that to master.

while i don't mind dropping old PHP versions for good reasons, i would prefer here to use the old nowdocs syntax for this test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean removing the quotes around the SQL in the HEREdoc string? I did that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reading https://www.php.net/manual/en/language.types.string.php#language.types.string.syntax.nowdoc i think the problem is the )

i think you best define the query as a string like in the test above. SQL; must stand on its own line without any other code and no indention.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be fixed.

$expected = [
'',
' ',
'',
'',
' ',
' ',
' ',
' ',
'SELECT',
'page',
'.',
'*',
'FROM',
'[nt:unstructured]',
'AS',
'page',
'WHERE',
'name',
'=',
'"Hello world"',
];

while ($token = $scanner->fetchNextToken()) {
$this->assertEquals(array_shift($expected), $scanner->getPreviousDelimiter());
$this->assertEquals(array_shift($expected), $token);
}
$this->assertCount(0, $expected);
}

public function testThrowingErrorOnUnclosedString()
{
$this->expectException(InvalidQueryException::class);
new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello ');
}
}