Skip to content

Commit 006ebcf

Browse files
authored
feat(jmespath): add lexer component (#2214)
* feat(jmespath): add lexer component * refactor: reduce cognitive complexity
1 parent 76c4cfd commit 006ebcf

File tree

1 file changed

+368
-0
lines changed

1 file changed

+368
-0
lines changed

packages/jmespath/src/Lexer.ts

Lines changed: 368 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,368 @@
1+
import {
2+
SIMPLE_TOKENS,
3+
START_IDENTIFIER,
4+
VALID_IDENTIFIER,
5+
VALID_NUMBER,
6+
WHITESPACE,
7+
} from './constants.js';
8+
import { EmptyExpressionError, LexerError } from './errors.js';
9+
import type { Token } from './types.js';
10+
11+
/**
12+
* A lexer for JMESPath expressions.
13+
*
14+
* This lexer tokenizes a JMESPath expression into a sequence of tokens.
15+
*/
16+
class Lexer {
17+
#position!: number;
18+
#expression!: string;
19+
#chars!: string[];
20+
#current!: string;
21+
#length!: number;
22+
23+
/**
24+
* Tokenize a JMESPath expression.
25+
*
26+
* This method is a generator that yields tokens for the given expression.
27+
*
28+
* @param expression The JMESPath expression to tokenize.
29+
*/
30+
public *tokenize(expression: string): Generator<Token> {
31+
this.#initializeForExpression(expression);
32+
while (this.#current !== '' && this.#current !== undefined) {
33+
if (SIMPLE_TOKENS.has(this.#current)) {
34+
yield {
35+
// We know that SIMPLE_TOKENS has this.#current as a key because
36+
// we checked for that above.
37+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
38+
type: SIMPLE_TOKENS.get(this.#current)!,
39+
value: this.#current,
40+
start: this.#position,
41+
end: this.#position + 1,
42+
};
43+
44+
this.#next();
45+
} else if (START_IDENTIFIER.has(this.#current)) {
46+
yield this.#consumeIdentifier();
47+
} else if (WHITESPACE.has(this.#current)) {
48+
this.#next();
49+
} else if (this.#current === '[') {
50+
yield this.#consumeSquareBracket();
51+
} else if (this.#current === `'`) {
52+
yield this.#consumeRawStringLiteral();
53+
} else if (this.#current === '`') {
54+
yield this.#consumeLiteral();
55+
} else if (VALID_NUMBER.has(this.#current)) {
56+
const start = this.#position;
57+
const buff = this.#consumeNumber();
58+
yield {
59+
type: 'number',
60+
value: parseInt(buff),
61+
start: start,
62+
end: start + buff.length,
63+
};
64+
} else if (this.#current === '-') {
65+
yield this.#consumeNegativeNumber();
66+
} else if (this.#current === '"') {
67+
yield this.#consumeQuotedIdentifier();
68+
} else if (['<', '>', '!', '=', '|', '&'].includes(this.#current)) {
69+
yield this.#consumeComparatorSigns(
70+
this.#current as '<' | '>' | '!' | '=' | '|' | '&'
71+
);
72+
} else {
73+
throw new LexerError(this.#position, this.#current);
74+
}
75+
}
76+
yield { type: 'eof', value: '', start: this.#length, end: this.#length };
77+
}
78+
79+
/**
80+
* Consume a comparator sign.
81+
*
82+
* This method is called when the lexer encounters a comparator sign.
83+
*
84+
* @param current The current character
85+
*/
86+
#consumeComparatorSigns = (
87+
current: '<' | '>' | '!' | '=' | '|' | '&'
88+
): Token => {
89+
switch (current) {
90+
case '<':
91+
return this.#matchOrElse('=', 'lte', 'lt');
92+
case '>':
93+
return this.#matchOrElse('=', 'gte', 'gt');
94+
case '!':
95+
return this.#matchOrElse('=', 'ne', 'not');
96+
case '|':
97+
return this.#matchOrElse('|', 'or', 'pipe');
98+
case '&':
99+
return this.#matchOrElse('&', 'and', 'expref');
100+
default:
101+
return this.#consumeEqualSign();
102+
}
103+
};
104+
105+
/**
106+
* Consume an equal sign.
107+
*
108+
* This method is called when the lexer encounters an equal sign.
109+
* It checks if the next character is also an equal sign and returns
110+
* the corresponding token.
111+
*/
112+
#consumeEqualSign(): Token {
113+
if (this.#next() === '=') {
114+
this.#next();
115+
116+
return {
117+
type: 'eq',
118+
value: '==',
119+
start: this.#position - 1,
120+
end: this.#position,
121+
};
122+
} else {
123+
throw new LexerError(this.#position - 1, '=');
124+
}
125+
}
126+
127+
/**
128+
* Consume an unquoted identifier.
129+
*
130+
* This method is called when the lexer encounters a character that is a valid
131+
* identifier. It advances the lexer until it finds a character that is not a
132+
* valid identifier and returns the corresponding token.
133+
*/
134+
#consumeIdentifier(): Token {
135+
const start = this.#position;
136+
let buff = this.#current;
137+
while (VALID_IDENTIFIER.has(this.#next())) {
138+
buff += this.#current;
139+
}
140+
141+
return {
142+
type: 'unquoted_identifier',
143+
value: buff,
144+
start,
145+
end: start + buff.length,
146+
};
147+
}
148+
149+
/**
150+
* Consume a negative number.
151+
*
152+
* This method is called when the lexer encounters a negative sign.
153+
* It checks if the next character is a number and returns the corresponding token.
154+
*/
155+
#consumeNegativeNumber(): Token {
156+
const start = this.#position;
157+
const buff = this.#consumeNumber();
158+
if (buff.length > 1) {
159+
return {
160+
type: 'number',
161+
value: parseInt(buff),
162+
start: start,
163+
end: start + buff.length,
164+
};
165+
} else {
166+
// If the negative sign is not followed by a number, it is an error.
167+
throw new LexerError(start, 'Unknown token after "-"');
168+
}
169+
}
170+
171+
/**
172+
* Consume a raw string that is a number.
173+
*
174+
* It takes the current position and advances
175+
* the lexer until it finds a character that
176+
* is not a number.
177+
*/
178+
#consumeNumber(): string {
179+
let buff = this.#current;
180+
while (VALID_NUMBER.has(this.#next())) {
181+
buff += this.#current;
182+
}
183+
184+
return buff;
185+
}
186+
187+
/**
188+
* Consume a square bracket.
189+
*
190+
* This method is called when the lexer encounters a square bracket.
191+
* It checks if the next character is a question mark or a closing
192+
* square bracket and returns the corresponding token.
193+
*/
194+
#consumeSquareBracket(): Token {
195+
const start = this.#position;
196+
const nextChar = this.#next();
197+
if (nextChar == ']') {
198+
this.#next();
199+
200+
return { type: 'flatten', value: '[]', start: start, end: start + 2 };
201+
} else if (nextChar == '?') {
202+
this.#next();
203+
204+
return { type: 'filter', value: '[?', start: start, end: start + 2 };
205+
} else {
206+
return { type: 'lbracket', value: '[', start: start, end: start + 1 };
207+
}
208+
}
209+
210+
/**
211+
* Initializes the lexer for the given expression.
212+
*
213+
* We use a separate method for this instead of the constructor
214+
* because we want to be able to reuse the same lexer instance
215+
* and also because we want to be able to expose a public API
216+
* for tokenizing expressions like `new Lexer().tokenize(expression)`.
217+
*
218+
* @param expression The JMESPath expression to tokenize.
219+
*/
220+
#initializeForExpression(expression: string): void {
221+
if (typeof expression !== 'string') {
222+
throw new EmptyExpressionError();
223+
}
224+
225+
this.#position = 0;
226+
this.#expression = expression;
227+
this.#chars = Array.from(expression);
228+
this.#current = this.#chars[0];
229+
this.#length = this.#expression.length;
230+
}
231+
232+
/**
233+
* Advance the lexer to the next character in the expression.
234+
*/
235+
#next(): string {
236+
if (this.#position === this.#length - 1) {
237+
this.#current = '';
238+
} else {
239+
this.#position += 1;
240+
this.#current = this.#chars[this.#position];
241+
}
242+
243+
return this.#current;
244+
}
245+
246+
/**
247+
* Consume until the given delimiter is reached allowing
248+
* for escaping of the delimiter with a backslash (`\`).
249+
*
250+
* @param delimiter The delimiter to consume until.
251+
*/
252+
#consumeUntil(delimiter: string): string {
253+
const start = this.#position;
254+
let buff = '';
255+
this.#next();
256+
while (this.#current !== delimiter) {
257+
if (this.#current === '\\') {
258+
buff += '\\';
259+
this.#next();
260+
}
261+
if (this.#current === '') {
262+
// We've reached the end of the expression (EOF) before
263+
// we found the delimiter. This is an error.
264+
throw new LexerError(start, this.#expression.substring(start));
265+
}
266+
buff += this.#current;
267+
this.#next();
268+
}
269+
// Skip the closing delimiter
270+
this.#next();
271+
272+
return buff;
273+
}
274+
275+
/**
276+
* Process a literal.
277+
*
278+
* A literal is a JSON string that is enclosed in backticks.
279+
*/
280+
#consumeLiteral(): Token {
281+
const start = this.#position;
282+
const lexeme = this.#consumeUntil('`').replace('\\`', '`');
283+
try {
284+
const parsedJson = JSON.parse(lexeme);
285+
286+
return {
287+
type: 'literal',
288+
value: parsedJson,
289+
start,
290+
end: this.#position - start,
291+
};
292+
} catch (error) {
293+
throw new LexerError(start, lexeme);
294+
}
295+
}
296+
297+
/**
298+
* Process a quoted identifier.
299+
*
300+
* A quoted identifier is a string that is enclosed in double quotes.
301+
*/
302+
#consumeQuotedIdentifier(): Token {
303+
const start = this.#position;
304+
const lexeme = '"' + this.#consumeUntil('"') + '"';
305+
const tokenLen = this.#position - start;
306+
307+
return {
308+
type: 'quoted_identifier',
309+
value: JSON.parse(lexeme),
310+
start,
311+
end: tokenLen,
312+
};
313+
}
314+
315+
/**
316+
* Process a raw string literal.
317+
*
318+
* A raw string literal is a string that is enclosed in single quotes.
319+
*/
320+
#consumeRawStringLiteral(): Token {
321+
const start = this.#position;
322+
const lexeme = this.#consumeUntil(`'`).replace(`\\'`, `'`);
323+
const tokenLen = this.#position - start;
324+
325+
return {
326+
type: 'literal',
327+
value: lexeme,
328+
start,
329+
end: tokenLen,
330+
};
331+
}
332+
333+
/**
334+
* Match the expected character and return the corresponding token type.
335+
*
336+
* @param expected The expected character
337+
* @param matchType The token type to return if the expected character is found
338+
* @param elseType The token type to return if the expected character is not found
339+
*/
340+
#matchOrElse(
341+
expected: string,
342+
matchType: Token['type'],
343+
elseType: Token['type']
344+
): Token {
345+
const start = this.#position;
346+
const current = this.#current;
347+
const nextChar = this.#next();
348+
if (nextChar === expected) {
349+
this.#next();
350+
351+
return {
352+
type: matchType,
353+
value: current + nextChar,
354+
start,
355+
end: start + 2,
356+
};
357+
}
358+
359+
return {
360+
type: elseType,
361+
value: current,
362+
start,
363+
end: start,
364+
};
365+
}
366+
}
367+
368+
export { Lexer };

0 commit comments

Comments
 (0)