UNPKG

16.2 kBJavaScriptView Raw
1import { syntaxError } from "../error/syntaxError.mjs";
2import { Token } from "./ast.mjs";
3import { TokenKind } from "./tokenKind.mjs";
4import { dedentBlockStringValue } from "./blockString.mjs";
5/**
6 * Given a Source object, creates a Lexer for that source.
7 * A Lexer is a stateful stream generator in that every time
8 * it is advanced, it returns the next token in the Source. Assuming the
9 * source lexes, the final Token emitted by the lexer will be of kind
10 * EOF, after which the lexer will repeatedly return the same EOF token
11 * whenever called.
12 */
13
14export var Lexer = /*#__PURE__*/function () {
15 /**
16 * The previously focused non-ignored token.
17 */
18
19 /**
20 * The currently focused non-ignored token.
21 */
22
23 /**
24 * The (1-indexed) line containing the current token.
25 */
26
27 /**
28 * The character offset at which the current line begins.
29 */
30 function Lexer(source) {
31 var startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0, null);
32 this.source = source;
33 this.lastToken = startOfFileToken;
34 this.token = startOfFileToken;
35 this.line = 1;
36 this.lineStart = 0;
37 }
38 /**
39 * Advances the token stream to the next non-ignored token.
40 */
41
42
43 var _proto = Lexer.prototype;
44
45 _proto.advance = function advance() {
46 this.lastToken = this.token;
47 var token = this.token = this.lookahead();
48 return token;
49 }
50 /**
51 * Looks ahead and returns the next non-ignored token, but does not change
52 * the state of Lexer.
53 */
54 ;
55
56 _proto.lookahead = function lookahead() {
57 var token = this.token;
58
59 if (token.kind !== TokenKind.EOF) {
60 do {
61 var _token$next;
62
63 // Note: next is only mutable during parsing, so we cast to allow this.
64 token = (_token$next = token.next) !== null && _token$next !== void 0 ? _token$next : token.next = readToken(this, token);
65 } while (token.kind === TokenKind.COMMENT);
66 }
67
68 return token;
69 };
70
71 return Lexer;
72}();
73/**
74 * @internal
75 */
76
77export function isPunctuatorTokenKind(kind) {
78 return kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R;
79}
80
81function printCharCode(code) {
82 return (// NaN/undefined represents access beyond the end of the file.
83 isNaN(code) ? TokenKind.EOF : // Trust JSON for ASCII.
84 code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
85 "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
86 );
87}
88/**
89 * Gets the next token from the source starting at the given position.
90 *
91 * This skips over whitespace until it finds the next lexable token, then lexes
92 * punctuators immediately or calls the appropriate helper function for more
93 * complicated tokens.
94 */
95
96
97function readToken(lexer, prev) {
98 var source = lexer.source;
99 var body = source.body;
100 var bodyLength = body.length;
101 var pos = prev.end;
102
103 while (pos < bodyLength) {
104 var code = body.charCodeAt(pos);
105 var _line = lexer.line;
106
107 var _col = 1 + pos - lexer.lineStart; // SourceCharacter
108
109
110 switch (code) {
111 case 0xfeff: // <BOM>
112
113 case 9: // \t
114
115 case 32: // <space>
116
117 case 44:
118 // ,
119 ++pos;
120 continue;
121
122 case 10:
123 // \n
124 ++pos;
125 ++lexer.line;
126 lexer.lineStart = pos;
127 continue;
128
129 case 13:
130 // \r
131 if (body.charCodeAt(pos + 1) === 10) {
132 pos += 2;
133 } else {
134 ++pos;
135 }
136
137 ++lexer.line;
138 lexer.lineStart = pos;
139 continue;
140
141 case 33:
142 // !
143 return new Token(TokenKind.BANG, pos, pos + 1, _line, _col, prev);
144
145 case 35:
146 // #
147 return readComment(source, pos, _line, _col, prev);
148
149 case 36:
150 // $
151 return new Token(TokenKind.DOLLAR, pos, pos + 1, _line, _col, prev);
152
153 case 38:
154 // &
155 return new Token(TokenKind.AMP, pos, pos + 1, _line, _col, prev);
156
157 case 40:
158 // (
159 return new Token(TokenKind.PAREN_L, pos, pos + 1, _line, _col, prev);
160
161 case 41:
162 // )
163 return new Token(TokenKind.PAREN_R, pos, pos + 1, _line, _col, prev);
164
165 case 46:
166 // .
167 if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
168 return new Token(TokenKind.SPREAD, pos, pos + 3, _line, _col, prev);
169 }
170
171 break;
172
173 case 58:
174 // :
175 return new Token(TokenKind.COLON, pos, pos + 1, _line, _col, prev);
176
177 case 61:
178 // =
179 return new Token(TokenKind.EQUALS, pos, pos + 1, _line, _col, prev);
180
181 case 64:
182 // @
183 return new Token(TokenKind.AT, pos, pos + 1, _line, _col, prev);
184
185 case 91:
186 // [
187 return new Token(TokenKind.BRACKET_L, pos, pos + 1, _line, _col, prev);
188
189 case 93:
190 // ]
191 return new Token(TokenKind.BRACKET_R, pos, pos + 1, _line, _col, prev);
192
193 case 123:
194 // {
195 return new Token(TokenKind.BRACE_L, pos, pos + 1, _line, _col, prev);
196
197 case 124:
198 // |
199 return new Token(TokenKind.PIPE, pos, pos + 1, _line, _col, prev);
200
201 case 125:
202 // }
203 return new Token(TokenKind.BRACE_R, pos, pos + 1, _line, _col, prev);
204
205 case 34:
206 // "
207 if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
208 return readBlockString(source, pos, _line, _col, prev, lexer);
209 }
210
211 return readString(source, pos, _line, _col, prev);
212
213 case 45: // -
214
215 case 48: // 0
216
217 case 49: // 1
218
219 case 50: // 2
220
221 case 51: // 3
222
223 case 52: // 4
224
225 case 53: // 5
226
227 case 54: // 6
228
229 case 55: // 7
230
231 case 56: // 8
232
233 case 57:
234 // 9
235 return readNumber(source, pos, code, _line, _col, prev);
236
237 case 65: // A
238
239 case 66: // B
240
241 case 67: // C
242
243 case 68: // D
244
245 case 69: // E
246
247 case 70: // F
248
249 case 71: // G
250
251 case 72: // H
252
253 case 73: // I
254
255 case 74: // J
256
257 case 75: // K
258
259 case 76: // L
260
261 case 77: // M
262
263 case 78: // N
264
265 case 79: // O
266
267 case 80: // P
268
269 case 81: // Q
270
271 case 82: // R
272
273 case 83: // S
274
275 case 84: // T
276
277 case 85: // U
278
279 case 86: // V
280
281 case 87: // W
282
283 case 88: // X
284
285 case 89: // Y
286
287 case 90: // Z
288
289 case 95: // _
290
291 case 97: // a
292
293 case 98: // b
294
295 case 99: // c
296
297 case 100: // d
298
299 case 101: // e
300
301 case 102: // f
302
303 case 103: // g
304
305 case 104: // h
306
307 case 105: // i
308
309 case 106: // j
310
311 case 107: // k
312
313 case 108: // l
314
315 case 109: // m
316
317 case 110: // n
318
319 case 111: // o
320
321 case 112: // p
322
323 case 113: // q
324
325 case 114: // r
326
327 case 115: // s
328
329 case 116: // t
330
331 case 117: // u
332
333 case 118: // v
334
335 case 119: // w
336
337 case 120: // x
338
339 case 121: // y
340
341 case 122:
342 // z
343 return readName(source, pos, _line, _col, prev);
344 }
345
346 throw syntaxError(source, pos, unexpectedCharacterMessage(code));
347 }
348
349 var line = lexer.line;
350 var col = 1 + pos - lexer.lineStart;
351 return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
352}
353/**
354 * Report a message that an unexpected character was encountered.
355 */
356
357
358function unexpectedCharacterMessage(code) {
359 if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
360 return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
361 }
362
363 if (code === 39) {
364 // '
365 return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
366 }
367
368 return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
369}
370/**
371 * Reads a comment token from the source file.
372 *
373 * #[\u0009\u0020-\uFFFF]*
374 */
375
376
377function readComment(source, start, line, col, prev) {
378 var body = source.body;
379 var code;
380 var position = start;
381
382 do {
383 code = body.charCodeAt(++position);
384 } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
385 code > 0x001f || code === 0x0009));
386
387 return new Token(TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
388}
389/**
390 * Reads a number token from the source file, either a float
391 * or an int depending on whether a decimal point appears.
392 *
393 * Int: -?(0|[1-9][0-9]*)
394 * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
395 */
396
397
398function readNumber(source, start, firstCode, line, col, prev) {
399 var body = source.body;
400 var code = firstCode;
401 var position = start;
402 var isFloat = false;
403
404 if (code === 45) {
405 // -
406 code = body.charCodeAt(++position);
407 }
408
409 if (code === 48) {
410 // 0
411 code = body.charCodeAt(++position);
412
413 if (code >= 48 && code <= 57) {
414 throw syntaxError(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
415 }
416 } else {
417 position = readDigits(source, position, code);
418 code = body.charCodeAt(position);
419 }
420
421 if (code === 46) {
422 // .
423 isFloat = true;
424 code = body.charCodeAt(++position);
425 position = readDigits(source, position, code);
426 code = body.charCodeAt(position);
427 }
428
429 if (code === 69 || code === 101) {
430 // E e
431 isFloat = true;
432 code = body.charCodeAt(++position);
433
434 if (code === 43 || code === 45) {
435 // + -
436 code = body.charCodeAt(++position);
437 }
438
439 position = readDigits(source, position, code);
440 code = body.charCodeAt(position);
441 } // Numbers cannot be followed by . or NameStart
442
443
444 if (code === 46 || isNameStart(code)) {
445 throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
446 }
447
448 return new Token(isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
449}
450/**
451 * Returns the new position in the source after reading digits.
452 */
453
454
455function readDigits(source, start, firstCode) {
456 var body = source.body;
457 var position = start;
458 var code = firstCode;
459
460 if (code >= 48 && code <= 57) {
461 // 0 - 9
462 do {
463 code = body.charCodeAt(++position);
464 } while (code >= 48 && code <= 57); // 0 - 9
465
466
467 return position;
468 }
469
470 throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
471}
472/**
473 * Reads a string token from the source file.
474 *
475 * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
476 */
477
478
479function readString(source, start, line, col, prev) {
480 var body = source.body;
481 var position = start + 1;
482 var chunkStart = position;
483 var code = 0;
484 var value = '';
485
486 while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
487 code !== 0x000a && code !== 0x000d) {
488 // Closing Quote (")
489 if (code === 34) {
490 value += body.slice(chunkStart, position);
491 return new Token(TokenKind.STRING, start, position + 1, line, col, prev, value);
492 } // SourceCharacter
493
494
495 if (code < 0x0020 && code !== 0x0009) {
496 throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
497 }
498
499 ++position;
500
501 if (code === 92) {
502 // \
503 value += body.slice(chunkStart, position - 1);
504 code = body.charCodeAt(position);
505
506 switch (code) {
507 case 34:
508 value += '"';
509 break;
510
511 case 47:
512 value += '/';
513 break;
514
515 case 92:
516 value += '\\';
517 break;
518
519 case 98:
520 value += '\b';
521 break;
522
523 case 102:
524 value += '\f';
525 break;
526
527 case 110:
528 value += '\n';
529 break;
530
531 case 114:
532 value += '\r';
533 break;
534
535 case 116:
536 value += '\t';
537 break;
538
539 case 117:
540 {
541 // uXXXX
542 var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
543
544 if (charCode < 0) {
545 var invalidSequence = body.slice(position + 1, position + 5);
546 throw syntaxError(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
547 }
548
549 value += String.fromCharCode(charCode);
550 position += 4;
551 break;
552 }
553
554 default:
555 throw syntaxError(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
556 }
557
558 ++position;
559 chunkStart = position;
560 }
561 }
562
563 throw syntaxError(source, position, 'Unterminated string.');
564}
565/**
566 * Reads a block string token from the source file.
567 *
568 * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
569 */
570
571
572function readBlockString(source, start, line, col, prev, lexer) {
573 var body = source.body;
574 var position = start + 3;
575 var chunkStart = position;
576 var code = 0;
577 var rawValue = '';
578
579 while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
580 // Closing Triple-Quote (""")
581 if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
582 rawValue += body.slice(chunkStart, position);
583 return new Token(TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, dedentBlockStringValue(rawValue));
584 } // SourceCharacter
585
586
587 if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
588 throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
589 }
590
591 if (code === 10) {
592 // new line
593 ++position;
594 ++lexer.line;
595 lexer.lineStart = position;
596 } else if (code === 13) {
597 // carriage return
598 if (body.charCodeAt(position + 1) === 10) {
599 position += 2;
600 } else {
601 ++position;
602 }
603
604 ++lexer.line;
605 lexer.lineStart = position;
606 } else if ( // Escape Triple-Quote (\""")
607 code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
608 rawValue += body.slice(chunkStart, position) + '"""';
609 position += 4;
610 chunkStart = position;
611 } else {
612 ++position;
613 }
614 }
615
616 throw syntaxError(source, position, 'Unterminated string.');
617}
618/**
619 * Converts four hexadecimal chars to the integer that the
620 * string represents. For example, uniCharCode('0','0','0','f')
621 * will return 15, and uniCharCode('0','0','f','f') returns 255.
622 *
623 * Returns a negative number on error, if a char was invalid.
624 *
625 * This is implemented by noting that char2hex() returns -1 on error,
626 * which means the result of ORing the char2hex() will also be negative.
627 */
628
629
630function uniCharCode(a, b, c, d) {
631 return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
632}
633/**
634 * Converts a hex character to its integer value.
635 * '0' becomes 0, '9' becomes 9
636 * 'A' becomes 10, 'F' becomes 15
637 * 'a' becomes 10, 'f' becomes 15
638 *
639 * Returns -1 on error.
640 */
641
642
643function char2hex(a) {
644 return a >= 48 && a <= 57 ? a - 48 // 0-9
645 : a >= 65 && a <= 70 ? a - 55 // A-F
646 : a >= 97 && a <= 102 ? a - 87 // a-f
647 : -1;
648}
649/**
650 * Reads an alphanumeric + underscore name from the source.
651 *
652 * [_A-Za-z][_0-9A-Za-z]*
653 */
654
655
656function readName(source, start, line, col, prev) {
657 var body = source.body;
658 var bodyLength = body.length;
659 var position = start + 1;
660 var code = 0;
661
662 while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
663 code >= 48 && code <= 57 || // 0-9
664 code >= 65 && code <= 90 || // A-Z
665 code >= 97 && code <= 122) // a-z
666 ) {
667 ++position;
668 }
669
670 return new Token(TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
671} // _ A-Z a-z
672
673
674function isNameStart(code) {
675 return code === 95 || code >= 65 && code <= 90 || code >= 97 && code <= 122;
676}