UNPKG

15.6 kBJavaScriptView Raw
1import defineToJSON from '../jsutils/defineToJSON';
2import { syntaxError } from '../error/syntaxError';
3import { dedentBlockStringValue } from './blockString';
4import { TokenKind } from './tokenKind';
5/**
6 * Given a Source object, this returns a Lexer for that source.
7 * A Lexer is a stateful stream generator in that every time
8 * it is advanced, it returns the next token in the Source. Assuming the
9 * source lexes, the final Token emitted by the lexer will be of kind
10 * EOF, after which the lexer will repeatedly return the same EOF token
11 * whenever called.
12 */
13
14export function createLexer(source, options) {
15 var startOfFileToken = new Tok(TokenKind.SOF, 0, 0, 0, 0, null);
16 var lexer = {
17 source: source,
18 options: options,
19 lastToken: startOfFileToken,
20 token: startOfFileToken,
21 line: 1,
22 lineStart: 0,
23 advance: advanceLexer,
24 lookahead: lookahead
25 };
26 return lexer;
27}
28
29function advanceLexer() {
30 this.lastToken = this.token;
31 var token = this.token = this.lookahead();
32 return token;
33}
34
35function lookahead() {
36 var token = this.token;
37
38 if (token.kind !== TokenKind.EOF) {
39 do {
40 // Note: next is only mutable during parsing, so we cast to allow this.
41 token = token.next || (token.next = readToken(this, token));
42 } while (token.kind === TokenKind.COMMENT);
43 }
44
45 return token;
46}
47/**
48 * The return type of createLexer.
49 */
50
51
52// @internal
53export function isPunctuatorToken(token) {
54 var kind = token.kind;
55 return kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R;
56}
57/**
58 * Helper function for constructing the Token object.
59 */
60
61function Tok(kind, start, end, line, column, prev, value) {
62 this.kind = kind;
63 this.start = start;
64 this.end = end;
65 this.line = line;
66 this.column = column;
67 this.value = value;
68 this.prev = prev;
69 this.next = null;
70} // Print a simplified form when appearing in JSON/util.inspect.
71
72
73defineToJSON(Tok, function () {
74 return {
75 kind: this.kind,
76 value: this.value,
77 line: this.line,
78 column: this.column
79 };
80});
81
82function printCharCode(code) {
83 return (// NaN/undefined represents access beyond the end of the file.
84 isNaN(code) ? TokenKind.EOF : // Trust JSON for ASCII.
85 code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form.
86 "\"\\u".concat(('00' + code.toString(16).toUpperCase()).slice(-4), "\"")
87 );
88}
89/**
90 * Gets the next token from the source starting at the given position.
91 *
92 * This skips over whitespace until it finds the next lexable token, then lexes
93 * punctuators immediately or calls the appropriate helper function for more
94 * complicated tokens.
95 */
96
97
98function readToken(lexer, prev) {
99 var source = lexer.source;
100 var body = source.body;
101 var bodyLength = body.length;
102 var pos = positionAfterWhitespace(body, prev.end, lexer);
103 var line = lexer.line;
104 var col = 1 + pos - lexer.lineStart;
105
106 if (pos >= bodyLength) {
107 return new Tok(TokenKind.EOF, bodyLength, bodyLength, line, col, prev);
108 }
109
110 var code = body.charCodeAt(pos); // SourceCharacter
111
112 switch (code) {
113 // !
114 case 33:
115 return new Tok(TokenKind.BANG, pos, pos + 1, line, col, prev);
116 // #
117
118 case 35:
119 return readComment(source, pos, line, col, prev);
120 // $
121
122 case 36:
123 return new Tok(TokenKind.DOLLAR, pos, pos + 1, line, col, prev);
124 // &
125
126 case 38:
127 return new Tok(TokenKind.AMP, pos, pos + 1, line, col, prev);
128 // (
129
130 case 40:
131 return new Tok(TokenKind.PAREN_L, pos, pos + 1, line, col, prev);
132 // )
133
134 case 41:
135 return new Tok(TokenKind.PAREN_R, pos, pos + 1, line, col, prev);
136 // .
137
138 case 46:
139 if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) {
140 return new Tok(TokenKind.SPREAD, pos, pos + 3, line, col, prev);
141 }
142
143 break;
144 // :
145
146 case 58:
147 return new Tok(TokenKind.COLON, pos, pos + 1, line, col, prev);
148 // =
149
150 case 61:
151 return new Tok(TokenKind.EQUALS, pos, pos + 1, line, col, prev);
152 // @
153
154 case 64:
155 return new Tok(TokenKind.AT, pos, pos + 1, line, col, prev);
156 // [
157
158 case 91:
159 return new Tok(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev);
160 // ]
161
162 case 93:
163 return new Tok(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev);
164 // {
165
166 case 123:
167 return new Tok(TokenKind.BRACE_L, pos, pos + 1, line, col, prev);
168 // |
169
170 case 124:
171 return new Tok(TokenKind.PIPE, pos, pos + 1, line, col, prev);
172 // }
173
174 case 125:
175 return new Tok(TokenKind.BRACE_R, pos, pos + 1, line, col, prev);
176 // A-Z _ a-z
177
178 case 65:
179 case 66:
180 case 67:
181 case 68:
182 case 69:
183 case 70:
184 case 71:
185 case 72:
186 case 73:
187 case 74:
188 case 75:
189 case 76:
190 case 77:
191 case 78:
192 case 79:
193 case 80:
194 case 81:
195 case 82:
196 case 83:
197 case 84:
198 case 85:
199 case 86:
200 case 87:
201 case 88:
202 case 89:
203 case 90:
204 case 95:
205 case 97:
206 case 98:
207 case 99:
208 case 100:
209 case 101:
210 case 102:
211 case 103:
212 case 104:
213 case 105:
214 case 106:
215 case 107:
216 case 108:
217 case 109:
218 case 110:
219 case 111:
220 case 112:
221 case 113:
222 case 114:
223 case 115:
224 case 116:
225 case 117:
226 case 118:
227 case 119:
228 case 120:
229 case 121:
230 case 122:
231 return readName(source, pos, line, col, prev);
232 // - 0-9
233
234 case 45:
235 case 48:
236 case 49:
237 case 50:
238 case 51:
239 case 52:
240 case 53:
241 case 54:
242 case 55:
243 case 56:
244 case 57:
245 return readNumber(source, pos, code, line, col, prev);
246 // "
247
248 case 34:
249 if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) {
250 return readBlockString(source, pos, line, col, prev, lexer);
251 }
252
253 return readString(source, pos, line, col, prev);
254 }
255
256 throw syntaxError(source, pos, unexpectedCharacterMessage(code));
257}
258/**
259 * Report a message that an unexpected character was encountered.
260 */
261
262
263function unexpectedCharacterMessage(code) {
264 if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
265 return "Cannot contain the invalid character ".concat(printCharCode(code), ".");
266 }
267
268 if (code === 39) {
269 // '
270 return 'Unexpected single quote character (\'), did you mean to use a double quote (")?';
271 }
272
273 return "Cannot parse the unexpected character ".concat(printCharCode(code), ".");
274}
275/**
276 * Reads from body starting at startPosition until it finds a non-whitespace
277 * character, then returns the position of that character for lexing.
278 */
279
280
281function positionAfterWhitespace(body, startPosition, lexer) {
282 var bodyLength = body.length;
283 var position = startPosition;
284
285 while (position < bodyLength) {
286 var code = body.charCodeAt(position); // tab | space | comma | BOM
287
288 if (code === 9 || code === 32 || code === 44 || code === 0xfeff) {
289 ++position;
290 } else if (code === 10) {
291 // new line
292 ++position;
293 ++lexer.line;
294 lexer.lineStart = position;
295 } else if (code === 13) {
296 // carriage return
297 if (body.charCodeAt(position + 1) === 10) {
298 position += 2;
299 } else {
300 ++position;
301 }
302
303 ++lexer.line;
304 lexer.lineStart = position;
305 } else {
306 break;
307 }
308 }
309
310 return position;
311}
312/**
313 * Reads a comment token from the source file.
314 *
315 * #[\u0009\u0020-\uFFFF]*
316 */
317
318
319function readComment(source, start, line, col, prev) {
320 var body = source.body;
321 var code;
322 var position = start;
323
324 do {
325 code = body.charCodeAt(++position);
326 } while (!isNaN(code) && ( // SourceCharacter but not LineTerminator
327 code > 0x001f || code === 0x0009));
328
329 return new Tok(TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position));
330}
331/**
332 * Reads a number token from the source file, either a float
333 * or an int depending on whether a decimal point appears.
334 *
335 * Int: -?(0|[1-9][0-9]*)
336 * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
337 */
338
339
340function readNumber(source, start, firstCode, line, col, prev) {
341 var body = source.body;
342 var code = firstCode;
343 var position = start;
344 var isFloat = false;
345
346 if (code === 45) {
347 // -
348 code = body.charCodeAt(++position);
349 }
350
351 if (code === 48) {
352 // 0
353 code = body.charCodeAt(++position);
354
355 if (code >= 48 && code <= 57) {
356 throw syntaxError(source, position, "Invalid number, unexpected digit after 0: ".concat(printCharCode(code), "."));
357 }
358 } else {
359 position = readDigits(source, position, code);
360 code = body.charCodeAt(position);
361 }
362
363 if (code === 46) {
364 // .
365 isFloat = true;
366 code = body.charCodeAt(++position);
367 position = readDigits(source, position, code);
368 code = body.charCodeAt(position);
369 }
370
371 if (code === 69 || code === 101) {
372 // E e
373 isFloat = true;
374 code = body.charCodeAt(++position);
375
376 if (code === 43 || code === 45) {
377 // + -
378 code = body.charCodeAt(++position);
379 }
380
381 position = readDigits(source, position, code);
382 code = body.charCodeAt(position);
383 } // Numbers cannot be followed by . or e
384
385
386 if (code === 46 || code === 69 || code === 101) {
387 throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
388 }
389
390 return new Tok(isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, line, col, prev, body.slice(start, position));
391}
392/**
393 * Returns the new position in the source after reading digits.
394 */
395
396
397function readDigits(source, start, firstCode) {
398 var body = source.body;
399 var position = start;
400 var code = firstCode;
401
402 if (code >= 48 && code <= 57) {
403 // 0 - 9
404 do {
405 code = body.charCodeAt(++position);
406 } while (code >= 48 && code <= 57); // 0 - 9
407
408
409 return position;
410 }
411
412 throw syntaxError(source, position, "Invalid number, expected digit but got: ".concat(printCharCode(code), "."));
413}
414/**
415 * Reads a string token from the source file.
416 *
417 * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
418 */
419
420
421function readString(source, start, line, col, prev) {
422 var body = source.body;
423 var position = start + 1;
424 var chunkStart = position;
425 var code = 0;
426 var value = '';
427
428 while (position < body.length && !isNaN(code = body.charCodeAt(position)) && // not LineTerminator
429 code !== 0x000a && code !== 0x000d) {
430 // Closing Quote (")
431 if (code === 34) {
432 value += body.slice(chunkStart, position);
433 return new Tok(TokenKind.STRING, start, position + 1, line, col, prev, value);
434 } // SourceCharacter
435
436
437 if (code < 0x0020 && code !== 0x0009) {
438 throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
439 }
440
441 ++position;
442
443 if (code === 92) {
444 // \
445 value += body.slice(chunkStart, position - 1);
446 code = body.charCodeAt(position);
447
448 switch (code) {
449 case 34:
450 value += '"';
451 break;
452
453 case 47:
454 value += '/';
455 break;
456
457 case 92:
458 value += '\\';
459 break;
460
461 case 98:
462 value += '\b';
463 break;
464
465 case 102:
466 value += '\f';
467 break;
468
469 case 110:
470 value += '\n';
471 break;
472
473 case 114:
474 value += '\r';
475 break;
476
477 case 116:
478 value += '\t';
479 break;
480
481 case 117:
482 {
483 // uXXXX
484 var charCode = uniCharCode(body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4));
485
486 if (charCode < 0) {
487 var invalidSequence = body.slice(position + 1, position + 5);
488 throw syntaxError(source, position, "Invalid character escape sequence: \\u".concat(invalidSequence, "."));
489 }
490
491 value += String.fromCharCode(charCode);
492 position += 4;
493 break;
494 }
495
496 default:
497 throw syntaxError(source, position, "Invalid character escape sequence: \\".concat(String.fromCharCode(code), "."));
498 }
499
500 ++position;
501 chunkStart = position;
502 }
503 }
504
505 throw syntaxError(source, position, 'Unterminated string.');
506}
507/**
508 * Reads a block string token from the source file.
509 *
510 * """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
511 */
512
513
514function readBlockString(source, start, line, col, prev, lexer) {
515 var body = source.body;
516 var position = start + 3;
517 var chunkStart = position;
518 var code = 0;
519 var rawValue = '';
520
521 while (position < body.length && !isNaN(code = body.charCodeAt(position))) {
522 // Closing Triple-Quote (""")
523 if (code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34) {
524 rawValue += body.slice(chunkStart, position);
525 return new Tok(TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, dedentBlockStringValue(rawValue));
526 } // SourceCharacter
527
528
529 if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) {
530 throw syntaxError(source, position, "Invalid character within String: ".concat(printCharCode(code), "."));
531 }
532
533 if (code === 10) {
534 // new line
535 ++position;
536 ++lexer.line;
537 lexer.lineStart = position;
538 } else if (code === 13) {
539 // carriage return
540 if (body.charCodeAt(position + 1) === 10) {
541 position += 2;
542 } else {
543 ++position;
544 }
545
546 ++lexer.line;
547 lexer.lineStart = position;
548 } else if ( // Escape Triple-Quote (\""")
549 code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34) {
550 rawValue += body.slice(chunkStart, position) + '"""';
551 position += 4;
552 chunkStart = position;
553 } else {
554 ++position;
555 }
556 }
557
558 throw syntaxError(source, position, 'Unterminated string.');
559}
560/**
561 * Converts four hexadecimal chars to the integer that the
562 * string represents. For example, uniCharCode('0','0','0','f')
563 * will return 15, and uniCharCode('0','0','f','f') returns 255.
564 *
565 * Returns a negative number on error, if a char was invalid.
566 *
567 * This is implemented by noting that char2hex() returns -1 on error,
568 * which means the result of ORing the char2hex() will also be negative.
569 */
570
571
572function uniCharCode(a, b, c, d) {
573 return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d);
574}
575/**
576 * Converts a hex character to its integer value.
577 * '0' becomes 0, '9' becomes 9
578 * 'A' becomes 10, 'F' becomes 15
579 * 'a' becomes 10, 'f' becomes 15
580 *
581 * Returns -1 on error.
582 */
583
584
585function char2hex(a) {
586 return a >= 48 && a <= 57 ? a - 48 // 0-9
587 : a >= 65 && a <= 70 ? a - 55 // A-F
588 : a >= 97 && a <= 102 ? a - 87 // a-f
589 : -1;
590}
591/**
592 * Reads an alphanumeric + underscore name from the source.
593 *
594 * [_A-Za-z][_0-9A-Za-z]*
595 */
596
597
598function readName(source, start, line, col, prev) {
599 var body = source.body;
600 var bodyLength = body.length;
601 var position = start + 1;
602 var code = 0;
603
604 while (position !== bodyLength && !isNaN(code = body.charCodeAt(position)) && (code === 95 || // _
605 code >= 48 && code <= 57 || // 0-9
606 code >= 65 && code <= 90 || // A-Z
607 code >= 97 && code <= 122) // a-z
608 ) {
609 ++position;
610 }
611
612 return new Tok(TokenKind.NAME, start, position, line, col, prev, body.slice(start, position));
613}