// @flow strict import defineToJSON from '../jsutils/defineToJSON'; import { syntaxError } from '../error/syntaxError'; import { type Token } from './ast'; import { type Source } from './source'; import { dedentBlockStringValue } from './blockString'; import { type TokenKindEnum, TokenKind } from './tokenKind'; /** * Given a Source object, this returns a Lexer for that source. * A Lexer is a stateful stream generator in that every time * it is advanced, it returns the next token in the Source. Assuming the * source lexes, the final Token emitted by the lexer will be of kind * EOF, after which the lexer will repeatedly return the same EOF token * whenever called. */ export function createLexer( source: Source, options: TOptions, ): Lexer { const startOfFileToken = new Tok(TokenKind.SOF, 0, 0, 0, 0, null); const lexer: Lexer = { source, options, lastToken: startOfFileToken, token: startOfFileToken, line: 1, lineStart: 0, advance: advanceLexer, lookahead, }; return lexer; } function advanceLexer() { this.lastToken = this.token; const token = (this.token = this.lookahead()); return token; } function lookahead() { let token = this.token; if (token.kind !== TokenKind.EOF) { do { // Note: next is only mutable during parsing, so we cast to allow this. token = token.next || ((token: any).next = readToken(this, token)); } while (token.kind === TokenKind.COMMENT); } return token; } /** * The return type of createLexer. */ export type Lexer = { source: Source, options: TOptions, /** * The previously focused non-ignored token. */ lastToken: Token, /** * The currently focused non-ignored token. */ token: Token, /** * The (1-indexed) line containing the current token. */ line: number, /** * The character offset at which the current line begins. */ lineStart: number, /** * Advances the token stream to the next non-ignored token. */ advance(): Token, /** * Looks ahead and returns the next non-ignored token, but does not change * the Lexer's state. */ lookahead(): Token, ... }; // @internal export function isPunctuatorToken(token: Token) { const kind = token.kind; return ( kind === TokenKind.BANG || kind === TokenKind.DOLLAR || kind === TokenKind.AMP || kind === TokenKind.PAREN_L || kind === TokenKind.PAREN_R || kind === TokenKind.SPREAD || kind === TokenKind.COLON || kind === TokenKind.EQUALS || kind === TokenKind.AT || kind === TokenKind.BRACKET_L || kind === TokenKind.BRACKET_R || kind === TokenKind.BRACE_L || kind === TokenKind.PIPE || kind === TokenKind.BRACE_R ); } /** * Helper function for constructing the Token object. */ function Tok( kind: TokenKindEnum, start: number, end: number, line: number, column: number, prev: Token | null, value?: string, ) { this.kind = kind; this.start = start; this.end = end; this.line = line; this.column = column; this.value = value; this.prev = prev; this.next = null; } // Print a simplified form when appearing in JSON/util.inspect. defineToJSON(Tok, function() { return { kind: this.kind, value: this.value, line: this.line, column: this.column, }; }); function printCharCode(code) { return ( // NaN/undefined represents access beyond the end of the file. isNaN(code) ? TokenKind.EOF : // Trust JSON for ASCII. code < 0x007f ? JSON.stringify(String.fromCharCode(code)) : // Otherwise print the escaped form. `"\\u${('00' + code.toString(16).toUpperCase()).slice(-4)}"` ); } /** * Gets the next token from the source starting at the given position. * * This skips over whitespace until it finds the next lexable token, then lexes * punctuators immediately or calls the appropriate helper function for more * complicated tokens. */ function readToken(lexer: Lexer, prev: Token): Token { const source = lexer.source; const body = source.body; const bodyLength = body.length; const pos = positionAfterWhitespace(body, prev.end, lexer); const line = lexer.line; const col = 1 + pos - lexer.lineStart; if (pos >= bodyLength) { return new Tok(TokenKind.EOF, bodyLength, bodyLength, line, col, prev); } const code = body.charCodeAt(pos); // SourceCharacter switch (code) { // ! case 33: return new Tok(TokenKind.BANG, pos, pos + 1, line, col, prev); // # case 35: return readComment(source, pos, line, col, prev); // $ case 36: return new Tok(TokenKind.DOLLAR, pos, pos + 1, line, col, prev); // & case 38: return new Tok(TokenKind.AMP, pos, pos + 1, line, col, prev); // ( case 40: return new Tok(TokenKind.PAREN_L, pos, pos + 1, line, col, prev); // ) case 41: return new Tok(TokenKind.PAREN_R, pos, pos + 1, line, col, prev); // . case 46: if (body.charCodeAt(pos + 1) === 46 && body.charCodeAt(pos + 2) === 46) { return new Tok(TokenKind.SPREAD, pos, pos + 3, line, col, prev); } break; // : case 58: return new Tok(TokenKind.COLON, pos, pos + 1, line, col, prev); // = case 61: return new Tok(TokenKind.EQUALS, pos, pos + 1, line, col, prev); // @ case 64: return new Tok(TokenKind.AT, pos, pos + 1, line, col, prev); // [ case 91: return new Tok(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev); // ] case 93: return new Tok(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev); // { case 123: return new Tok(TokenKind.BRACE_L, pos, pos + 1, line, col, prev); // | case 124: return new Tok(TokenKind.PIPE, pos, pos + 1, line, col, prev); // } case 125: return new Tok(TokenKind.BRACE_R, pos, pos + 1, line, col, prev); // A-Z _ a-z case 65: case 66: case 67: case 68: case 69: case 70: case 71: case 72: case 73: case 74: case 75: case 76: case 77: case 78: case 79: case 80: case 81: case 82: case 83: case 84: case 85: case 86: case 87: case 88: case 89: case 90: case 95: case 97: case 98: case 99: case 100: case 101: case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 109: case 110: case 111: case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 119: case 120: case 121: case 122: return readName(source, pos, line, col, prev); // - 0-9 case 45: case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: return readNumber(source, pos, code, line, col, prev); // " case 34: if (body.charCodeAt(pos + 1) === 34 && body.charCodeAt(pos + 2) === 34) { return readBlockString(source, pos, line, col, prev, lexer); } return readString(source, pos, line, col, prev); } throw syntaxError(source, pos, unexpectedCharacterMessage(code)); } /** * Report a message that an unexpected character was encountered. */ function unexpectedCharacterMessage(code) { if (code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d) { return `Cannot contain the invalid character ${printCharCode(code)}.`; } if (code === 39) { // ' return 'Unexpected single quote character (\'), did you mean to use a double quote (")?'; } return `Cannot parse the unexpected character ${printCharCode(code)}.`; } /** * Reads from body starting at startPosition until it finds a non-whitespace * character, then returns the position of that character for lexing. */ function positionAfterWhitespace( body: string, startPosition: number, lexer: Lexer, ): number { const bodyLength = body.length; let position = startPosition; while (position < bodyLength) { const code = body.charCodeAt(position); // tab | space | comma | BOM if (code === 9 || code === 32 || code === 44 || code === 0xfeff) { ++position; } else if (code === 10) { // new line ++position; ++lexer.line; lexer.lineStart = position; } else if (code === 13) { // carriage return if (body.charCodeAt(position + 1) === 10) { position += 2; } else { ++position; } ++lexer.line; lexer.lineStart = position; } else { break; } } return position; } /** * Reads a comment token from the source file. * * #[\u0009\u0020-\uFFFF]* */ function readComment(source, start, line, col, prev): Token { const body = source.body; let code; let position = start; do { code = body.charCodeAt(++position); } while ( !isNaN(code) && // SourceCharacter but not LineTerminator (code > 0x001f || code === 0x0009) ); return new Tok( TokenKind.COMMENT, start, position, line, col, prev, body.slice(start + 1, position), ); } /** * Reads a number token from the source file, either a float * or an int depending on whether a decimal point appears. * * Int: -?(0|[1-9][0-9]*) * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)? */ function readNumber(source, start, firstCode, line, col, prev): Token { const body = source.body; let code = firstCode; let position = start; let isFloat = false; if (code === 45) { // - code = body.charCodeAt(++position); } if (code === 48) { // 0 code = body.charCodeAt(++position); if (code >= 48 && code <= 57) { throw syntaxError( source, position, `Invalid number, unexpected digit after 0: ${printCharCode(code)}.`, ); } } else { position = readDigits(source, position, code); code = body.charCodeAt(position); } if (code === 46) { // . isFloat = true; code = body.charCodeAt(++position); position = readDigits(source, position, code); code = body.charCodeAt(position); } if (code === 69 || code === 101) { // E e isFloat = true; code = body.charCodeAt(++position); if (code === 43 || code === 45) { // + - code = body.charCodeAt(++position); } position = readDigits(source, position, code); } return new Tok( isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, line, col, prev, body.slice(start, position), ); } /** * Returns the new position in the source after reading digits. */ function readDigits(source, start, firstCode) { const body = source.body; let position = start; let code = firstCode; if (code >= 48 && code <= 57) { // 0 - 9 do { code = body.charCodeAt(++position); } while (code >= 48 && code <= 57); // 0 - 9 return position; } throw syntaxError( source, position, `Invalid number, expected digit but got: ${printCharCode(code)}.`, ); } /** * Reads a string token from the source file. * * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*" */ function readString(source, start, line, col, prev): Token { const body = source.body; let position = start + 1; let chunkStart = position; let code = 0; let value = ''; while ( position < body.length && !isNaN((code = body.charCodeAt(position))) && // not LineTerminator code !== 0x000a && code !== 0x000d ) { // Closing Quote (") if (code === 34) { value += body.slice(chunkStart, position); return new Tok( TokenKind.STRING, start, position + 1, line, col, prev, value, ); } // SourceCharacter if (code < 0x0020 && code !== 0x0009) { throw syntaxError( source, position, `Invalid character within String: ${printCharCode(code)}.`, ); } ++position; if (code === 92) { // \ value += body.slice(chunkStart, position - 1); code = body.charCodeAt(position); switch (code) { case 34: value += '"'; break; case 47: value += '/'; break; case 92: value += '\\'; break; case 98: value += '\b'; break; case 102: value += '\f'; break; case 110: value += '\n'; break; case 114: value += '\r'; break; case 116: value += '\t'; break; case 117: { // uXXXX const charCode = uniCharCode( body.charCodeAt(position + 1), body.charCodeAt(position + 2), body.charCodeAt(position + 3), body.charCodeAt(position + 4), ); if (charCode < 0) { const invalidSequence = body.slice(position + 1, position + 5); throw syntaxError( source, position, `Invalid character escape sequence: \\u${invalidSequence}.`, ); } value += String.fromCharCode(charCode); position += 4; break; } default: throw syntaxError( source, position, `Invalid character escape sequence: \\${String.fromCharCode( code, )}.`, ); } ++position; chunkStart = position; } } throw syntaxError(source, position, 'Unterminated string.'); } /** * Reads a block string token from the source file. * * """("?"?(\\"""|\\(?!=""")|[^"\\]))*""" */ function readBlockString(source, start, line, col, prev, lexer): Token { const body = source.body; let position = start + 3; let chunkStart = position; let code = 0; let rawValue = ''; while (position < body.length && !isNaN((code = body.charCodeAt(position)))) { // Closing Triple-Quote (""") if ( code === 34 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 ) { rawValue += body.slice(chunkStart, position); return new Tok( TokenKind.BLOCK_STRING, start, position + 3, line, col, prev, dedentBlockStringValue(rawValue), ); } // SourceCharacter if ( code < 0x0020 && code !== 0x0009 && code !== 0x000a && code !== 0x000d ) { throw syntaxError( source, position, `Invalid character within String: ${printCharCode(code)}.`, ); } if (code === 10) { // new line ++position; ++lexer.line; lexer.lineStart = position; } else if (code === 13) { // carriage return if (body.charCodeAt(position + 1) === 10) { position += 2; } else { ++position; } ++lexer.line; lexer.lineStart = position; } else if ( // Escape Triple-Quote (\""") code === 92 && body.charCodeAt(position + 1) === 34 && body.charCodeAt(position + 2) === 34 && body.charCodeAt(position + 3) === 34 ) { rawValue += body.slice(chunkStart, position) + '"""'; position += 4; chunkStart = position; } else { ++position; } } throw syntaxError(source, position, 'Unterminated string.'); } /** * Converts four hexadecimal chars to the integer that the * string represents. For example, uniCharCode('0','0','0','f') * will return 15, and uniCharCode('0','0','f','f') returns 255. * * Returns a negative number on error, if a char was invalid. * * This is implemented by noting that char2hex() returns -1 on error, * which means the result of ORing the char2hex() will also be negative. */ function uniCharCode(a, b, c, d) { return ( (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) | char2hex(d) ); } /** * Converts a hex character to its integer value. * '0' becomes 0, '9' becomes 9 * 'A' becomes 10, 'F' becomes 15 * 'a' becomes 10, 'f' becomes 15 * * Returns -1 on error. */ function char2hex(a) { return a >= 48 && a <= 57 ? a - 48 // 0-9 : a >= 65 && a <= 70 ? a - 55 // A-F : a >= 97 && a <= 102 ? a - 87 // a-f : -1; } /** * Reads an alphanumeric + underscore name from the source. * * [_A-Za-z][_0-9A-Za-z]* */ function readName(source, start, line, col, prev): Token { const body = source.body; const bodyLength = body.length; let position = start + 1; let code = 0; while ( position !== bodyLength && !isNaN((code = body.charCodeAt(position))) && (code === 95 || // _ (code >= 48 && code <= 57) || // 0-9 (code >= 65 && code <= 90) || // A-Z (code >= 97 && code <= 122)) // a-z ) { ++position; } return new Tok( TokenKind.NAME, start, position, line, col, prev, body.slice(start, position), ); }