UNPKG

9.32 kBJavaScriptView Raw
1/**
2 * @license
3 * Copyright 2018 Google LLC
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18'use strict';
19
20// A simple lexer for Postgres SQL.
21//
22// https://www.postgresql.org/docs/9.0/static/sql-syntax-lexical.html
23//
24// -- line chars line comment
25// /* block */ block comment. may nest: /* /* */ still in comment */
26//
27// "..." identifier literal
28// U&"..." identifier literal with unicode escapes
29// UESCAPE symbol may follow U& string to override \ as escape character
30//
31// '...' string literal
32// E'...' supports C-style escape sequences
33// U&'...' string literal with unicode escapes
34// UESCAPE symbol ditto
35// B'...' binary literal
36// X'...' hex literal
37//
38// $$...$$ string literal with no escaping convention
39// $foo$...$foo$ string literal where "foo" may be any run of identifier chars
40
41
42// eslint-disable-next-line no-use-before-define
43exports.makeLexer = makeLexer;
44
45
46const TOP_LEVEL_DELIMITER = new RegExp(
47 // Line comment
48 '--' +
49 // or a block comment start
50 '|/[*]' +
51 // or an unescaped string start
52 // Tag has the form of an unquoted identifier without embedded '$'.
53 // TODO: should allow non-ascii identifiers. Might need to normalize.
54 '|[$](?:[a-zA-Z_][a-zA-Z_0-9]*)?[$]' +
55 // or an identifier start
56 '|(?:[Uu]&)?"' +
57 // or an escaped string start
58 '|(?:[Uu]&|[EeBbXx])?\'');
59
60const LINE_COMMENT_BODY = /^[^\r\n]*/;
61
62const BLOCK_COMMENT_TOKEN = /[*][/]|[/][*]/;
63
64const ESC_DQ_STRING_BODY = /^(?:[^"\\]|""|\\.)*(")?/;
65const ESC_SQ_STRING_BODY = /^(?:[^'\\]|''|\\.)*(')?/;
66
67const SIMPLE_DQ_STRING_BODY = /^(?:[^"]|"")*(")?/;
68const SIMPLE_SQ_STRING_BODY = /^(?:[^']|'')*(')?/;
69
70const ESC_STRING_CONTINUATION = /^[\t\n\r ]*([/][*]|--|')?/;
71
72const STRING_BODIES = {
73 __proto__: null,
74 '"': SIMPLE_DQ_STRING_BODY,
75 'u&"': ESC_DQ_STRING_BODY,
76 '\'': SIMPLE_SQ_STRING_BODY,
77 'b\'': SIMPLE_SQ_STRING_BODY,
78 'e\'': ESC_SQ_STRING_BODY,
79 'u&\'': ESC_SQ_STRING_BODY,
80 'x\'': SIMPLE_SQ_STRING_BODY,
81};
82
83const LAST_DELIMITER_CHARACTER_TO_HANDLER = {
84 '-': (delimiter, chunk) => {
85 // delimiter is --
86 const match = LINE_COMMENT_BODY.exec(chunk);
87 const remainder = chunk.substring(match[0].length);
88 if (remainder) {
89 return [ null, remainder ];
90 }
91 throw new Error(`Unterminated line comment: --${ chunk }`);
92 },
93 '*': (delimiter, chunk) => {
94 // delimiter is '/*'.
95 let depth = delimiter.length / 2;
96 let remainder = chunk;
97 while (remainder) {
98 const match = BLOCK_COMMENT_TOKEN.exec(remainder);
99 if (!match) {
100 break;
101 }
102 remainder = remainder.substring(match.index + 2);
103 if (match[0] === '/*') {
104 ++depth;
105 } else {
106 // */
107 --depth;
108 if (!depth) {
109 break;
110 }
111 }
112 }
113 if (depth) {
114 throw new Error(`Unterminated block comment: /*${ chunk }`);
115 }
116 return [ null, remainder ];
117
118 // TODO: Do we need to take into account nested "--".
119 // soc.if.usp.br/manual/postgresql-doc-7.4/html/plpgsql-structure.html says
120 // "double dash comments can be enclosed into a block comment and
121 // a double dash can hide the block comment delimiters /* and */."
122 },
123 '"': (delimiter, chunk) => {
124 const match = STRING_BODIES[delimiter].exec(chunk);
125 const remainder = chunk.substring(match[0].length);
126 if (match[1]) {
127 return [ null, remainder ];
128 }
129 if (match[0]) {
130 return [ delimiter, remainder ];
131 }
132 throw new Error(`Incomplete escape sequence in ${ delimiter } delimited string at \`${ chunk }\``);
133 },
134 '\'': (delimiter, chunk) => {
135 const match = STRING_BODIES[delimiter].exec(chunk);
136 const remainder = chunk.substring(match[0].length);
137 if (match[1]) {
138 return [
139 // 4.1.2.2. String Constants with C-style Escapes
140 // (When continuing an escape string constant across lines,
141 // write E only before the first opening quote.)
142 (delimiter === 'e\'' || delimiter === 'E\'') ? 'e' : null, // eslint-disable-line array-element-newline
143 remainder,
144 ];
145 }
146 if (match[0]) {
147 return [ delimiter, remainder ];
148 }
149 throw new Error(`Incomplete escape sequence in ${ delimiter } delimited string at \`${ chunk }\``);
150 },
151 '$': (delimiter, chunk) => {
152 // TODO: should this match be case insensitive? $x$...$X$
153 const i = chunk.indexOf(delimiter);
154 if (i >= 0) {
155 return [ null, chunk.substring(i + delimiter.length) ];
156 }
157 const lastDollar = chunk.lastIndexOf('$');
158 if (lastDollar >= 0) {
159 const suffix = chunk.substring(lastDollar);
160 if (delimiter.indexOf(suffix) === 0) {
161 // merge hazard
162 throw new Error(`merge hazard '${ suffix }' at end of ${ delimiter } delimited string`);
163 }
164 }
165 return [ delimiter, '' ];
166 },
167 // Special handler to detect e'...' continuations. See 'e' case above.
168 'e': (delimiter, chunk) => {
169 let remainder = chunk;
170 while (remainder) {
171 const match = ESC_STRING_CONTINUATION.exec(remainder);
172 let [ consumed, subdelim ] = match; // eslint-disable-line prefer-const
173 if (!consumed) {
174 return [ null, remainder ];
175 }
176 remainder = remainder.substring(consumed.length);
177 if (subdelim) {
178 if (subdelim === '\'') {
179 return [ 'e\'', remainder ];
180 }
181 while (remainder && subdelim) {
182 const handler = LAST_DELIMITER_CHARACTER_TO_HANDLER[subdelim[subdelim.length - 1]];
183 [ subdelim, remainder ] = handler(subdelim, remainder);
184 }
185 }
186 }
187 return [ delimiter, remainder ];
188 },
189};
190
191function replayError(fun) {
192 let message = null;
193 return (...args) => {
194 if (message !== null) {
195 throw new Error(message);
196 }
197 try {
198 return fun(...args);
199 } catch (exc) {
200 message = `${ exc.message }`;
201 throw exc;
202 }
203 };
204}
205
206function makeLexer() {
207 let delimiter = null;
208 let continuationAmbiguity = false;
209 let chunkIndex = -1;
210
211 function consumeFromLeft(remainder) {
212 if (delimiter) {
213 const lastChar = delimiter[delimiter.length - 1];
214 if (lastChar !== '*' && lastChar !== '-') {
215 continuationAmbiguity = false;
216 }
217 const handler = LAST_DELIMITER_CHARACTER_TO_HANDLER[lastChar];
218 ([ delimiter, remainder ] = handler(delimiter, remainder));
219 } else {
220 const match = TOP_LEVEL_DELIMITER.exec(remainder);
221 if (continuationAmbiguity) {
222 const end = match ? match.index : remainder.length;
223 if (/[^\t\n\r ]/.test(remainder.substring(0, end))) {
224 continuationAmbiguity = false;
225 }
226 }
227 if (!match) {
228 return '';
229 }
230 [ delimiter ] = match;
231 if (delimiter[0] !== '$') {
232 // Empirically,
233 // postgres=# SELECT $foo$bar$Foo$;
234 // postgres$# $foo$;
235 // ?column?
236 // -----------
237 // bar$Foo$;+
238 delimiter = delimiter.toLowerCase();
239 }
240 remainder = remainder.substring(match.index + delimiter.length);
241 }
242 return remainder;
243 }
244
245 function lexer(chunk) {
246 if (chunk === null) {
247 if (delimiter && delimiter !== 'e') {
248 throw new Error(`Unclosed quoted string: ${ delimiter }`);
249 }
250 return delimiter;
251 }
252
253 ++chunkIndex;
254
255 if (continuationAmbiguity && chunkIndex > 1) {
256 // If any chunk besides the last contains a newline and
257 // does not contain any non-whitespace or comment content,
258 // then we have a continuation ambiguity.
259 //
260 // For example,
261 // pg`SELECT ${ x }
262 // ${ y }`
263 // then we would have to know how ${ x } was escaped to
264 // determine how to escape ${ y } because of a string
265 // continuation corner-case:
266 //
267 // From https://www.postgresql.org/docs/9.0/static/sql-syntax-lexical.html
268 // "Two string constants that are only separated by whitespace with at
269 // least one newline are concatenated and effectively treated as if the
270 // string had been written as one constant."
271 //
272 // "When continuing an escape string constant across lines, write E only
273 // before the first opening quote."
274 //
275 // To decide whether to wrap y using e'...' ${ y } we need to know about
276 // ${ x }.
277 throw new Error(
278 // eslint-disable-next-line no-template-curly-in-string
279 'Potential for ambiguous string continuation at `${ chunk }`.' +
280 ' If you need string continuation start with an e\'...\' string.');
281 }
282
283 let remainder = `${ chunk }`;
284 continuationAmbiguity = /[\n\r]/.test(chunk);
285 while (remainder) {
286 remainder = consumeFromLeft(remainder);
287 }
288 return delimiter;
289 }
290
291 return replayError(lexer);
292}
293
294module.exports.makeLexer = makeLexer;