UNPKG

18.3 kBPlain TextView Raw
1/*---------------------------------------------------------------------------------------------
2 * Copyright (c) Microsoft Corporation. All rights reserved.
3 * Licensed under the MIT License. See License.txt in the project root for license information.
4 *--------------------------------------------------------------------------------------------*/
5
6/*
7 * This module only exports 'compile' which compiles a JSON language definition
8 * into a typed and checked ILexer definition.
9 */
10
11import * as monarchCommon from './common';
12import { IMonarchLanguage, IMonarchLanguageBracket } from './types';
13
14/*
15 * Type helpers
16 *
17 * Note: this is just for sanity checks on the JSON description which is
18 * helpful for the programmer. No checks are done anymore once the lexer is
19 * already 'compiled and checked'.
20 *
21 */
22
23function isArrayOf(elemType: (x: any) => boolean, obj: any): boolean {
24 if (!obj) {
25 return false;
26 }
27 if (!(Array.isArray(obj))) {
28 return false;
29 }
30 for (const el of obj) {
31 if (!(elemType(el))) {
32 return false;
33 }
34 }
35 return true;
36}
37
38function bool(prop: any, defValue: boolean): boolean {
39 if (typeof prop === 'boolean') {
40 return prop;
41 }
42 return defValue;
43}
44
45function string(prop: any, defValue: string): string {
46 if (typeof (prop) === 'string') {
47 return prop;
48 }
49 return defValue;
50}
51
52
53function arrayToHash(array: string[]): { [name: string]: true } {
54 const result: any = {};
55 for (const e of array) {
56 result[e] = true;
57 }
58 return result;
59}
60
61
62function createKeywordMatcher(arr: string[], caseInsensitive: boolean = false): (str: string) => boolean {
63 if (caseInsensitive) {
64 arr = arr.map(function (x) { return x.toLowerCase(); });
65 }
66 const hash = arrayToHash(arr);
67 if (caseInsensitive) {
68 return function (word) {
69 return hash[word.toLowerCase()] !== undefined && hash.hasOwnProperty(word.toLowerCase());
70 };
71 } else {
72 return function (word) {
73 return hash[word] !== undefined && hash.hasOwnProperty(word);
74 };
75 }
76}
77
78
79// Lexer helpers
80
81/**
82 * Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set.
83 * Also replaces @\w+ or sequences with the content of the specified attribute
84 */
85function compileRegExp(lexer: monarchCommon.ILexerMin, str: string): RegExp {
86 let n = 0;
87 while (str.indexOf('@') >= 0 && n < 5) { // at most 5 expansions
88 n++;
89 str = str.replace(/@(\w+)/g, function (s, attr?) {
90 let sub = '';
91 if (typeof (lexer[attr]) === 'string') {
92 sub = lexer[attr];
93 } else if (lexer[attr] && lexer[attr] instanceof RegExp) {
94 sub = lexer[attr].source;
95 } else {
96 if (lexer[attr] === undefined) {
97 throw monarchCommon.createError(lexer, 'language definition does not contain attribute \'' + attr + '\', used at: ' + str);
98 } else {
99 throw monarchCommon.createError(lexer, 'attribute reference \'' + attr + '\' must be a string, used at: ' + str);
100 }
101 }
102 return (monarchCommon.empty(sub) ? '' : '(?:' + sub + ')');
103 });
104 }
105
106 return new RegExp(str, (lexer.ignoreCase ? 'i' : ''));
107}
108
109/**
110 * Compiles guard functions for case matches.
111 * This compiles 'cases' attributes into efficient match functions.
112 *
113 */
114function selectScrutinee(id: string, matches: string[], state: string, num: number): string | null {
115 if (num < 0) {
116 return id;
117 }
118 if (num < matches.length) {
119 return matches[num];
120 }
121 if (num >= 100) {
122 num = num - 100;
123 let parts = state.split('.');
124 parts.unshift(state);
125 if (num < parts.length) {
126 return parts[num];
127 }
128 }
129 return null;
130}
131
132function createGuard(lexer: monarchCommon.ILexerMin, ruleName: string, tkey: string, val: monarchCommon.FuzzyAction): monarchCommon.IBranch {
133 // get the scrutinee and pattern
134 let scrut = -1; // -1: $!, 0-99: $n, 100+n: $Sn
135 let oppat = tkey;
136 let matches = tkey.match(/^\$(([sS]?)(\d\d?)|#)(.*)$/);
137 if (matches) {
138 if (matches[3]) { // if digits
139 scrut = parseInt(matches[3]);
140 if (matches[2]) {
141 scrut = scrut + 100; // if [sS] present
142 }
143 }
144 oppat = matches[4];
145 }
146 // get operator
147 let op = '~';
148 let pat = oppat;
149 if (!oppat || oppat.length === 0) {
150 op = '!=';
151 pat = '';
152 }
153 else if (/^\w*$/.test(pat)) { // just a word
154 op = '==';
155 }
156 else {
157 matches = oppat.match(/^(@|!@|~|!~|==|!=)(.*)$/);
158 if (matches) {
159 op = matches[1];
160 pat = matches[2];
161 }
162 }
163
164 // set the tester function
165 let tester: (s: string, id: string, matches: string[], state: string, eos: boolean) => boolean;
166
167 // special case a regexp that matches just words
168 if ((op === '~' || op === '!~') && /^(\w|\|)*$/.test(pat)) {
169 let inWords = createKeywordMatcher(pat.split('|'), lexer.ignoreCase);
170 tester = function (s) { return (op === '~' ? inWords(s) : !inWords(s)); };
171 }
172 else if (op === '@' || op === '!@') {
173 let words = lexer[pat];
174 if (!words) {
175 throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' is not defined, in rule: ' + ruleName);
176 }
177 if (!(isArrayOf(function (elem) { return (typeof (elem) === 'string'); }, words))) {
178 throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' must be an array of strings, in rule: ' + ruleName);
179 }
180 let inWords = createKeywordMatcher(words, lexer.ignoreCase);
181 tester = function (s) { return (op === '@' ? inWords(s) : !inWords(s)); };
182 }
183 else if (op === '~' || op === '!~') {
184 if (pat.indexOf('$') < 0) {
185 // precompile regular expression
186 let re = compileRegExp(lexer, '^' + pat + '$');
187 tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };
188 }
189 else {
190 tester = function (s, id, matches, state) {
191 let re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$');
192 return re.test(s);
193 };
194 }
195 }
196 else { // if (op==='==' || op==='!=') {
197 if (pat.indexOf('$') < 0) {
198 let patx = monarchCommon.fixCase(lexer, pat);
199 tester = function (s) { return (op === '==' ? s === patx : s !== patx); };
200 }
201 else {
202 let patx = monarchCommon.fixCase(lexer, pat);
203 tester = function (s, id, matches, state, eos) {
204 let patexp = monarchCommon.substituteMatches(lexer, patx, id, matches, state);
205 return (op === '==' ? s === patexp : s !== patexp);
206 };
207 }
208 }
209
210 // return the branch object
211 if (scrut === -1) {
212 return {
213 name: tkey, value: val, test: function (id, matches, state, eos) {
214 return tester(id, id, matches, state, eos);
215 }
216 };
217 }
218 else {
219 return {
220 name: tkey, value: val, test: function (id, matches, state, eos) {
221 let scrutinee = selectScrutinee(id, matches, state, scrut);
222 return tester(!scrutinee ? '' : scrutinee, id, matches, state, eos);
223 }
224 };
225 }
226}
227
228/**
229 * Compiles an action: i.e. optimize regular expressions and case matches
230 * and do many sanity checks.
231 *
232 * This is called only during compilation but if the lexer definition
233 * contains user functions as actions (which is usually not allowed), then this
234 * may be called during lexing. It is important therefore to compile common cases efficiently
235 */
236function compileAction(lexer: monarchCommon.ILexerMin, ruleName: string, action: any): monarchCommon.FuzzyAction {
237 if (!action) {
238 return { token: '' };
239 }
240 else if (typeof (action) === 'string') {
241 return action; // { token: action };
242 }
243 else if (action.token || action.token === '') {
244 if (typeof (action.token) !== 'string') {
245 throw monarchCommon.createError(lexer, 'a \'token\' attribute must be of type string, in rule: ' + ruleName);
246 }
247 else {
248 // only copy specific typed fields (only happens once during compile Lexer)
249 let newAction: monarchCommon.IAction = { token: action.token };
250 if (action.token.indexOf('$') >= 0) {
251 newAction.tokenSubst = true;
252 // console.log('compiling sub',action.token,monarchCommon.compileSubstitution(action.token));
253 }
254 if (typeof (action.bracket) === 'string') {
255 if (action.bracket === '@open') {
256 newAction.bracket = monarchCommon.MonarchBracket.Open;
257 } else if (action.bracket === '@close') {
258 newAction.bracket = monarchCommon.MonarchBracket.Close;
259 } else {
260 throw monarchCommon.createError(lexer, 'a \'bracket\' attribute must be either \'@open\' or \'@close\', in rule: ' + ruleName);
261 }
262 }
263 if (action.next) {
264 if (typeof (action.next) !== 'string') {
265 throw monarchCommon.createError(lexer, 'the next state must be a string value in rule: ' + ruleName);
266 }
267 else {
268 let next: string = action.next;
269 if (!/^(@pop|@push|@popall)$/.test(next)) {
270 if (next[0] === '@') {
271 next = next.substr(1); // peel off starting @ sign
272 }
273 if (next.indexOf('$') < 0) { // no dollar substitution, we can check if the state exists
274 if (!monarchCommon.stateExists(lexer, monarchCommon.substituteMatches(lexer, next, '', [], ''))) {
275 throw monarchCommon.createError(lexer, 'the next state \'' + action.next + '\' is not defined in rule: ' + ruleName);
276 }
277 }
278 }
279 newAction.next = next;
280 }
281 }
282 if (typeof (action.goBack) === 'number') {
283 newAction.goBack = action.goBack;
284 }
285 if (typeof (action.switchTo) === 'string') {
286 newAction.switchTo = action.switchTo;
287 }
288 if (typeof (action.log) === 'string') {
289 newAction.log = action.log;
290 }
291 if (typeof (action._push) === 'string') {
292 newAction._push = action._push;
293 }
294 if (typeof (action._pop) === 'string') {
295 newAction._pop = action._pop;
296 }
297 if (typeof (action.mark) === 'string') {
298 newAction.mark = action.mark;
299 }
300 if (typeof (action.fn) === 'string') {
301 newAction.fn = action.fn;
302 }
303 if (typeof (action.nextEmbedded) === 'string') {
304 newAction.nextEmbedded = action.nextEmbedded;
305 lexer.usesEmbedded = true;
306 }
307 return newAction;
308 }
309 }
310 else if (Array.isArray(action)) {
311 let results: monarchCommon.FuzzyAction[] = [];
312 for (let i = 0, len = action.length; i < len; i++) {
313 results[i] = compileAction(lexer, ruleName, action[i]);
314 }
315 return { group: results };
316 }
317 else if (action.cases) {
318 // build an array of test cases
319 let cases: monarchCommon.IBranch[] = [];
320
321 // for each case, push a test function and result value
322 for (let tkey in action.cases) {
323 if (action.cases.hasOwnProperty(tkey)) {
324 const val = compileAction(lexer, ruleName, action.cases[tkey]);
325
326 // what kind of case
327 if (tkey === '@default' || tkey === '@' || tkey === '') {
328 cases.push({ test: undefined, value: val, name: tkey });
329 }
330 else if (tkey === '@eos') {
331 cases.push({ test: function (id, matches, state, eos) { return eos; }, value: val, name: tkey });
332 }
333 else {
334 cases.push(createGuard(lexer, ruleName, tkey, val)); // call separate function to avoid local variable capture
335 }
336 }
337 }
338
339 // create a matching function
340 const def = lexer.defaultToken;
341 return {
342 test: function (id, matches, state, eos) {
343 for (const _case of cases) {
344 const didmatch = (!_case.test || _case.test(id, matches, state, eos));
345 if (didmatch) {
346 return _case.value;
347 }
348 }
349 return def;
350 }
351 };
352 }
353 else {
354 throw monarchCommon.createError(lexer, 'an action must be a string, an object with a \'token\' or \'cases\' attribute, or an array of actions; in rule: ' + ruleName);
355 }
356}
357
358/**
359 * Helper class for creating matching rules
360 */
361class Rule implements monarchCommon.IRule {
362 public regex: RegExp = new RegExp('');
363 public action: monarchCommon.FuzzyAction = { token: '' };
364 public matchOnlyAtLineStart: boolean = false;
365 public name: string = '';
366 public stats: any;
367 public string?: string;
368
369 constructor(name: string) {
370 this.name = name;
371 this.stats = {time:0,count:0,hits:0};
372 }
373
374 public setRegex(lexer: monarchCommon.ILexerMin, re: string | RegExp): void {
375 let sregex: string;
376 if (typeof (re) === 'string') {
377 sregex = re;
378 }
379 else if (re instanceof RegExp) {
380 sregex = (<RegExp>re).source;
381 }
382 else {
383 throw monarchCommon.createError(lexer, 'rules must start with a match string or regular expression: ' + this.name);
384 }
385
386 if(sregex.length == 2 && sregex[0] == '\\' && (/[\{\}\(\)\[\]]/).test(sregex[1])){
387 this.string = sregex[1];
388 }
389
390 this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');
391 this.name = this.name + ': ' + sregex;
392 this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')');
393 // this.regex = compileRegExp(lexer, '^' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + '');
394 }
395
396 public setAction(lexer: monarchCommon.ILexerMin, act: monarchCommon.IAction) {
397 this.action = compileAction(lexer, this.name, act);
398 }
399}
400
401/**
402 * Compiles a json description function into json where all regular expressions,
403 * case matches etc, are compiled and all include rules are expanded.
404 * We also compile the bracket definitions, supply defaults, and do many sanity checks.
405 * If the 'jsonStrict' parameter is 'false', we allow at certain locations
406 * regular expression objects and functions that get called during lexing.
407 * (Currently we have no samples that need this so perhaps we should always have
408 * jsonStrict to true).
409 */
410export function compile(languageId: string, json: IMonarchLanguage): monarchCommon.ILexer {
411 if (!json || typeof (json) !== 'object') {
412 throw new Error('Monarch: expecting a language definition object');
413 }
414
415 // Create our lexer
416 let lexer: monarchCommon.ILexer = <monarchCommon.ILexer>{};
417 lexer.languageId = languageId;
418 lexer.noThrow = false; // raise exceptions during compilation
419 lexer.maxStack = 100;
420
421 // Set standard fields: be defensive about types
422 lexer.start = (typeof json.start === 'string' ? json.start : null);
423 lexer.ignoreCase = bool(json.ignoreCase, false);
424
425 lexer.tokenPostfix = string(json.tokenPostfix, '.' + lexer.languageId);
426 lexer.defaultToken = string(json.defaultToken, 'source');
427
428 lexer.usesEmbedded = false; // becomes true if we find a nextEmbedded action
429
430 // For calling compileAction later on
431 let lexerMin: monarchCommon.ILexerMin = <any>json;
432 lexerMin.languageId = languageId;
433 lexerMin.ignoreCase = lexer.ignoreCase;
434 lexerMin.noThrow = lexer.noThrow;
435 lexerMin.usesEmbedded = lexer.usesEmbedded;
436 lexerMin.stateNames = json.tokenizer;
437 lexerMin.defaultToken = lexer.defaultToken;
438
439
440 // Compile an array of rules into newrules where RegExp objects are created.
441 function addRules(state: string, newrules: monarchCommon.IRule[], rules: any[]) {
442 for (const rule of rules) {
443
444 let include = rule.include;
445 if (include) {
446 if (typeof (include) !== 'string') {
447 throw monarchCommon.createError(lexer, 'an \'include\' attribute must be a string at: ' + state);
448 }
449 if (include[0] === '@') {
450 include = include.substr(1); // peel off starting @
451 }
452 if (!json.tokenizer[include]) {
453 throw monarchCommon.createError(lexer, 'include target \'' + include + '\' is not defined at: ' + state);
454 }
455 addRules(state + '.' + include, newrules, json.tokenizer[include]);
456 }
457 else {
458 const newrule = new Rule(state);
459
460 // Set up new rule attributes
461 if (Array.isArray(rule) && rule.length >= 1 && rule.length <= 3) {
462 newrule.setRegex(lexerMin, rule[0]);
463 if (rule.length >= 3) {
464 if (typeof (rule[1]) === 'string') {
465 newrule.setAction(lexerMin, { token: rule[1], next: rule[2] });
466 }
467 else if (typeof (rule[1]) === 'object') {
468 const rule1 = rule[1];
469 rule1.next = rule[2];
470 newrule.setAction(lexerMin, rule1);
471 }
472 else {
473 throw monarchCommon.createError(lexer, 'a next state as the last element of a rule can only be given if the action is either an object or a string, at: ' + state);
474 }
475 }
476 else {
477 newrule.setAction(lexerMin, rule[1]);
478 }
479 }
480 else {
481 if (!rule.regex) {
482 throw monarchCommon.createError(lexer, 'a rule must either be an array, or an object with a \'regex\' or \'include\' field at: ' + state);
483 }
484 if (rule.name) {
485 if (typeof rule.name === 'string') {
486 newrule.name = rule.name;
487 }
488 }
489 if (rule.matchOnlyAtStart) {
490 newrule.matchOnlyAtLineStart = bool(rule.matchOnlyAtLineStart, false);
491 }
492 newrule.setRegex(lexerMin, rule.regex);
493 newrule.setAction(lexerMin, rule.action);
494 }
495
496 newrules.push(newrule);
497 }
498 }
499 }
500
501 // compile the tokenizer rules
502 if (!json.tokenizer || typeof (json.tokenizer) !== 'object') {
503 throw monarchCommon.createError(lexer, 'a language definition must define the \'tokenizer\' attribute as an object');
504 }
505
506 lexer.tokenizer = <any>[];
507 for (let key in json.tokenizer) {
508 if (json.tokenizer.hasOwnProperty(key)) {
509 if (!lexer.start) {
510 lexer.start = key;
511 }
512
513 const rules = json.tokenizer[key];
514 lexer.tokenizer[key] = new Array();
515 addRules('tokenizer.' + key, lexer.tokenizer[key], rules);
516 }
517 }
518 lexer.usesEmbedded = lexerMin.usesEmbedded; // can be set during compileAction
519
520 // Set simple brackets
521 if (json.brackets) {
522 if (!(Array.isArray(<any>json.brackets))) {
523 throw monarchCommon.createError(lexer, 'the \'brackets\' attribute must be defined as an array');
524 }
525 }
526 else {
527 json.brackets = [
528 { open: '{', close: '}', token: 'delimiter.curly' },
529 { open: '[', close: ']', token: 'delimiter.square' },
530 { open: '(', close: ')', token: 'delimiter.parenthesis' },
531 { open: '<', close: '>', token: 'delimiter.angle' }];
532 }
533 let brackets: IMonarchLanguageBracket[] = [];
534 for (let el of json.brackets) {
535 let desc: any = el;
536 if (desc && Array.isArray(desc) && desc.length === 3) {
537 desc = { token: desc[2], open: desc[0], close: desc[1] };
538 }
539 if (desc.open === desc.close) {
540 throw monarchCommon.createError(lexer, 'open and close brackets in a \'brackets\' attribute must be different: ' + desc.open +
541 '\n hint: use the \'bracket\' attribute if matching on equal brackets is required.');
542 }
543 if (typeof desc.open === 'string' && typeof desc.token === 'string' && typeof desc.close === 'string') {
544 brackets.push({
545 token: desc.token + lexer.tokenPostfix,
546 open: monarchCommon.fixCase(lexer, desc.open),
547 close: monarchCommon.fixCase(lexer, desc.close)
548 });
549 }
550 else {
551 throw monarchCommon.createError(lexer, 'every element in the \'brackets\' array must be a \'{open,close,token}\' object or array');
552 }
553 }
554 lexer.brackets = brackets;
555
556 // Disable throw so the syntax highlighter goes, no matter what
557 lexer.noThrow = true;
558 return lexer;
559}