UNPKG

12.7 kBJavaScriptView Raw
1"use strict";
2var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3 if (k2 === undefined) k2 = k;
4 Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
5}) : (function(o, m, k, k2) {
6 if (k2 === undefined) k2 = k;
7 o[k2] = m[k];
8}));
9var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
10 Object.defineProperty(o, "default", { enumerable: true, value: v });
11}) : function(o, v) {
12 o["default"] = v;
13});
14var __importStar = (this && this.__importStar) || function (mod) {
15 if (mod && mod.__esModule) return mod;
16 var result = {};
17 if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
18 __setModuleDefault(result, mod);
19 return result;
20};
21Object.defineProperty(exports, "__esModule", { value: true });
22exports.tokenizer = void 0;
23const util = __importStar(require("./util"));
24const types_1 = require("./types");
25const sets = __importStar(require("./sets"));
26/**
27 * Tokenizes a regular expression (that is currently a string)
28 * @param {string} regexpStr String of regular expression to be tokenized
29 *
30 * @returns {Root}
31 */
32exports.tokenizer = (regexpStr) => {
33 let i = 0, c;
34 let start = { type: types_1.types.ROOT, stack: [] };
35 // Keep track of last clause/group and stack.
36 let lastGroup = start;
37 let last = start.stack;
38 let groupStack = [];
39 let referenceQueue = [];
40 let groupCount = 0;
41 const repeatErr = (col) => {
42 throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Nothing to repeat at column ${col - 1}`);
43 };
44 // Decode a few escaped characters.
45 let str = util.strToChars(regexpStr);
46 // Iterate through each character in string.
47 while (i < str.length) {
48 switch (c = str[i++]) {
49 // Handle escaped characters, inclues a few sets.
50 case '\\':
51 switch (c = str[i++]) {
52 case 'b':
53 last.push({ type: types_1.types.POSITION, value: 'b' });
54 break;
55 case 'B':
56 last.push({ type: types_1.types.POSITION, value: 'B' });
57 break;
58 case 'w':
59 last.push(sets.words());
60 break;
61 case 'W':
62 last.push(sets.notWords());
63 break;
64 case 'd':
65 last.push(sets.ints());
66 break;
67 case 'D':
68 last.push(sets.notInts());
69 break;
70 case 's':
71 last.push(sets.whitespace());
72 break;
73 case 'S':
74 last.push(sets.notWhitespace());
75 break;
76 default:
77 // Check if c is integer.
78 // In which case it's a reference.
79 if (/\d/.test(c)) {
80 let digits = c;
81 while (/\d/.test(str[i])) {
82 digits += str[i++];
83 }
84 let value = parseInt(digits, 10);
85 const reference = { type: types_1.types.REFERENCE, value };
86 last.push(reference);
87 referenceQueue.push({ reference, stack: last, index: last.length - 1 });
88 // Escaped character.
89 }
90 else {
91 last.push({ type: types_1.types.CHAR, value: c.charCodeAt(0) });
92 }
93 }
94 break;
95 // Positionals.
96 case '^':
97 last.push({ type: types_1.types.POSITION, value: '^' });
98 break;
99 case '$':
100 last.push({ type: types_1.types.POSITION, value: '$' });
101 break;
102 // Handle custom sets.
103 case '[': {
104 // Check if this class is 'anti' i.e. [^abc].
105 let not;
106 if (str[i] === '^') {
107 not = true;
108 i++;
109 }
110 else {
111 not = false;
112 }
113 // Get all the characters in class.
114 let classTokens = util.tokenizeClass(str.slice(i), regexpStr);
115 // Increase index by length of class.
116 i += classTokens[1];
117 last.push({
118 type: types_1.types.SET,
119 set: classTokens[0],
120 not,
121 });
122 break;
123 }
124 // Class of any character except \n.
125 case '.':
126 last.push(sets.anyChar());
127 break;
128 // Push group onto stack.
129 case '(': {
130 // Create group.
131 let group = {
132 type: types_1.types.GROUP,
133 stack: [],
134 remember: true,
135 };
136 // If if this is a special kind of group.
137 if (str[i] === '?') {
138 c = str[i + 1];
139 i += 2;
140 // Match if followed by.
141 if (c === '=') {
142 group.followedBy = true;
143 // Match if not followed by.
144 }
145 else if (c === '!') {
146 group.notFollowedBy = true;
147 }
148 else if (c !== ':') {
149 throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Invalid group, character '${c}'` +
150 ` after '?' at column ${i - 1}`);
151 }
152 group.remember = false;
153 }
154 else {
155 groupCount += 1;
156 }
157 // Insert subgroup into current group stack.
158 last.push(group);
159 // Remember the current group for when the group closes.
160 groupStack.push(lastGroup);
161 // Make this new group the current group.
162 lastGroup = group;
163 last = group.stack;
164 break;
165 }
166 // Pop group out of stack.
167 case ')':
168 if (groupStack.length === 0) {
169 throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unmatched ) at column ${i - 1}`);
170 }
171 lastGroup = groupStack.pop();
172 // Check if this group has a PIPE.
173 // To get back the correct last stack.
174 last = lastGroup.options ?
175 lastGroup.options[lastGroup.options.length - 1] :
176 lastGroup.stack;
177 break;
178 // Use pipe character to give more choices.
179 case '|': {
180 // Create array where options are if this is the first PIPE
181 // in this clause.
182 if (!lastGroup.options) {
183 lastGroup.options = [lastGroup.stack];
184 delete lastGroup.stack;
185 }
186 // Create a new stack and add to options for rest of clause.
187 let stack = [];
188 lastGroup.options.push(stack);
189 last = stack;
190 break;
191 }
192 // Repetition.
193 // For every repetition, remove last element from last stack
194 // then insert back a RANGE object.
195 // This design is chosen because there could be more than
196 // one repetition symbols in a regex i.e. `a?+{2,3}`.
197 case '{': {
198 let rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;
199 if (rs !== null) {
200 if (last.length === 0) {
201 repeatErr(i);
202 }
203 min = parseInt(rs[1], 10);
204 max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;
205 i += rs[0].length;
206 last.push({
207 type: types_1.types.REPETITION,
208 min,
209 max,
210 value: last.pop(),
211 });
212 }
213 else {
214 last.push({
215 type: types_1.types.CHAR,
216 value: 123,
217 });
218 }
219 break;
220 }
221 case '?':
222 if (last.length === 0) {
223 repeatErr(i);
224 }
225 last.push({
226 type: types_1.types.REPETITION,
227 min: 0,
228 max: 1,
229 value: last.pop(),
230 });
231 break;
232 case '+':
233 if (last.length === 0) {
234 repeatErr(i);
235 }
236 last.push({
237 type: types_1.types.REPETITION,
238 min: 1,
239 max: Infinity,
240 value: last.pop(),
241 });
242 break;
243 case '*':
244 if (last.length === 0) {
245 repeatErr(i);
246 }
247 last.push({
248 type: types_1.types.REPETITION,
249 min: 0,
250 max: Infinity,
251 value: last.pop(),
252 });
253 break;
254 // Default is a character that is not `\[](){}?+*^$`.
255 default:
256 last.push({
257 type: types_1.types.CHAR,
258 value: c.charCodeAt(0),
259 });
260 }
261 }
262 // Check if any groups have not been closed.
263 if (groupStack.length !== 0) {
264 throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated group`);
265 }
266 updateReferences(referenceQueue, groupCount);
267 return start;
268};
269/**
270 * This is a side effecting function that changes references to chars
271 * if there are not enough capturing groups to reference
272 * See: https://github.com/fent/ret.js/pull/39#issuecomment-1006475703
273 * See: https://github.com/fent/ret.js/issues/38
274 * @param {(Reference | Char)[]} referenceQueue
275 * @param {number} groupCount
276 * @returns {void}
277 */
278function updateReferences(referenceQueue, groupCount) {
279 // Note: We go through the queue in reverse order so
280 // that index we use is correct even if we have to add
281 // multiple tokens to one stack
282 for (const elem of referenceQueue.reverse()) {
283 if (groupCount < elem.reference.value) {
284 // If there is nothing to reference then turn this into a char token
285 elem.reference.type = types_1.types.CHAR;
286 const valueString = elem.reference.value.toString();
287 // If the number is not octal then we need to create multiple tokens
288 // https://github.com/fent/ret.js/pull/39#issuecomment-1008229226
289 if (!/^[0-7]+$/.test(valueString)) {
290 let i = 0;
291 while (valueString[i] !== '8' && valueString[i] !== '9') {
292 i += 1;
293 }
294 if (i === 0) {
295 // Handling case when escaped number starts with 8 or 9
296 elem.reference.value = valueString.charCodeAt(0);
297 i += 1;
298 }
299 else {
300 // If the escaped number does not start with 8 or 9, then all
301 // 0-7 digits before the first 8/9 form the first character code
302 // see: https://github.com/fent/ret.js/pull/39#discussion_r780747085
303 elem.reference.value = parseInt(valueString.slice(0, i), 10);
304 }
305 if (valueString.length > i) {
306 const tail = elem.stack.splice(elem.index + 1);
307 for (const char of valueString.slice(i)) {
308 elem.stack.push({
309 type: types_1.types.CHAR,
310 value: char.charCodeAt(0),
311 });
312 }
313 elem.stack.push(...tail);
314 }
315 }
316 }
317 }
318}
319//# sourceMappingURL=tokenizer.js.map
\No newline at end of file