1 | ;
|
2 | var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
3 | if (k2 === undefined) k2 = k;
|
4 | Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
|
5 | }) : (function(o, m, k, k2) {
|
6 | if (k2 === undefined) k2 = k;
|
7 | o[k2] = m[k];
|
8 | }));
|
9 | var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
10 | Object.defineProperty(o, "default", { enumerable: true, value: v });
|
11 | }) : function(o, v) {
|
12 | o["default"] = v;
|
13 | });
|
14 | var __importStar = (this && this.__importStar) || function (mod) {
|
15 | if (mod && mod.__esModule) return mod;
|
16 | var result = {};
|
17 | if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
18 | __setModuleDefault(result, mod);
|
19 | return result;
|
20 | };
|
21 | Object.defineProperty(exports, "__esModule", { value: true });
|
22 | exports.tokenizer = void 0;
|
23 | const util = __importStar(require("./util"));
|
24 | const types_1 = require("./types");
|
25 | const sets = __importStar(require("./sets"));
|
26 | /**
|
27 | * Tokenizes a regular expression (that is currently a string)
|
28 | * @param {string} regexpStr String of regular expression to be tokenized
|
29 | *
|
30 | * @returns {Root}
|
31 | */
|
32 | exports.tokenizer = (regexpStr) => {
|
33 | let i = 0, c;
|
34 | let start = { type: types_1.types.ROOT, stack: [] };
|
35 | // Keep track of last clause/group and stack.
|
36 | let lastGroup = start;
|
37 | let last = start.stack;
|
38 | let groupStack = [];
|
39 | let referenceQueue = [];
|
40 | let groupCount = 0;
|
41 | const repeatErr = (col) => {
|
42 | throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Nothing to repeat at column ${col - 1}`);
|
43 | };
|
44 | // Decode a few escaped characters.
|
45 | let str = util.strToChars(regexpStr);
|
46 | // Iterate through each character in string.
|
47 | while (i < str.length) {
|
48 | switch (c = str[i++]) {
|
49 | // Handle escaped characters, inclues a few sets.
|
50 | case '\\':
|
51 | switch (c = str[i++]) {
|
52 | case 'b':
|
53 | last.push({ type: types_1.types.POSITION, value: 'b' });
|
54 | break;
|
55 | case 'B':
|
56 | last.push({ type: types_1.types.POSITION, value: 'B' });
|
57 | break;
|
58 | case 'w':
|
59 | last.push(sets.words());
|
60 | break;
|
61 | case 'W':
|
62 | last.push(sets.notWords());
|
63 | break;
|
64 | case 'd':
|
65 | last.push(sets.ints());
|
66 | break;
|
67 | case 'D':
|
68 | last.push(sets.notInts());
|
69 | break;
|
70 | case 's':
|
71 | last.push(sets.whitespace());
|
72 | break;
|
73 | case 'S':
|
74 | last.push(sets.notWhitespace());
|
75 | break;
|
76 | default:
|
77 | // Check if c is integer.
|
78 | // In which case it's a reference.
|
79 | if (/\d/.test(c)) {
|
80 | let digits = c;
|
81 | while (/\d/.test(str[i])) {
|
82 | digits += str[i++];
|
83 | }
|
84 | let value = parseInt(digits, 10);
|
85 | const reference = { type: types_1.types.REFERENCE, value };
|
86 | last.push(reference);
|
87 | referenceQueue.push({ reference, stack: last, index: last.length - 1 });
|
88 | // Escaped character.
|
89 | }
|
90 | else {
|
91 | last.push({ type: types_1.types.CHAR, value: c.charCodeAt(0) });
|
92 | }
|
93 | }
|
94 | break;
|
95 | // Positionals.
|
96 | case '^':
|
97 | last.push({ type: types_1.types.POSITION, value: '^' });
|
98 | break;
|
99 | case '$':
|
100 | last.push({ type: types_1.types.POSITION, value: '$' });
|
101 | break;
|
102 | // Handle custom sets.
|
103 | case '[': {
|
104 | // Check if this class is 'anti' i.e. [^abc].
|
105 | let not;
|
106 | if (str[i] === '^') {
|
107 | not = true;
|
108 | i++;
|
109 | }
|
110 | else {
|
111 | not = false;
|
112 | }
|
113 | // Get all the characters in class.
|
114 | let classTokens = util.tokenizeClass(str.slice(i), regexpStr);
|
115 | // Increase index by length of class.
|
116 | i += classTokens[1];
|
117 | last.push({
|
118 | type: types_1.types.SET,
|
119 | set: classTokens[0],
|
120 | not,
|
121 | });
|
122 | break;
|
123 | }
|
124 | // Class of any character except \n.
|
125 | case '.':
|
126 | last.push(sets.anyChar());
|
127 | break;
|
128 | // Push group onto stack.
|
129 | case '(': {
|
130 | // Create group.
|
131 | let group = {
|
132 | type: types_1.types.GROUP,
|
133 | stack: [],
|
134 | remember: true,
|
135 | };
|
136 | // If if this is a special kind of group.
|
137 | if (str[i] === '?') {
|
138 | c = str[i + 1];
|
139 | i += 2;
|
140 | // Match if followed by.
|
141 | if (c === '=') {
|
142 | group.followedBy = true;
|
143 | // Match if not followed by.
|
144 | }
|
145 | else if (c === '!') {
|
146 | group.notFollowedBy = true;
|
147 | }
|
148 | else if (c !== ':') {
|
149 | throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Invalid group, character '${c}'` +
|
150 | ` after '?' at column ${i - 1}`);
|
151 | }
|
152 | group.remember = false;
|
153 | }
|
154 | else {
|
155 | groupCount += 1;
|
156 | }
|
157 | // Insert subgroup into current group stack.
|
158 | last.push(group);
|
159 | // Remember the current group for when the group closes.
|
160 | groupStack.push(lastGroup);
|
161 | // Make this new group the current group.
|
162 | lastGroup = group;
|
163 | last = group.stack;
|
164 | break;
|
165 | }
|
166 | // Pop group out of stack.
|
167 | case ')':
|
168 | if (groupStack.length === 0) {
|
169 | throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unmatched ) at column ${i - 1}`);
|
170 | }
|
171 | lastGroup = groupStack.pop();
|
172 | // Check if this group has a PIPE.
|
173 | // To get back the correct last stack.
|
174 | last = lastGroup.options ?
|
175 | lastGroup.options[lastGroup.options.length - 1] :
|
176 | lastGroup.stack;
|
177 | break;
|
178 | // Use pipe character to give more choices.
|
179 | case '|': {
|
180 | // Create array where options are if this is the first PIPE
|
181 | // in this clause.
|
182 | if (!lastGroup.options) {
|
183 | lastGroup.options = [lastGroup.stack];
|
184 | delete lastGroup.stack;
|
185 | }
|
186 | // Create a new stack and add to options for rest of clause.
|
187 | let stack = [];
|
188 | lastGroup.options.push(stack);
|
189 | last = stack;
|
190 | break;
|
191 | }
|
192 | // Repetition.
|
193 | // For every repetition, remove last element from last stack
|
194 | // then insert back a RANGE object.
|
195 | // This design is chosen because there could be more than
|
196 | // one repetition symbols in a regex i.e. `a?+{2,3}`.
|
197 | case '{': {
|
198 | let rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;
|
199 | if (rs !== null) {
|
200 | if (last.length === 0) {
|
201 | repeatErr(i);
|
202 | }
|
203 | min = parseInt(rs[1], 10);
|
204 | max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;
|
205 | i += rs[0].length;
|
206 | last.push({
|
207 | type: types_1.types.REPETITION,
|
208 | min,
|
209 | max,
|
210 | value: last.pop(),
|
211 | });
|
212 | }
|
213 | else {
|
214 | last.push({
|
215 | type: types_1.types.CHAR,
|
216 | value: 123,
|
217 | });
|
218 | }
|
219 | break;
|
220 | }
|
221 | case '?':
|
222 | if (last.length === 0) {
|
223 | repeatErr(i);
|
224 | }
|
225 | last.push({
|
226 | type: types_1.types.REPETITION,
|
227 | min: 0,
|
228 | max: 1,
|
229 | value: last.pop(),
|
230 | });
|
231 | break;
|
232 | case '+':
|
233 | if (last.length === 0) {
|
234 | repeatErr(i);
|
235 | }
|
236 | last.push({
|
237 | type: types_1.types.REPETITION,
|
238 | min: 1,
|
239 | max: Infinity,
|
240 | value: last.pop(),
|
241 | });
|
242 | break;
|
243 | case '*':
|
244 | if (last.length === 0) {
|
245 | repeatErr(i);
|
246 | }
|
247 | last.push({
|
248 | type: types_1.types.REPETITION,
|
249 | min: 0,
|
250 | max: Infinity,
|
251 | value: last.pop(),
|
252 | });
|
253 | break;
|
254 | // Default is a character that is not `\[](){}?+*^$`.
|
255 | default:
|
256 | last.push({
|
257 | type: types_1.types.CHAR,
|
258 | value: c.charCodeAt(0),
|
259 | });
|
260 | }
|
261 | }
|
262 | // Check if any groups have not been closed.
|
263 | if (groupStack.length !== 0) {
|
264 | throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated group`);
|
265 | }
|
266 | updateReferences(referenceQueue, groupCount);
|
267 | return start;
|
268 | };
|
269 | /**
|
270 | * This is a side effecting function that changes references to chars
|
271 | * if there are not enough capturing groups to reference
|
272 | * See: https://github.com/fent/ret.js/pull/39#issuecomment-1006475703
|
273 | * See: https://github.com/fent/ret.js/issues/38
|
274 | * @param {(Reference | Char)[]} referenceQueue
|
275 | * @param {number} groupCount
|
276 | * @returns {void}
|
277 | */
|
278 | function updateReferences(referenceQueue, groupCount) {
|
279 | // Note: We go through the queue in reverse order so
|
280 | // that index we use is correct even if we have to add
|
281 | // multiple tokens to one stack
|
282 | for (const elem of referenceQueue.reverse()) {
|
283 | if (groupCount < elem.reference.value) {
|
284 | // If there is nothing to reference then turn this into a char token
|
285 | elem.reference.type = types_1.types.CHAR;
|
286 | const valueString = elem.reference.value.toString();
|
287 | // If the number is not octal then we need to create multiple tokens
|
288 | // https://github.com/fent/ret.js/pull/39#issuecomment-1008229226
|
289 | if (!/^[0-7]+$/.test(valueString)) {
|
290 | let i = 0;
|
291 | while (valueString[i] !== '8' && valueString[i] !== '9') {
|
292 | i += 1;
|
293 | }
|
294 | if (i === 0) {
|
295 | // Handling case when escaped number starts with 8 or 9
|
296 | elem.reference.value = valueString.charCodeAt(0);
|
297 | i += 1;
|
298 | }
|
299 | else {
|
300 | // If the escaped number does not start with 8 or 9, then all
|
301 | // 0-7 digits before the first 8/9 form the first character code
|
302 | // see: https://github.com/fent/ret.js/pull/39#discussion_r780747085
|
303 | elem.reference.value = parseInt(valueString.slice(0, i), 10);
|
304 | }
|
305 | if (valueString.length > i) {
|
306 | const tail = elem.stack.splice(elem.index + 1);
|
307 | for (const char of valueString.slice(i)) {
|
308 | elem.stack.push({
|
309 | type: types_1.types.CHAR,
|
310 | value: char.charCodeAt(0),
|
311 | });
|
312 | }
|
313 | elem.stack.push(...tail);
|
314 | }
|
315 | }
|
316 | }
|
317 | }
|
318 | }
|
319 | //# sourceMappingURL=tokenizer.js.map |
\ | No newline at end of file |