UNPKG

ret/dist/tokenizer.js

Version:

12.7 kBJavaScriptView Raw

1"use strict";
2var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
  if (k2 === undefined) k2 = k;
  Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
5}) : (function(o, m, k, k2) {
  if (k2 === undefined) k2 = k;
  o[k2] = m[k];
8}));
9var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
  Object.defineProperty(o, "default", { enumerable: true, value: v });
11}) : function(o, v) {
  o["default"] = v;
13});
14var __importStar = (this && this.__importStar) || function (mod) {
  if (mod && mod.__esModule) return mod;
  var result = {};
  if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
  __setModuleDefault(result, mod);
  return result;
20};
21Object.defineProperty(exports, "__esModule", { value: true });
22exports.tokenizer = void 0;
23const util = __importStar(require("./util"));
24const types_1 = require("./types");
25const sets = __importStar(require("./sets"));
26/**
* Tokenizes a regular expression (that is currently a string)
* @param {string} regexpStr String of regular expression to be tokenized
*
* @returns {Root}
*/
32exports.tokenizer = (regexpStr) => {
  let i = 0, c;
  let start = { type: types_1.types.ROOT, stack: [] };
  // Keep track of last clause/group and stack.
  let lastGroup = start;
  let last = start.stack;
  let groupStack = [];
  let referenceQueue = [];
  let groupCount = 0;
  const repeatErr = (col) => {
      throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Nothing to repeat at column ${col - 1}`);
  };
  // Decode a few escaped characters.
  let str = util.strToChars(regexpStr);
  // Iterate through each character in string.
  while (i < str.length) {
      switch (c = str[i++]) {
          // Handle escaped characters, inclues a few sets.
          case '\\':
              switch (c = str[i++]) {
                  case 'b':
                      last.push({ type: types_1.types.POSITION, value: 'b' });
                      break;
                  case 'B':
                      last.push({ type: types_1.types.POSITION, value: 'B' });
                      break;
                  case 'w':
                      last.push(sets.words());
                      break;
                  case 'W':
                      last.push(sets.notWords());
                      break;
                  case 'd':
                      last.push(sets.ints());
                      break;
                  case 'D':
                      last.push(sets.notInts());
                      break;
                  case 's':
                      last.push(sets.whitespace());
                      break;
                  case 'S':
                      last.push(sets.notWhitespace());
                      break;
                  default:
                      // Check if c is integer.
                      // In which case it's a reference.
                      if (/\d/.test(c)) {
                          let digits = c;
                          while (/\d/.test(str[i])) {
                              digits += str[i++];
                          }
                          let value = parseInt(digits, 10);
                          const reference = { type: types_1.types.REFERENCE, value };
                          last.push(reference);
                          referenceQueue.push({ reference, stack: last, index: last.length - 1 });
                          // Escaped character.
                      }
                      else {
                          last.push({ type: types_1.types.CHAR, value: c.charCodeAt(0) });
                      }
              }
              break;
          // Positionals.
          case '^':
              last.push({ type: types_1.types.POSITION, value: '^' });
              break;
          case '$':
              last.push({ type: types_1.types.POSITION, value: '$' });
              break;
          // Handle custom sets.
          case '[': {
              // Check if this class is 'anti' i.e. [^abc].
              let not;
              if (str[i] === '^') {
                  not = true;
                  i++;
              }
              else {
                  not = false;
              }
              // Get all the characters in class.
              let classTokens = util.tokenizeClass(str.slice(i), regexpStr);
              // Increase index by length of class.
              i += classTokens[1];
              last.push({
                  type: types_1.types.SET,
                  set: classTokens[0],
                  not,
              });
              break;
          }
          // Class of any character except \n.
          case '.':
              last.push(sets.anyChar());
              break;
          // Push group onto stack.
          case '(': {
              // Create group.
              let group = {
                  type: types_1.types.GROUP,
                  stack: [],
                  remember: true,
              };
              // If if this is a special kind of group.
              if (str[i] === '?') {
                  c = str[i + 1];
                  i += 2;
                  // Match if followed by.
                  if (c === '=') {
                      group.followedBy = true;
                      // Match if not followed by.
                  }
                  else if (c === '!') {
                      group.notFollowedBy = true;
                  }
                  else if (c !== ':') {
                      throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Invalid group, character '${c}'` +
                          ` after '?' at column ${i - 1}`);
                  }
                  group.remember = false;
              }
              else {
                  groupCount += 1;
              }
              // Insert subgroup into current group stack.
              last.push(group);
              // Remember the current group for when the group closes.
              groupStack.push(lastGroup);
              // Make this new group the current group.
              lastGroup = group;
              last = group.stack;
              break;
          }
          // Pop group out of stack.
          case ')':
              if (groupStack.length === 0) {
                  throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unmatched ) at column ${i - 1}`);
              }
              lastGroup = groupStack.pop();
              // Check if this group has a PIPE.
              // To get back the correct last stack.
              last = lastGroup.options ?
                  lastGroup.options[lastGroup.options.length - 1] :
                  lastGroup.stack;
              break;
          // Use pipe character to give more choices.
          case '|': {
              // Create array where options are if this is the first PIPE
              // in this clause.
              if (!lastGroup.options) {
                  lastGroup.options = [lastGroup.stack];
                  delete lastGroup.stack;
              }
              // Create a new stack and add to options for rest of clause.
              let stack = [];
              lastGroup.options.push(stack);
              last = stack;
              break;
          }
          // Repetition.
          // For every repetition, remove last element from last stack
          // then insert back a RANGE object.
          // This design is chosen because there could be more than
          // one repetition symbols in a regex i.e. `a?+{2,3}`.
          case '{': {
              let rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;
              if (rs !== null) {
                  if (last.length === 0) {
                      repeatErr(i);
                  }
                  min = parseInt(rs[1], 10);
                  max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;
                  i += rs[0].length;
                  last.push({
                      type: types_1.types.REPETITION,
                      min,
                      max,
                      value: last.pop(),
                  });
              }
              else {
                  last.push({
                      type: types_1.types.CHAR,
                      value: 123,
                  });
              }
              break;
          }
          case '?':
              if (last.length === 0) {
                  repeatErr(i);
              }
              last.push({
                  type: types_1.types.REPETITION,
                  min: 0,
                  max: 1,
                  value: last.pop(),
              });
              break;
          case '+':
              if (last.length === 0) {
                  repeatErr(i);
              }
              last.push({
                  type: types_1.types.REPETITION,
                  min: 1,
                  max: Infinity,
                  value: last.pop(),
              });
              break;
          case '*':
              if (last.length === 0) {
                  repeatErr(i);
              }
              last.push({
                  type: types_1.types.REPETITION,
                  min: 0,
                  max: Infinity,
                  value: last.pop(),
              });
              break;
          // Default is a character that is not `\[](){}?+*^$`.
          default:
              last.push({
                  type: types_1.types.CHAR,
                  value: c.charCodeAt(0),
              });
      }
  }
  // Check if any groups have not been closed.
  if (groupStack.length !== 0) {
      throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated group`);
  }
  updateReferences(referenceQueue, groupCount);
  return start;
268};
269/**
* This is a side effecting function that changes references to chars
* if there are not enough capturing groups to reference
* See: https://github.com/fent/ret.js/pull/39#issuecomment-1006475703
* See: https://github.com/fent/ret.js/issues/38
* @param {(Reference | Char)[]} referenceQueue
* @param {number} groupCount
* @returns {void}
*/
278function updateReferences(referenceQueue, groupCount) {
  // Note: We go through the queue in reverse order so
  // that index we use is correct even if we have to add
  // multiple tokens to one stack
  for (const elem of referenceQueue.reverse()) {
      if (groupCount < elem.reference.value) {
          // If there is nothing to reference then turn this into a char token
          elem.reference.type = types_1.types.CHAR;
          const valueString = elem.reference.value.toString();
          // If the number is not octal then we need to create multiple tokens
          // https://github.com/fent/ret.js/pull/39#issuecomment-1008229226
          if (!/^[0-7]+$/.test(valueString)) {
              let i = 0;
              while (valueString[i] !== '8' && valueString[i] !== '9') {
                  i += 1;
              }
              if (i === 0) {
                  // Handling case when escaped number starts with 8 or 9
                  elem.reference.value = valueString.charCodeAt(0);
                  i += 1;
              }
              else {
                  // If the escaped number does not start with 8 or 9, then all
                  // 0-7 digits before the first 8/9 form the first character code
                  // see: https://github.com/fent/ret.js/pull/39#discussion_r780747085
                  elem.reference.value = parseInt(valueString.slice(0, i), 10);
              }
              if (valueString.length > i) {
                  const tail = elem.stack.splice(elem.index + 1);
                  for (const char of valueString.slice(i)) {
                      elem.stack.push({
                          type: types_1.types.CHAR,
                          value: char.charCodeAt(0),
                      });
                  }
                  elem.stack.push(...tail);
              }
          }
      }
  }
318}
319//# sourceMappingURL=tokenizer.js.map
\No newline at end of file

1	`"use strict";`
2	`var __createBinding = (this && this.__createBinding) \|\| (Object.create ? (function(o, m, k, k2) {`
3	`if (k2 === undefined) k2 = k;`
4	`Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });`
5	`}) : (function(o, m, k, k2) {`
6	`if (k2 === undefined) k2 = k;`
7	`o[k2] = m[k];`
8	`}));`
9	`var __setModuleDefault = (this && this.__setModuleDefault) \|\| (Object.create ? (function(o, v) {`
10	`Object.defineProperty(o, "default", { enumerable: true, value: v });`
11	`}) : function(o, v) {`
12	`o["default"] = v;`
13	`});`
14	`var __importStar = (this && this.__importStar) \|\| function (mod) {`
15	`if (mod && mod.__esModule) return mod;`
16	`var result = {};`
17	`if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);`
18	`__setModuleDefault(result, mod);`
19	`return result;`
20	`};`
21	`Object.defineProperty(exports, "__esModule", { value: true });`
22	`exports.tokenizer = void 0;`
23	`const util = __importStar(require("./util"));`
24	`const types_1 = require("./types");`
25	`const sets = __importStar(require("./sets"));`
26	`/**`
27	`* Tokenizes a regular expression (that is currently a string)`
28	`* @param {string} regexpStr String of regular expression to be tokenized`
29	`*`
30	`* @returns {Root}`
31	`*/`
32	`exports.tokenizer = (regexpStr) => {`
33	`let i = 0, c;`
34	`let start = { type: types_1.types.ROOT, stack: [] };`
35	`// Keep track of last clause/group and stack.`
36	`let lastGroup = start;`
37	`let last = start.stack;`
38	`let groupStack = [];`
39	`let referenceQueue = [];`
40	`let groupCount = 0;`
41	`const repeatErr = (col) => {`
42	throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Nothing to repeat at column ${col - 1}`);
43	`};`
44	`// Decode a few escaped characters.`
45	`let str = util.strToChars(regexpStr);`
46	`// Iterate through each character in string.`
47	`while (i < str.length) {`
48	`switch (c = str[i++]) {`
49	`// Handle escaped characters, inclues a few sets.`
50	`case '\\':`
51	`switch (c = str[i++]) {`
52	`case 'b':`
53	`last.push({ type: types_1.types.POSITION, value: 'b' });`
54	`break;`
55	`case 'B':`
56	`last.push({ type: types_1.types.POSITION, value: 'B' });`
57	`break;`
58	`case 'w':`
59	`last.push(sets.words());`
60	`break;`
61	`case 'W':`
62	`last.push(sets.notWords());`
63	`break;`
64	`case 'd':`
65	`last.push(sets.ints());`
66	`break;`
67	`case 'D':`
68	`last.push(sets.notInts());`
69	`break;`
70	`case 's':`
71	`last.push(sets.whitespace());`
72	`break;`
73	`case 'S':`
74	`last.push(sets.notWhitespace());`
75	`break;`
76	`default:`
77	`// Check if c is integer.`
78	`// In which case it's a reference.`
79	`if (/\d/.test(c)) {`
80	`let digits = c;`
81	`while (/\d/.test(str[i])) {`
82	`digits += str[i++];`
83	`}`
84	`let value = parseInt(digits, 10);`
85	`const reference = { type: types_1.types.REFERENCE, value };`
86	`last.push(reference);`
87	`referenceQueue.push({ reference, stack: last, index: last.length - 1 });`
88	`// Escaped character.`
89	`}`
90	`else {`
91	`last.push({ type: types_1.types.CHAR, value: c.charCodeAt(0) });`
92	`}`
93	`}`
94	`break;`
95	`// Positionals.`
96	`case '^':`
97	`last.push({ type: types_1.types.POSITION, value: '^' });`
98	`break;`
99	`case '$':`
100	`last.push({ type: types_1.types.POSITION, value: '$' });`
101	`break;`
102	`// Handle custom sets.`
103	`case '[': {`
104	`// Check if this class is 'anti' i.e. [^abc].`
105	`let not;`
106	`if (str[i] === '^') {`
107	`not = true;`
108	`i++;`
109	`}`
110	`else {`
111	`not = false;`
112	`}`
113	`// Get all the characters in class.`
114	`let classTokens = util.tokenizeClass(str.slice(i), regexpStr);`
115	`// Increase index by length of class.`
116	`i += classTokens[1];`
117	`last.push({`
118	`type: types_1.types.SET,`
119	`set: classTokens[0],`
120	`not,`
121	`});`
122	`break;`
123	`}`
124	`// Class of any character except \n.`
125	`case '.':`
126	`last.push(sets.anyChar());`
127	`break;`
128	`// Push group onto stack.`
129	`case '(': {`
130	`// Create group.`
131	`let group = {`
132	`type: types_1.types.GROUP,`
133	`stack: [],`
134	`remember: true,`
135	`};`
136	`// If if this is a special kind of group.`
137	`if (str[i] === '?') {`
138	`c = str[i + 1];`
139	`i += 2;`
140	`// Match if followed by.`
141	`if (c === '=') {`
142	`group.followedBy = true;`
143	`// Match if not followed by.`
144	`}`
145	`else if (c === '!') {`
146	`group.notFollowedBy = true;`
147	`}`
148	`else if (c !== ':') {`
149	throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Invalid group, character '${c}'` +
150	` after '?' at column ${i - 1}`);
151	`}`
152	`group.remember = false;`
153	`}`
154	`else {`
155	`groupCount += 1;`
156	`}`
157	`// Insert subgroup into current group stack.`
158	`last.push(group);`
159	`// Remember the current group for when the group closes.`
160	`groupStack.push(lastGroup);`
161	`// Make this new group the current group.`
162	`lastGroup = group;`
163	`last = group.stack;`
164	`break;`
165	`}`
166	`// Pop group out of stack.`
167	`case ')':`
168	`if (groupStack.length === 0) {`
169	throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unmatched ) at column ${i - 1}`);
170	`}`
171	`lastGroup = groupStack.pop();`
172	`// Check if this group has a PIPE.`
173	`// To get back the correct last stack.`
174	`last = lastGroup.options ?`
175	`lastGroup.options[lastGroup.options.length - 1] :`
176	`lastGroup.stack;`
177	`break;`
178	`// Use pipe character to give more choices.`
179	`case '\|': {`
180	`// Create array where options are if this is the first PIPE`
181	`// in this clause.`
182	`if (!lastGroup.options) {`
183	`lastGroup.options = [lastGroup.stack];`
184	`delete lastGroup.stack;`
185	`}`
186	`// Create a new stack and add to options for rest of clause.`
187	`let stack = [];`
188	`lastGroup.options.push(stack);`
189	`last = stack;`
190	`break;`
191	`}`
192	`// Repetition.`
193	`// For every repetition, remove last element from last stack`
194	`// then insert back a RANGE object.`
195	`// This design is chosen because there could be more than`
196	// one repetition symbols in a regex i.e. `a?+{2,3}`.
197	`case '{': {`
198	`let rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;`
199	`if (rs !== null) {`
200	`if (last.length === 0) {`
201	`repeatErr(i);`
202	`}`
203	`min = parseInt(rs[1], 10);`
204	`max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;`
205	`i += rs[0].length;`
206	`last.push({`
207	`type: types_1.types.REPETITION,`
208	`min,`
209	`max,`
210	`value: last.pop(),`
211	`});`
212	`}`
213	`else {`
214	`last.push({`
215	`type: types_1.types.CHAR,`
216	`value: 123,`
217	`});`
218	`}`
219	`break;`
220	`}`
221	`case '?':`
222	`if (last.length === 0) {`
223	`repeatErr(i);`
224	`}`
225	`last.push({`
226	`type: types_1.types.REPETITION,`
227	`min: 0,`
228	`max: 1,`
229	`value: last.pop(),`
230	`});`
231	`break;`
232	`case '+':`
233	`if (last.length === 0) {`
234	`repeatErr(i);`
235	`}`
236	`last.push({`
237	`type: types_1.types.REPETITION,`
238	`min: 1,`
239	`max: Infinity,`
240	`value: last.pop(),`
241	`});`
242	`break;`
243	`case '*':`
244	`if (last.length === 0) {`
245	`repeatErr(i);`
246	`}`
247	`last.push({`
248	`type: types_1.types.REPETITION,`
249	`min: 0,`
250	`max: Infinity,`
251	`value: last.pop(),`
252	`});`
253	`break;`
254	// Default is a character that is not `\[](){}?+*^$`.
255	`default:`
256	`last.push({`
257	`type: types_1.types.CHAR,`
258	`value: c.charCodeAt(0),`
259	`});`
260	`}`
261	`}`
262	`// Check if any groups have not been closed.`
263	`if (groupStack.length !== 0) {`
264	throw new SyntaxError(`Invalid regular expression: /${regexpStr}/: Unterminated group`);
265	`}`
266	`updateReferences(referenceQueue, groupCount);`
267	`return start;`
268	`};`
269	`/**`
270	`* This is a side effecting function that changes references to chars`
271	`* if there are not enough capturing groups to reference`
272	`* See: https://github.com/fent/ret.js/pull/39#issuecomment-1006475703`
273	`* See: https://github.com/fent/ret.js/issues/38`
274	`* @param {(Reference \| Char)[]} referenceQueue`
275	`* @param {number} groupCount`
276	`* @returns {void}`
277	`*/`
278	`function updateReferences(referenceQueue, groupCount) {`
279	`// Note: We go through the queue in reverse order so`
280	`// that index we use is correct even if we have to add`
281	`// multiple tokens to one stack`
282	`for (const elem of referenceQueue.reverse()) {`
283	`if (groupCount < elem.reference.value) {`
284	`// If there is nothing to reference then turn this into a char token`
285	`elem.reference.type = types_1.types.CHAR;`
286	`const valueString = elem.reference.value.toString();`
287	`// If the number is not octal then we need to create multiple tokens`
288	`// https://github.com/fent/ret.js/pull/39#issuecomment-1008229226`
289	`if (!/^[0-7]+$/.test(valueString)) {`
290	`let i = 0;`
291	`while (valueString[i] !== '8' && valueString[i] !== '9') {`
292	`i += 1;`
293	`}`
294	`if (i === 0) {`
295	`// Handling case when escaped number starts with 8 or 9`
296	`elem.reference.value = valueString.charCodeAt(0);`
297	`i += 1;`
298	`}`
299	`else {`
300	`// If the escaped number does not start with 8 or 9, then all`
301	`// 0-7 digits before the first 8/9 form the first character code`
302	`// see: https://github.com/fent/ret.js/pull/39#discussion_r780747085`
303	`elem.reference.value = parseInt(valueString.slice(0, i), 10);`
304	`}`
305	`if (valueString.length > i) {`
306	`const tail = elem.stack.splice(elem.index + 1);`
307	`for (const char of valueString.slice(i)) {`
308	`elem.stack.push({`
309	`type: types_1.types.CHAR,`
310	`value: char.charCodeAt(0),`
311	`});`
312	`}`
313	`elem.stack.push(...tail);`
314	`}`
315	`}`
316	`}`
317	`}`
318	`}`
319	`//# sourceMappingURL=tokenizer.js.map`
\	No newline at end of file