1 | 'use strict';
|
2 |
|
3 | const generate = require('regjsgen').generate;
|
4 | const parse = require('regjsparser').parse;
|
5 | const regenerate = require('regenerate');
|
6 | const unicodeMatchProperty = require('unicode-match-property-ecmascript');
|
7 | const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
|
8 | const iuMappings = require('./data/iu-mappings.js');
|
9 | const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
|
10 |
|
11 |
|
12 |
|
13 | const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
|
14 |
|
15 |
|
16 | const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
|
17 |
|
18 |
|
19 |
|
20 | const DOT_SET_UNICODE = UNICODE_SET.clone()
|
21 | .remove(
|
22 |
|
23 | 0x000A,
|
24 | 0x000D,
|
25 | 0x2028,
|
26 | 0x2029
|
27 | );
|
28 |
|
29 | const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
|
30 | if (unicode) {
|
31 | if (ignoreCase) {
|
32 | return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
|
33 | }
|
34 | return ESCAPE_SETS.UNICODE.get(character);
|
35 | }
|
36 | return ESCAPE_SETS.REGULAR.get(character);
|
37 | };
|
38 |
|
39 | const getUnicodeDotSet = (dotAll) => {
|
40 | return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
|
41 | };
|
42 |
|
43 | const getUnicodePropertyValueSet = (property, value) => {
|
44 | const path = value ?
|
45 | `${ property }/${ value }` :
|
46 | `Binary_Property/${ property }`;
|
47 | try {
|
48 | return require(`regenerate-unicode-properties/${ path }.js`);
|
49 | } catch (exception) {
|
50 | throw new Error(
|
51 | `Failed to recognize value \`${ value }\` for property ` +
|
52 | `\`${ property }\`.`
|
53 | );
|
54 | }
|
55 | };
|
56 |
|
57 | const handleLoneUnicodePropertyNameOrValue = (value) => {
|
58 |
|
59 |
|
60 | try {
|
61 | const property = 'General_Category';
|
62 | const category = unicodeMatchPropertyValue(property, value);
|
63 | return getUnicodePropertyValueSet(property, category);
|
64 | } catch (exception) {}
|
65 |
|
66 |
|
67 | const property = unicodeMatchProperty(value);
|
68 | return getUnicodePropertyValueSet(property);
|
69 | };
|
70 |
|
71 | const getUnicodePropertyEscapeSet = (value, isNegative) => {
|
72 | const parts = value.split('=');
|
73 | const firstPart = parts[0];
|
74 | let set;
|
75 | if (parts.length == 1) {
|
76 | set = handleLoneUnicodePropertyNameOrValue(firstPart);
|
77 | } else {
|
78 |
|
79 | const property = unicodeMatchProperty(firstPart);
|
80 | const value = unicodeMatchPropertyValue(property, parts[1]);
|
81 | set = getUnicodePropertyValueSet(property, value);
|
82 | }
|
83 | if (isNegative) {
|
84 | return UNICODE_SET.clone().remove(set);
|
85 | }
|
86 | return set.clone();
|
87 | };
|
88 |
|
89 | // Given a range of code points, add any case-folded code points in that range
|
90 | // to a set.
|
91 | regenerate.prototype.iuAddRange = function(min, max) {
|
92 | const $this = this;
|
93 | do {
|
94 | const folded = caseFold(min);
|
95 | if (folded) {
|
96 | $this.add(folded);
|
97 | }
|
98 | } while (++min <= max);
|
99 | return $this;
|
100 | };
|
101 |
|
102 | const update = (item, pattern) => {
|
103 | let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
|
104 | switch (tree.type) {
|
105 | case 'characterClass':
|
106 | case 'group':
|
107 | case 'value':
|
108 |
|
109 | break;
|
110 | default:
|
111 |
|
112 | tree = wrap(tree, pattern);
|
113 | }
|
114 | Object.assign(item, tree);
|
115 | };
|
116 |
|
117 | const wrap = (tree, pattern) => {
|
118 |
|
119 | return {
|
120 | 'type': 'group',
|
121 | 'behavior': 'ignore',
|
122 | 'body': [tree],
|
123 | 'raw': `(?:${ pattern })`
|
124 | };
|
125 | };
|
126 |
|
127 | const caseFold = (codePoint) => {
|
128 | return iuMappings.get(codePoint) || false;
|
129 | };
|
130 |
|
131 | const processCharacterClass = (characterClassItem, regenerateOptions) => {
|
132 | const set = regenerate();
|
133 | for (const item of characterClassItem.body) {
|
134 | switch (item.type) {
|
135 | case 'value':
|
136 | set.add(item.codePoint);
|
137 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
138 | const folded = caseFold(item.codePoint);
|
139 | if (folded) {
|
140 | set.add(folded);
|
141 | }
|
142 | }
|
143 | break;
|
144 | case 'characterClassRange':
|
145 | const min = item.min.codePoint;
|
146 | const max = item.max.codePoint;
|
147 | set.addRange(min, max);
|
148 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
149 | set.iuAddRange(min, max);
|
150 | }
|
151 | break;
|
152 | case 'characterClassEscape':
|
153 | set.add(getCharacterClassEscapeSet(
|
154 | item.value,
|
155 | config.unicode,
|
156 | config.ignoreCase
|
157 | ));
|
158 | break;
|
159 | case 'unicodePropertyEscape':
|
160 | set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
|
161 | break;
|
162 | // The `default` clause is only here as a safeguard; it should never be
|
163 | // reached. Code coverage tools should ignore it.
|
164 | /* istanbul ignore next */
|
165 | default:
|
166 | throw new Error(`Unknown term type: ${ item.type }`);
|
167 | }
|
168 | }
|
169 | if (characterClassItem.negative) {
|
170 | update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
|
171 | } else {
|
172 | update(characterClassItem, set.toString(regenerateOptions));
|
173 | }
|
174 | return characterClassItem;
|
175 | };
|
176 |
|
177 | const updateNamedReference = (item, index) => {
|
178 | delete item.name;
|
179 | item.matchIndex = index;
|
180 | };
|
181 |
|
182 | const assertNoUnmatchedReferences = (groups) => {
|
183 | const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
|
184 | if (unmatchedReferencesNames.length > 0) {
|
185 | throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
|
186 | }
|
187 | };
|
188 |
|
189 | const processTerm = (item, regenerateOptions, groups) => {
|
190 | switch (item.type) {
|
191 | case 'dot':
|
192 | if (config.useDotAllFlag) {
|
193 | break;
|
194 | } else if (config.unicode) {
|
195 | update(
|
196 | item,
|
197 | getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
|
198 | );
|
199 | } else if (config.dotAll) {
|
200 |
|
201 | update(item, '[\\s\\S]');
|
202 | }
|
203 | break;
|
204 | case 'characterClass':
|
205 | item = processCharacterClass(item, regenerateOptions);
|
206 | break;
|
207 | case 'unicodePropertyEscape':
|
208 | if (config.unicodePropertyEscape) {
|
209 | update(
|
210 | item,
|
211 | getUnicodePropertyEscapeSet(item.value, item.negative)
|
212 | .toString(regenerateOptions)
|
213 | );
|
214 | }
|
215 | break;
|
216 | case 'characterClassEscape':
|
217 | update(
|
218 | item,
|
219 | getCharacterClassEscapeSet(
|
220 | item.value,
|
221 | config.unicode,
|
222 | config.ignoreCase
|
223 | ).toString(regenerateOptions)
|
224 | );
|
225 | break;
|
226 | case 'group':
|
227 | if (item.behavior == 'normal') {
|
228 | groups.lastIndex++;
|
229 | }
|
230 | if (item.name && config.namedGroup) {
|
231 | const name = item.name.value;
|
232 |
|
233 | if (groups.names[name]) {
|
234 | throw new Error(
|
235 | `Multiple groups with the same name (${ name }) are not allowed.`
|
236 | );
|
237 | }
|
238 |
|
239 | const index = groups.lastIndex;
|
240 | delete item.name;
|
241 |
|
242 | groups.names[name] = index;
|
243 | if (groups.onNamedGroup) {
|
244 | groups.onNamedGroup.call(null, name, index);
|
245 | }
|
246 |
|
247 | if (groups.unmatchedReferences[name]) {
|
248 | groups.unmatchedReferences[name].forEach(reference => {
|
249 | updateNamedReference(reference, index);
|
250 | });
|
251 | delete groups.unmatchedReferences[name];
|
252 | }
|
253 | }
|
254 |
|
255 | case 'alternative':
|
256 | case 'disjunction':
|
257 | case 'quantifier':
|
258 | item.body = item.body.map(term => {
|
259 | return processTerm(term, regenerateOptions, groups);
|
260 | });
|
261 | break;
|
262 | case 'value':
|
263 | const codePoint = item.codePoint;
|
264 | const set = regenerate(codePoint);
|
265 | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
|
266 | const folded = caseFold(codePoint);
|
267 | if (folded) {
|
268 | set.add(folded);
|
269 | }
|
270 | }
|
271 | update(item, set.toString(regenerateOptions));
|
272 | break;
|
273 | case 'reference':
|
274 | if (item.name) {
|
275 | const name = item.name.value;
|
276 | const index = groups.names[name];
|
277 | if (index) {
|
278 | updateNamedReference(item, index);
|
279 | break;
|
280 | }
|
281 |
|
282 | if (!groups.unmatchedReferences[name]) {
|
283 | groups.unmatchedReferences[name] = [];
|
284 | }
|
285 |
|
286 | groups.unmatchedReferences[name].push(item);
|
287 | }
|
288 | break;
|
289 | case 'anchor':
|
290 | case 'empty':
|
291 | case 'group':
|
292 |
|
293 | break;
|
294 |
|
295 |
|
296 |
|
297 | default:
|
298 | throw new Error(`Unknown term type: ${ item.type }`);
|
299 | }
|
300 | return item;
|
301 | };
|
302 |
|
303 | const config = {
|
304 | 'ignoreCase': false,
|
305 | 'unicode': false,
|
306 | 'dotAll': false,
|
307 | 'useDotAllFlag': false,
|
308 | 'useUnicodeFlag': false,
|
309 | 'unicodePropertyEscape': false,
|
310 | 'namedGroup': false
|
311 | };
|
312 | const rewritePattern = (pattern, flags, options) => {
|
313 | config.unicode = flags && flags.includes('u');
|
314 | const regjsparserFeatures = {
|
315 | 'unicodePropertyEscape': config.unicode,
|
316 | 'namedGroups': true,
|
317 | 'lookbehind': options && options.lookbehind
|
318 | };
|
319 | config.ignoreCase = flags && flags.includes('i');
|
320 | const supportDotAllFlag = options && options.dotAllFlag;
|
321 | config.dotAll = supportDotAllFlag && flags && flags.includes('s');
|
322 | config.namedGroup = options && options.namedGroup;
|
323 | config.useDotAllFlag = options && options.useDotAllFlag;
|
324 | config.useUnicodeFlag = options && options.useUnicodeFlag;
|
325 | config.unicodePropertyEscape = options && options.unicodePropertyEscape;
|
326 | if (supportDotAllFlag && config.useDotAllFlag) {
|
327 | throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!');
|
328 | }
|
329 | const regenerateOptions = {
|
330 | 'hasUnicodeFlag': config.useUnicodeFlag,
|
331 | 'bmpOnly': !config.unicode
|
332 | };
|
333 | const groups = {
|
334 | 'onNamedGroup': options && options.onNamedGroup,
|
335 | 'lastIndex': 0,
|
336 | 'names': Object.create(null),
|
337 | 'unmatchedReferences': Object.create(null)
|
338 | };
|
339 | const tree = parse(pattern, flags, regjsparserFeatures);
|
340 |
|
341 | processTerm(tree, regenerateOptions, groups);
|
342 | assertNoUnmatchedReferences(groups);
|
343 | return generate(tree);
|
344 | };
|
345 |
|
346 | module.exports = rewritePattern;
|