UNPKG

10.1 kBJavaScriptView Raw
1'use strict';
2
3const generate = require('regjsgen').generate;
4const parse = require('regjsparser').parse;
5const regenerate = require('regenerate');
6const unicodeMatchProperty = require('unicode-match-property-ecmascript');
7const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
8const iuMappings = require('./data/iu-mappings.js');
9const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
10
11// Prepare a Regenerate set containing all code points, used for negative
12// character classes (if any).
13const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
14// Without the `u` flag, the range stops at 0xFFFF.
15// https://mths.be/es6#sec-pattern-semantics
16const BMP_SET = regenerate().addRange(0x0, 0xFFFF);
17
18// Prepare a Regenerate set containing all code points that are supposed to be
19// matched by `/./u`. https://mths.be/es6#sec-atom
20const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
21 .remove(
22 // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
23 0x000A, // Line Feed <LF>
24 0x000D, // Carriage Return <CR>
25 0x2028, // Line Separator <LS>
26 0x2029 // Paragraph Separator <PS>
27 );
28
29const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
30 if (unicode) {
31 if (ignoreCase) {
32 return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
33 }
34 return ESCAPE_SETS.UNICODE.get(character);
35 }
36 return ESCAPE_SETS.REGULAR.get(character);
37};
38
39const getUnicodeDotSet = (dotAll) => {
40 return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
41};
42
43const getUnicodePropertyValueSet = (property, value) => {
44 const path = value ?
45 `${ property }/${ value }` :
46 `Binary_Property/${ property }`;
47 try {
48 return require(`regenerate-unicode-properties/${ path }.js`);
49 } catch (exception) {
50 throw new Error(
51 `Failed to recognize value \`${ value }\` for property ` +
52 `\`${ property }\`.`
53 );
54 }
55};
56
57const handleLoneUnicodePropertyNameOrValue = (value) => {
58 // It could be a `General_Category` value or a binary property.
59 // Note: `unicodeMatchPropertyValue` throws on invalid values.
60 try {
61 const property = 'General_Category';
62 const category = unicodeMatchPropertyValue(property, value);
63 return getUnicodePropertyValueSet(property, category);
64 } catch (exception) {}
65 // It’s not a `General_Category` value, so check if it’s a binary
66 // property. Note: `unicodeMatchProperty` throws on invalid properties.
67 const property = unicodeMatchProperty(value);
68 return getUnicodePropertyValueSet(property);
69};
70
71const getUnicodePropertyEscapeSet = (value, isNegative) => {
72 const parts = value.split('=');
73 const firstPart = parts[0];
74 let set;
75 if (parts.length == 1) {
76 set = handleLoneUnicodePropertyNameOrValue(firstPart);
77 } else {
78 // The pattern consists of two parts, i.e. `Property=Value`.
79 const property = unicodeMatchProperty(firstPart);
80 const value = unicodeMatchPropertyValue(property, parts[1]);
81 set = getUnicodePropertyValueSet(property, value);
82 }
83 if (isNegative) {
84 return UNICODE_SET.clone().remove(set);
85 }
86 return set.clone();
87};
88
89// Given a range of code points, add any case-folded code points in that range
90// to a set.
91regenerate.prototype.iuAddRange = function(min, max) {
92 const $this = this;
93 do {
94 const folded = caseFold(min);
95 if (folded) {
96 $this.add(folded);
97 }
98 } while (++min <= max);
99 return $this;
100};
101
102const update = (item, pattern) => {
103 let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
104 switch (tree.type) {
105 case 'characterClass':
106 case 'group':
107 case 'value':
108 // No wrapping needed.
109 break;
110 default:
111 // Wrap the pattern in a non-capturing group.
112 tree = wrap(tree, pattern);
113 }
114 Object.assign(item, tree);
115};
116
117const wrap = (tree, pattern) => {
118 // Wrap the pattern in a non-capturing group.
119 return {
120 'type': 'group',
121 'behavior': 'ignore',
122 'body': [tree],
123 'raw': `(?:${ pattern })`
124 };
125};
126
127const caseFold = (codePoint) => {
128 return iuMappings.get(codePoint) || false;
129};
130
131const processCharacterClass = (characterClassItem, regenerateOptions) => {
132 const set = regenerate();
133 for (const item of characterClassItem.body) {
134 switch (item.type) {
135 case 'value':
136 set.add(item.codePoint);
137 if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
138 const folded = caseFold(item.codePoint);
139 if (folded) {
140 set.add(folded);
141 }
142 }
143 break;
144 case 'characterClassRange':
145 const min = item.min.codePoint;
146 const max = item.max.codePoint;
147 set.addRange(min, max);
148 if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
149 set.iuAddRange(min, max);
150 }
151 break;
152 case 'characterClassEscape':
153 set.add(getCharacterClassEscapeSet(
154 item.value,
155 config.unicode,
156 config.ignoreCase
157 ));
158 break;
159 case 'unicodePropertyEscape':
160 set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
161 break;
162 // The `default` clause is only here as a safeguard; it should never be
163 // reached. Code coverage tools should ignore it.
164 /* istanbul ignore next */
165 default:
166 throw new Error(`Unknown term type: ${ item.type }`);
167 }
168 }
169 if (characterClassItem.negative) {
170 update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
171 } else {
172 update(characterClassItem, set.toString(regenerateOptions));
173 }
174 return characterClassItem;
175};
176
177const updateNamedReference = (item, index) => {
178 delete item.name;
179 item.matchIndex = index;
180};
181
182const assertNoUnmatchedReferences = (groups) => {
183 const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
184 if (unmatchedReferencesNames.length > 0) {
185 throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
186 }
187};
188
189const processTerm = (item, regenerateOptions, groups) => {
190 switch (item.type) {
191 case 'dot':
192 if (config.useDotAllFlag) {
193 break;
194 } else if (config.unicode) {
195 update(
196 item,
197 getUnicodeDotSet(config.dotAll).toString(regenerateOptions)
198 );
199 } else if (config.dotAll) {
200 // TODO: consider changing this at the regenerate level.
201 update(item, '[\\s\\S]');
202 }
203 break;
204 case 'characterClass':
205 item = processCharacterClass(item, regenerateOptions);
206 break;
207 case 'unicodePropertyEscape':
208 if (config.unicodePropertyEscape) {
209 update(
210 item,
211 getUnicodePropertyEscapeSet(item.value, item.negative)
212 .toString(regenerateOptions)
213 );
214 }
215 break;
216 case 'characterClassEscape':
217 update(
218 item,
219 getCharacterClassEscapeSet(
220 item.value,
221 config.unicode,
222 config.ignoreCase
223 ).toString(regenerateOptions)
224 );
225 break;
226 case 'group':
227 if (item.behavior == 'normal') {
228 groups.lastIndex++;
229 }
230 if (item.name && config.namedGroup) {
231 const name = item.name.value;
232
233 if (groups.names[name]) {
234 throw new Error(
235 `Multiple groups with the same name (${ name }) are not allowed.`
236 );
237 }
238
239 const index = groups.lastIndex;
240 delete item.name;
241
242 groups.names[name] = index;
243 if (groups.onNamedGroup) {
244 groups.onNamedGroup.call(null, name, index);
245 }
246
247 if (groups.unmatchedReferences[name]) {
248 groups.unmatchedReferences[name].forEach(reference => {
249 updateNamedReference(reference, index);
250 });
251 delete groups.unmatchedReferences[name];
252 }
253 }
254 /* falls through */
255 case 'alternative':
256 case 'disjunction':
257 case 'quantifier':
258 item.body = item.body.map(term => {
259 return processTerm(term, regenerateOptions, groups);
260 });
261 break;
262 case 'value':
263 const codePoint = item.codePoint;
264 const set = regenerate(codePoint);
265 if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) {
266 const folded = caseFold(codePoint);
267 if (folded) {
268 set.add(folded);
269 }
270 }
271 update(item, set.toString(regenerateOptions));
272 break;
273 case 'reference':
274 if (item.name) {
275 const name = item.name.value;
276 const index = groups.names[name];
277 if (index) {
278 updateNamedReference(item, index);
279 break;
280 }
281
282 if (!groups.unmatchedReferences[name]) {
283 groups.unmatchedReferences[name] = [];
284 }
285 // Keep track of references used before the corresponding group.
286 groups.unmatchedReferences[name].push(item);
287 }
288 break;
289 case 'anchor':
290 case 'empty':
291 case 'group':
292 // Nothing to do here.
293 break;
294 // The `default` clause is only here as a safeguard; it should never be
295 // reached. Code coverage tools should ignore it.
296 /* istanbul ignore next */
297 default:
298 throw new Error(`Unknown term type: ${ item.type }`);
299 }
300 return item;
301};
302
303const config = {
304 'ignoreCase': false,
305 'unicode': false,
306 'dotAll': false,
307 'useDotAllFlag': false,
308 'useUnicodeFlag': false,
309 'unicodePropertyEscape': false,
310 'namedGroup': false
311};
312const rewritePattern = (pattern, flags, options) => {
313 config.unicode = flags && flags.includes('u');
314 const regjsparserFeatures = {
315 'unicodePropertyEscape': config.unicode,
316 'namedGroups': true,
317 'lookbehind': options && options.lookbehind
318 };
319 config.ignoreCase = flags && flags.includes('i');
320 const supportDotAllFlag = options && options.dotAllFlag;
321 config.dotAll = supportDotAllFlag && flags && flags.includes('s');
322 config.namedGroup = options && options.namedGroup;
323 config.useDotAllFlag = options && options.useDotAllFlag;
324 config.useUnicodeFlag = options && options.useUnicodeFlag;
325 config.unicodePropertyEscape = options && options.unicodePropertyEscape;
326 if (supportDotAllFlag && config.useDotAllFlag) {
327 throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!');
328 }
329 const regenerateOptions = {
330 'hasUnicodeFlag': config.useUnicodeFlag,
331 'bmpOnly': !config.unicode
332 };
333 const groups = {
334 'onNamedGroup': options && options.onNamedGroup,
335 'lastIndex': 0,
336 'names': Object.create(null), // { [name]: index }
337 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> }
338 };
339 const tree = parse(pattern, flags, regjsparserFeatures);
340 // Note: `processTerm` mutates `tree` and `groups`.
341 processTerm(tree, regenerateOptions, groups);
342 assertNoUnmatchedReferences(groups);
343 return generate(tree);
344};
345
346module.exports = rewritePattern;