UNPKG

18.5 kBJavaScriptView Raw
1/**
2 * @typedef {import('hast').Comment} Comment
3 * @typedef {import('hast').Doctype} Doctype
4 * @typedef {import('hast').Element} Element
5 * @typedef {import('hast').ElementContent} ElementContent
6 * @typedef {import('hast').Nodes} Nodes
7 * @typedef {import('hast').Properties} Properties
8 * @typedef {import('hast').Root} Root
9 * @typedef {import('hast').RootContent} RootContent
10 * @typedef {import('hast').Text} Text
11 */
12
13/**
14 * @typedef {[string, ...Array<Exclude<Properties[keyof Properties], Array<any>> | RegExp>] | string} PropertyDefinition
15 * Definition for a property.
16 *
17 * @typedef Schema
18 * Schema that defines what nodes and properties are allowed.
19 *
20 * The default schema is `defaultSchema`, which follows how GitHub cleans.
21 * If any top-level key is missing in the given schema, the corresponding
22 * value of the default schema is used.
23 *
24 * To extend the standard schema with a few changes, clone `defaultSchema`
25 * like so:
26 *
27 * ```js
28 * import deepmerge from 'deepmerge'
29 * import {h} from 'hastscript'
30 * import {defaultSchema, sanitize} from 'hast-util-sanitize'
31 *
32 * // This allows `className` on all elements.
33 * const schema = deepmerge(defaultSchema, {attributes: {'*': ['className']}})
34 *
35 * const tree = sanitize(h('div', {className: ['foo']}), schema)
36 *
37 * // `tree` still has `className`.
38 * console.log(tree)
39 * // {
40 * // type: 'element',
41 * // tagName: 'div',
42 * // properties: {className: ['foo']},
43 * // children: []
44 * // }
45 * ```
46 * @property {boolean | null | undefined} [allowComments=false]
47 * Whether to allow comment nodes (default: `false`).
48 *
49 * For example:
50 *
51 * ```js
52 * allowComments: true
53 * ```
54 * @property {boolean | null | undefined} [allowDoctypes=false]
55 * Whether to allow doctype nodes (default: `false`).
56 *
57 * For example:
58 *
59 * ```js
60 * allowDoctypes: true
61 * ```
62 * @property {Record<string, Array<string>> | null | undefined} [ancestors]
63 * Map of tag names to a list of tag names which are required ancestors
64 * (default: `defaultSchema.ancestors`).
65 *
66 * Elements with these tag names will be ignored if they occur outside of one
67 * of their allowed parents.
68 *
69 * For example:
70 *
71 * ```js
72 * ancestors: {
73 * tbody: ['table'],
74 * // …
75 * tr: ['table']
76 * }
77 * ```
78 * @property {Record<string, Array<PropertyDefinition>> | null | undefined} [attributes]
79 * Map of tag names to allowed property names (default:
80 * `defaultSchema.attributes`).
81 *
82 * The special key `'*'` as a tag name defines property names allowed on all
83 * elements.
84 *
85 * The special value `'data*'` as a property name can be used to allow all
86 * `data` properties.
87 *
88 * For example:
89 *
90 * ```js
91 * attributes: {
92 * 'ariaDescribedBy', 'ariaLabel', 'ariaLabelledBy', …, 'href'
93 * // …
94 * '*': [
95 * 'abbr',
96 * 'accept',
97 * 'acceptCharset',
98 * // …
99 * 'vAlign',
100 * 'value',
101 * 'width'
102 * ]
103 * }
104 * ```
105 *
106 * Instead of a single string in the array, which allows any property value
107 * for the field, you can use an array to allow several values.
108 * For example, `input: ['type']` allows `type` set to any value on `input`s.
109 * But `input: [['type', 'checkbox', 'radio']]` allows `type` when set to
110 * `'checkbox'` or `'radio'`.
111 *
112 * You can use regexes, so for example `span: [['className', /^hljs-/]]`
113 * allows any class that starts with `hljs-` on `span`s.
114 *
115 * When comma- or space-separated values are used (such as `className`), each
116 * value in is checked individually.
117 * For example, to allow certain classes on `span`s for syntax highlighting,
118 * use `span: [['className', 'number', 'operator', 'token']]`.
119 * This will allow `'number'`, `'operator'`, and `'token'` classes, but drop
120 * others.
121 * @property {Array<string> | null | undefined} [clobber]
122 * List of property names that clobber (default: `defaultSchema.clobber`).
123 *
124 * For example:
125 *
126 * ```js
127 * clobber: ['ariaDescribedBy', 'ariaLabelledBy', 'id', 'name']
128 * ```
129 * @property {string | null | undefined} [clobberPrefix]
130 * Prefix to use before clobbering properties (default:
131 * `defaultSchema.clobberPrefix`).
132 *
133 * For example:
134 *
135 * ```js
136 * clobberPrefix: 'user-content-'
137 * ```
138 * @property {Record<string, Array<string> | null | undefined> | null | undefined} [protocols]
139 * Map of *property names* to allowed protocols (default:
140 * `defaultSchema.protocols`).
141 *
142 * This defines URLs that are always allowed to have local URLs (relative to
143 * the current website, such as `this`, `#this`, `/this`, or `?this`), and
144 * only allowed to have remote URLs (such as `https://example.com`) if they
145 * use a known protocol.
146 *
147 * For example:
148 *
149 * ```js
150 * protocols: {
151 * cite: ['http', 'https'],
152 * // …
153 * src: ['http', 'https']
154 * }
155 * ```
156 * @property {Record<string, Record<string, Properties[keyof Properties]>> | null | undefined} [required]
157 * Map of tag names to required property names with a default value
158 * (default: `defaultSchema.required`).
159 *
160 * This defines properties that must be set.
161 * If a field does not exist (after the element was made safe), these will be
162 * added with the given value.
163 *
164 * For example:
165 *
166 * ```js
167 * required: {
168 * input: {disabled: true, type: 'checkbox'}
169 * }
170 * ```
171 *
172 * > 👉 **Note**: properties are first checked based on `schema.attributes`,
173 * > then on `schema.required`.
174 * > That means properties could be removed by `attributes` and then added
175 * > again with `required`.
176 * @property {Array<string> | null | undefined} [strip]
177 * List of tag names to strip from the tree (default: `defaultSchema.strip`).
178 *
179 * By default, unsafe elements (those not in `schema.tagNames`) are replaced
180 * by what they contain.
181 * This option can drop their contents.
182 *
183 * For example:
184 *
185 * ```js
186 * strip: ['script']
187 * ```
188 * @property {Array<string> | null | undefined} [tagNames]
189 * List of allowed tag names (default: `defaultSchema.tagNames`).
190 *
191 * For example:
192 *
193 * ```js
194 * tagNames: [
195 * 'a',
196 * 'b',
197 * // …
198 * 'ul',
199 * 'var'
200 * ]
201 * ```
202 *
203 * @typedef State
204 * Info passed around.
205 * @property {Readonly<Schema>} schema
206 * Schema.
207 * @property {Array<string>} stack
208 * Tag names of ancestors.
209 */
210
211import structuredClone from '@ungap/structured-clone'
212import {position} from 'unist-util-position'
213import {defaultSchema} from './schema.js'
214
215const own = {}.hasOwnProperty
216
217/**
218 * Sanitize a tree.
219 *
220 * @param {Readonly<Nodes>} node
221 * Unsafe tree.
222 * @param {Readonly<Schema> | null | undefined} [options]
223 * Configuration (default: `defaultSchema`).
224 * @returns {Nodes}
225 * New, safe tree.
226 */
227export function sanitize(node, options) {
228 /** @type {Nodes} */
229 let ctx = {type: 'root', children: []}
230
231 /** @type {State} */
232 const state = {
233 schema: options ? {...defaultSchema, ...options} : defaultSchema,
234 stack: []
235 }
236 const replace = transform(state, node)
237
238 if (replace) {
239 if (Array.isArray(replace)) {
240 if (replace.length === 1) {
241 ctx = replace[0]
242 } else {
243 ctx.children = replace
244 }
245 } else {
246 ctx = replace
247 }
248 }
249
250 return ctx
251}
252
253/**
254 * Sanitize `node`.
255 *
256 * @param {State} state
257 * Info passed around.
258 * @param {Readonly<unknown>} node
259 * Unsafe node.
260 * @returns {Array<ElementContent> | Nodes | undefined}
261 * Safe result.
262 */
263function transform(state, node) {
264 if (node && typeof node === 'object') {
265 const unsafe = /** @type {Record<string, Readonly<unknown>>} */ (node)
266 const type = typeof unsafe.type === 'string' ? unsafe.type : ''
267
268 switch (type) {
269 case 'comment': {
270 return comment(state, unsafe)
271 }
272
273 case 'doctype': {
274 return doctype(state, unsafe)
275 }
276
277 case 'element': {
278 return element(state, unsafe)
279 }
280
281 case 'root': {
282 return root(state, unsafe)
283 }
284
285 case 'text': {
286 return text(state, unsafe)
287 }
288
289 default:
290 }
291 }
292}
293
294/**
295 * Make a safe comment.
296 *
297 * @param {State} state
298 * Info passed around.
299 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
300 * Unsafe comment-like value.
301 * @returns {Comment | undefined}
302 * Safe comment (if with `allowComments`).
303 */
304function comment(state, unsafe) {
305 if (state.schema.allowComments) {
306 // See <https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments>
307 const result = typeof unsafe.value === 'string' ? unsafe.value : ''
308 const index = result.indexOf('-->')
309 const value = index < 0 ? result : result.slice(0, index)
310
311 /** @type {Comment} */
312 const node = {type: 'comment', value}
313
314 patch(node, unsafe)
315
316 return node
317 }
318}
319
320/**
321 * Make a safe doctype.
322 *
323 * @param {State} state
324 * Info passed around.
325 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
326 * Unsafe doctype-like value.
327 * @returns {Doctype | undefined}
328 * Safe doctype (if with `allowDoctypes`).
329 */
330function doctype(state, unsafe) {
331 if (state.schema.allowDoctypes) {
332 /** @type {Doctype} */
333 const node = {type: 'doctype'}
334
335 patch(node, unsafe)
336
337 return node
338 }
339}
340
341/**
342 * Make a safe element.
343 *
344 * @param {State} state
345 * Info passed around.
346 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
347 * Unsafe element-like value.
348 * @returns {Array<ElementContent> | Element | undefined}
349 * Safe element.
350 */
351function element(state, unsafe) {
352 const name = typeof unsafe.tagName === 'string' ? unsafe.tagName : ''
353
354 state.stack.push(name)
355
356 const content = /** @type {Array<ElementContent>} */ (
357 children(state, unsafe.children)
358 )
359 const props = properties(state, unsafe.properties)
360
361 state.stack.pop()
362
363 let safeElement = false
364
365 if (
366 name.length > 0 &&
367 name !== '*' &&
368 (!state.schema.tagNames || state.schema.tagNames.includes(name))
369 ) {
370 safeElement = true
371
372 // Some nodes can break out of their context if they don’t have a certain
373 // ancestor.
374 if (state.schema.ancestors && own.call(state.schema.ancestors, name)) {
375 const ancestors = state.schema.ancestors[name]
376 let index = -1
377
378 safeElement = false
379
380 while (++index < ancestors.length) {
381 if (state.stack.includes(ancestors[index])) {
382 safeElement = true
383 }
384 }
385 }
386 }
387
388 if (!safeElement) {
389 return state.schema.strip && !state.schema.strip.includes(name)
390 ? content
391 : undefined
392 }
393
394 /** @type {Element} */
395 const node = {
396 type: 'element',
397 tagName: name,
398 properties: props,
399 children: content
400 }
401
402 patch(node, unsafe)
403
404 return node
405}
406
407/**
408 * Make a safe root.
409 *
410 * @param {State} state
411 * Info passed around.
412 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
413 * Unsafe root-like value.
414 * @returns {Root}
415 * Safe root.
416 */
417function root(state, unsafe) {
418 const content = /** @type {Array<RootContent>} */ (
419 children(state, unsafe.children)
420 )
421
422 /** @type {Root} */
423 const node = {type: 'root', children: content}
424
425 patch(node, unsafe)
426
427 return node
428}
429
430/**
431 * Make a safe text.
432 *
433 * @param {State} _
434 * Info passed around.
435 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
436 * Unsafe text-like value.
437 * @returns {Text}
438 * Safe text.
439 */
440function text(_, unsafe) {
441 const value = typeof unsafe.value === 'string' ? unsafe.value : ''
442 /** @type {Text} */
443 const node = {type: 'text', value}
444
445 patch(node, unsafe)
446
447 return node
448}
449
450/**
451 * Make children safe.
452 *
453 * @param {State} state
454 * Info passed around.
455 * @param {Readonly<unknown>} children
456 * Unsafe value.
457 * @returns {Array<Nodes>}
458 * Safe children.
459 */
460function children(state, children) {
461 /** @type {Array<Nodes>} */
462 const results = []
463
464 if (Array.isArray(children)) {
465 const childrenUnknown = /** @type {Array<Readonly<unknown>>} */ (children)
466 let index = -1
467
468 while (++index < childrenUnknown.length) {
469 const value = transform(state, childrenUnknown[index])
470
471 if (value) {
472 if (Array.isArray(value)) {
473 results.push(...value)
474 } else {
475 results.push(value)
476 }
477 }
478 }
479 }
480
481 return results
482}
483
484/**
485 * Make element properties safe.
486 *
487 * @param {State} state
488 * Info passed around.
489 * @param {Readonly<unknown>} properties
490 * Unsafe value.
491 * @returns {Properties}
492 * Safe value.
493 */
494function properties(state, properties) {
495 const tagName = state.stack[state.stack.length - 1]
496 const attributes = state.schema.attributes
497 const required = state.schema.required
498 const specific =
499 attributes && own.call(attributes, tagName)
500 ? attributes[tagName]
501 : undefined
502 const defaults =
503 attributes && own.call(attributes, '*') ? attributes['*'] : undefined
504 const props = /** @type {Readonly<Record<string, Readonly<unknown>>>} */ (
505 properties && typeof properties === 'object' ? properties : {}
506 )
507 /** @type {Properties} */
508 const result = {}
509 /** @type {string} */
510 let key
511
512 for (key in props) {
513 if (own.call(props, key)) {
514 /** @type {Readonly<PropertyDefinition> | undefined} */
515 let definition
516
517 if (specific) definition = findDefinition(specific, key)
518 if (!definition && defaults) definition = findDefinition(defaults, key)
519
520 if (definition) {
521 const unsafe = props[key]
522 const safe = Array.isArray(unsafe)
523 ? propertyValues(state, definition, key, unsafe)
524 : propertyValue(state, definition, key, unsafe)
525
526 if (safe !== null && safe !== undefined) {
527 result[key] = safe
528 }
529 }
530 }
531 }
532
533 if (required && own.call(required, tagName)) {
534 const properties = required[tagName]
535
536 for (key in properties) {
537 if (own.call(properties, key) && !own.call(result, key)) {
538 result[key] = properties[key]
539 }
540 }
541 }
542
543 return result
544}
545
546/**
547 * Sanitize a property value which is a list.
548 *
549 * @param {State} state
550 * Info passed around.
551 * @param {Readonly<PropertyDefinition>} definition
552 * Definition.
553 * @param {string} key
554 * Field name.
555 * @param {Readonly<Array<Readonly<unknown>>>} values
556 * Unsafe value (but an array).
557 * @returns {Array<number | string>}
558 * Safe value.
559 */
560function propertyValues(state, definition, key, values) {
561 let index = -1
562 /** @type {Array<number | string>} */
563 const result = []
564
565 while (++index < values.length) {
566 const value = propertyValue(state, definition, key, values[index])
567
568 if (typeof value === 'number' || typeof value === 'string') {
569 result.push(value)
570 }
571 }
572
573 return result
574}
575
576/**
577 * Sanitize a property value.
578 *
579 * @param {State} state
580 * Info passed around.
581 * @param {Readonly<PropertyDefinition>} definition
582 * Definition.
583 * @param {string} key
584 * Field name.
585 * @param {Readonly<unknown>} value
586 * Unsafe value (but not an array).
587 * @returns {boolean | number | string | undefined}
588 * Safe value.
589 */
590function propertyValue(state, definition, key, value) {
591 if (
592 typeof value !== 'boolean' &&
593 typeof value !== 'number' &&
594 typeof value !== 'string'
595 ) {
596 return
597 }
598
599 if (!safeProtocol(state, key, value)) {
600 return
601 }
602
603 // Just a string, or only one item in an array, means all values are OK.
604 // More than one item means an allow list.
605 if (typeof definition === 'object' && definition.length > 1) {
606 let ok = false
607 let index = 0 // Ignore `key`, which is the first item.
608
609 while (++index < definition.length) {
610 const allowed = definition[index]
611
612 // Expression.
613 if (allowed && typeof allowed === 'object' && 'flags' in allowed) {
614 if (allowed.test(String(value))) {
615 ok = true
616 break
617 }
618 }
619 // Primitive.
620 else if (allowed === value) {
621 ok = true
622 break
623 }
624 }
625
626 if (!ok) return
627 }
628
629 return state.schema.clobber &&
630 state.schema.clobberPrefix &&
631 state.schema.clobber.includes(key)
632 ? state.schema.clobberPrefix + value
633 : value
634}
635
636/**
637 * Check whether `value` is a safe URL.
638 *
639 * @param {State} state
640 * Info passed around.
641 * @param {string} key
642 * Field name.
643 * @param {Readonly<unknown>} value
644 * Unsafe value.
645 * @returns {boolean}
646 * Whether it’s a safe value.
647 */
648function safeProtocol(state, key, value) {
649 const protocols =
650 state.schema.protocols && own.call(state.schema.protocols, key)
651 ? state.schema.protocols[key]
652 : undefined
653
654 // No protocols defined? Then everything is fine.
655 if (!protocols || protocols.length === 0) {
656 return true
657 }
658
659 const url = String(value)
660 const colon = url.indexOf(':')
661 const questionMark = url.indexOf('?')
662 const numberSign = url.indexOf('#')
663 const slash = url.indexOf('/')
664
665 if (
666 colon < 0 ||
667 // If the first colon is after a `?`, `#`, or `/`, it’s not a protocol.
668 (slash > -1 && colon > slash) ||
669 (questionMark > -1 && colon > questionMark) ||
670 (numberSign > -1 && colon > numberSign)
671 ) {
672 return true
673 }
674
675 let index = -1
676
677 while (++index < protocols.length) {
678 const protocol = protocols[index]
679
680 if (
681 colon === protocol.length &&
682 url.slice(0, protocol.length) === protocol
683 ) {
684 return true
685 }
686 }
687
688 return false
689}
690
691/**
692 * Add data and position.
693 *
694 * @param {Nodes} node
695 * Node to patch safe data and position on.
696 * @param {Readonly<Record<string, Readonly<unknown>>>} unsafe
697 * Unsafe node-like value.
698 * @returns {undefined}
699 * Nothing.
700 */
701function patch(node, unsafe) {
702 const cleanPosition = position(
703 // @ts-expect-error: looks like a node.
704 unsafe
705 )
706
707 if (unsafe.data) {
708 node.data = structuredClone(unsafe.data)
709 }
710
711 if (cleanPosition) node.position = cleanPosition
712}
713
714/**
715 *
716 * @param {Readonly<Array<PropertyDefinition>>} definitions
717 * @param {string} key
718 * @returns {Readonly<PropertyDefinition> | undefined}
719 */
720function findDefinition(definitions, key) {
721 /** @type {PropertyDefinition | undefined} */
722 let dataDefault
723 let index = -1
724
725 while (++index < definitions.length) {
726 const entry = definitions[index]
727 const name = typeof entry === 'string' ? entry : entry[0]
728
729 if (name === key) {
730 return entry
731 }
732
733 if (name === 'data*') dataDefault = entry
734 }
735
736 if (key.length > 4 && key.slice(0, 4).toLowerCase() === 'data') {
737 return dataDefault
738 }
739}