UNPKG

4.14 kBJavaScriptView Raw
1/**
2 * @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved.
3 * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
4 */
5/**
6 * @module paste-from-office/filters/parse
7 */
8/* globals DOMParser */
9import { DomConverter, ViewDocument } from 'ckeditor5/src/engine.js';
10import { normalizeSpacing, normalizeSpacerunSpans } from './space.js';
11/**
12 * Parses the provided HTML extracting contents of `<body>` and `<style>` tags.
13 *
14 * @param htmlString HTML string to be parsed.
15 */
16export function parseHtml(htmlString, stylesProcessor) {
17 const domParser = new DOMParser();
18 // Remove Word specific "if comments" so content inside is not omitted by the parser.
19 htmlString = htmlString.replace(/<!--\[if gte vml 1]>/g, '');
20 // Clean the <head> section of MS Windows specific tags. See https://github.com/ckeditor/ckeditor5/issues/15333.
21 // The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values).
22 htmlString = htmlString.replace(/<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]*")?)*\s*\/?>/gi, '');
23 const normalizedHtml = normalizeSpacing(cleanContentAfterBody(htmlString));
24 // Parse htmlString as native Document object.
25 const htmlDocument = domParser.parseFromString(normalizedHtml, 'text/html');
26 normalizeSpacerunSpans(htmlDocument);
27 // Get `innerHTML` first as transforming to View modifies the source document.
28 const bodyString = htmlDocument.body.innerHTML;
29 // Transform document.body to View.
30 const bodyView = documentToView(htmlDocument, stylesProcessor);
31 // Extract stylesheets.
32 const stylesObject = extractStyles(htmlDocument);
33 return {
34 body: bodyView,
35 bodyString,
36 styles: stylesObject.styles,
37 stylesString: stylesObject.stylesString
38 };
39}
40/**
41 * Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
42 *
43 * @param htmlDocument Native `Document` object to be transformed.
44 */
45function documentToView(htmlDocument, stylesProcessor) {
46 const viewDocument = new ViewDocument(stylesProcessor);
47 const domConverter = new DomConverter(viewDocument, { renderingMode: 'data' });
48 const fragment = htmlDocument.createDocumentFragment();
49 const nodes = htmlDocument.body.childNodes;
50 while (nodes.length > 0) {
51 fragment.appendChild(nodes[0]);
52 }
53 return domConverter.domToView(fragment, { skipComments: true });
54}
55/**
56 * Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
57 *
58 * @param htmlDocument Native `Document` object from which styles will be extracted.
59 */
60function extractStyles(htmlDocument) {
61 const styles = [];
62 const stylesString = [];
63 const styleTags = Array.from(htmlDocument.getElementsByTagName('style'));
64 for (const style of styleTags) {
65 if (style.sheet && style.sheet.cssRules && style.sheet.cssRules.length) {
66 styles.push(style.sheet);
67 stylesString.push(style.innerHTML);
68 }
69 }
70 return {
71 styles,
72 stylesString: stylesString.join(' ')
73 };
74}
75/**
76 * Removes leftover content from between closing </body> and closing </html> tag:
77 *
78 * ```html
79 * <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
80 * ```
81 *
82 * This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
83 * @param htmlString The HTML string to be cleaned.
84 * @returns The HTML string with leftover content removed.
85 */
86function cleanContentAfterBody(htmlString) {
87 const bodyCloseTag = '</body>';
88 const htmlCloseTag = '</html>';
89 const bodyCloseIndex = htmlString.indexOf(bodyCloseTag);
90 if (bodyCloseIndex < 0) {
91 return htmlString;
92 }
93 const htmlCloseIndex = htmlString.indexOf(htmlCloseTag, bodyCloseIndex + bodyCloseTag.length);
94 return htmlString.substring(0, bodyCloseIndex + bodyCloseTag.length) +
95 (htmlCloseIndex >= 0 ? htmlString.substring(htmlCloseIndex) : '');
96}