UNPKG

4.72 kBJavaScriptView Raw
1/**
2 * @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
3 * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
4 */
5
6/**
7 * @module paste-from-office/filters/parse
8 */
9
10/* globals DOMParser */
11
12import { DomConverter, ViewDocument } from 'ckeditor5/src/engine';
13
14import { normalizeSpacing, normalizeSpacerunSpans } from './space';
15
16/**
17 * Parses provided HTML extracting contents of `<body>` and `<style>` tags.
18 *
19 * @param {String} htmlString HTML string to be parsed.
20 * @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
21 * @returns {Object} result
22 * @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body
23 * content as a traversable structure.
24 * @returns {String} result.bodyString Entire body content as a string.
25 * @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
26 * separate `style` tag from the source HTML.
27 * @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
28 */
29export function parseHtml( htmlString, stylesProcessor ) {
30 const domParser = new DOMParser();
31
32 // Remove Word specific "if comments" so content inside is not omitted by the parser.
33 htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
34
35 const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
36
37 // Parse htmlString as native Document object.
38 const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );
39
40 normalizeSpacerunSpans( htmlDocument );
41
42 // Get `innerHTML` first as transforming to View modifies the source document.
43 const bodyString = htmlDocument.body.innerHTML;
44
45 // Transform document.body to View.
46 const bodyView = documentToView( htmlDocument, stylesProcessor );
47
48 // Extract stylesheets.
49 const stylesObject = extractStyles( htmlDocument );
50
51 return {
52 body: bodyView,
53 bodyString,
54 styles: stylesObject.styles,
55 stylesString: stylesObject.stylesString
56 };
57}
58
59// Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
60//
61// @param {Document} htmlDocument Native `Document` object to be transformed.
62// @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
63// @returns {module:engine/view/documentfragment~DocumentFragment}
64function documentToView( htmlDocument, stylesProcessor ) {
65 const viewDocument = new ViewDocument( stylesProcessor );
66 const domConverter = new DomConverter( viewDocument, { renderingMode: 'data' } );
67 const fragment = htmlDocument.createDocumentFragment();
68 const nodes = htmlDocument.body.childNodes;
69
70 while ( nodes.length > 0 ) {
71 fragment.appendChild( nodes[ 0 ] );
72 }
73
74 return domConverter.domToView( fragment, { skipComments: true } );
75}
76
77// Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
78//
79// @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
80// @returns {Object} result
81// @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
82// separate `style` tag from the source object.
83// @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
84function extractStyles( htmlDocument ) {
85 const styles = [];
86 const stylesString = [];
87 const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );
88
89 for ( const style of styleTags ) {
90 if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {
91 styles.push( style.sheet );
92 stylesString.push( style.innerHTML );
93 }
94 }
95
96 return {
97 styles,
98 stylesString: stylesString.join( ' ' )
99 };
100}
101
102// Removes leftover content from between closing </body> and closing </html> tag:
103//
104// <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
105//
106// This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
107// @param {String} htmlString The HTML string to be cleaned.
108// @returns {String} The HTML string with leftover content removed.
109function cleanContentAfterBody( htmlString ) {
110 const bodyCloseTag = '</body>';
111 const htmlCloseTag = '</html>';
112
113 const bodyCloseIndex = htmlString.indexOf( bodyCloseTag );
114
115 if ( bodyCloseIndex < 0 ) {
116 return htmlString;
117 }
118
119 const htmlCloseIndex = htmlString.indexOf( htmlCloseTag, bodyCloseIndex + bodyCloseTag.length );
120
121 return htmlString.substring( 0, bodyCloseIndex + bodyCloseTag.length ) +
122 ( htmlCloseIndex >= 0 ? htmlString.substring( htmlCloseIndex ) : '' );
123}