1 | /**
|
2 | * @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
|
3 | * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
|
4 | */
|
5 |
|
6 | /**
|
7 | * @module paste-from-office/filters/parse
|
8 | */
|
9 |
|
10 | /* globals DOMParser */
|
11 |
|
12 | import { DomConverter, ViewDocument } from 'ckeditor5/src/engine';
|
13 |
|
14 | import { normalizeSpacing, normalizeSpacerunSpans } from './space';
|
15 |
|
16 | /**
|
17 | * Parses provided HTML extracting contents of `<body>` and `<style>` tags.
|
18 | *
|
19 | * @param {String} htmlString HTML string to be parsed.
|
20 | * @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
|
21 | * @returns {Object} result
|
22 | * @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body
|
23 | * content as a traversable structure.
|
24 | * @returns {String} result.bodyString Entire body content as a string.
|
25 | * @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
|
26 | * separate `style` tag from the source HTML.
|
27 | * @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
|
28 | */
|
29 | export function parseHtml( htmlString, stylesProcessor ) {
|
30 | const domParser = new DOMParser();
|
31 |
|
32 | // Remove Word specific "if comments" so content inside is not omitted by the parser.
|
33 | htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
|
34 |
|
35 | const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
|
36 |
|
37 | // Parse htmlString as native Document object.
|
38 | const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );
|
39 |
|
40 | normalizeSpacerunSpans( htmlDocument );
|
41 |
|
42 | // Get `innerHTML` first as transforming to View modifies the source document.
|
43 | const bodyString = htmlDocument.body.innerHTML;
|
44 |
|
45 | // Transform document.body to View.
|
46 | const bodyView = documentToView( htmlDocument, stylesProcessor );
|
47 |
|
48 | // Extract stylesheets.
|
49 | const stylesObject = extractStyles( htmlDocument );
|
50 |
|
51 | return {
|
52 | body: bodyView,
|
53 | bodyString,
|
54 | styles: stylesObject.styles,
|
55 | stylesString: stylesObject.stylesString
|
56 | };
|
57 | }
|
58 |
|
59 | // Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
|
60 | //
|
61 | // @param {Document} htmlDocument Native `Document` object to be transformed.
|
62 | // @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
|
63 | // @returns {module:engine/view/documentfragment~DocumentFragment}
|
64 | function documentToView( htmlDocument, stylesProcessor ) {
|
65 | const viewDocument = new ViewDocument( stylesProcessor );
|
66 | const domConverter = new DomConverter( viewDocument, { renderingMode: 'data' } );
|
67 | const fragment = htmlDocument.createDocumentFragment();
|
68 | const nodes = htmlDocument.body.childNodes;
|
69 |
|
70 | while ( nodes.length > 0 ) {
|
71 | fragment.appendChild( nodes[ 0 ] );
|
72 | }
|
73 |
|
74 | return domConverter.domToView( fragment, { skipComments: true } );
|
75 | }
|
76 |
|
77 | // Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
|
78 | //
|
79 | // @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
|
80 | // @returns {Object} result
|
81 | // @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
|
82 | // separate `style` tag from the source object.
|
83 | // @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
|
84 | function extractStyles( htmlDocument ) {
|
85 | const styles = [];
|
86 | const stylesString = [];
|
87 | const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );
|
88 |
|
89 | for ( const style of styleTags ) {
|
90 | if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {
|
91 | styles.push( style.sheet );
|
92 | stylesString.push( style.innerHTML );
|
93 | }
|
94 | }
|
95 |
|
96 | return {
|
97 | styles,
|
98 | stylesString: stylesString.join( ' ' )
|
99 | };
|
100 | }
|
101 |
|
102 | // Removes leftover content from between closing </body> and closing </html> tag:
|
103 | //
|
104 | // <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
|
105 | //
|
106 | // This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
|
107 | // @param {String} htmlString The HTML string to be cleaned.
|
108 | // @returns {String} The HTML string with leftover content removed.
|
109 | function cleanContentAfterBody( htmlString ) {
|
110 | const bodyCloseTag = '</body>';
|
111 | const htmlCloseTag = '</html>';
|
112 |
|
113 | const bodyCloseIndex = htmlString.indexOf( bodyCloseTag );
|
114 |
|
115 | if ( bodyCloseIndex < 0 ) {
|
116 | return htmlString;
|
117 | }
|
118 |
|
119 | const htmlCloseIndex = htmlString.indexOf( htmlCloseTag, bodyCloseIndex + bodyCloseTag.length );
|
120 |
|
121 | return htmlString.substring( 0, bodyCloseIndex + bodyCloseTag.length ) +
|
122 | ( htmlCloseIndex >= 0 ? htmlString.substring( htmlCloseIndex ) : '' );
|
123 | }
|