UNPKG

@ckeditor/ckeditor5-paste-from-office/src/filters/parse.js

Version:

4.72 kBJavaScriptView Raw

1/**
2 * @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
3 * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
4 */
5
6/**
7 * @module paste-from-office/filters/parse
8 */
9
10/* globals DOMParser */
11
12import { DomConverter, ViewDocument } from 'ckeditor5/src/engine';
13
14import { normalizeSpacing, normalizeSpacerunSpans } from './space';
15
16/**
17 * Parses provided HTML extracting contents of `<body>` and `<style>` tags.
18 *
19 * @param {String} htmlString HTML string to be parsed.
20 * @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
21 * @returns {Object} result
22 * @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body
23 * content as a traversable structure.
24 * @returns {String} result.bodyString Entire body content as a string.
25 * @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
26 * separate `style` tag from the source HTML.
27 * @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
28 */
29export function parseHtml( htmlString, stylesProcessor ) {
30	const domParser = new DOMParser();
31
32	// Remove Word specific "if comments" so content inside is not omitted by the parser.
33	htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );
34
35	const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );
36
37	// Parse htmlString as native Document object.
38	const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );
39
40	normalizeSpacerunSpans( htmlDocument );
41
42	// Get `innerHTML` first as transforming to View modifies the source document.
43	const bodyString = htmlDocument.body.innerHTML;
44
45	// Transform document.body to View.
46	const bodyView = documentToView( htmlDocument, stylesProcessor );
47
48	// Extract stylesheets.
49	const stylesObject = extractStyles( htmlDocument );
50
51	return {
52		body: bodyView,
53		bodyString,
54		styles: stylesObject.styles,
55		stylesString: stylesObject.stylesString
56	};
57}
58
59// Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
60//
61// @param {Document} htmlDocument Native `Document` object to be transformed.
62// @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor
63// @returns {module:engine/view/documentfragment~DocumentFragment}
64function documentToView( htmlDocument, stylesProcessor ) {
65	const viewDocument = new ViewDocument( stylesProcessor );
66	const domConverter = new DomConverter( viewDocument, { renderingMode: 'data' } );
67	const fragment = htmlDocument.createDocumentFragment();
68	const nodes = htmlDocument.body.childNodes;
69
70	while ( nodes.length > 0 ) {
71		fragment.appendChild( nodes[ 0 ] );
72	}
73
74	return domConverter.domToView( fragment, { skipComments: true } );
75}
76
77// Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
78//
79// @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
80// @returns {Object} result
81// @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
82// separate `style` tag from the source object.
83// @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
84function extractStyles( htmlDocument ) {
85	const styles = [];
86	const stylesString = [];
87	const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );
88
89	for ( const style of styleTags ) {
90		if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {
91			styles.push( style.sheet );
92			stylesString.push( style.innerHTML );
93		}
94	}
95
96	return {
97		styles,
98		stylesString: stylesString.join( ' ' )
99	};
100}
101
102// Removes leftover content from between closing </body> and closing </html> tag:
103//
104// 		<html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
105//
106// This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
107// @param {String} htmlString The HTML string to be cleaned.
108// @returns {String} The HTML string with leftover content removed.
109function cleanContentAfterBody( htmlString ) {
110	const bodyCloseTag = '</body>';
111	const htmlCloseTag = '</html>';
112
113	const bodyCloseIndex = htmlString.indexOf( bodyCloseTag );
114
115	if ( bodyCloseIndex < 0 ) {
116		return htmlString;
117	}
118
119	const htmlCloseIndex = htmlString.indexOf( htmlCloseTag, bodyCloseIndex + bodyCloseTag.length );
120
121	return htmlString.substring( 0, bodyCloseIndex + bodyCloseTag.length ) +
122		( htmlCloseIndex >= 0 ? htmlString.substring( htmlCloseIndex ) : '' );
123}

1	`/**`
2	`* @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.`
3	`* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license`
4	`*/`
5
6	`/**`
7	`* @module paste-from-office/filters/parse`
8	`*/`
9
10	`/* globals DOMParser */`
11
12	`import { DomConverter, ViewDocument } from 'ckeditor5/src/engine';`
13
14	`import { normalizeSpacing, normalizeSpacerunSpans } from './space';`
15
16	`/**`
17	* Parses provided HTML extracting contents of `<body>` and `<style>` tags.
18	`*`
19	`* @param {String} htmlString HTML string to be parsed.`
20	`* @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor`
21	`* @returns {Object} result`
22	`* @returns {module:engine/view/documentfragment~DocumentFragment} result.body Parsed body`
23	`* content as a traversable structure.`
24	`* @returns {String} result.bodyString Entire body content as a string.`
25	* @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` objects, each representing
26	* separate `style` tag from the source HTML.
27	* @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence into one string.
28	`*/`
29	`export function parseHtml( htmlString, stylesProcessor ) {`
30	`const domParser = new DOMParser();`
31
32	`// Remove Word specific "if comments" so content inside is not omitted by the parser.`
33	`htmlString = htmlString.replace( /<!--\[if gte vml 1]>/g, '' );`
34
35	`const normalizedHtml = normalizeSpacing( cleanContentAfterBody( htmlString ) );`
36
37	`// Parse htmlString as native Document object.`
38	`const htmlDocument = domParser.parseFromString( normalizedHtml, 'text/html' );`
39
40	`normalizeSpacerunSpans( htmlDocument );`
41
42	// Get `innerHTML` first as transforming to View modifies the source document.
43	`const bodyString = htmlDocument.body.innerHTML;`
44
45	`// Transform document.body to View.`
46	`const bodyView = documentToView( htmlDocument, stylesProcessor );`
47
48	`// Extract stylesheets.`
49	`const stylesObject = extractStyles( htmlDocument );`
50
51	`return {`
52	`body: bodyView,`
53	`bodyString,`
54	`styles: stylesObject.styles,`
55	`stylesString: stylesObject.stylesString`
56	`};`
57	`}`
58
59	// Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
60	`//`
61	// @param {Document} htmlDocument Native `Document` object to be transformed.
62	`// @param {module:engine/view/stylesmap~StylesProcessor} stylesProcessor`
63	`// @returns {module:engine/view/documentfragment~DocumentFragment}`
64	`function documentToView( htmlDocument, stylesProcessor ) {`
65	`const viewDocument = new ViewDocument( stylesProcessor );`
66	`const domConverter = new DomConverter( viewDocument, { renderingMode: 'data' } );`
67	`const fragment = htmlDocument.createDocumentFragment();`
68	`const nodes = htmlDocument.body.childNodes;`
69
70	`while ( nodes.length > 0 ) {`
71	`fragment.appendChild( nodes[ 0 ] );`
72	`}`
73
74	`return domConverter.domToView( fragment, { skipComments: true } );`
75	`}`
76
77	// Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
78	`//`
79	// @param {Document} htmlDocument Native `Document` object from which styles will be extracted.
80	`// @returns {Object} result`
81	// @returns {Array.<CSSStyleSheet>} result.styles Array of native `CSSStyleSheet` object, each representing
82	// separate `style` tag from the source object.
83	// @returns {String} result.stylesString All `style` tags contents combined in the order of occurrence as one string.
84	`function extractStyles( htmlDocument ) {`
85	`const styles = [];`
86	`const stylesString = [];`
87	`const styleTags = Array.from( htmlDocument.getElementsByTagName( 'style' ) );`
88
89	`for ( const style of styleTags ) {`
90	`if ( style.sheet && style.sheet.cssRules && style.sheet.cssRules.length ) {`
91	`styles.push( style.sheet );`
92	`stylesString.push( style.innerHTML );`
93	`}`
94	`}`
95
96	`return {`
97	`styles,`
98	`stylesString: stylesString.join( ' ' )`
99	`};`
100	`}`
101
102	`// Removes leftover content from between closing </body> and closing </html> tag:`
103	`//`
104	`// <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>`
105	`//`
106	// This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
107	`// @param {String} htmlString The HTML string to be cleaned.`
108	`// @returns {String} The HTML string with leftover content removed.`
109	`function cleanContentAfterBody( htmlString ) {`
110	`const bodyCloseTag = '</body>';`
111	`const htmlCloseTag = '</html>';`
112
113	`const bodyCloseIndex = htmlString.indexOf( bodyCloseTag );`
114
115	`if ( bodyCloseIndex < 0 ) {`
116	`return htmlString;`
117	`}`
118
119	`const htmlCloseIndex = htmlString.indexOf( htmlCloseTag, bodyCloseIndex + bodyCloseTag.length );`
120
121	`return htmlString.substring( 0, bodyCloseIndex + bodyCloseTag.length ) +`
122	`( htmlCloseIndex >= 0 ? htmlString.substring( htmlCloseIndex ) : '' );`
123	`}`