UNPKG

@ckeditor/ckeditor5-paste-from-office/src/filters/parse.js

Version:

4.14 kBJavaScriptView Raw

1/**
* @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved.
* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
*/
5/**
* @module paste-from-office/filters/parse
*/
8/* globals DOMParser */
9import { DomConverter, ViewDocument } from 'ckeditor5/src/engine.js';
10import { normalizeSpacing, normalizeSpacerunSpans } from './space.js';
11/**
* Parses the provided HTML extracting contents of `<body>` and `<style>` tags.
*
* @param htmlString HTML string to be parsed.
*/
16export function parseHtml(htmlString, stylesProcessor) {
  const domParser = new DOMParser();
  // Remove Word specific "if comments" so content inside is not omitted by the parser.
  htmlString = htmlString.replace(/<!--\[if gte vml 1]>/g, '');
  // Clean the <head> section of MS Windows specific tags. See https://github.com/ckeditor/ckeditor5/issues/15333.
  // The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values).
  htmlString = htmlString.replace(/<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]*")?)*\s*\/?>/gi, '');
  const normalizedHtml = normalizeSpacing(cleanContentAfterBody(htmlString));
  // Parse htmlString as native Document object.
  const htmlDocument = domParser.parseFromString(normalizedHtml, 'text/html');
  normalizeSpacerunSpans(htmlDocument);
  // Get `innerHTML` first as transforming to View modifies the source document.
  const bodyString = htmlDocument.body.innerHTML;
  // Transform document.body to View.
  const bodyView = documentToView(htmlDocument, stylesProcessor);
  // Extract stylesheets.
  const stylesObject = extractStyles(htmlDocument);
  return {
      body: bodyView,
      bodyString,
      styles: stylesObject.styles,
      stylesString: stylesObject.stylesString
  };
39}
40/**
* Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
*
* @param htmlDocument Native `Document` object to be transformed.
*/
45function documentToView(htmlDocument, stylesProcessor) {
  const viewDocument = new ViewDocument(stylesProcessor);
  const domConverter = new DomConverter(viewDocument, { renderingMode: 'data' });
  const fragment = htmlDocument.createDocumentFragment();
  const nodes = htmlDocument.body.childNodes;
  while (nodes.length > 0) {
      fragment.appendChild(nodes[0]);
  }
  return domConverter.domToView(fragment, { skipComments: true });
54}
55/**
* Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
*
* @param htmlDocument Native `Document` object from which styles will be extracted.
*/
60function extractStyles(htmlDocument) {
  const styles = [];
  const stylesString = [];
  const styleTags = Array.from(htmlDocument.getElementsByTagName('style'));
  for (const style of styleTags) {
      if (style.sheet && style.sheet.cssRules && style.sheet.cssRules.length) {
          styles.push(style.sheet);
          stylesString.push(style.innerHTML);
      }
  }
  return {
      styles,
      stylesString: stylesString.join(' ')
  };
74}
75/**
* Removes leftover content from between closing </body> and closing </html> tag:
*
* ```html
* <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>
* ```
*
* This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
* @param htmlString The HTML string to be cleaned.
* @returns The HTML string with leftover content removed.
*/
86function cleanContentAfterBody(htmlString) {
  const bodyCloseTag = '</body>';
  const htmlCloseTag = '</html>';
  const bodyCloseIndex = htmlString.indexOf(bodyCloseTag);
  if (bodyCloseIndex < 0) {
      return htmlString;
  }
  const htmlCloseIndex = htmlString.indexOf(htmlCloseTag, bodyCloseIndex + bodyCloseTag.length);
  return htmlString.substring(0, bodyCloseIndex + bodyCloseTag.length) +
      (htmlCloseIndex >= 0 ? htmlString.substring(htmlCloseIndex) : '');
96}

1	`/**`
2	`* @license Copyright (c) 2003-2024, CKSource Holding sp. z o.o. All rights reserved.`
3	`* For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license`
4	`*/`
5	`/**`
6	`* @module paste-from-office/filters/parse`
7	`*/`
8	`/* globals DOMParser */`
9	`import { DomConverter, ViewDocument } from 'ckeditor5/src/engine.js';`
10	`import { normalizeSpacing, normalizeSpacerunSpans } from './space.js';`
11	`/**`
12	* Parses the provided HTML extracting contents of `<body>` and `<style>` tags.
13	`*`
14	`* @param htmlString HTML string to be parsed.`
15	`*/`
16	`export function parseHtml(htmlString, stylesProcessor) {`
17	`const domParser = new DOMParser();`
18	`// Remove Word specific "if comments" so content inside is not omitted by the parser.`
19	`htmlString = htmlString.replace(/<!--\[if gte vml 1]>/g, '');`
20	`// Clean the <head> section of MS Windows specific tags. See https://github.com/ckeditor/ckeditor5/issues/15333.`
21	`// The regular expression matches the <o:SmartTagType> tag with optional attributes (with or without values).`
22	`htmlString = htmlString.replace(/<o:SmartTagType(?:\s+[^\s>=]+(?:="[^"]")?)\s*\/?>/gi, '');`
23	`const normalizedHtml = normalizeSpacing(cleanContentAfterBody(htmlString));`
24	`// Parse htmlString as native Document object.`
25	`const htmlDocument = domParser.parseFromString(normalizedHtml, 'text/html');`
26	`normalizeSpacerunSpans(htmlDocument);`
27	// Get `innerHTML` first as transforming to View modifies the source document.
28	`const bodyString = htmlDocument.body.innerHTML;`
29	`// Transform document.body to View.`
30	`const bodyView = documentToView(htmlDocument, stylesProcessor);`
31	`// Extract stylesheets.`
32	`const stylesObject = extractStyles(htmlDocument);`
33	`return {`
34	`body: bodyView,`
35	`bodyString,`
36	`styles: stylesObject.styles,`
37	`stylesString: stylesObject.stylesString`
38	`};`
39	`}`
40	`/**`
41	* Transforms native `Document` object into {@link module:engine/view/documentfragment~DocumentFragment}. Comments are skipped.
42	`*`
43	* @param htmlDocument Native `Document` object to be transformed.
44	`*/`
45	`function documentToView(htmlDocument, stylesProcessor) {`
46	`const viewDocument = new ViewDocument(stylesProcessor);`
47	`const domConverter = new DomConverter(viewDocument, { renderingMode: 'data' });`
48	`const fragment = htmlDocument.createDocumentFragment();`
49	`const nodes = htmlDocument.body.childNodes;`
50	`while (nodes.length > 0) {`
51	`fragment.appendChild(nodes[0]);`
52	`}`
53	`return domConverter.domToView(fragment, { skipComments: true });`
54	`}`
55	`/**`
56	* Extracts both `CSSStyleSheet` and string representation from all `style` elements available in a provided `htmlDocument`.
57	`*`
58	* @param htmlDocument Native `Document` object from which styles will be extracted.
59	`*/`
60	`function extractStyles(htmlDocument) {`
61	`const styles = [];`
62	`const stylesString = [];`
63	`const styleTags = Array.from(htmlDocument.getElementsByTagName('style'));`
64	`for (const style of styleTags) {`
65	`if (style.sheet && style.sheet.cssRules && style.sheet.cssRules.length) {`
66	`styles.push(style.sheet);`
67	`stylesString.push(style.innerHTML);`
68	`}`
69	`}`
70	`return {`
71	`styles,`
72	`stylesString: stylesString.join(' ')`
73	`};`
74	`}`
75	`/**`
76	`* Removes leftover content from between closing </body> and closing </html> tag:`
77	`*`
78	* ```html
79	`* <html><body><p>Foo Bar</p></body><span>Fo</span></html> -> <html><body><p>Foo Bar</p></body></html>`
80	* ```
81	`*`
82	* This function is used as specific browsers (Edge) add some random content after `body` tag when pasting from Word.
83	`* @param htmlString The HTML string to be cleaned.`
84	`* @returns The HTML string with leftover content removed.`
85	`*/`
86	`function cleanContentAfterBody(htmlString) {`
87	`const bodyCloseTag = '</body>';`
88	`const htmlCloseTag = '</html>';`
89	`const bodyCloseIndex = htmlString.indexOf(bodyCloseTag);`
90	`if (bodyCloseIndex < 0) {`
91	`return htmlString;`
92	`}`
93	`const htmlCloseIndex = htmlString.indexOf(htmlCloseTag, bodyCloseIndex + bodyCloseTag.length);`
94	`return htmlString.substring(0, bodyCloseIndex + bodyCloseTag.length) +`
95	`(htmlCloseIndex >= 0 ? htmlString.substring(htmlCloseIndex) : '');`
96	`}`