UNPKG

16.3 kBJavaScriptView Raw
1/**
2 * @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
3 * For licensing, see LICENSE.md or https://ckeditor.com/legal/ckeditor-oss-license
4 */
5
6/**
7 * @module paste-from-office/filters/list
8 */
9
10import { Matcher, UpcastWriter } from 'ckeditor5/src/engine';
11
12/**
13 * Transforms Word specific list-like elements to the semantic HTML lists.
14 *
15 * Lists in Word are represented by block elements with special attributes like:
16 *
17 * <p class=MsoListParagraphCxSpFirst style='mso-list:l1 level1 lfo1'>...</p> // Paragraph based list.
18 * <h1 style='mso-list:l0 level1 lfo1'>...</h1> // Heading 1 based list.
19 *
20 * @param {module:engine/view/documentfragment~DocumentFragment} documentFragment The view structure to be transformed.
21 * @param {String} stylesString Styles from which list-like elements styling will be extracted.
22 */
23export function transformListItemLikeElementsIntoLists( documentFragment, stylesString ) {
24 if ( !documentFragment.childCount ) {
25 return;
26 }
27
28 const writer = new UpcastWriter( documentFragment.document );
29 const itemLikeElements = findAllItemLikeElements( documentFragment, writer );
30
31 if ( !itemLikeElements.length ) {
32 return;
33 }
34
35 let currentList = null;
36 let currentIndentation = 1;
37
38 itemLikeElements.forEach( ( itemLikeElement, i ) => {
39 const isDifferentList = isNewListNeeded( itemLikeElements[ i - 1 ], itemLikeElement );
40 const previousItemLikeElement = isDifferentList ? null : itemLikeElements[ i - 1 ];
41 const indentationDifference = getIndentationDifference( previousItemLikeElement, itemLikeElement );
42
43 if ( isDifferentList ) {
44 currentList = null;
45 currentIndentation = 1;
46 }
47
48 if ( !currentList || indentationDifference !== 0 ) {
49 const listStyle = detectListStyle( itemLikeElement, stylesString );
50
51 if ( !currentList ) {
52 currentList = insertNewEmptyList( listStyle, itemLikeElement.element, writer );
53 } else if ( itemLikeElement.indent > currentIndentation ) {
54 const lastListItem = currentList.getChild( currentList.childCount - 1 );
55 const lastListItemChild = lastListItem.getChild( lastListItem.childCount - 1 );
56
57 currentList = insertNewEmptyList( listStyle, lastListItemChild, writer );
58 currentIndentation += 1;
59 } else if ( itemLikeElement.indent < currentIndentation ) {
60 const differentIndentation = currentIndentation - itemLikeElement.indent;
61
62 currentList = findParentListAtLevel( currentList, differentIndentation );
63 currentIndentation = parseInt( itemLikeElement.indent );
64 }
65
66 if ( itemLikeElement.indent <= currentIndentation ) {
67 if ( !currentList.is( 'element', listStyle.type ) ) {
68 currentList = writer.rename( listStyle.type, currentList );
69 }
70 }
71 }
72
73 const listItem = transformElementIntoListItem( itemLikeElement.element, writer );
74
75 writer.appendChild( listItem, currentList );
76 } );
77}
78
79/**
80 * Removes paragraph wrapping content inside a list item.
81 *
82 * @param {module:engine/view/documentfragment~DocumentFragment} documentFragment
83 * @param {module:engine/view/upcastwriter~UpcastWriter} writer
84 */
85export function unwrapParagraphInListItem( documentFragment, writer ) {
86 for ( const value of writer.createRangeIn( documentFragment ) ) {
87 const element = value.item;
88
89 if ( element.is( 'element', 'li' ) ) {
90 // Google Docs allows for single paragraph inside LI.
91 const firstChild = element.getChild( 0 );
92
93 if ( firstChild && firstChild.is( 'element', 'p' ) ) {
94 writer.unwrapElement( firstChild );
95 }
96 }
97 }
98}
99
100// Finds all list-like elements in a given document fragment.
101//
102// @param {module:engine/view/documentfragment~DocumentFragment} documentFragment Document fragment
103// in which to look for list-like nodes.
104// @param {module:engine/view/upcastwriter~UpcastWriter} writer
105// @returns {Array.<Object>} Array of found list-like items. Each item is an object containing:
106//
107// * {module:engine/src/view/element~Element} element List-like element.
108// * {Number} id List item id parsed from `mso-list` style (see `getListItemData()` function).
109// * {Number} order List item creation order parsed from `mso-list` style (see `getListItemData()` function).
110// * {Number} indent List item indentation level parsed from `mso-list` style (see `getListItemData()` function).
111function findAllItemLikeElements( documentFragment, writer ) {
112 const range = writer.createRangeIn( documentFragment );
113
114 // Matcher for finding list-like elements.
115 const itemLikeElementsMatcher = new Matcher( {
116 name: /^p|h\d+$/,
117 styles: {
118 'mso-list': /.*/
119 }
120 } );
121
122 const itemLikeElements = [];
123
124 for ( const value of range ) {
125 if ( value.type === 'elementStart' && itemLikeElementsMatcher.match( value.item ) ) {
126 const itemData = getListItemData( value.item );
127
128 itemLikeElements.push( {
129 element: value.item,
130 id: itemData.id,
131 order: itemData.order,
132 indent: itemData.indent
133 } );
134 }
135 }
136
137 return itemLikeElements;
138}
139
140// Extracts list item style from the provided CSS.
141//
142// List item style is extracted from the CSS stylesheet. Each list with its specific style attribute
143// value (`mso-list:l1 level1 lfo1`) has its dedicated properties in a CSS stylesheet defined with a selector like:
144//
145// @list l1:level1 { ... }
146//
147// It contains `mso-level-number-format` property which defines list numbering/bullet style. If this property
148// is not defined it means default `decimal` numbering.
149//
150// Here CSS string representation is used as `mso-level-number-format` property is an invalid CSS property
151// and will be removed during CSS parsing.
152//
153// @param {Object} listLikeItem List-like item for which list style will be searched for. Usually
154// a result of `findAllItemLikeElements()` function.
155// @param {String} stylesString CSS stylesheet.
156// @returns {Object} result
157// @returns {String} result.type List type, could be `ul` or `ol`.
158// @returns {Number} result.startIndex List start index, valid only for ordered lists.
159// @returns {String|null} result.style List style, for example: `decimal`, `lower-roman`, etc. It is extracted
160// directly from Word stylesheet and adjusted to represent proper values for the CSS `list-style-type` property.
161// If it cannot be adjusted, the `null` value is returned.
162function detectListStyle( listLikeItem, stylesString ) {
163 const listStyleRegexp = new RegExp( `@list l${ listLikeItem.id }:level${ listLikeItem.indent }\\s*({[^}]*)`, 'gi' );
164 const listStyleTypeRegex = /mso-level-number-format:([^;]{0,100});/gi;
165 const listStartIndexRegex = /mso-level-start-at:\s{0,100}([0-9]{0,10})\s{0,100};/gi;
166
167 const listStyleMatch = listStyleRegexp.exec( stylesString );
168
169 let listStyleType = 'decimal'; // Decimal is default one.
170 let type = 'ol'; // <ol> is default list.
171 let startIndex = null;
172
173 if ( listStyleMatch && listStyleMatch[ 1 ] ) {
174 const listStyleTypeMatch = listStyleTypeRegex.exec( listStyleMatch[ 1 ] );
175
176 if ( listStyleTypeMatch && listStyleTypeMatch[ 1 ] ) {
177 listStyleType = listStyleTypeMatch[ 1 ].trim();
178 type = listStyleType !== 'bullet' && listStyleType !== 'image' ? 'ol' : 'ul';
179 }
180
181 // Styles for the numbered lists are always defined in the Word CSS stylesheet.
182 // Unordered lists MAY contain a value for the Word CSS definition `mso-level-text` but sometimes
183 // this tag is missing. And because of that, we cannot depend on that. We need to predict the list style value
184 // based on the list style marker element.
185 if ( listStyleType === 'bullet' ) {
186 const bulletedStyle = findBulletedListStyle( listLikeItem.element );
187
188 if ( bulletedStyle ) {
189 listStyleType = bulletedStyle;
190 }
191 } else {
192 const listStartIndexMatch = listStartIndexRegex.exec( listStyleMatch[ 1 ] );
193
194 if ( listStartIndexMatch && listStartIndexMatch[ 1 ] ) {
195 startIndex = parseInt( listStartIndexMatch[ 1 ] );
196 }
197 }
198 }
199
200 return {
201 type,
202 startIndex,
203 style: mapListStyleDefinition( listStyleType )
204 };
205}
206
207// Tries to extract the `list-style-type` value based on the marker element for bulleted list.
208//
209// @param {module:engine/view/element~Element} element
210// @returns {String|null}
211function findBulletedListStyle( element ) {
212 const listMarkerElement = findListMarkerNode( element );
213
214 if ( !listMarkerElement ) {
215 return null;
216 }
217
218 const listMarker = listMarkerElement._data;
219
220 if ( listMarker === 'o' ) {
221 return 'circle';
222 } else if ( listMarker === '·' ) {
223 return 'disc';
224 }
225 // Word returns '§' instead of '■' for the square list style.
226 else if ( listMarker === '§' ) {
227 return 'square';
228 }
229
230 return null;
231}
232
233// Tries to find a text node that represents the marker element (list-style-type).
234//
235// @param {module:engine/view/element~Element} element
236// @returns {module:engine/view/text~Text|null}
237function findListMarkerNode( element ) {
238 // If the first child is a text node, it is the data for the element.
239 // The list-style marker is not present here.
240 if ( element.getChild( 0 ).is( '$text' ) ) {
241 return null;
242 }
243
244 for ( const childNode of element.getChildren() ) {
245 // The list-style marker will be inside the `<span>` element. Let's ignore all non-span elements.
246 // It may happen that the `<a>` element is added as the first child. Most probably, it's an anchor element.
247 if ( !childNode.is( 'element', 'span' ) ) {
248 continue;
249 }
250
251 const textNodeOrElement = childNode.getChild( 0 );
252
253 // If already found the marker element, use it.
254 if ( textNodeOrElement.is( '$text' ) ) {
255 return textNodeOrElement;
256 }
257
258 return textNodeOrElement.getChild( 0 );
259 }
260}
261
262// Parses the `list-style-type` value extracted directly from the Word CSS stylesheet and returns proper CSS definition.
263//
264// @param {String|null} value
265// @returns {String|null}
266function mapListStyleDefinition( value ) {
267 if ( value.startsWith( 'arabic-leading-zero' ) ) {
268 return 'decimal-leading-zero';
269 }
270
271 switch ( value ) {
272 case 'alpha-upper':
273 return 'upper-alpha';
274 case 'alpha-lower':
275 return 'lower-alpha';
276 case 'roman-upper':
277 return 'upper-roman';
278 case 'roman-lower':
279 return 'lower-roman';
280 case 'circle':
281 case 'disc':
282 case 'square':
283 return value;
284 default:
285 return null;
286 }
287}
288
289// Creates an empty list of a given type and inserts it after a specified element.
290//
291// @param {Object} listStyle List style object which determines the type of newly created list.
292// Usually a result of `detectListStyle()` function.
293// @param {module:engine/view/element~Element} element Element after which list is inserted.
294// @param {module:engine/view/upcastwriter~UpcastWriter} writer
295// @returns {module:engine/view/element~Element} Newly created list element.
296
297function insertNewEmptyList( listStyle, element, writer ) {
298 const parent = element.parent;
299 const list = writer.createElement( listStyle.type );
300 const position = parent.getChildIndex( element ) + 1;
301
302 writer.insertChild( position, list, parent );
303
304 // We do not support modifying the marker for a particular list item.
305 // Set the value for the `list-style-type` property directly to the list container.
306 if ( listStyle.style ) {
307 writer.setStyle( 'list-style-type', listStyle.style, list );
308 }
309
310 if ( listStyle.startIndex && listStyle.startIndex > 1 ) {
311 writer.setAttribute( 'start', listStyle.startIndex, list );
312 }
313
314 return list;
315}
316
317// Transforms a given element into a semantic list item. As the function operates on a provided
318// {module:engine/src/view/element~Element element} it will modify the view structure to which this element belongs.
319//
320// @param {module:engine/view/element~Element} element Element which will be transformed into a list item.
321// @param {module:engine/view/upcastwriter~UpcastWriter} writer
322// @returns {module:engine/view/element~Element} New element to which the given one was transformed. It is
323// inserted in place of the old element (the reference to the old element is lost due to renaming).
324function transformElementIntoListItem( element, writer ) {
325 removeBulletElement( element, writer );
326
327 return writer.rename( 'li', element );
328}
329
330// Extracts list item information from Word specific list-like element style:
331//
332// `style="mso-list:l1 level1 lfo1"`
333//
334// where:
335//
336// * `l1` is a list id (however it does not mean this is a continuous list - see #43),
337// * `level1` is a list item indentation level,
338// * `lfo1` is a list insertion order in a document.
339//
340// @param {module:engine/view/element~Element} element Element from which style data is extracted.
341// @returns {Object} result
342// @returns {Number} result.id Parent list id.
343// @returns {Number} result.order List item creation order.
344// @returns {Number} result.indent List item indentation level.
345function getListItemData( element ) {
346 const data = {};
347 const listStyle = element.getStyle( 'mso-list' );
348
349 if ( listStyle ) {
350 const idMatch = listStyle.match( /(^|\s{1,100})l(\d+)/i );
351 const orderMatch = listStyle.match( /\s{0,100}lfo(\d+)/i );
352 const indentMatch = listStyle.match( /\s{0,100}level(\d+)/i );
353
354 if ( idMatch && orderMatch && indentMatch ) {
355 data.id = idMatch[ 2 ];
356 data.order = orderMatch[ 1 ];
357 data.indent = indentMatch[ 1 ];
358 }
359 }
360
361 return data;
362}
363
364// Removes span with a numbering/bullet from a given element.
365//
366// @param {module:engine/view/element~Element} element
367// @param {module:engine/view/upcastwriter~UpcastWriter} writer
368function removeBulletElement( element, writer ) {
369 // Matcher for finding `span` elements holding lists numbering/bullets.
370 const bulletMatcher = new Matcher( {
371 name: 'span',
372 styles: {
373 'mso-list': 'Ignore'
374 }
375 } );
376
377 const range = writer.createRangeIn( element );
378
379 for ( const value of range ) {
380 if ( value.type === 'elementStart' && bulletMatcher.match( value.item ) ) {
381 writer.remove( value.item );
382 }
383 }
384}
385
386// Whether the previous and current items belong to the same list. It is determined based on `item.id`
387// (extracted from `mso-list` style, see #getListItemData) and a previous sibling of the current item.
388//
389// However, it's quite easy to change the `id` attribute for nested lists in Word. It will break the list feature while pasting.
390// Let's check also the `indent` attribute. If the difference between those two elements is equal to 1, we can assume that
391// the `currentItem` is a beginning of the nested list because lists in CKEditor 5 always start with the `indent=0` attribute.
392// See: https://github.com/ckeditor/ckeditor5/issues/7805.
393//
394// @param {Object} previousItem
395// @param {Object} currentItem
396// @returns {Boolean}
397function isNewListNeeded( previousItem, currentItem ) {
398 if ( !previousItem ) {
399 return true;
400 }
401
402 if ( previousItem.id !== currentItem.id ) {
403 // See: https://github.com/ckeditor/ckeditor5/issues/7805.
404 //
405 // * List item 1.
406 // - Nested list item 1.
407 if ( currentItem.indent - previousItem.indent === 1 ) {
408 return false;
409 }
410
411 return true;
412 }
413
414 const previousSibling = currentItem.element.previousSibling;
415
416 if ( !previousSibling ) {
417 return true;
418 }
419
420 // Even with the same id the list does not have to be continuous (#43).
421 return !isList( previousSibling );
422}
423
424function isList( element ) {
425 return element.is( 'element', 'ol' ) || element.is( 'element', 'ul' );
426}
427
428// Calculates the indentation difference between two given list items (based on the indent attribute
429// extracted from the `mso-list` style, see #getListItemData).
430//
431// @param {Object} previousItem
432// @param {Object} currentItem
433// @returns {Number}
434function getIndentationDifference( previousItem, currentItem ) {
435 return previousItem ? currentItem.indent - previousItem.indent : currentItem.indent - 1;
436}
437
438// Finds the parent list element (ul/ol) of a given list element with indentation level lower by a given value.
439//
440// @param {module:engine/view/element~Element} listElement List element from which to start looking for a parent list.
441// @param {Number} indentationDifference Indentation difference between lists.
442// @returns {module:engine/view/element~Element} Found list element with indentation level lower by a given value.
443function findParentListAtLevel( listElement, indentationDifference ) {
444 const ancestors = listElement.getAncestors( { parentFirst: true } );
445
446 let parentList = null;
447 let levelChange = 0;
448
449 for ( const ancestor of ancestors ) {
450 if ( ancestor.name === 'ul' || ancestor.name === 'ol' ) {
451 levelChange++;
452 }
453
454 if ( levelChange === indentationDifference ) {
455 parentList = ancestor;
456 break;
457 }
458 }
459
460 return parentList;
461}