UNPKG

20.1 kBJavaScriptView Raw
1/**
2 * @license
3 * Copyright 2021 Google LLC
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16import { applyWrapStyle, parseFromString } from './dom.js';
17import { Parser } from './parser.js';
18import { win } from './win.js';
19const assert = console.assert;
20const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
21const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
22// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
23// but we define the same here for Node.js environments.
24const NodeType = {
25 ELEMENT_NODE: 1,
26 TEXT_NODE: 3,
27};
28const DomAction = {
29 Inline: 0, // An inline content, becomes a part of a paragraph.
30 Block: 1, // A nested paragraph.
31 Skip: 2, // Skip the content. The content before and after are connected.
32 Break: 3, // A forced break. The content before and after become paragraphs.
33 NoBreak: 4, // The content provides context, but it's not breakable.
34 BreakOpportunity: 5, // Force a break opportunity.
35};
36/**
37 * Determines the action from an element name, as defined in
38 * {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
39 * See also {@link actionForElement}.
40 */
41const domActions = {
42 // Hidden elements
43 // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
44 AREA: DomAction.Skip,
45 BASE: DomAction.Skip,
46 BASEFONT: DomAction.Skip,
47 DATALIST: DomAction.Skip,
48 HEAD: DomAction.Skip,
49 LINK: DomAction.Skip,
50 META: DomAction.Skip,
51 NOEMBED: DomAction.Skip,
52 NOFRAMES: DomAction.Skip,
53 PARAM: DomAction.Skip,
54 RP: DomAction.Skip,
55 SCRIPT: DomAction.Skip,
56 STYLE: DomAction.Skip,
57 TEMPLATE: DomAction.Skip,
58 TITLE: DomAction.Skip,
59 NOSCRIPT: DomAction.Skip,
60 // Flow content
61 // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
62 HR: DomAction.Break,
63 // Disable if `white-space: pre`.
64 LISTING: DomAction.Skip,
65 PLAINTEXT: DomAction.Skip,
66 PRE: DomAction.Skip,
67 XMP: DomAction.Skip,
68 // Phrasing content
69 // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
70 BR: DomAction.Break,
71 RT: DomAction.Skip,
72 WBR: DomAction.BreakOpportunity,
73 // Form controls
74 // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
75 INPUT: DomAction.Skip,
76 SELECT: DomAction.Skip,
77 BUTTON: DomAction.Skip,
78 TEXTAREA: DomAction.Skip,
79 // Other elements where the phrase-based line breaking should be disabled.
80 // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
81 ABBR: DomAction.Skip,
82 CODE: DomAction.Skip,
83 IFRAME: DomAction.Skip,
84 TIME: DomAction.Skip,
85 VAR: DomAction.Skip,
86 // Deprecated, but supported in all browsers.
87 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
88 NOBR: DomAction.NoBreak,
89};
90const defaultBlockElements = new Set([
91 // 15.3.2 The page
92 'HTML',
93 'BODY',
94 // 15.3.3 Flow content
95 'ADDRESS',
96 'BLOCKQUOTE',
97 'CENTER',
98 'DIALOG',
99 'DIV',
100 'FIGURE',
101 'FIGCAPTION',
102 'FOOTER',
103 'FORM',
104 'HEADER',
105 'LEGEND',
106 'LISTING',
107 'MAIN',
108 'P',
109 // 15.3.6 Sections and headings
110 'ARTICLE',
111 'ASIDE',
112 'H1',
113 'H2',
114 'H3',
115 'H4',
116 'H5',
117 'H6',
118 'HGROUP',
119 'NAV',
120 'SECTION',
121 // 15.3.7 Lists
122 'DIR',
123 'DD',
124 'DL',
125 'DT',
126 'MENU',
127 'OL',
128 'UL',
129 'LI',
130 // 15.3.8 Tables
131 'TABLE',
132 'CAPTION',
133 'COL',
134 'TR',
135 'TD',
136 'TH',
137 // 15.3.12 The fieldset and legend elements
138 'FIELDSET',
139 // 15.5.4 The details and summary elements
140 'DETAILS',
141 'SUMMARY',
142 // 15.5.12 The marquee element
143 'MARQUEE',
144]);
145// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
146// but we define the same here for Node.js environments.
147const NODETYPE = {
148 ELEMENT: 1,
149 TEXT: 3,
150};
151/**
152 * Determine the action for an element.
153 * @param element An element to determine the action for.
154 * @return The {@link domActions} for the element.
155 */
156function actionForElement(element) {
157 const nodeName = element.nodeName;
158 const action = domActions[nodeName];
159 if (action !== undefined)
160 return action;
161 if (typeof win.getComputedStyle === 'function') {
162 const style = win.getComputedStyle(element);
163 switch (style.whiteSpace) {
164 case 'nowrap':
165 case 'pre':
166 return DomAction.NoBreak;
167 }
168 const display = style.display;
169 if (display)
170 return display === 'inline' ? DomAction.Inline : DomAction.Block;
171 // `display` is an empty string if the element is not connected.
172 }
173 // Use the built-in rules if the `display` property is empty, or if
174 // `getComputedStyle` is missing (e.g., jsdom.)
175 return defaultBlockElements.has(nodeName)
176 ? DomAction.Block
177 : DomAction.Inline;
178}
179/**
180 * Represents a node in {@link Paragraph}.
181 *
182 * It wraps a {@link Text} or a {@link string}.
183 *
184 * A {@link string} provides the context for the parser, but it can't be split.
185 */
186class NodeOrText {
187 constructor(nodeOrText) {
188 this.chunks = [];
189 this.hasBreakOpportunityAfter = false;
190 this.nodeOrText = nodeOrText;
191 }
192 get isString() {
193 return typeof this.nodeOrText === 'string';
194 }
195 get canSplit() {
196 return !this.isString;
197 }
198 get text() {
199 return this.isString
200 ? this.nodeOrText
201 : this.nodeOrText.nodeValue;
202 }
203 get length() {
204 var _a, _b;
205 return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
206 }
207 /**
208 * Split the {@link Text} in the same way as the {@link chunks}.
209 * Joining all {@link chunks} must be equal to {@link text}.
210 */
211 split(separator) {
212 const chunks = this.chunks;
213 assert(chunks.length === 0 || chunks.join('') === this.text);
214 if (chunks.length <= 1)
215 return;
216 assert(this.canSplit);
217 const node = this.nodeOrText;
218 if (typeof separator === 'string') {
219 // If the `separator` is a string, insert it at each boundary.
220 node.nodeValue = chunks.join(separator);
221 return;
222 }
223 // Otherwise create a `Text` node for each chunk, with the separator node
224 // between them, and replace the `node` with them.
225 const document = node.ownerDocument;
226 let nodes = [];
227 for (const chunk of chunks) {
228 if (chunk)
229 nodes.push(document.createTextNode(chunk));
230 // Add a separator between chunks. To simplify the logic, add a separator
231 // after each chunk, then remove the last one.
232 // To avoid `cloneNode` for the temporary one that is going to be removed,
233 // add `null` as a marker, then replace them with `cloneNode` later.
234 nodes.push(null);
235 }
236 nodes.pop();
237 nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
238 node.replaceWith(...nodes);
239 }
240}
241export class NodeOrTextForTesting extends NodeOrText {
242}
243/**
244 * Represents a "paragraph", broken by block boundaries or forced breaks.
245 *
246 * A CSS
247 * {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
248 * is usually a "paragraph", but it can be broken into multiple paragraphs by
249 * forced breaks such as `<br>`.
250 */
251class Paragraph {
252 constructor(element) {
253 this.nodes = [];
254 this.element = element;
255 }
256 isEmpty() {
257 return this.nodes.length === 0;
258 }
259 get text() {
260 return this.nodes.map(node => node.text).join('');
261 }
262 get lastNode() {
263 return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
264 }
265 setHasBreakOpportunityAfter() {
266 const lastNode = this.lastNode;
267 if (lastNode)
268 lastNode.hasBreakOpportunityAfter = true;
269 }
270 /**
271 * @return Indices of forced break opportunities in the source.
272 * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
273 */
274 getForcedOpportunities() {
275 const opportunities = [];
276 let len = 0;
277 for (const node of this.nodes) {
278 if (node.canSplit) {
279 const text = node.text;
280 if (text) {
281 for (let i = 0; i < text.length; ++i) {
282 if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
283 opportunities.push(len + i + 1);
284 }
285 }
286 }
287 }
288 len += node.length;
289 if (node.hasBreakOpportunityAfter) {
290 opportunities.push(len);
291 }
292 }
293 return opportunities;
294 }
295 /**
296 * @return Filtered {@param boundaries} by excluding
297 * {@link getForcedOpportunities} if it's not empty.
298 * Otherwise {@param boundaries}.
299 */
300 excludeForcedOpportunities(boundaries) {
301 const forcedOpportunities = this.getForcedOpportunities();
302 if (!forcedOpportunities.length)
303 return boundaries;
304 const set = new Set(forcedOpportunities);
305 return boundaries.filter(i => !set.has(i));
306 }
307}
308export class ParagraphForTesting extends Paragraph {
309}
310/**
311 * Adds HTML processing support to a BudouX {@link Parser}.
312 */
313export class HTMLProcessor {
314 /**
315 * @param parser A BudouX {@link Parser} to compute semantic line breaks.
316 */
317 constructor(parser, options) {
318 /** See {@link HTMLProcessorOptions.separator}. */
319 this.separator = ZWSP;
320 this.parser_ = parser;
321 if (options !== undefined) {
322 if (options.className !== undefined)
323 this.className = options.className;
324 if (options.separator !== undefined)
325 this.separator = options.separator;
326 }
327 }
328 /**
329 * Checks if the given element has a text node in its children.
330 *
331 * @param ele An element to be checked.
332 * @return Whether the element has a child text node.
333 */
334 static hasChildTextNode(ele) {
335 for (const child of ele.childNodes) {
336 if (child.nodeType === NODETYPE.TEXT)
337 return true;
338 }
339 return false;
340 }
341 /**
342 * Applies markups for semantic line breaks to the given HTML element.
343 *
344 * It breaks descendant nodes into paragraphs,
345 * and applies the BudouX to each paragraph.
346 * @param element The input element.
347 */
348 applyToElement(element) {
349 for (const block of this.getBlocks(element)) {
350 assert(!block.isEmpty());
351 this.applyToParagraph(block);
352 }
353 }
354 /**
355 * Find paragraphs from a given HTML element.
356 * @param element The root element to find paragraphs.
357 * @param parent The parent {@link Paragraph} if any.
358 * @return A list of {@link Paragraph}s.
359 */
360 *getBlocks(element, parent) {
361 assert(element.nodeType === NodeType.ELEMENT_NODE);
362 // Skip if it was once applied to this element.
363 if (this.className && element.classList.contains(this.className))
364 return;
365 const action = actionForElement(element);
366 if (action === DomAction.Skip)
367 return;
368 if (action === DomAction.Break) {
369 if (parent && !parent.isEmpty()) {
370 parent.setHasBreakOpportunityAfter();
371 yield parent;
372 parent.nodes = [];
373 }
374 assert(!element.firstChild);
375 return;
376 }
377 if (action === DomAction.BreakOpportunity) {
378 if (parent)
379 parent.setHasBreakOpportunityAfter();
380 return;
381 }
382 // Determine if this element creates a new inline formatting context, or if
383 // this element belongs to the parent inline formatting context.
384 assert(action === DomAction.Block ||
385 action === DomAction.Inline ||
386 action === DomAction.NoBreak);
387 const isNewBlock = !parent || action === DomAction.Block;
388 const block = isNewBlock ? new Paragraph(element) : parent;
389 // Collect all text nodes in this inline formatting context, while searching
390 // descendant elements recursively.
391 for (const child of element.childNodes) {
392 switch (child.nodeType) {
393 case NodeType.ELEMENT_NODE:
394 for (const childBlock of this.getBlocks(child, block))
395 yield childBlock;
396 break;
397 case NodeType.TEXT_NODE:
398 if (action === DomAction.NoBreak) {
399 const text = child.nodeValue;
400 if (text) {
401 block.nodes.push(new NodeOrText(text));
402 }
403 break;
404 }
405 block.nodes.push(new NodeOrText(child));
406 break;
407 }
408 }
409 // Apply if this is an inline formatting context.
410 if (isNewBlock && !block.isEmpty())
411 yield block;
412 }
413 /**
414 * Apply the BudouX to the given {@link Paragraph}.
415 * @param paragraph The {@link Paragraph} to apply.
416 */
417 applyToParagraph(paragraph) {
418 assert(paragraph.nodes.length > 0);
419 if (!paragraph.nodes.some(node => node.canSplit))
420 return;
421 const text = paragraph.text;
422 // No changes if whitespace-only.
423 if (/^\s*$/.test(text))
424 return;
425 // Compute the phrase boundaries.
426 const boundaries = this.parser_.parseBoundaries(text);
427 // No changes if single phrase.
428 if (boundaries.length <= 0)
429 return;
430 // The boundaries should be between 1 and `text.length - 1` in the
431 // ascending order.
432 assert(boundaries[0] > 0);
433 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
434 assert(boundaries[boundaries.length - 1] < text.length);
435 const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
436 // Add a sentinel to help iterating.
437 adjustedBoundaries.push(text.length + 1);
438 this.splitNodes(paragraph.nodes, adjustedBoundaries);
439 this.applyBlockStyle(paragraph.element);
440 }
441 /**
442 * Split {@link NodeOrText} at the specified boundaries.
443 * @param nodes A list of {@link NodeOrText}.
444 * @param boundaries A list of indices of the text to split at.
445 */
446 splitNodes(nodes, boundaries) {
447 var _a;
448 assert(boundaries.length > 0);
449 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
450 const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
451 // The last boundary must be a sentinel.
452 assert(boundaries[boundaries.length - 1] > textLen);
453 // Distribute `boundaries` to `node.chunks`.
454 let boundary_index = 0;
455 let boundary = boundaries[0];
456 assert(boundary > 0);
457 let nodeStart = 0; // the start index of the `nodeText` in the whole text.
458 let lastNode = null;
459 for (const node of nodes) {
460 assert(boundary >= nodeStart);
461 assert(node.chunks.length === 0);
462 const nodeText = node.text;
463 if (!nodeText)
464 continue;
465 const nodeLength = nodeText.length;
466 const nodeEnd = nodeStart + nodeLength;
467 assert(!lastNode || lastNode.canSplit);
468 if (!node.canSplit) {
469 // If there's a boundary between nodes and `lastNode.canSplit`, add a
470 // boundary to the end of the `lastNode`.
471 if (lastNode && boundary === nodeStart) {
472 if (lastNode.chunks.length === 0)
473 lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
474 lastNode.chunks.push('');
475 }
476 while (boundary < nodeEnd) {
477 boundary = boundaries[++boundary_index];
478 }
479 lastNode = null;
480 nodeStart = nodeEnd;
481 continue;
482 }
483 // Check if the next boundary is in this `node`.
484 lastNode = node;
485 if (boundary >= nodeEnd) {
486 nodeStart = nodeEnd;
487 continue;
488 }
489 // Compute the boundary indices in the `node`.
490 const chunks = node.chunks;
491 let chunkStartInNode = 0;
492 while (boundary < nodeEnd) {
493 const boundaryInNode = boundary - nodeStart;
494 assert(boundaryInNode >= chunkStartInNode);
495 chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
496 chunkStartInNode = boundaryInNode;
497 boundary = boundaries[++boundary_index];
498 }
499 // Add the rest of the `nodeText`.
500 assert(chunkStartInNode < nodeLength);
501 chunks.push(nodeText.slice(chunkStartInNode));
502 nodeStart = nodeEnd;
503 }
504 // Check if all nodes and boundaries are consumed.
505 assert(nodeStart === textLen);
506 assert(boundary_index < boundaries.length);
507 assert(boundaries[boundary_index] >= textLen);
508 // `node.chunks` are finalized. Split them.
509 for (const node of nodes) {
510 node.split(this.separator);
511 }
512 }
513 /**
514 * Applies the block style to the given element.
515 * @param element The element to apply the block style.
516 */
517 applyBlockStyle(element) {
518 if (this.className) {
519 element.classList.add(this.className);
520 return;
521 }
522 applyWrapStyle(element);
523 }
524}
525/**
526 * BudouX {@link Parser} with HTML processing support.
527 */
528export class HTMLProcessingParser extends Parser {
529 constructor(model, htmlProcessorOptions = {
530 separator: ZWSP,
531 }) {
532 super(model);
533 this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
534 }
535 /**
536 * @deprecated Use `applyToElement` instead. `applyElement` will be removed
537 * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
538 *
539 * Applies markups for semantic line breaks to the given HTML element.
540 * @param parentElement The input element.
541 */
542 applyElement(parentElement) {
543 console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
544 '`applyElement` will be removed in v0.7.0.');
545 this.applyToElement(parentElement);
546 }
547 /**
548 * Applies markups for semantic line breaks to the given HTML element.
549 * @param parentElement The input element.
550 */
551 applyToElement(parentElement) {
552 this.htmlProcessor.applyToElement(parentElement);
553 }
554 /**
555 * Translates the given HTML string to another HTML string with markups
556 * for semantic line breaks.
557 * @param html An input html string.
558 * @return The translated HTML string.
559 */
560 translateHTMLString(html) {
561 if (html === '')
562 return html;
563 const doc = parseFromString(html);
564 if (HTMLProcessor.hasChildTextNode(doc.body)) {
565 const wrapper = doc.createElement('span');
566 wrapper.append(...doc.body.childNodes);
567 doc.body.append(wrapper);
568 }
569 this.applyToElement(doc.body.childNodes[0]);
570 return doc.body.innerHTML;
571 }
572}
573//# sourceMappingURL=html_processor.js.map
\No newline at end of file