UNPKG

19.4 kBPlain TextView Raw
1/**
2 * @license
3 * Copyright 2021 Google LLC
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import {applyWrapStyle, parseFromString} from './dom.js';
18import {Parser} from './parser.js';
19import {win} from './win.js';
20
21const assert = console.assert;
22
23const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
24const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
25
26// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
27// but we define the same here for Node.js environments.
28const NodeType = {
29 ELEMENT_NODE: 1,
30 TEXT_NODE: 3,
31};
32
33const DomAction = {
34 Inline: 0, // An inline content, becomes a part of a paragraph.
35 Block: 1, // A nested paragraph.
36 Skip: 2, // Skip the content. The content before and after are connected.
37 Break: 3, // A forced break. The content before and after become paragraphs.
38 NoBreak: 4, // The content provides context, but it's not breakable.
39 BreakOpportunity: 5, // Force a break opportunity.
40} as const;
41type DomAction = (typeof DomAction)[keyof typeof DomAction];
42
43/**
44 * Determines the action from an element name, as defined in
45 * {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
46 * See also {@link actionForElement}.
47 */
48const domActions: {[name: string]: DomAction} = {
49 // Hidden elements
50 // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
51 AREA: DomAction.Skip,
52 BASE: DomAction.Skip,
53 BASEFONT: DomAction.Skip,
54 DATALIST: DomAction.Skip,
55 HEAD: DomAction.Skip,
56 LINK: DomAction.Skip,
57 META: DomAction.Skip,
58 NOEMBED: DomAction.Skip,
59 NOFRAMES: DomAction.Skip,
60 PARAM: DomAction.Skip,
61 RP: DomAction.Skip,
62 SCRIPT: DomAction.Skip,
63 STYLE: DomAction.Skip,
64 TEMPLATE: DomAction.Skip,
65 TITLE: DomAction.Skip,
66 NOSCRIPT: DomAction.Skip,
67
68 // Flow content
69 // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
70 HR: DomAction.Break,
71 // Disable if `white-space: pre`.
72 LISTING: DomAction.Skip,
73 PLAINTEXT: DomAction.Skip,
74 PRE: DomAction.Skip,
75 XMP: DomAction.Skip,
76
77 // Phrasing content
78 // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
79 BR: DomAction.Break,
80 RT: DomAction.Skip,
81 WBR: DomAction.BreakOpportunity,
82
83 // Form controls
84 // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
85 INPUT: DomAction.Skip,
86 SELECT: DomAction.Skip,
87 BUTTON: DomAction.Skip,
88 TEXTAREA: DomAction.Skip,
89
90 // Other elements where the phrase-based line breaking should be disabled.
91 // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
92 ABBR: DomAction.Skip,
93 CODE: DomAction.Skip,
94 IFRAME: DomAction.Skip,
95 TIME: DomAction.Skip,
96 VAR: DomAction.Skip,
97
98 // Deprecated, but supported in all browsers.
99 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
100 NOBR: DomAction.NoBreak,
101};
102
103const defaultBlockElements = new Set([
104 // 15.3.2 The page
105 'HTML',
106 'BODY',
107 // 15.3.3 Flow content
108 'ADDRESS',
109 'BLOCKQUOTE',
110 'CENTER',
111 'DIALOG',
112 'DIV',
113 'FIGURE',
114 'FIGCAPTION',
115 'FOOTER',
116 'FORM',
117 'HEADER',
118 'LEGEND',
119 'LISTING',
120 'MAIN',
121 'P',
122 // 15.3.6 Sections and headings
123 'ARTICLE',
124 'ASIDE',
125 'H1',
126 'H2',
127 'H3',
128 'H4',
129 'H5',
130 'H6',
131 'HGROUP',
132 'NAV',
133 'SECTION',
134 // 15.3.7 Lists
135 'DIR',
136 'DD',
137 'DL',
138 'DT',
139 'MENU',
140 'OL',
141 'UL',
142 'LI',
143 // 15.3.8 Tables
144 'TABLE',
145 'CAPTION',
146 'COL',
147 'TR',
148 'TD',
149 'TH',
150 // 15.3.12 The fieldset and legend elements
151 'FIELDSET',
152 // 15.5.4 The details and summary elements
153 'DETAILS',
154 'SUMMARY',
155 // 15.5.12 The marquee element
156 'MARQUEE',
157]);
158
159// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
160// but we define the same here for Node.js environments.
161const NODETYPE = {
162 ELEMENT: 1,
163 TEXT: 3,
164};
165
166/**
167 * Determine the action for an element.
168 * @param element An element to determine the action for.
169 * @return The {@link domActions} for the element.
170 */
171function actionForElement(element: Element): DomAction {
172 const nodeName = element.nodeName;
173 const action = domActions[nodeName];
174 if (action !== undefined) return action;
175
176 if (typeof win.getComputedStyle === 'function') {
177 const style = win.getComputedStyle(element);
178 switch (style.whiteSpace) {
179 case 'nowrap':
180 case 'pre':
181 return DomAction.NoBreak;
182 }
183
184 const display = style.display;
185 if (display)
186 return display === 'inline' ? DomAction.Inline : DomAction.Block;
187 // `display` is an empty string if the element is not connected.
188 }
189
190 // Use the built-in rules if the `display` property is empty, or if
191 // `getComputedStyle` is missing (e.g., jsdom.)
192 return defaultBlockElements.has(nodeName)
193 ? DomAction.Block
194 : DomAction.Inline;
195}
196
197/**
198 * Represents a node in {@link Paragraph}.
199 *
200 * It wraps a {@link Text} or a {@link string}.
201 *
202 * A {@link string} provides the context for the parser, but it can't be split.
203 */
204class NodeOrText {
205 nodeOrText: Text | string;
206 chunks: string[] = [];
207 hasBreakOpportunityAfter = false;
208
209 constructor(nodeOrText: Text | string) {
210 this.nodeOrText = nodeOrText;
211 }
212
213 get isString(): boolean {
214 return typeof this.nodeOrText === 'string';
215 }
216 get canSplit(): boolean {
217 return !this.isString;
218 }
219 get text(): string | null {
220 return this.isString
221 ? (this.nodeOrText as string)
222 : (this.nodeOrText as Text).nodeValue;
223 }
224 get length(): number {
225 return this.text?.length ?? 0;
226 }
227
228 /**
229 * Split the {@link Text} in the same way as the {@link chunks}.
230 * Joining all {@link chunks} must be equal to {@link text}.
231 */
232 split(separator: string | Node) {
233 const chunks = this.chunks;
234 assert(chunks.length === 0 || chunks.join('') === this.text);
235 if (chunks.length <= 1) return;
236 assert(this.canSplit);
237 const node = this.nodeOrText as Text;
238 if (typeof separator === 'string') {
239 // If the `separator` is a string, insert it at each boundary.
240 node.nodeValue = chunks.join(separator);
241 return;
242 }
243
244 // Otherwise create a `Text` node for each chunk, with the separator node
245 // between them, and replace the `node` with them.
246 const document = node.ownerDocument;
247 let nodes = [];
248 for (const chunk of chunks) {
249 if (chunk) nodes.push(document.createTextNode(chunk));
250 // Add a separator between chunks. To simplify the logic, add a separator
251 // after each chunk, then remove the last one.
252 // To avoid `cloneNode` for the temporary one that is going to be removed,
253 // add `null` as a marker, then replace them with `cloneNode` later.
254 nodes.push(null);
255 }
256 nodes.pop();
257 nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
258 node.replaceWith(...nodes);
259 }
260}
261export class NodeOrTextForTesting extends NodeOrText {}
262
263/**
264 * Represents a "paragraph", broken by block boundaries or forced breaks.
265 *
266 * A CSS
267 * {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
268 * is usually a "paragraph", but it can be broken into multiple paragraphs by
269 * forced breaks such as `<br>`.
270 */
271class Paragraph {
272 element: HTMLElement;
273 nodes: NodeOrText[] = [];
274
275 constructor(element: HTMLElement) {
276 this.element = element;
277 }
278
279 isEmpty(): boolean {
280 return this.nodes.length === 0;
281 }
282 get text(): string {
283 return this.nodes.map(node => node.text).join('');
284 }
285
286 get lastNode(): NodeOrText | undefined {
287 return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
288 }
289 setHasBreakOpportunityAfter() {
290 const lastNode = this.lastNode;
291 if (lastNode) lastNode.hasBreakOpportunityAfter = true;
292 }
293
294 /**
295 * @return Indices of forced break opportunities in the source.
296 * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
297 */
298 getForcedOpportunities(): number[] {
299 const opportunities: number[] = [];
300 let len = 0;
301 for (const node of this.nodes) {
302 if (node.canSplit) {
303 const text = node.text;
304 if (text) {
305 for (let i = 0; i < text.length; ++i) {
306 if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
307 opportunities.push(len + i + 1);
308 }
309 }
310 }
311 }
312 len += node.length;
313 if (node.hasBreakOpportunityAfter) {
314 opportunities.push(len);
315 }
316 }
317 return opportunities;
318 }
319
320 /**
321 * @return Filtered {@param boundaries} by excluding
322 * {@link getForcedOpportunities} if it's not empty.
323 * Otherwise {@param boundaries}.
324 */
325 excludeForcedOpportunities(boundaries: number[]): number[] {
326 const forcedOpportunities = this.getForcedOpportunities();
327 if (!forcedOpportunities.length) return boundaries;
328 const set = new Set<number>(forcedOpportunities);
329 return boundaries.filter(i => !set.has(i));
330 }
331}
332export class ParagraphForTesting extends Paragraph {}
333
334/**
335 * Options for {@link HTMLProcessor}.
336 */
337export interface HTMLProcessorOptions {
338 /**
339 * This class name is added to the containing block when the BudouX is applied.
340 * The containing block should have following CSS properties to make it work.
341 * `{ word-break: keep-all; overflow-wrap: anywhere; }`
342 *
343 * When falsy, an inline style is set instead.
344 */
345 className?: string;
346 /**
347 * The separator to insert at each semantics boundary.
348 *
349 * When it's a {@link Node}, a clone of the {@link Node} will be inserted.
350 *
351 * The default value is U+200B ZERO WIDTH SPACE.
352 */
353 separator?: string | Node;
354}
355
356/**
357 * Adds HTML processing support to a BudouX {@link Parser}.
358 */
359export class HTMLProcessor {
360 private parser_: Parser;
361 /** See {@link HTMLProcessorOptions.className}. */
362 className?: string;
363 /** See {@link HTMLProcessorOptions.separator}. */
364 separator: string | Node = ZWSP;
365
366 /**
367 * @param parser A BudouX {@link Parser} to compute semantic line breaks.
368 */
369 constructor(parser: Parser, options?: HTMLProcessorOptions) {
370 this.parser_ = parser;
371 if (options !== undefined) {
372 if (options.className !== undefined) this.className = options.className;
373 if (options.separator !== undefined) this.separator = options.separator;
374 }
375 }
376
377 /**
378 * Checks if the given element has a text node in its children.
379 *
380 * @param ele An element to be checked.
381 * @return Whether the element has a child text node.
382 */
383 static hasChildTextNode(ele: HTMLElement) {
384 for (const child of ele.childNodes) {
385 if (child.nodeType === NODETYPE.TEXT) return true;
386 }
387 return false;
388 }
389
390 /**
391 * Applies markups for semantic line breaks to the given HTML element.
392 *
393 * It breaks descendant nodes into paragraphs,
394 * and applies the BudouX to each paragraph.
395 * @param element The input element.
396 */
397 applyToElement(element: HTMLElement) {
398 for (const block of this.getBlocks(element)) {
399 assert(!block.isEmpty());
400 this.applyToParagraph(block);
401 }
402 }
403
404 /**
405 * Find paragraphs from a given HTML element.
406 * @param element The root element to find paragraphs.
407 * @param parent The parent {@link Paragraph} if any.
408 * @return A list of {@link Paragraph}s.
409 */
410 *getBlocks(
411 element: HTMLElement,
412 parent?: Paragraph
413 ): IterableIterator<Paragraph> {
414 assert(element.nodeType === NodeType.ELEMENT_NODE);
415
416 // Skip if it was once applied to this element.
417 if (this.className && element.classList.contains(this.className)) return;
418
419 const action = actionForElement(element);
420 if (action === DomAction.Skip) return;
421 if (action === DomAction.Break) {
422 if (parent && !parent.isEmpty()) {
423 parent.setHasBreakOpportunityAfter();
424 yield parent;
425 parent.nodes = [];
426 }
427 assert(!element.firstChild);
428 return;
429 }
430 if (action === DomAction.BreakOpportunity) {
431 if (parent) parent.setHasBreakOpportunityAfter();
432 return;
433 }
434
435 // Determine if this element creates a new inline formatting context, or if
436 // this element belongs to the parent inline formatting context.
437 assert(
438 action === DomAction.Block ||
439 action === DomAction.Inline ||
440 action === DomAction.NoBreak
441 );
442 const isNewBlock = !parent || action === DomAction.Block;
443 const block = isNewBlock ? new Paragraph(element) : parent;
444
445 // Collect all text nodes in this inline formatting context, while searching
446 // descendant elements recursively.
447 for (const child of element.childNodes) {
448 switch (child.nodeType) {
449 case NodeType.ELEMENT_NODE:
450 for (const childBlock of this.getBlocks(child as HTMLElement, block))
451 yield childBlock;
452 break;
453 case NodeType.TEXT_NODE:
454 if (action === DomAction.NoBreak) {
455 const text = child.nodeValue;
456 if (text) {
457 block.nodes.push(new NodeOrText(text));
458 }
459 break;
460 }
461 block.nodes.push(new NodeOrText(child as Text));
462 break;
463 }
464 }
465
466 // Apply if this is an inline formatting context.
467 if (isNewBlock && !block.isEmpty()) yield block;
468 }
469
470 /**
471 * Apply the BudouX to the given {@link Paragraph}.
472 * @param paragraph The {@link Paragraph} to apply.
473 */
474 applyToParagraph(paragraph: Paragraph): void {
475 assert(paragraph.nodes.length > 0);
476 if (!paragraph.nodes.some(node => node.canSplit)) return;
477 const text = paragraph.text;
478 // No changes if whitespace-only.
479 if (/^\s*$/.test(text)) return;
480
481 // Compute the phrase boundaries.
482 const boundaries = this.parser_.parseBoundaries(text);
483 // No changes if single phrase.
484 if (boundaries.length <= 0) return;
485 // The boundaries should be between 1 and `text.length - 1` in the
486 // ascending order.
487 assert(boundaries[0] > 0);
488 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
489 assert(boundaries[boundaries.length - 1] < text.length);
490
491 const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
492
493 // Add a sentinel to help iterating.
494 adjustedBoundaries.push(text.length + 1);
495
496 this.splitNodes(paragraph.nodes, adjustedBoundaries);
497 this.applyBlockStyle(paragraph.element);
498 }
499
500 /**
501 * Split {@link NodeOrText} at the specified boundaries.
502 * @param nodes A list of {@link NodeOrText}.
503 * @param boundaries A list of indices of the text to split at.
504 */
505 splitNodes(nodes: NodeOrText[], boundaries: number[]): void {
506 assert(boundaries.length > 0);
507 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
508 const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
509 // The last boundary must be a sentinel.
510 assert(boundaries[boundaries.length - 1] > textLen);
511
512 // Distribute `boundaries` to `node.chunks`.
513 let boundary_index = 0;
514 let boundary = boundaries[0];
515 assert(boundary > 0);
516 let nodeStart = 0; // the start index of the `nodeText` in the whole text.
517 let lastNode: NodeOrText | null = null;
518 for (const node of nodes) {
519 assert(boundary >= nodeStart);
520 assert(node.chunks.length === 0);
521 const nodeText = node.text;
522 if (!nodeText) continue;
523 const nodeLength = nodeText.length;
524 const nodeEnd = nodeStart + nodeLength;
525 assert(!lastNode || lastNode.canSplit);
526 if (!node.canSplit) {
527 // If there's a boundary between nodes and `lastNode.canSplit`, add a
528 // boundary to the end of the `lastNode`.
529 if (lastNode && boundary === nodeStart) {
530 if (lastNode.chunks.length === 0)
531 lastNode.chunks.push(lastNode.text ?? '');
532 lastNode.chunks.push('');
533 }
534 while (boundary < nodeEnd) {
535 boundary = boundaries[++boundary_index];
536 }
537 lastNode = null;
538 nodeStart = nodeEnd;
539 continue;
540 }
541
542 // Check if the next boundary is in this `node`.
543 lastNode = node;
544 if (boundary >= nodeEnd) {
545 nodeStart = nodeEnd;
546 continue;
547 }
548
549 // Compute the boundary indices in the `node`.
550 const chunks = node.chunks;
551 let chunkStartInNode = 0;
552 while (boundary < nodeEnd) {
553 const boundaryInNode = boundary - nodeStart;
554 assert(boundaryInNode >= chunkStartInNode);
555 chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
556 chunkStartInNode = boundaryInNode;
557 boundary = boundaries[++boundary_index];
558 }
559 // Add the rest of the `nodeText`.
560 assert(chunkStartInNode < nodeLength);
561 chunks.push(nodeText.slice(chunkStartInNode));
562
563 nodeStart = nodeEnd;
564 }
565 // Check if all nodes and boundaries are consumed.
566 assert(nodeStart === textLen);
567 assert(boundary_index < boundaries.length);
568 assert(boundaries[boundary_index] >= textLen);
569
570 // `node.chunks` are finalized. Split them.
571 for (const node of nodes) {
572 node.split(this.separator);
573 }
574 }
575
576 /**
577 * Applies the block style to the given element.
578 * @param element The element to apply the block style.
579 */
580 applyBlockStyle(element: HTMLElement): void {
581 if (this.className) {
582 element.classList.add(this.className);
583 return;
584 }
585 applyWrapStyle(element);
586 }
587}
588
589/**
590 * BudouX {@link Parser} with HTML processing support.
591 */
592export class HTMLProcessingParser extends Parser {
593 htmlProcessor: HTMLProcessor;
594
595 constructor(
596 model: {[key: string]: {[key: string]: number}},
597 htmlProcessorOptions: HTMLProcessorOptions = {
598 separator: ZWSP,
599 }
600 ) {
601 super(model);
602 this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
603 }
604
605 /**
606 * @deprecated Use `applyToElement` instead. `applyElement` will be removed
607 * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
608 *
609 * Applies markups for semantic line breaks to the given HTML element.
610 * @param parentElement The input element.
611 */
612 applyElement(parentElement: HTMLElement) {
613 console.warn(
614 '`applyElement` is deprecated. Please use `applyToElement` instead. ' +
615 '`applyElement` will be removed in v0.7.0.'
616 );
617 this.applyToElement(parentElement);
618 }
619
620 /**
621 * Applies markups for semantic line breaks to the given HTML element.
622 * @param parentElement The input element.
623 */
624 applyToElement(parentElement: HTMLElement) {
625 this.htmlProcessor.applyToElement(parentElement);
626 }
627
628 /**
629 * Translates the given HTML string to another HTML string with markups
630 * for semantic line breaks.
631 * @param html An input html string.
632 * @return The translated HTML string.
633 */
634 translateHTMLString(html: string) {
635 if (html === '') return html;
636 const doc = parseFromString(html);
637 if (HTMLProcessor.hasChildTextNode(doc.body)) {
638 const wrapper = doc.createElement('span');
639 wrapper.append(...doc.body.childNodes);
640 doc.body.append(wrapper);
641 }
642 this.applyToElement(doc.body.childNodes[0] as HTMLElement);
643 return doc.body.innerHTML;
644 }
645}