20.5 kBJavaScriptView Raw
1"use strict";
3 * @license
4 * Copyright 2021 Google LLC
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17Object.defineProperty(exports, "__esModule", { value: true });
18exports.HTMLProcessingParser = exports.HTMLProcessor = exports.ParagraphForTesting = exports.NodeOrTextForTesting = void 0;
19const dom_js_1 = require("./dom.js");
20const parser_js_1 = require("./parser.js");
21const win_js_1 = require("./win.js");
22const assert = console.assert;
23const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
24const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
25// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
26// but we define the same here for Node.js environments.
27const NodeType = {
29 TEXT_NODE: 3,
31const DomAction = {
32 Inline: 0, // An inline content, becomes a part of a paragraph.
33 Block: 1, // A nested paragraph.
34 Skip: 2, // Skip the content. The content before and after are connected.
35 Break: 3, // A forced break. The content before and after become paragraphs.
36 NoBreak: 4, // The content provides context, but it's not breakable.
37 BreakOpportunity: 5, // Force a break opportunity.
40 * Determines the action from an element name, as defined in
41 * {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
42 * See also {@link actionForElement}.
43 */
44const domActions = {
45 // Hidden elements
46 // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
47 AREA: DomAction.Skip,
48 BASE: DomAction.Skip,
49 BASEFONT: DomAction.Skip,
50 DATALIST: DomAction.Skip,
51 HEAD: DomAction.Skip,
52 LINK: DomAction.Skip,
53 META: DomAction.Skip,
54 NOEMBED: DomAction.Skip,
55 NOFRAMES: DomAction.Skip,
56 PARAM: DomAction.Skip,
57 RP: DomAction.Skip,
58 SCRIPT: DomAction.Skip,
59 STYLE: DomAction.Skip,
60 TEMPLATE: DomAction.Skip,
61 TITLE: DomAction.Skip,
62 NOSCRIPT: DomAction.Skip,
63 // Flow content
64 // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
65 HR: DomAction.Break,
66 // Disable if `white-space: pre`.
67 LISTING: DomAction.Skip,
68 PLAINTEXT: DomAction.Skip,
69 PRE: DomAction.Skip,
70 XMP: DomAction.Skip,
71 // Phrasing content
72 // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
73 BR: DomAction.Break,
74 RT: DomAction.Skip,
75 WBR: DomAction.BreakOpportunity,
76 // Form controls
77 // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
78 INPUT: DomAction.Skip,
79 SELECT: DomAction.Skip,
80 BUTTON: DomAction.Skip,
81 TEXTAREA: DomAction.Skip,
82 // Other elements where the phrase-based line breaking should be disabled.
83 // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
84 ABBR: DomAction.Skip,
85 CODE: DomAction.Skip,
86 IFRAME: DomAction.Skip,
87 TIME: DomAction.Skip,
88 VAR: DomAction.Skip,
89 // Deprecated, but supported in all browsers.
90 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
91 NOBR: DomAction.NoBreak,
93const defaultBlockElements = new Set([
94 // 15.3.2 The page
95 'HTML',
96 'BODY',
97 // 15.3.3 Flow content
100 'CENTER',
101 'DIALOG',
102 'DIV',
103 'FIGURE',
105 'FOOTER',
106 'FORM',
107 'HEADER',
108 'LEGEND',
109 'LISTING',
110 'MAIN',
111 'P',
112 // 15.3.6 Sections and headings
113 'ARTICLE',
114 'ASIDE',
115 'H1',
116 'H2',
117 'H3',
118 'H4',
119 'H5',
120 'H6',
121 'HGROUP',
122 'NAV',
123 'SECTION',
124 // 15.3.7 Lists
125 'DIR',
126 'DD',
127 'DL',
128 'DT',
129 'MENU',
130 'OL',
131 'UL',
132 'LI',
133 // 15.3.8 Tables
134 'TABLE',
135 'CAPTION',
136 'COL',
137 'TR',
138 'TD',
139 'TH',
140 // 15.3.12 The fieldset and legend elements
142 // 15.5.4 The details and summary elements
143 'DETAILS',
144 'SUMMARY',
145 // 15.5.12 The marquee element
146 'MARQUEE',
148// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
149// but we define the same here for Node.js environments.
150const NODETYPE = {
151 ELEMENT: 1,
152 TEXT: 3,
155 * Determine the action for an element.
156 * @param element An element to determine the action for.
157 * @return The {@link domActions} for the element.
158 */
159function actionForElement(element) {
160 const nodeName = element.nodeName;
161 const action = domActions[nodeName];
162 if (action !== undefined)
163 return action;
164 if (typeof win_js_1.win.getComputedStyle === 'function') {
165 const style = win_js_1.win.getComputedStyle(element);
166 switch (style.whiteSpace) {
167 case 'nowrap':
168 case 'pre':
169 return DomAction.NoBreak;
170 }
171 const display = style.display;
172 if (display)
173 return display === 'inline' ? DomAction.Inline : DomAction.Block;
174 // `display` is an empty string if the element is not connected.
175 }
176 // Use the built-in rules if the `display` property is empty, or if
177 // `getComputedStyle` is missing (e.g., jsdom.)
178 return defaultBlockElements.has(nodeName)
179 ? DomAction.Block
180 : DomAction.Inline;
183 * Represents a node in {@link Paragraph}.
184 *
185 * It wraps a {@link Text} or a {@link string}.
186 *
187 * A {@link string} provides the context for the parser, but it can't be split.
188 */
189class NodeOrText {
190 constructor(nodeOrText) {
191 this.chunks = [];
192 this.hasBreakOpportunityAfter = false;
193 this.nodeOrText = nodeOrText;
194 }
195 get isString() {
196 return typeof this.nodeOrText === 'string';
197 }
198 get canSplit() {
199 return !this.isString;
200 }
201 get text() {
202 return this.isString
203 ? this.nodeOrText
204 : this.nodeOrText.nodeValue;
205 }
206 get length() {
207 var _a, _b;
208 return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
209 }
210 /**
211 * Split the {@link Text} in the same way as the {@link chunks}.
212 * Joining all {@link chunks} must be equal to {@link text}.
213 */
214 split(separator) {
215 const chunks = this.chunks;
216 assert(chunks.length === 0 || chunks.join('') === this.text);
217 if (chunks.length <= 1)
218 return;
219 assert(this.canSplit);
220 const node = this.nodeOrText;
221 if (typeof separator === 'string') {
222 // If the `separator` is a string, insert it at each boundary.
223 node.nodeValue = chunks.join(separator);
224 return;
225 }
226 // Otherwise create a `Text` node for each chunk, with the separator node
227 // between them, and replace the `node` with them.
228 const document = node.ownerDocument;
229 let nodes = [];
230 for (const chunk of chunks) {
231 if (chunk)
232 nodes.push(document.createTextNode(chunk));
233 // Add a separator between chunks. To simplify the logic, add a separator
234 // after each chunk, then remove the last one.
235 // To avoid `cloneNode` for the temporary one that is going to be removed,
236 // add `null` as a marker, then replace them with `cloneNode` later.
237 nodes.push(null);
238 }
239 nodes.pop();
240 nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
241 node.replaceWith(...nodes);
242 }
244class NodeOrTextForTesting extends NodeOrText {
246exports.NodeOrTextForTesting = NodeOrTextForTesting;
248 * Represents a "paragraph", broken by block boundaries or forced breaks.
249 *
250 * A CSS
251 * {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
252 * is usually a "paragraph", but it can be broken into multiple paragraphs by
253 * forced breaks such as `<br>`.
254 */
255class Paragraph {
256 constructor(element) {
257 this.nodes = [];
258 this.element = element;
259 }
260 isEmpty() {
261 return this.nodes.length === 0;
262 }
263 get text() {
264 return this.nodes.map(node => node.text).join('');
265 }
266 get lastNode() {
267 return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
268 }
269 setHasBreakOpportunityAfter() {
270 const lastNode = this.lastNode;
271 if (lastNode)
272 lastNode.hasBreakOpportunityAfter = true;
273 }
274 /**
275 * @return Indices of forced break opportunities in the source.
276 * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
277 */
278 getForcedOpportunities() {
279 const opportunities = [];
280 let len = 0;
281 for (const node of this.nodes) {
282 if (node.canSplit) {
283 const text = node.text;
284 if (text) {
285 for (let i = 0; i < text.length; ++i) {
286 if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
287 opportunities.push(len + i + 1);
288 }
289 }
290 }
291 }
292 len += node.length;
293 if (node.hasBreakOpportunityAfter) {
294 opportunities.push(len);
295 }
296 }
297 return opportunities;
298 }
299 /**
300 * @return Filtered {@param boundaries} by excluding
301 * {@link getForcedOpportunities} if it's not empty.
302 * Otherwise {@param boundaries}.
303 */
304 excludeForcedOpportunities(boundaries) {
305 const forcedOpportunities = this.getForcedOpportunities();
306 if (!forcedOpportunities.length)
307 return boundaries;
308 const set = new Set(forcedOpportunities);
309 return boundaries.filter(i => !set.has(i));
310 }
312class ParagraphForTesting extends Paragraph {
314exports.ParagraphForTesting = ParagraphForTesting;
316 * Adds HTML processing support to a BudouX {@link Parser}.
317 */
318class HTMLProcessor {
319 /**
320 * @param parser A BudouX {@link Parser} to compute semantic line breaks.
321 */
322 constructor(parser, options) {
323 /** See {@link HTMLProcessorOptions.separator}. */
324 this.separator = ZWSP;
325 this.parser_ = parser;
326 if (options !== undefined) {
327 if (options.className !== undefined)
328 this.className = options.className;
329 if (options.separator !== undefined)
330 this.separator = options.separator;
331 }
332 }
333 /**
334 * Checks if the given element has a text node in its children.
335 *
336 * @param ele An element to be checked.
337 * @return Whether the element has a child text node.
338 */
339 static hasChildTextNode(ele) {
340 for (const child of ele.childNodes) {
341 if (child.nodeType === NODETYPE.TEXT)
342 return true;
343 }
344 return false;
345 }
346 /**
347 * Applies markups for semantic line breaks to the given HTML element.
348 *
349 * It breaks descendant nodes into paragraphs,
350 * and applies the BudouX to each paragraph.
351 * @param element The input element.
352 */
353 applyToElement(element) {
354 for (const block of this.getBlocks(element)) {
355 assert(!block.isEmpty());
356 this.applyToParagraph(block);
357 }
358 }
359 /**
360 * Find paragraphs from a given HTML element.
361 * @param element The root element to find paragraphs.
362 * @param parent The parent {@link Paragraph} if any.
363 * @return A list of {@link Paragraph}s.
364 */
365 *getBlocks(element, parent) {
366 assert(element.nodeType === NodeType.ELEMENT_NODE);
367 // Skip if it was once applied to this element.
368 if (this.className && element.classList.contains(this.className))
369 return;
370 const action = actionForElement(element);
371 if (action === DomAction.Skip)
372 return;
373 if (action === DomAction.Break) {
374 if (parent && !parent.isEmpty()) {
375 parent.setHasBreakOpportunityAfter();
376 yield parent;
377 parent.nodes = [];
378 }
379 assert(!element.firstChild);
380 return;
381 }
382 if (action === DomAction.BreakOpportunity) {
383 if (parent)
384 parent.setHasBreakOpportunityAfter();
385 return;
386 }
387 // Determine if this element creates a new inline formatting context, or if
388 // this element belongs to the parent inline formatting context.
389 assert(action === DomAction.Block ||
390 action === DomAction.Inline ||
391 action === DomAction.NoBreak);
392 const isNewBlock = !parent || action === DomAction.Block;
393 const block = isNewBlock ? new Paragraph(element) : parent;
394 // Collect all text nodes in this inline formatting context, while searching
395 // descendant elements recursively.
396 for (const child of element.childNodes) {
397 switch (child.nodeType) {
398 case NodeType.ELEMENT_NODE:
399 for (const childBlock of this.getBlocks(child, block))
400 yield childBlock;
401 break;
402 case NodeType.TEXT_NODE:
403 if (action === DomAction.NoBreak) {
404 const text = child.nodeValue;
405 if (text) {
406 block.nodes.push(new NodeOrText(text));
407 }
408 break;
409 }
410 block.nodes.push(new NodeOrText(child));
411 break;
412 }
413 }
414 // Apply if this is an inline formatting context.
415 if (isNewBlock && !block.isEmpty())
416 yield block;
417 }
418 /**
419 * Apply the BudouX to the given {@link Paragraph}.
420 * @param paragraph The {@link Paragraph} to apply.
421 */
422 applyToParagraph(paragraph) {
423 assert(paragraph.nodes.length > 0);
424 if (!paragraph.nodes.some(node => node.canSplit))
425 return;
426 const text = paragraph.text;
427 // No changes if whitespace-only.
428 if (/^\s*$/.test(text))
429 return;
430 // Compute the phrase boundaries.
431 const boundaries = this.parser_.parseBoundaries(text);
432 // No changes if single phrase.
433 if (boundaries.length <= 0)
434 return;
435 // The boundaries should be between 1 and `text.length - 1` in the
436 // ascending order.
437 assert(boundaries[0] > 0);
438 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
439 assert(boundaries[boundaries.length - 1] < text.length);
440 const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
441 // Add a sentinel to help iterating.
442 adjustedBoundaries.push(text.length + 1);
443 this.splitNodes(paragraph.nodes, adjustedBoundaries);
444 this.applyBlockStyle(paragraph.element);
445 }
446 /**
447 * Split {@link NodeOrText} at the specified boundaries.
448 * @param nodes A list of {@link NodeOrText}.
449 * @param boundaries A list of indices of the text to split at.
450 */
451 splitNodes(nodes, boundaries) {
452 var _a;
453 assert(boundaries.length > 0);
454 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
455 const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
456 // The last boundary must be a sentinel.
457 assert(boundaries[boundaries.length - 1] > textLen);
458 // Distribute `boundaries` to `node.chunks`.
459 let boundary_index = 0;
460 let boundary = boundaries[0];
461 assert(boundary > 0);
462 let nodeStart = 0; // the start index of the `nodeText` in the whole text.
463 let lastNode = null;
464 for (const node of nodes) {
465 assert(boundary >= nodeStart);
466 assert(node.chunks.length === 0);
467 const nodeText = node.text;
468 if (!nodeText)
469 continue;
470 const nodeLength = nodeText.length;
471 const nodeEnd = nodeStart + nodeLength;
472 assert(!lastNode || lastNode.canSplit);
473 if (!node.canSplit) {
474 // If there's a boundary between nodes and `lastNode.canSplit`, add a
475 // boundary to the end of the `lastNode`.
476 if (lastNode && boundary === nodeStart) {
477 if (lastNode.chunks.length === 0)
478 lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
479 lastNode.chunks.push('');
480 }
481 while (boundary < nodeEnd) {
482 boundary = boundaries[++boundary_index];
483 }
484 lastNode = null;
485 nodeStart = nodeEnd;
486 continue;
487 }
488 // Check if the next boundary is in this `node`.
489 lastNode = node;
490 if (boundary >= nodeEnd) {
491 nodeStart = nodeEnd;
492 continue;
493 }
494 // Compute the boundary indices in the `node`.
495 const chunks = node.chunks;
496 let chunkStartInNode = 0;
497 while (boundary < nodeEnd) {
498 const boundaryInNode = boundary - nodeStart;
499 assert(boundaryInNode >= chunkStartInNode);
500 chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
501 chunkStartInNode = boundaryInNode;
502 boundary = boundaries[++boundary_index];
503 }
504 // Add the rest of the `nodeText`.
505 assert(chunkStartInNode < nodeLength);
506 chunks.push(nodeText.slice(chunkStartInNode));
507 nodeStart = nodeEnd;
508 }
509 // Check if all nodes and boundaries are consumed.
510 assert(nodeStart === textLen);
511 assert(boundary_index < boundaries.length);
512 assert(boundaries[boundary_index] >= textLen);
513 // `node.chunks` are finalized. Split them.
514 for (const node of nodes) {
515 node.split(this.separator);
516 }
517 }
518 /**
519 * Applies the block style to the given element.
520 * @param element The element to apply the block style.
521 */
522 applyBlockStyle(element) {
523 if (this.className) {
524 element.classList.add(this.className);
525 return;
526 }
527 (0, dom_js_1.applyWrapStyle)(element);
528 }
530exports.HTMLProcessor = HTMLProcessor;
532 * BudouX {@link Parser} with HTML processing support.
533 */
534class HTMLProcessingParser extends parser_js_1.Parser {
535 constructor(model, htmlProcessorOptions = {
536 separator: ZWSP,
537 }) {
538 super(model);
539 this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
540 }
541 /**
542 * @deprecated Use `applyToElement` instead. `applyElement` will be removed
543 * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
544 *
545 * Applies markups for semantic line breaks to the given HTML element.
546 * @param parentElement The input element.
547 */
548 applyElement(parentElement) {
549 console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
550 '`applyElement` will be removed in v0.7.0.');
551 this.applyToElement(parentElement);
552 }
553 /**
554 * Applies markups for semantic line breaks to the given HTML element.
555 * @param parentElement The input element.
556 */
557 applyToElement(parentElement) {
558 this.htmlProcessor.applyToElement(parentElement);
559 }
560 /**
561 * Translates the given HTML string to another HTML string with markups
562 * for semantic line breaks.
563 * @param html An input html string.
564 * @return The translated HTML string.
565 */
566 translateHTMLString(html) {
567 if (html === '')
568 return html;
569 const doc = (0, dom_js_1.parseFromString)(html);
570 if (HTMLProcessor.hasChildTextNode(doc.body)) {
571 const wrapper = doc.createElement('span');
572 wrapper.append(...doc.body.childNodes);
573 doc.body.append(wrapper);
574 }
575 this.applyToElement(doc.body.childNodes[0]);
576 return doc.body.innerHTML;
577 }
579exports.HTMLProcessingParser = HTMLProcessingParser;
580//# sourceMappingURL=html_processor.js.map
\No newline at end of file