UNPKG

20.5 kBJavaScriptView Raw
1"use strict";
2/**
3 * @license
4 * Copyright 2021 Google LLC
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17Object.defineProperty(exports, "__esModule", { value: true });
18exports.HTMLProcessingParser = exports.HTMLProcessor = exports.ParagraphForTesting = exports.NodeOrTextForTesting = void 0;
19const dom_js_1 = require("./dom.js");
20const parser_js_1 = require("./parser.js");
21const win_js_1 = require("./win.js");
22const assert = console.assert;
23const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
24const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
25// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
26// but we define the same here for Node.js environments.
27const NodeType = {
28 ELEMENT_NODE: 1,
29 TEXT_NODE: 3,
30};
31const DomAction = {
32 Inline: 0, // An inline content, becomes a part of a paragraph.
33 Block: 1, // A nested paragraph.
34 Skip: 2, // Skip the content. The content before and after are connected.
35 Break: 3, // A forced break. The content before and after become paragraphs.
36 NoBreak: 4, // The content provides context, but it's not breakable.
37 BreakOpportunity: 5, // Force a break opportunity.
38};
39/**
40 * Determines the action from an element name, as defined in
41 * {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
42 * See also {@link actionForElement}.
43 */
44const domActions = {
45 // Hidden elements
46 // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
47 AREA: DomAction.Skip,
48 BASE: DomAction.Skip,
49 BASEFONT: DomAction.Skip,
50 DATALIST: DomAction.Skip,
51 HEAD: DomAction.Skip,
52 LINK: DomAction.Skip,
53 META: DomAction.Skip,
54 NOEMBED: DomAction.Skip,
55 NOFRAMES: DomAction.Skip,
56 PARAM: DomAction.Skip,
57 RP: DomAction.Skip,
58 SCRIPT: DomAction.Skip,
59 STYLE: DomAction.Skip,
60 TEMPLATE: DomAction.Skip,
61 TITLE: DomAction.Skip,
62 NOSCRIPT: DomAction.Skip,
63 // Flow content
64 // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
65 HR: DomAction.Break,
66 // Disable if `white-space: pre`.
67 LISTING: DomAction.Skip,
68 PLAINTEXT: DomAction.Skip,
69 PRE: DomAction.Skip,
70 XMP: DomAction.Skip,
71 // Phrasing content
72 // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
73 BR: DomAction.Break,
74 RT: DomAction.Skip,
75 WBR: DomAction.BreakOpportunity,
76 // Form controls
77 // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
78 INPUT: DomAction.Skip,
79 SELECT: DomAction.Skip,
80 BUTTON: DomAction.Skip,
81 TEXTAREA: DomAction.Skip,
82 // Other elements where the phrase-based line breaking should be disabled.
83 // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
84 ABBR: DomAction.Skip,
85 CODE: DomAction.Skip,
86 IFRAME: DomAction.Skip,
87 TIME: DomAction.Skip,
88 VAR: DomAction.Skip,
89 // Deprecated, but supported in all browsers.
90 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
91 NOBR: DomAction.NoBreak,
92};
93const defaultBlockElements = new Set([
94 // 15.3.2 The page
95 'HTML',
96 'BODY',
97 // 15.3.3 Flow content
98 'ADDRESS',
99 'BLOCKQUOTE',
100 'CENTER',
101 'DIALOG',
102 'DIV',
103 'FIGURE',
104 'FIGCAPTION',
105 'FOOTER',
106 'FORM',
107 'HEADER',
108 'LEGEND',
109 'LISTING',
110 'MAIN',
111 'P',
112 // 15.3.6 Sections and headings
113 'ARTICLE',
114 'ASIDE',
115 'H1',
116 'H2',
117 'H3',
118 'H4',
119 'H5',
120 'H6',
121 'HGROUP',
122 'NAV',
123 'SECTION',
124 // 15.3.7 Lists
125 'DIR',
126 'DD',
127 'DL',
128 'DT',
129 'MENU',
130 'OL',
131 'UL',
132 'LI',
133 // 15.3.8 Tables
134 'TABLE',
135 'CAPTION',
136 'COL',
137 'TR',
138 'TD',
139 'TH',
140 // 15.3.12 The fieldset and legend elements
141 'FIELDSET',
142 // 15.5.4 The details and summary elements
143 'DETAILS',
144 'SUMMARY',
145 // 15.5.12 The marquee element
146 'MARQUEE',
147]);
148// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
149// but we define the same here for Node.js environments.
150const NODETYPE = {
151 ELEMENT: 1,
152 TEXT: 3,
153};
154/**
155 * Determine the action for an element.
156 * @param element An element to determine the action for.
157 * @return The {@link domActions} for the element.
158 */
159function actionForElement(element) {
160 const nodeName = element.nodeName;
161 const action = domActions[nodeName];
162 if (action !== undefined)
163 return action;
164 if (typeof win_js_1.win.getComputedStyle === 'function') {
165 const style = win_js_1.win.getComputedStyle(element);
166 switch (style.whiteSpace) {
167 case 'nowrap':
168 case 'pre':
169 return DomAction.NoBreak;
170 }
171 const display = style.display;
172 if (display)
173 return display === 'inline' ? DomAction.Inline : DomAction.Block;
174 // `display` is an empty string if the element is not connected.
175 }
176 // Use the built-in rules if the `display` property is empty, or if
177 // `getComputedStyle` is missing (e.g., jsdom.)
178 return defaultBlockElements.has(nodeName)
179 ? DomAction.Block
180 : DomAction.Inline;
181}
182/**
183 * Represents a node in {@link Paragraph}.
184 *
185 * It wraps a {@link Text} or a {@link string}.
186 *
187 * A {@link string} provides the context for the parser, but it can't be split.
188 */
189class NodeOrText {
190 constructor(nodeOrText) {
191 this.chunks = [];
192 this.hasBreakOpportunityAfter = false;
193 this.nodeOrText = nodeOrText;
194 }
195 get isString() {
196 return typeof this.nodeOrText === 'string';
197 }
198 get canSplit() {
199 return !this.isString;
200 }
201 get text() {
202 return this.isString
203 ? this.nodeOrText
204 : this.nodeOrText.nodeValue;
205 }
206 get length() {
207 var _a, _b;
208 return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
209 }
210 /**
211 * Split the {@link Text} in the same way as the {@link chunks}.
212 * Joining all {@link chunks} must be equal to {@link text}.
213 */
214 split(separator) {
215 const chunks = this.chunks;
216 assert(chunks.length === 0 || chunks.join('') === this.text);
217 if (chunks.length <= 1)
218 return;
219 assert(this.canSplit);
220 const node = this.nodeOrText;
221 if (typeof separator === 'string') {
222 // If the `separator` is a string, insert it at each boundary.
223 node.nodeValue = chunks.join(separator);
224 return;
225 }
226 // Otherwise create a `Text` node for each chunk, with the separator node
227 // between them, and replace the `node` with them.
228 const document = node.ownerDocument;
229 let nodes = [];
230 for (const chunk of chunks) {
231 if (chunk)
232 nodes.push(document.createTextNode(chunk));
233 // Add a separator between chunks. To simplify the logic, add a separator
234 // after each chunk, then remove the last one.
235 // To avoid `cloneNode` for the temporary one that is going to be removed,
236 // add `null` as a marker, then replace them with `cloneNode` later.
237 nodes.push(null);
238 }
239 nodes.pop();
240 nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
241 node.replaceWith(...nodes);
242 }
243}
244class NodeOrTextForTesting extends NodeOrText {
245}
246exports.NodeOrTextForTesting = NodeOrTextForTesting;
247/**
248 * Represents a "paragraph", broken by block boundaries or forced breaks.
249 *
250 * A CSS
251 * {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
252 * is usually a "paragraph", but it can be broken into multiple paragraphs by
253 * forced breaks such as `<br>`.
254 */
255class Paragraph {
256 constructor(element) {
257 this.nodes = [];
258 this.element = element;
259 }
260 isEmpty() {
261 return this.nodes.length === 0;
262 }
263 get text() {
264 return this.nodes.map(node => node.text).join('');
265 }
266 get lastNode() {
267 return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
268 }
269 setHasBreakOpportunityAfter() {
270 const lastNode = this.lastNode;
271 if (lastNode)
272 lastNode.hasBreakOpportunityAfter = true;
273 }
274 /**
275 * @return Indices of forced break opportunities in the source.
276 * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
277 */
278 getForcedOpportunities() {
279 const opportunities = [];
280 let len = 0;
281 for (const node of this.nodes) {
282 if (node.canSplit) {
283 const text = node.text;
284 if (text) {
285 for (let i = 0; i < text.length; ++i) {
286 if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
287 opportunities.push(len + i + 1);
288 }
289 }
290 }
291 }
292 len += node.length;
293 if (node.hasBreakOpportunityAfter) {
294 opportunities.push(len);
295 }
296 }
297 return opportunities;
298 }
299 /**
300 * @return Filtered {@param boundaries} by excluding
301 * {@link getForcedOpportunities} if it's not empty.
302 * Otherwise {@param boundaries}.
303 */
304 excludeForcedOpportunities(boundaries) {
305 const forcedOpportunities = this.getForcedOpportunities();
306 if (!forcedOpportunities.length)
307 return boundaries;
308 const set = new Set(forcedOpportunities);
309 return boundaries.filter(i => !set.has(i));
310 }
311}
312class ParagraphForTesting extends Paragraph {
313}
314exports.ParagraphForTesting = ParagraphForTesting;
315/**
316 * Adds HTML processing support to a BudouX {@link Parser}.
317 */
318class HTMLProcessor {
319 /**
320 * @param parser A BudouX {@link Parser} to compute semantic line breaks.
321 */
322 constructor(parser, options) {
323 /** See {@link HTMLProcessorOptions.separator}. */
324 this.separator = ZWSP;
325 this.parser_ = parser;
326 if (options !== undefined) {
327 if (options.className !== undefined)
328 this.className = options.className;
329 if (options.separator !== undefined)
330 this.separator = options.separator;
331 }
332 }
333 /**
334 * Checks if the given element has a text node in its children.
335 *
336 * @param ele An element to be checked.
337 * @return Whether the element has a child text node.
338 */
339 static hasChildTextNode(ele) {
340 for (const child of ele.childNodes) {
341 if (child.nodeType === NODETYPE.TEXT)
342 return true;
343 }
344 return false;
345 }
346 /**
347 * Applies markups for semantic line breaks to the given HTML element.
348 *
349 * It breaks descendant nodes into paragraphs,
350 * and applies the BudouX to each paragraph.
351 * @param element The input element.
352 */
353 applyToElement(element) {
354 for (const block of this.getBlocks(element)) {
355 assert(!block.isEmpty());
356 this.applyToParagraph(block);
357 }
358 }
359 /**
360 * Find paragraphs from a given HTML element.
361 * @param element The root element to find paragraphs.
362 * @param parent The parent {@link Paragraph} if any.
363 * @return A list of {@link Paragraph}s.
364 */
365 *getBlocks(element, parent) {
366 assert(element.nodeType === NodeType.ELEMENT_NODE);
367 // Skip if it was once applied to this element.
368 if (this.className && element.classList.contains(this.className))
369 return;
370 const action = actionForElement(element);
371 if (action === DomAction.Skip)
372 return;
373 if (action === DomAction.Break) {
374 if (parent && !parent.isEmpty()) {
375 parent.setHasBreakOpportunityAfter();
376 yield parent;
377 parent.nodes = [];
378 }
379 assert(!element.firstChild);
380 return;
381 }
382 if (action === DomAction.BreakOpportunity) {
383 if (parent)
384 parent.setHasBreakOpportunityAfter();
385 return;
386 }
387 // Determine if this element creates a new inline formatting context, or if
388 // this element belongs to the parent inline formatting context.
389 assert(action === DomAction.Block ||
390 action === DomAction.Inline ||
391 action === DomAction.NoBreak);
392 const isNewBlock = !parent || action === DomAction.Block;
393 const block = isNewBlock ? new Paragraph(element) : parent;
394 // Collect all text nodes in this inline formatting context, while searching
395 // descendant elements recursively.
396 for (const child of element.childNodes) {
397 switch (child.nodeType) {
398 case NodeType.ELEMENT_NODE:
399 for (const childBlock of this.getBlocks(child, block))
400 yield childBlock;
401 break;
402 case NodeType.TEXT_NODE:
403 if (action === DomAction.NoBreak) {
404 const text = child.nodeValue;
405 if (text) {
406 block.nodes.push(new NodeOrText(text));
407 }
408 break;
409 }
410 block.nodes.push(new NodeOrText(child));
411 break;
412 }
413 }
414 // Apply if this is an inline formatting context.
415 if (isNewBlock && !block.isEmpty())
416 yield block;
417 }
418 /**
419 * Apply the BudouX to the given {@link Paragraph}.
420 * @param paragraph The {@link Paragraph} to apply.
421 */
422 applyToParagraph(paragraph) {
423 assert(paragraph.nodes.length > 0);
424 if (!paragraph.nodes.some(node => node.canSplit))
425 return;
426 const text = paragraph.text;
427 // No changes if whitespace-only.
428 if (/^\s*$/.test(text))
429 return;
430 // Compute the phrase boundaries.
431 const boundaries = this.parser_.parseBoundaries(text);
432 // No changes if single phrase.
433 if (boundaries.length <= 0)
434 return;
435 // The boundaries should be between 1 and `text.length - 1` in the
436 // ascending order.
437 assert(boundaries[0] > 0);
438 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
439 assert(boundaries[boundaries.length - 1] < text.length);
440 const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
441 // Add a sentinel to help iterating.
442 adjustedBoundaries.push(text.length + 1);
443 this.splitNodes(paragraph.nodes, adjustedBoundaries);
444 this.applyBlockStyle(paragraph.element);
445 }
446 /**
447 * Split {@link NodeOrText} at the specified boundaries.
448 * @param nodes A list of {@link NodeOrText}.
449 * @param boundaries A list of indices of the text to split at.
450 */
451 splitNodes(nodes, boundaries) {
452 var _a;
453 assert(boundaries.length > 0);
454 assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
455 const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
456 // The last boundary must be a sentinel.
457 assert(boundaries[boundaries.length - 1] > textLen);
458 // Distribute `boundaries` to `node.chunks`.
459 let boundary_index = 0;
460 let boundary = boundaries[0];
461 assert(boundary > 0);
462 let nodeStart = 0; // the start index of the `nodeText` in the whole text.
463 let lastNode = null;
464 for (const node of nodes) {
465 assert(boundary >= nodeStart);
466 assert(node.chunks.length === 0);
467 const nodeText = node.text;
468 if (!nodeText)
469 continue;
470 const nodeLength = nodeText.length;
471 const nodeEnd = nodeStart + nodeLength;
472 assert(!lastNode || lastNode.canSplit);
473 if (!node.canSplit) {
474 // If there's a boundary between nodes and `lastNode.canSplit`, add a
475 // boundary to the end of the `lastNode`.
476 if (lastNode && boundary === nodeStart) {
477 if (lastNode.chunks.length === 0)
478 lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
479 lastNode.chunks.push('');
480 }
481 while (boundary < nodeEnd) {
482 boundary = boundaries[++boundary_index];
483 }
484 lastNode = null;
485 nodeStart = nodeEnd;
486 continue;
487 }
488 // Check if the next boundary is in this `node`.
489 lastNode = node;
490 if (boundary >= nodeEnd) {
491 nodeStart = nodeEnd;
492 continue;
493 }
494 // Compute the boundary indices in the `node`.
495 const chunks = node.chunks;
496 let chunkStartInNode = 0;
497 while (boundary < nodeEnd) {
498 const boundaryInNode = boundary - nodeStart;
499 assert(boundaryInNode >= chunkStartInNode);
500 chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
501 chunkStartInNode = boundaryInNode;
502 boundary = boundaries[++boundary_index];
503 }
504 // Add the rest of the `nodeText`.
505 assert(chunkStartInNode < nodeLength);
506 chunks.push(nodeText.slice(chunkStartInNode));
507 nodeStart = nodeEnd;
508 }
509 // Check if all nodes and boundaries are consumed.
510 assert(nodeStart === textLen);
511 assert(boundary_index < boundaries.length);
512 assert(boundaries[boundary_index] >= textLen);
513 // `node.chunks` are finalized. Split them.
514 for (const node of nodes) {
515 node.split(this.separator);
516 }
517 }
518 /**
519 * Applies the block style to the given element.
520 * @param element The element to apply the block style.
521 */
522 applyBlockStyle(element) {
523 if (this.className) {
524 element.classList.add(this.className);
525 return;
526 }
527 (0, dom_js_1.applyWrapStyle)(element);
528 }
529}
530exports.HTMLProcessor = HTMLProcessor;
531/**
532 * BudouX {@link Parser} with HTML processing support.
533 */
534class HTMLProcessingParser extends parser_js_1.Parser {
535 constructor(model, htmlProcessorOptions = {
536 separator: ZWSP,
537 }) {
538 super(model);
539 this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
540 }
541 /**
542 * @deprecated Use `applyToElement` instead. `applyElement` will be removed
543 * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
544 *
545 * Applies markups for semantic line breaks to the given HTML element.
546 * @param parentElement The input element.
547 */
548 applyElement(parentElement) {
549 console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
550 '`applyElement` will be removed in v0.7.0.');
551 this.applyToElement(parentElement);
552 }
553 /**
554 * Applies markups for semantic line breaks to the given HTML element.
555 * @param parentElement The input element.
556 */
557 applyToElement(parentElement) {
558 this.htmlProcessor.applyToElement(parentElement);
559 }
560 /**
561 * Translates the given HTML string to another HTML string with markups
562 * for semantic line breaks.
563 * @param html An input html string.
564 * @return The translated HTML string.
565 */
566 translateHTMLString(html) {
567 if (html === '')
568 return html;
569 const doc = (0, dom_js_1.parseFromString)(html);
570 if (HTMLProcessor.hasChildTextNode(doc.body)) {
571 const wrapper = doc.createElement('span');
572 wrapper.append(...doc.body.childNodes);
573 doc.body.append(wrapper);
574 }
575 this.applyToElement(doc.body.childNodes[0]);
576 return doc.body.innerHTML;
577 }
578}
579exports.HTMLProcessingParser = HTMLProcessingParser;
580//# sourceMappingURL=html_processor.js.map
\No newline at end of file