UNPKG

budoux/src/html_processor.ts

Version:

19.4 kBPlain TextView Raw

1/**
* @license
* Copyright 2021 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
16
17import {applyWrapStyle, parseFromString} from './dom.js';
18import {Parser} from './parser.js';
19import {win} from './win.js';
20
21const assert = console.assert;
22
23const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
24const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
25
26// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
27// but we define the same here for Node.js environments.
28const NodeType = {
ELEMENT_NODE: 1,
TEXT_NODE: 3,
31};
32
33const DomAction = {
Inline: 0, // An inline content, becomes a part of a paragraph.
Block: 1, // A nested paragraph.
Skip: 2, // Skip the content. The content before and after are connected.
Break: 3, // A forced break. The content before and after become paragraphs.
NoBreak: 4, // The content provides context, but it's not breakable.
BreakOpportunity: 5, // Force a break opportunity.
40} as const;
41type DomAction = (typeof DomAction)[keyof typeof DomAction];
42
43/**
* Determines the action from an element name, as defined in
* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
* See also {@link actionForElement}.
*/
48const domActions: {[name: string]: DomAction} = {
// Hidden elements
// https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
AREA: DomAction.Skip,
BASE: DomAction.Skip,
BASEFONT: DomAction.Skip,
DATALIST: DomAction.Skip,
HEAD: DomAction.Skip,
LINK: DomAction.Skip,
META: DomAction.Skip,
NOEMBED: DomAction.Skip,
NOFRAMES: DomAction.Skip,
PARAM: DomAction.Skip,
RP: DomAction.Skip,
SCRIPT: DomAction.Skip,
STYLE: DomAction.Skip,
TEMPLATE: DomAction.Skip,
TITLE: DomAction.Skip,
NOSCRIPT: DomAction.Skip,
67
// Flow content
// https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
HR: DomAction.Break,
// Disable if `white-space: pre`.
LISTING: DomAction.Skip,
PLAINTEXT: DomAction.Skip,
PRE: DomAction.Skip,
XMP: DomAction.Skip,
76
// Phrasing content
// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
BR: DomAction.Break,
RT: DomAction.Skip,
WBR: DomAction.BreakOpportunity,
82
// Form controls
// https://html.spec.whatwg.org/multipage/rendering.html#form-controls
INPUT: DomAction.Skip,
SELECT: DomAction.Skip,
BUTTON: DomAction.Skip,
TEXTAREA: DomAction.Skip,
89
// Other elements where the phrase-based line breaking should be disabled.
// https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
ABBR: DomAction.Skip,
CODE: DomAction.Skip,
IFRAME: DomAction.Skip,
TIME: DomAction.Skip,
VAR: DomAction.Skip,
97
// Deprecated, but supported in all browsers.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
NOBR: DomAction.NoBreak,
101};
102
103const defaultBlockElements = new Set([
// 15.3.2 The page
'HTML',
'BODY',
// 15.3.3 Flow content
'ADDRESS',
'BLOCKQUOTE',
'CENTER',
'DIALOG',
'DIV',
'FIGURE',
'FIGCAPTION',
'FOOTER',
'FORM',
'HEADER',
'LEGEND',
'LISTING',
'MAIN',
'P',
// 15.3.6 Sections and headings
'ARTICLE',
'ASIDE',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'HGROUP',
'NAV',
'SECTION',
// 15.3.7 Lists
'DIR',
'DD',
'DL',
'DT',
'MENU',
'OL',
'UL',
'LI',
// 15.3.8 Tables
'TABLE',
'CAPTION',
'COL',
'TR',
'TD',
'TH',
// 15.3.12 The fieldset and legend elements
'FIELDSET',
// 15.5.4 The details and summary elements
'DETAILS',
'SUMMARY',
// 15.5.12 The marquee element
'MARQUEE',
157]);
158
159// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
160// but we define the same here for Node.js environments.
161const NODETYPE = {
ELEMENT: 1,
TEXT: 3,
164};
165
166/**
* Determine the action for an element.
* @param element An element to determine the action for.
* @return The {@link domActions} for the element.
*/
171function actionForElement(element: Element): DomAction {
const nodeName = element.nodeName;
const action = domActions[nodeName];
if (action !== undefined) return action;
175
if (typeof win.getComputedStyle === 'function') {
  const style = win.getComputedStyle(element);
  switch (style.whiteSpace) {
    case 'nowrap':
    case 'pre':
      return DomAction.NoBreak;
  }
183
  const display = style.display;
  if (display)
    return display === 'inline' ? DomAction.Inline : DomAction.Block;
  // `display` is an empty string if the element is not connected.
}
189
// Use the built-in rules if the `display` property is empty, or if
// `getComputedStyle` is missing (e.g., jsdom.)
return defaultBlockElements.has(nodeName)
  ? DomAction.Block
  : DomAction.Inline;
195}
196
197/**
* Represents a node in {@link Paragraph}.
*
* It wraps a {@link Text} or a {@link string}.
*
* A {@link string} provides the context for the parser, but it can't be split.
*/
204class NodeOrText {
nodeOrText: Text | string;
chunks: string[] = [];
hasBreakOpportunityAfter = false;
208
constructor(nodeOrText: Text | string) {
  this.nodeOrText = nodeOrText;
}
212
get isString(): boolean {
  return typeof this.nodeOrText === 'string';
}
get canSplit(): boolean {
  return !this.isString;
}
get text(): string | null {
  return this.isString
    ? (this.nodeOrText as string)
    : (this.nodeOrText as Text).nodeValue;
}
get length(): number {
  return this.text?.length ?? 0;
}
227
/**
 * Split the {@link Text} in the same way as the {@link chunks}.
 * Joining all {@link chunks} must be equal to {@link text}.
 */
split(separator: string | Node) {
  const chunks = this.chunks;
  assert(chunks.length === 0 || chunks.join('') === this.text);
  if (chunks.length <= 1) return;
  assert(this.canSplit);
  const node = this.nodeOrText as Text;
  if (typeof separator === 'string') {
    // If the `separator` is a string, insert it at each boundary.
    node.nodeValue = chunks.join(separator);
    return;
  }
243
  // Otherwise create a `Text` node for each chunk, with the separator node
  // between them, and replace the `node` with them.
  const document = node.ownerDocument;
  let nodes = [];
  for (const chunk of chunks) {
    if (chunk) nodes.push(document.createTextNode(chunk));
    // Add a separator between chunks. To simplify the logic, add a separator
    // after each chunk, then remove the last one.
    // To avoid `cloneNode` for the temporary one that is going to be removed,
    // add `null` as a marker, then replace them with `cloneNode` later.
    nodes.push(null);
  }
  nodes.pop();
  nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
  node.replaceWith(...nodes);
}
260}
261export class NodeOrTextForTesting extends NodeOrText {}
262
263/**
* Represents a "paragraph", broken by block boundaries or forced breaks.
*
* A CSS
* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
* is usually a "paragraph", but it can be broken into multiple paragraphs by
* forced breaks such as `<br>`.
*/
271class Paragraph {
element: HTMLElement;
nodes: NodeOrText[] = [];
274
constructor(element: HTMLElement) {
  this.element = element;
}
278
isEmpty(): boolean {
  return this.nodes.length === 0;
}
get text(): string {
  return this.nodes.map(node => node.text).join('');
}
285
get lastNode(): NodeOrText | undefined {
  return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
}
setHasBreakOpportunityAfter() {
  const lastNode = this.lastNode;
  if (lastNode) lastNode.hasBreakOpportunityAfter = true;
}
293
/**
 * @return Indices of forced break opportunities in the source.
 * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
 */
getForcedOpportunities(): number[] {
  const opportunities: number[] = [];
  let len = 0;
  for (const node of this.nodes) {
    if (node.canSplit) {
      const text = node.text;
      if (text) {
        for (let i = 0; i < text.length; ++i) {
          if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
            opportunities.push(len + i + 1);
          }
        }
      }
    }
    len += node.length;
    if (node.hasBreakOpportunityAfter) {
      opportunities.push(len);
    }
  }
  return opportunities;
}
319
/**
 * @return Filtered {@param boundaries} by excluding
 * {@link getForcedOpportunities} if it's not empty.
 * Otherwise {@param boundaries}.
 */
excludeForcedOpportunities(boundaries: number[]): number[] {
  const forcedOpportunities = this.getForcedOpportunities();
  if (!forcedOpportunities.length) return boundaries;
  const set = new Set<number>(forcedOpportunities);
  return boundaries.filter(i => !set.has(i));
}
331}
332export class ParagraphForTesting extends Paragraph {}
333
334/**
* Options for {@link HTMLProcessor}.
*/
337export interface HTMLProcessorOptions {
/**
 * This class name is added to the containing block when the BudouX is applied.
 * The containing block should have following CSS properties to make it work.
 * `{ word-break: keep-all; overflow-wrap: anywhere; }`
 *
 * When falsy, an inline style is set instead.
 */
className?: string;
/**
 * The separator to insert at each semantics boundary.
 *
 * When it's a {@link Node}, a clone of the {@link Node} will be inserted.
 *
 * The default value is U+200B ZERO WIDTH SPACE.
 */
separator?: string | Node;
354}
355
356/**
* Adds HTML processing support to a BudouX {@link Parser}.
*/
359export class HTMLProcessor {
private parser_: Parser;
/** See {@link HTMLProcessorOptions.className}. */
className?: string;
/** See {@link HTMLProcessorOptions.separator}. */
separator: string | Node = ZWSP;
365
/**
 * @param parser A BudouX {@link Parser} to compute semantic line breaks.
 */
constructor(parser: Parser, options?: HTMLProcessorOptions) {
  this.parser_ = parser;
  if (options !== undefined) {
    if (options.className !== undefined) this.className = options.className;
    if (options.separator !== undefined) this.separator = options.separator;
  }
}
376
/**
 * Checks if the given element has a text node in its children.
 *
 * @param ele An element to be checked.
 * @return Whether the element has a child text node.
 */
static hasChildTextNode(ele: HTMLElement) {
  for (const child of ele.childNodes) {
    if (child.nodeType === NODETYPE.TEXT) return true;
  }
  return false;
}
389
/**
 * Applies markups for semantic line breaks to the given HTML element.
 *
 * It breaks descendant nodes into paragraphs,
 * and applies the BudouX to each paragraph.
 * @param element The input element.
 */
applyToElement(element: HTMLElement) {
  for (const block of this.getBlocks(element)) {
    assert(!block.isEmpty());
    this.applyToParagraph(block);
  }
}
403
/**
 * Find paragraphs from a given HTML element.
 * @param element The root element to find paragraphs.
 * @param parent The parent {@link Paragraph} if any.
 * @return A list of {@link Paragraph}s.
 */
*getBlocks(
  element: HTMLElement,
  parent?: Paragraph
): IterableIterator<Paragraph> {
  assert(element.nodeType === NodeType.ELEMENT_NODE);
415
  // Skip if it was once applied to this element.
  if (this.className && element.classList.contains(this.className)) return;
418
  const action = actionForElement(element);
  if (action === DomAction.Skip) return;
  if (action === DomAction.Break) {
    if (parent && !parent.isEmpty()) {
      parent.setHasBreakOpportunityAfter();
      yield parent;
      parent.nodes = [];
    }
    assert(!element.firstChild);
    return;
  }
  if (action === DomAction.BreakOpportunity) {
    if (parent) parent.setHasBreakOpportunityAfter();
    return;
  }
434
  // Determine if this element creates a new inline formatting context, or if
  // this element belongs to the parent inline formatting context.
  assert(
    action === DomAction.Block ||
      action === DomAction.Inline ||
      action === DomAction.NoBreak
  );
  const isNewBlock = !parent || action === DomAction.Block;
  const block = isNewBlock ? new Paragraph(element) : parent;
444
  // Collect all text nodes in this inline formatting context, while searching
  // descendant elements recursively.
  for (const child of element.childNodes) {
    switch (child.nodeType) {
      case NodeType.ELEMENT_NODE:
        for (const childBlock of this.getBlocks(child as HTMLElement, block))
          yield childBlock;
        break;
      case NodeType.TEXT_NODE:
        if (action === DomAction.NoBreak) {
          const text = child.nodeValue;
          if (text) {
            block.nodes.push(new NodeOrText(text));
          }
          break;
        }
        block.nodes.push(new NodeOrText(child as Text));
        break;
    }
  }
465
  // Apply if this is an inline formatting context.
  if (isNewBlock && !block.isEmpty()) yield block;
}
469
/**
 * Apply the BudouX to the given {@link Paragraph}.
 * @param paragraph The {@link Paragraph} to apply.
 */
applyToParagraph(paragraph: Paragraph): void {
  assert(paragraph.nodes.length > 0);
  if (!paragraph.nodes.some(node => node.canSplit)) return;
  const text = paragraph.text;
  // No changes if whitespace-only.
  if (/^\s*$/.test(text)) return;
480
  // Compute the phrase boundaries.
  const boundaries = this.parser_.parseBoundaries(text);
  // No changes if single phrase.
  if (boundaries.length <= 0) return;
  // The boundaries should be between 1 and `text.length - 1` in the
  // ascending order.
  assert(boundaries[0] > 0);
  assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
  assert(boundaries[boundaries.length - 1] < text.length);
490
  const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
492
  // Add a sentinel to help iterating.
  adjustedBoundaries.push(text.length + 1);
495
  this.splitNodes(paragraph.nodes, adjustedBoundaries);
  this.applyBlockStyle(paragraph.element);
}
499
/**
 * Split {@link NodeOrText} at the specified boundaries.
 * @param nodes A list of {@link NodeOrText}.
 * @param boundaries A list of indices of the text to split at.
 */
splitNodes(nodes: NodeOrText[], boundaries: number[]): void {
  assert(boundaries.length > 0);
  assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
  const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
  // The last boundary must be a sentinel.
  assert(boundaries[boundaries.length - 1] > textLen);
511
  // Distribute `boundaries` to `node.chunks`.
  let boundary_index = 0;
  let boundary = boundaries[0];
  assert(boundary > 0);
  let nodeStart = 0; // the start index of the `nodeText` in the whole text.
  let lastNode: NodeOrText | null = null;
  for (const node of nodes) {
    assert(boundary >= nodeStart);
    assert(node.chunks.length === 0);
    const nodeText = node.text;
    if (!nodeText) continue;
    const nodeLength = nodeText.length;
    const nodeEnd = nodeStart + nodeLength;
    assert(!lastNode || lastNode.canSplit);
    if (!node.canSplit) {
      // If there's a boundary between nodes and `lastNode.canSplit`, add a
      // boundary to the end of the `lastNode`.
      if (lastNode && boundary === nodeStart) {
        if (lastNode.chunks.length === 0)
          lastNode.chunks.push(lastNode.text ?? '');
        lastNode.chunks.push('');
      }
      while (boundary < nodeEnd) {
        boundary = boundaries[++boundary_index];
      }
      lastNode = null;
      nodeStart = nodeEnd;
      continue;
    }
541
    // Check if the next boundary is in this `node`.
    lastNode = node;
    if (boundary >= nodeEnd) {
      nodeStart = nodeEnd;
      continue;
    }
548
    // Compute the boundary indices in the `node`.
    const chunks = node.chunks;
    let chunkStartInNode = 0;
    while (boundary < nodeEnd) {
      const boundaryInNode = boundary - nodeStart;
      assert(boundaryInNode >= chunkStartInNode);
      chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
      chunkStartInNode = boundaryInNode;
      boundary = boundaries[++boundary_index];
    }
    // Add the rest of the `nodeText`.
    assert(chunkStartInNode < nodeLength);
    chunks.push(nodeText.slice(chunkStartInNode));
562
    nodeStart = nodeEnd;
  }
  // Check if all nodes and boundaries are consumed.
  assert(nodeStart === textLen);
  assert(boundary_index < boundaries.length);
  assert(boundaries[boundary_index] >= textLen);
569
  // `node.chunks` are finalized. Split them.
  for (const node of nodes) {
    node.split(this.separator);
  }
}
575
/**
 * Applies the block style to the given element.
 * @param element The element to apply the block style.
 */
applyBlockStyle(element: HTMLElement): void {
  if (this.className) {
    element.classList.add(this.className);
    return;
  }
  applyWrapStyle(element);
}
587}
588
589/**
* BudouX {@link Parser} with HTML processing support.
*/
592export class HTMLProcessingParser extends Parser {
htmlProcessor: HTMLProcessor;
594
constructor(
  model: {[key: string]: {[key: string]: number}},
  htmlProcessorOptions: HTMLProcessorOptions = {
    separator: ZWSP,
  }
) {
  super(model);
  this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
}
604
/**
 * @deprecated Use `applyToElement` instead. `applyElement` will be removed
 * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
 *
 * Applies markups for semantic line breaks to the given HTML element.
 * @param parentElement The input element.
 */
applyElement(parentElement: HTMLElement) {
  console.warn(
    '`applyElement` is deprecated. Please use `applyToElement` instead. ' +
      '`applyElement` will be removed in v0.7.0.'
  );
  this.applyToElement(parentElement);
}
619
/**
 * Applies markups for semantic line breaks to the given HTML element.
 * @param parentElement The input element.
 */
applyToElement(parentElement: HTMLElement) {
  this.htmlProcessor.applyToElement(parentElement);
}
627
/**
 * Translates the given HTML string to another HTML string with markups
 * for semantic line breaks.
 * @param html An input html string.
 * @return The translated HTML string.
 */
translateHTMLString(html: string) {
  if (html === '') return html;
  const doc = parseFromString(html);
  if (HTMLProcessor.hasChildTextNode(doc.body)) {
    const wrapper = doc.createElement('span');
    wrapper.append(...doc.body.childNodes);
    doc.body.append(wrapper);
  }
  this.applyToElement(doc.body.childNodes[0] as HTMLElement);
  return doc.body.innerHTML;
}
645}

1	`/**`
2	`* @license`
3	`* Copyright 2021 Google LLC`
4	`* Licensed under the Apache License, Version 2.0 (the "License");`
5	`* you may not use this file except in compliance with the License.`
6	`* You may obtain a copy of the License at`
7	`*`
8	`* https://www.apache.org/licenses/LICENSE-2.0`
9	`*`
10	`* Unless required by applicable law or agreed to in writing, software`
11	`* distributed under the License is distributed on an "AS IS" BASIS,`
12	`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
13	`* See the License for the specific language governing permissions and`
14	`* limitations under the License.`
15	`*/`
16
17	`import {applyWrapStyle, parseFromString} from './dom.js';`
18	`import {Parser} from './parser.js';`
19	`import {win} from './win.js';`
20
21	`const assert = console.assert;`
22
23	`const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE`
24	`const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);`
25
26	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
27	`// but we define the same here for Node.js environments.`
28	`const NodeType = {`
29	`ELEMENT_NODE: 1,`
30	`TEXT_NODE: 3,`
31	`};`
32
33	`const DomAction = {`
34	`Inline: 0, // An inline content, becomes a part of a paragraph.`
35	`Block: 1, // A nested paragraph.`
36	`Skip: 2, // Skip the content. The content before and after are connected.`
37	`Break: 3, // A forced break. The content before and after become paragraphs.`
38	`NoBreak: 4, // The content provides context, but it's not breakable.`
39	`BreakOpportunity: 5, // Force a break opportunity.`
40	`} as const;`
41	`type DomAction = (typeof DomAction)[keyof typeof DomAction];`
42
43	`/**`
44	`* Determines the action from an element name, as defined in`
45	`* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.`
46	`* See also {@link actionForElement}.`
47	`*/`
48	`const domActions: {[name: string]: DomAction} = {`
49	`// Hidden elements`
50	`// https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements`
51	`AREA: DomAction.Skip,`
52	`BASE: DomAction.Skip,`
53	`BASEFONT: DomAction.Skip,`
54	`DATALIST: DomAction.Skip,`
55	`HEAD: DomAction.Skip,`
56	`LINK: DomAction.Skip,`
57	`META: DomAction.Skip,`
58	`NOEMBED: DomAction.Skip,`
59	`NOFRAMES: DomAction.Skip,`
60	`PARAM: DomAction.Skip,`
61	`RP: DomAction.Skip,`
62	`SCRIPT: DomAction.Skip,`
63	`STYLE: DomAction.Skip,`
64	`TEMPLATE: DomAction.Skip,`
65	`TITLE: DomAction.Skip,`
66	`NOSCRIPT: DomAction.Skip,`
67
68	`// Flow content`
69	`// https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3`
70	`HR: DomAction.Break,`
71	// Disable if `white-space: pre`.
72	`LISTING: DomAction.Skip,`
73	`PLAINTEXT: DomAction.Skip,`
74	`PRE: DomAction.Skip,`
75	`XMP: DomAction.Skip,`
76
77	`// Phrasing content`
78	`// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3`
79	`BR: DomAction.Break,`
80	`RT: DomAction.Skip,`
81	`WBR: DomAction.BreakOpportunity,`
82
83	`// Form controls`
84	`// https://html.spec.whatwg.org/multipage/rendering.html#form-controls`
85	`INPUT: DomAction.Skip,`
86	`SELECT: DomAction.Skip,`
87	`BUTTON: DomAction.Skip,`
88	`TEXTAREA: DomAction.Skip,`
89
90	`// Other elements where the phrase-based line breaking should be disabled.`
91	`// https://github.com/google/budoux/blob/main/budoux/skip_nodes.json`
92	`ABBR: DomAction.Skip,`
93	`CODE: DomAction.Skip,`
94	`IFRAME: DomAction.Skip,`
95	`TIME: DomAction.Skip,`
96	`VAR: DomAction.Skip,`
97
98	`// Deprecated, but supported in all browsers.`
99	`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr`
100	`NOBR: DomAction.NoBreak,`
101	`};`
102
103	`const defaultBlockElements = new Set([`
104	`// 15.3.2 The page`
105	`'HTML',`
106	`'BODY',`
107	`// 15.3.3 Flow content`
108	`'ADDRESS',`
109	`'BLOCKQUOTE',`
110	`'CENTER',`
111	`'DIALOG',`
112	`'DIV',`
113	`'FIGURE',`
114	`'FIGCAPTION',`
115	`'FOOTER',`
116	`'FORM',`
117	`'HEADER',`
118	`'LEGEND',`
119	`'LISTING',`
120	`'MAIN',`
121	`'P',`
122	`// 15.3.6 Sections and headings`
123	`'ARTICLE',`
124	`'ASIDE',`
125	`'H1',`
126	`'H2',`
127	`'H3',`
128	`'H4',`
129	`'H5',`
130	`'H6',`
131	`'HGROUP',`
132	`'NAV',`
133	`'SECTION',`
134	`// 15.3.7 Lists`
135	`'DIR',`
136	`'DD',`
137	`'DL',`
138	`'DT',`
139	`'MENU',`
140	`'OL',`
141	`'UL',`
142	`'LI',`
143	`// 15.3.8 Tables`
144	`'TABLE',`
145	`'CAPTION',`
146	`'COL',`
147	`'TR',`
148	`'TD',`
149	`'TH',`
150	`// 15.3.12 The fieldset and legend elements`
151	`'FIELDSET',`
152	`// 15.5.4 The details and summary elements`
153	`'DETAILS',`
154	`'SUMMARY',`
155	`// 15.5.12 The marquee element`
156	`'MARQUEE',`
157	`]);`
158
159	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
160	`// but we define the same here for Node.js environments.`
161	`const NODETYPE = {`
162	`ELEMENT: 1,`
163	`TEXT: 3,`
164	`};`
165
166	`/**`
167	`* Determine the action for an element.`
168	`* @param element An element to determine the action for.`
169	`* @return The {@link domActions} for the element.`
170	`*/`
171	`function actionForElement(element: Element): DomAction {`
172	`const nodeName = element.nodeName;`
173	`const action = domActions[nodeName];`
174	`if (action !== undefined) return action;`
175
176	`if (typeof win.getComputedStyle === 'function') {`
177	`const style = win.getComputedStyle(element);`
178	`switch (style.whiteSpace) {`
179	`case 'nowrap':`
180	`case 'pre':`
181	`return DomAction.NoBreak;`
182	`}`
183
184	`const display = style.display;`
185	`if (display)`
186	`return display === 'inline' ? DomAction.Inline : DomAction.Block;`
187	// `display` is an empty string if the element is not connected.
188	`}`
189
190	// Use the built-in rules if the `display` property is empty, or if
191	// `getComputedStyle` is missing (e.g., jsdom.)
192	`return defaultBlockElements.has(nodeName)`
193	`? DomAction.Block`
194	`: DomAction.Inline;`
195	`}`
196
197	`/**`
198	`* Represents a node in {@link Paragraph}.`
199	`*`
200	`* It wraps a {@link Text} or a {@link string}.`
201	`*`
202	`* A {@link string} provides the context for the parser, but it can't be split.`
203	`*/`
204	`class NodeOrText {`
205	`nodeOrText: Text \| string;`
206	`chunks: string[] = [];`
207	`hasBreakOpportunityAfter = false;`
208
209	`constructor(nodeOrText: Text \| string) {`
210	`this.nodeOrText = nodeOrText;`
211	`}`
212
213	`get isString(): boolean {`
214	`return typeof this.nodeOrText === 'string';`
215	`}`
216	`get canSplit(): boolean {`
217	`return !this.isString;`
218	`}`
219	`get text(): string \| null {`
220	`return this.isString`
221	`? (this.nodeOrText as string)`
222	`: (this.nodeOrText as Text).nodeValue;`
223	`}`
224	`get length(): number {`
225	`return this.text?.length ?? 0;`
226	`}`
227
228	`/**`
229	`* Split the {@link Text} in the same way as the {@link chunks}.`
230	`* Joining all {@link chunks} must be equal to {@link text}.`
231	`*/`
232	`split(separator: string \| Node) {`
233	`const chunks = this.chunks;`
234	`assert(chunks.length === 0 \|\| chunks.join('') === this.text);`
235	`if (chunks.length <= 1) return;`
236	`assert(this.canSplit);`
237	`const node = this.nodeOrText as Text;`
238	`if (typeof separator === 'string') {`
239	// If the `separator` is a string, insert it at each boundary.
240	`node.nodeValue = chunks.join(separator);`
241	`return;`
242	`}`
243
244	// Otherwise create a `Text` node for each chunk, with the separator node
245	// between them, and replace the `node` with them.
246	`const document = node.ownerDocument;`
247	`let nodes = [];`
248	`for (const chunk of chunks) {`
249	`if (chunk) nodes.push(document.createTextNode(chunk));`
250	`// Add a separator between chunks. To simplify the logic, add a separator`
251	`// after each chunk, then remove the last one.`
252	// To avoid `cloneNode` for the temporary one that is going to be removed,
253	// add `null` as a marker, then replace them with `cloneNode` later.
254	`nodes.push(null);`
255	`}`
256	`nodes.pop();`
257	`nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));`
258	`node.replaceWith(...nodes);`
259	`}`
260	`}`
261	`export class NodeOrTextForTesting extends NodeOrText {}`
262
263	`/**`
264	`* Represents a "paragraph", broken by block boundaries or forced breaks.`
265	`*`
266	`* A CSS`
267	`* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}`
268	`* is usually a "paragraph", but it can be broken into multiple paragraphs by`
269	* forced breaks such as `<br>`.
270	`*/`
271	`class Paragraph {`
272	`element: HTMLElement;`
273	`nodes: NodeOrText[] = [];`
274
275	`constructor(element: HTMLElement) {`
276	`this.element = element;`
277	`}`
278
279	`isEmpty(): boolean {`
280	`return this.nodes.length === 0;`
281	`}`
282	`get text(): string {`
283	`return this.nodes.map(node => node.text).join('');`
284	`}`
285
286	`get lastNode(): NodeOrText \| undefined {`
287	`return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;`
288	`}`
289	`setHasBreakOpportunityAfter() {`
290	`const lastNode = this.lastNode;`
291	`if (lastNode) lastNode.hasBreakOpportunityAfter = true;`
292	`}`
293
294	`/**`
295	`* @return Indices of forced break opportunities in the source.`
296	* They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
297	`*/`
298	`getForcedOpportunities(): number[] {`
299	`const opportunities: number[] = [];`
300	`let len = 0;`
301	`for (const node of this.nodes) {`
302	`if (node.canSplit) {`
303	`const text = node.text;`
304	`if (text) {`
305	`for (let i = 0; i < text.length; ++i) {`
306	`if (text.charCodeAt(i) === ZWSP_CODEPOINT) {`
307	`opportunities.push(len + i + 1);`
308	`}`
309	`}`
310	`}`
311	`}`
312	`len += node.length;`
313	`if (node.hasBreakOpportunityAfter) {`
314	`opportunities.push(len);`
315	`}`
316	`}`
317	`return opportunities;`
318	`}`
319
320	`/**`
321	`* @return Filtered {@param boundaries} by excluding`
322	`* {@link getForcedOpportunities} if it's not empty.`
323	`* Otherwise {@param boundaries}.`
324	`*/`
325	`excludeForcedOpportunities(boundaries: number[]): number[] {`
326	`const forcedOpportunities = this.getForcedOpportunities();`
327	`if (!forcedOpportunities.length) return boundaries;`
328	`const set = new Set<number>(forcedOpportunities);`
329	`return boundaries.filter(i => !set.has(i));`
330	`}`
331	`}`
332	`export class ParagraphForTesting extends Paragraph {}`
333
334	`/**`
335	`* Options for {@link HTMLProcessor}.`
336	`*/`
337	`export interface HTMLProcessorOptions {`
338	`/**`
339	`* This class name is added to the containing block when the BudouX is applied.`
340	`* The containing block should have following CSS properties to make it work.`
341	* `{ word-break: keep-all; overflow-wrap: anywhere; }`
342	`*`
343	`* When falsy, an inline style is set instead.`
344	`*/`
345	`className?: string;`
346	`/**`
347	`* The separator to insert at each semantics boundary.`
348	`*`
349	`* When it's a {@link Node}, a clone of the {@link Node} will be inserted.`
350	`*`
351	`* The default value is U+200B ZERO WIDTH SPACE.`
352	`*/`
353	`separator?: string \| Node;`
354	`}`
355
356	`/**`
357	`* Adds HTML processing support to a BudouX {@link Parser}.`
358	`*/`
359	`export class HTMLProcessor {`
360	`private parser_: Parser;`
361	`/** See {@link HTMLProcessorOptions.className}. */`
362	`className?: string;`
363	`/** See {@link HTMLProcessorOptions.separator}. */`
364	`separator: string \| Node = ZWSP;`
365
366	`/**`
367	`* @param parser A BudouX {@link Parser} to compute semantic line breaks.`
368	`*/`
369	`constructor(parser: Parser, options?: HTMLProcessorOptions) {`
370	`this.parser_ = parser;`
371	`if (options !== undefined) {`
372	`if (options.className !== undefined) this.className = options.className;`
373	`if (options.separator !== undefined) this.separator = options.separator;`
374	`}`
375	`}`
376
377	`/**`
378	`* Checks if the given element has a text node in its children.`
379	`*`
380	`* @param ele An element to be checked.`
381	`* @return Whether the element has a child text node.`
382	`*/`
383	`static hasChildTextNode(ele: HTMLElement) {`
384	`for (const child of ele.childNodes) {`
385	`if (child.nodeType === NODETYPE.TEXT) return true;`
386	`}`
387	`return false;`
388	`}`
389
390	`/**`
391	`* Applies markups for semantic line breaks to the given HTML element.`
392	`*`
393	`* It breaks descendant nodes into paragraphs,`
394	`* and applies the BudouX to each paragraph.`
395	`* @param element The input element.`
396	`*/`
397	`applyToElement(element: HTMLElement) {`
398	`for (const block of this.getBlocks(element)) {`
399	`assert(!block.isEmpty());`
400	`this.applyToParagraph(block);`
401	`}`
402	`}`
403
404	`/**`
405	`* Find paragraphs from a given HTML element.`
406	`* @param element The root element to find paragraphs.`
407	`* @param parent The parent {@link Paragraph} if any.`
408	`* @return A list of {@link Paragraph}s.`
409	`*/`
410	`*getBlocks(`
411	`element: HTMLElement,`
412	`parent?: Paragraph`
413	`): IterableIterator<Paragraph> {`
414	`assert(element.nodeType === NodeType.ELEMENT_NODE);`
415
416	`// Skip if it was once applied to this element.`
417	`if (this.className && element.classList.contains(this.className)) return;`
418
419	`const action = actionForElement(element);`
420	`if (action === DomAction.Skip) return;`
421	`if (action === DomAction.Break) {`
422	`if (parent && !parent.isEmpty()) {`
423	`parent.setHasBreakOpportunityAfter();`
424	`yield parent;`
425	`parent.nodes = [];`
426	`}`
427	`assert(!element.firstChild);`
428	`return;`
429	`}`
430	`if (action === DomAction.BreakOpportunity) {`
431	`if (parent) parent.setHasBreakOpportunityAfter();`
432	`return;`
433	`}`
434
435	`// Determine if this element creates a new inline formatting context, or if`
436	`// this element belongs to the parent inline formatting context.`
437	`assert(`
438	`action === DomAction.Block \|\|`
439	`action === DomAction.Inline \|\|`
440	`action === DomAction.NoBreak`
441	`);`
442	`const isNewBlock = !parent \|\| action === DomAction.Block;`
443	`const block = isNewBlock ? new Paragraph(element) : parent;`
444
445	`// Collect all text nodes in this inline formatting context, while searching`
446	`// descendant elements recursively.`
447	`for (const child of element.childNodes) {`
448	`switch (child.nodeType) {`
449	`case NodeType.ELEMENT_NODE:`
450	`for (const childBlock of this.getBlocks(child as HTMLElement, block))`
451	`yield childBlock;`
452	`break;`
453	`case NodeType.TEXT_NODE:`
454	`if (action === DomAction.NoBreak) {`
455	`const text = child.nodeValue;`
456	`if (text) {`
457	`block.nodes.push(new NodeOrText(text));`
458	`}`
459	`break;`
460	`}`
461	`block.nodes.push(new NodeOrText(child as Text));`
462	`break;`
463	`}`
464	`}`
465
466	`// Apply if this is an inline formatting context.`
467	`if (isNewBlock && !block.isEmpty()) yield block;`
468	`}`
469
470	`/**`
471	`* Apply the BudouX to the given {@link Paragraph}.`
472	`* @param paragraph The {@link Paragraph} to apply.`
473	`*/`
474	`applyToParagraph(paragraph: Paragraph): void {`
475	`assert(paragraph.nodes.length > 0);`
476	`if (!paragraph.nodes.some(node => node.canSplit)) return;`
477	`const text = paragraph.text;`
478	`// No changes if whitespace-only.`
479	`if (/^\s*$/.test(text)) return;`
480
481	`// Compute the phrase boundaries.`
482	`const boundaries = this.parser_.parseBoundaries(text);`
483	`// No changes if single phrase.`
484	`if (boundaries.length <= 0) return;`
485	// The boundaries should be between 1 and `text.length - 1` in the
486	`// ascending order.`
487	`assert(boundaries[0] > 0);`
488	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
489	`assert(boundaries[boundaries.length - 1] < text.length);`
490
491	`const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);`
492
493	`// Add a sentinel to help iterating.`
494	`adjustedBoundaries.push(text.length + 1);`
495
496	`this.splitNodes(paragraph.nodes, adjustedBoundaries);`
497	`this.applyBlockStyle(paragraph.element);`
498	`}`
499
500	`/**`
501	`* Split {@link NodeOrText} at the specified boundaries.`
502	`* @param nodes A list of {@link NodeOrText}.`
503	`* @param boundaries A list of indices of the text to split at.`
504	`*/`
505	`splitNodes(nodes: NodeOrText[], boundaries: number[]): void {`
506	`assert(boundaries.length > 0);`
507	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
508	`const textLen = nodes.reduce((sum, node) => sum + node.length, 0);`
509	`// The last boundary must be a sentinel.`
510	`assert(boundaries[boundaries.length - 1] > textLen);`
511
512	// Distribute `boundaries` to `node.chunks`.
513	`let boundary_index = 0;`
514	`let boundary = boundaries[0];`
515	`assert(boundary > 0);`
516	let nodeStart = 0; // the start index of the `nodeText` in the whole text.
517	`let lastNode: NodeOrText \| null = null;`
518	`for (const node of nodes) {`
519	`assert(boundary >= nodeStart);`
520	`assert(node.chunks.length === 0);`
521	`const nodeText = node.text;`
522	`if (!nodeText) continue;`
523	`const nodeLength = nodeText.length;`
524	`const nodeEnd = nodeStart + nodeLength;`
525	`assert(!lastNode \|\| lastNode.canSplit);`
526	`if (!node.canSplit) {`
527	// If there's a boundary between nodes and `lastNode.canSplit`, add a
528	// boundary to the end of the `lastNode`.
529	`if (lastNode && boundary === nodeStart) {`
530	`if (lastNode.chunks.length === 0)`
531	`lastNode.chunks.push(lastNode.text ?? '');`
532	`lastNode.chunks.push('');`
533	`}`
534	`while (boundary < nodeEnd) {`
535	`boundary = boundaries[++boundary_index];`
536	`}`
537	`lastNode = null;`
538	`nodeStart = nodeEnd;`
539	`continue;`
540	`}`
541
542	// Check if the next boundary is in this `node`.
543	`lastNode = node;`
544	`if (boundary >= nodeEnd) {`
545	`nodeStart = nodeEnd;`
546	`continue;`
547	`}`
548
549	// Compute the boundary indices in the `node`.
550	`const chunks = node.chunks;`
551	`let chunkStartInNode = 0;`
552	`while (boundary < nodeEnd) {`
553	`const boundaryInNode = boundary - nodeStart;`
554	`assert(boundaryInNode >= chunkStartInNode);`
555	`chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));`
556	`chunkStartInNode = boundaryInNode;`
557	`boundary = boundaries[++boundary_index];`
558	`}`
559	// Add the rest of the `nodeText`.
560	`assert(chunkStartInNode < nodeLength);`
561	`chunks.push(nodeText.slice(chunkStartInNode));`
562
563	`nodeStart = nodeEnd;`
564	`}`
565	`// Check if all nodes and boundaries are consumed.`
566	`assert(nodeStart === textLen);`
567	`assert(boundary_index < boundaries.length);`
568	`assert(boundaries[boundary_index] >= textLen);`
569
570	// `node.chunks` are finalized. Split them.
571	`for (const node of nodes) {`
572	`node.split(this.separator);`
573	`}`
574	`}`
575
576	`/**`
577	`* Applies the block style to the given element.`
578	`* @param element The element to apply the block style.`
579	`*/`
580	`applyBlockStyle(element: HTMLElement): void {`
581	`if (this.className) {`
582	`element.classList.add(this.className);`
583	`return;`
584	`}`
585	`applyWrapStyle(element);`
586	`}`
587	`}`
588
589	`/**`
590	`* BudouX {@link Parser} with HTML processing support.`
591	`*/`
592	`export class HTMLProcessingParser extends Parser {`
593	`htmlProcessor: HTMLProcessor;`
594
595	`constructor(`
596	`model: {[key: string]: {[key: string]: number}},`
597	`htmlProcessorOptions: HTMLProcessorOptions = {`
598	`separator: ZWSP,`
599	`}`
600	`) {`
601	`super(model);`
602	`this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);`
603	`}`
604
605	`/**`
606	* @deprecated Use `applyToElement` instead. `applyElement` will be removed
607	* in v0.7.0 to align the function name with `HTMLProcessor`'s API.
608	`*`
609	`* Applies markups for semantic line breaks to the given HTML element.`
610	`* @param parentElement The input element.`
611	`*/`
612	`applyElement(parentElement: HTMLElement) {`
613	`console.warn(`
614	'`applyElement` is deprecated. Please use `applyToElement` instead. ' +
615	'`applyElement` will be removed in v0.7.0.'
616	`);`
617	`this.applyToElement(parentElement);`
618	`}`
619
620	`/**`
621	`* Applies markups for semantic line breaks to the given HTML element.`
622	`* @param parentElement The input element.`
623	`*/`
624	`applyToElement(parentElement: HTMLElement) {`
625	`this.htmlProcessor.applyToElement(parentElement);`
626	`}`
627
628	`/**`
629	`* Translates the given HTML string to another HTML string with markups`
630	`* for semantic line breaks.`
631	`* @param html An input html string.`
632	`* @return The translated HTML string.`
633	`*/`
634	`translateHTMLString(html: string) {`
635	`if (html === '') return html;`
636	`const doc = parseFromString(html);`
637	`if (HTMLProcessor.hasChildTextNode(doc.body)) {`
638	`const wrapper = doc.createElement('span');`
639	`wrapper.append(...doc.body.childNodes);`
640	`doc.body.append(wrapper);`
641	`}`
642	`this.applyToElement(doc.body.childNodes[0] as HTMLElement);`
643	`return doc.body.innerHTML;`
644	`}`
645	`}`