UNPKG

budoux/module/html_processor.js

Version:

20.1 kBJavaScriptView Raw

1/**
* @license
* Copyright 2021 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
16import { applyWrapStyle, parseFromString } from './dom.js';
17import { Parser } from './parser.js';
18import { win } from './win.js';
19const assert = console.assert;
20const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
21const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
22// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
23// but we define the same here for Node.js environments.
24const NodeType = {
  ELEMENT_NODE: 1,
  TEXT_NODE: 3,
27};
28const DomAction = {
  Inline: 0, // An inline content, becomes a part of a paragraph.
  Block: 1, // A nested paragraph.
  Skip: 2, // Skip the content. The content before and after are connected.
  Break: 3, // A forced break. The content before and after become paragraphs.
  NoBreak: 4, // The content provides context, but it's not breakable.
  BreakOpportunity: 5, // Force a break opportunity.
35};
36/**
* Determines the action from an element name, as defined in
* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
* See also {@link actionForElement}.
*/
41const domActions = {
  // Hidden elements
  // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
  AREA: DomAction.Skip,
  BASE: DomAction.Skip,
  BASEFONT: DomAction.Skip,
  DATALIST: DomAction.Skip,
  HEAD: DomAction.Skip,
  LINK: DomAction.Skip,
  META: DomAction.Skip,
  NOEMBED: DomAction.Skip,
  NOFRAMES: DomAction.Skip,
  PARAM: DomAction.Skip,
  RP: DomAction.Skip,
  SCRIPT: DomAction.Skip,
  STYLE: DomAction.Skip,
  TEMPLATE: DomAction.Skip,
  TITLE: DomAction.Skip,
  NOSCRIPT: DomAction.Skip,
  // Flow content
  // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
  HR: DomAction.Break,
  // Disable if `white-space: pre`.
  LISTING: DomAction.Skip,
  PLAINTEXT: DomAction.Skip,
  PRE: DomAction.Skip,
  XMP: DomAction.Skip,
  // Phrasing content
  // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
  BR: DomAction.Break,
  RT: DomAction.Skip,
  WBR: DomAction.BreakOpportunity,
  // Form controls
  // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
  INPUT: DomAction.Skip,
  SELECT: DomAction.Skip,
  BUTTON: DomAction.Skip,
  TEXTAREA: DomAction.Skip,
  // Other elements where the phrase-based line breaking should be disabled.
  // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
  ABBR: DomAction.Skip,
  CODE: DomAction.Skip,
  IFRAME: DomAction.Skip,
  TIME: DomAction.Skip,
  VAR: DomAction.Skip,
  // Deprecated, but supported in all browsers.
  // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
  NOBR: DomAction.NoBreak,
89};
90const defaultBlockElements = new Set([
  // 15.3.2 The page
  'HTML',
  'BODY',
  // 15.3.3 Flow content
  'ADDRESS',
  'BLOCKQUOTE',
  'CENTER',
  'DIALOG',
  'DIV',
  'FIGURE',
  'FIGCAPTION',
  'FOOTER',
  'FORM',
  'HEADER',
  'LEGEND',
  'LISTING',
  'MAIN',
  'P',
  // 15.3.6 Sections and headings
  'ARTICLE',
  'ASIDE',
  'H1',
  'H2',
  'H3',
  'H4',
  'H5',
  'H6',
  'HGROUP',
  'NAV',
  'SECTION',
  // 15.3.7 Lists
  'DIR',
  'DD',
  'DL',
  'DT',
  'MENU',
  'OL',
  'UL',
  'LI',
  // 15.3.8 Tables
  'TABLE',
  'CAPTION',
  'COL',
  'TR',
  'TD',
  'TH',
  // 15.3.12 The fieldset and legend elements
  'FIELDSET',
  // 15.5.4 The details and summary elements
  'DETAILS',
  'SUMMARY',
  // 15.5.12 The marquee element
  'MARQUEE',
144]);
145// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
146// but we define the same here for Node.js environments.
147const NODETYPE = {
  ELEMENT: 1,
  TEXT: 3,
150};
151/**
* Determine the action for an element.
* @param element An element to determine the action for.
* @return The {@link domActions} for the element.
*/
156function actionForElement(element) {
  const nodeName = element.nodeName;
  const action = domActions[nodeName];
  if (action !== undefined)
      return action;
  if (typeof win.getComputedStyle === 'function') {
      const style = win.getComputedStyle(element);
      switch (style.whiteSpace) {
          case 'nowrap':
          case 'pre':
              return DomAction.NoBreak;
      }
      const display = style.display;
      if (display)
          return display === 'inline' ? DomAction.Inline : DomAction.Block;
      // `display` is an empty string if the element is not connected.
  }
  // Use the built-in rules if the `display` property is empty, or if
  // `getComputedStyle` is missing (e.g., jsdom.)
  return defaultBlockElements.has(nodeName)
      ? DomAction.Block
      : DomAction.Inline;
178}
179/**
* Represents a node in {@link Paragraph}.
*
* It wraps a {@link Text} or a {@link string}.
*
* A {@link string} provides the context for the parser, but it can't be split.
*/
186class NodeOrText {
  constructor(nodeOrText) {
      this.chunks = [];
      this.hasBreakOpportunityAfter = false;
      this.nodeOrText = nodeOrText;
  }
  get isString() {
      return typeof this.nodeOrText === 'string';
  }
  get canSplit() {
      return !this.isString;
  }
  get text() {
      return this.isString
          ? this.nodeOrText
          : this.nodeOrText.nodeValue;
  }
  get length() {
      var _a, _b;
      return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
  }
  /**
   * Split the {@link Text} in the same way as the {@link chunks}.
   * Joining all {@link chunks} must be equal to {@link text}.
   */
  split(separator) {
      const chunks = this.chunks;
      assert(chunks.length === 0 || chunks.join('') === this.text);
      if (chunks.length <= 1)
          return;
      assert(this.canSplit);
      const node = this.nodeOrText;
      if (typeof separator === 'string') {
          // If the `separator` is a string, insert it at each boundary.
          node.nodeValue = chunks.join(separator);
          return;
      }
      // Otherwise create a `Text` node for each chunk, with the separator node
      // between them, and replace the `node` with them.
      const document = node.ownerDocument;
      let nodes = [];
      for (const chunk of chunks) {
          if (chunk)
              nodes.push(document.createTextNode(chunk));
          // Add a separator between chunks. To simplify the logic, add a separator
          // after each chunk, then remove the last one.
          // To avoid `cloneNode` for the temporary one that is going to be removed,
          // add `null` as a marker, then replace them with `cloneNode` later.
          nodes.push(null);
      }
      nodes.pop();
      nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
      node.replaceWith(...nodes);
  }
240}
241export class NodeOrTextForTesting extends NodeOrText {
242}
243/**
* Represents a "paragraph", broken by block boundaries or forced breaks.
*
* A CSS
* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
* is usually a "paragraph", but it can be broken into multiple paragraphs by
* forced breaks such as `<br>`.
*/
251class Paragraph {
  constructor(element) {
      this.nodes = [];
      this.element = element;
  }
  isEmpty() {
      return this.nodes.length === 0;
  }
  get text() {
      return this.nodes.map(node => node.text).join('');
  }
  get lastNode() {
      return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
  }
  setHasBreakOpportunityAfter() {
      const lastNode = this.lastNode;
      if (lastNode)
          lastNode.hasBreakOpportunityAfter = true;
  }
  /**
   * @return Indices of forced break opportunities in the source.
   * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
   */
  getForcedOpportunities() {
      const opportunities = [];
      let len = 0;
      for (const node of this.nodes) {
          if (node.canSplit) {
              const text = node.text;
              if (text) {
                  for (let i = 0; i < text.length; ++i) {
                      if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
                          opportunities.push(len + i + 1);
                      }
                  }
              }
          }
          len += node.length;
          if (node.hasBreakOpportunityAfter) {
              opportunities.push(len);
          }
      }
      return opportunities;
  }
  /**
   * @return Filtered {@param boundaries} by excluding
   * {@link getForcedOpportunities} if it's not empty.
   * Otherwise {@param boundaries}.
   */
  excludeForcedOpportunities(boundaries) {
      const forcedOpportunities = this.getForcedOpportunities();
      if (!forcedOpportunities.length)
          return boundaries;
      const set = new Set(forcedOpportunities);
      return boundaries.filter(i => !set.has(i));
  }
307}
308export class ParagraphForTesting extends Paragraph {
309}
310/**
* Adds HTML processing support to a BudouX {@link Parser}.
*/
313export class HTMLProcessor {
  /**
   * @param parser A BudouX {@link Parser} to compute semantic line breaks.
   */
  constructor(parser, options) {
      /** See {@link HTMLProcessorOptions.separator}. */
      this.separator = ZWSP;
      this.parser_ = parser;
      if (options !== undefined) {
          if (options.className !== undefined)
              this.className = options.className;
          if (options.separator !== undefined)
              this.separator = options.separator;
      }
  }
  /**
   * Checks if the given element has a text node in its children.
   *
   * @param ele An element to be checked.
   * @return Whether the element has a child text node.
   */
  static hasChildTextNode(ele) {
      for (const child of ele.childNodes) {
          if (child.nodeType === NODETYPE.TEXT)
              return true;
      }
      return false;
  }
  /**
   * Applies markups for semantic line breaks to the given HTML element.
   *
   * It breaks descendant nodes into paragraphs,
   * and applies the BudouX to each paragraph.
   * @param element The input element.
   */
  applyToElement(element) {
      for (const block of this.getBlocks(element)) {
          assert(!block.isEmpty());
          this.applyToParagraph(block);
      }
  }
  /**
   * Find paragraphs from a given HTML element.
   * @param element The root element to find paragraphs.
   * @param parent The parent {@link Paragraph} if any.
   * @return A list of {@link Paragraph}s.
   */
  *getBlocks(element, parent) {
      assert(element.nodeType === NodeType.ELEMENT_NODE);
      // Skip if it was once applied to this element.
      if (this.className && element.classList.contains(this.className))
          return;
      const action = actionForElement(element);
      if (action === DomAction.Skip)
          return;
      if (action === DomAction.Break) {
          if (parent && !parent.isEmpty()) {
              parent.setHasBreakOpportunityAfter();
              yield parent;
              parent.nodes = [];
          }
          assert(!element.firstChild);
          return;
      }
      if (action === DomAction.BreakOpportunity) {
          if (parent)
              parent.setHasBreakOpportunityAfter();
          return;
      }
      // Determine if this element creates a new inline formatting context, or if
      // this element belongs to the parent inline formatting context.
      assert(action === DomAction.Block ||
          action === DomAction.Inline ||
          action === DomAction.NoBreak);
      const isNewBlock = !parent || action === DomAction.Block;
      const block = isNewBlock ? new Paragraph(element) : parent;
      // Collect all text nodes in this inline formatting context, while searching
      // descendant elements recursively.
      for (const child of element.childNodes) {
          switch (child.nodeType) {
              case NodeType.ELEMENT_NODE:
                  for (const childBlock of this.getBlocks(child, block))
                      yield childBlock;
                  break;
              case NodeType.TEXT_NODE:
                  if (action === DomAction.NoBreak) {
                      const text = child.nodeValue;
                      if (text) {
                          block.nodes.push(new NodeOrText(text));
                      }
                      break;
                  }
                  block.nodes.push(new NodeOrText(child));
                  break;
          }
      }
      // Apply if this is an inline formatting context.
      if (isNewBlock && !block.isEmpty())
          yield block;
  }
  /**
   * Apply the BudouX to the given {@link Paragraph}.
   * @param paragraph The {@link Paragraph} to apply.
   */
  applyToParagraph(paragraph) {
      assert(paragraph.nodes.length > 0);
      if (!paragraph.nodes.some(node => node.canSplit))
          return;
      const text = paragraph.text;
      // No changes if whitespace-only.
      if (/^\s*$/.test(text))
          return;
      // Compute the phrase boundaries.
      const boundaries = this.parser_.parseBoundaries(text);
      // No changes if single phrase.
      if (boundaries.length <= 0)
          return;
      // The boundaries should be between 1 and `text.length - 1` in the
      // ascending order.
      assert(boundaries[0] > 0);
      assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
      assert(boundaries[boundaries.length - 1] < text.length);
      const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
      // Add a sentinel to help iterating.
      adjustedBoundaries.push(text.length + 1);
      this.splitNodes(paragraph.nodes, adjustedBoundaries);
      this.applyBlockStyle(paragraph.element);
  }
  /**
   * Split {@link NodeOrText} at the specified boundaries.
   * @param nodes A list of {@link NodeOrText}.
   * @param boundaries A list of indices of the text to split at.
   */
  splitNodes(nodes, boundaries) {
      var _a;
      assert(boundaries.length > 0);
      assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
      const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
      // The last boundary must be a sentinel.
      assert(boundaries[boundaries.length - 1] > textLen);
      // Distribute `boundaries` to `node.chunks`.
      let boundary_index = 0;
      let boundary = boundaries[0];
      assert(boundary > 0);
      let nodeStart = 0; // the start index of the `nodeText` in the whole text.
      let lastNode = null;
      for (const node of nodes) {
          assert(boundary >= nodeStart);
          assert(node.chunks.length === 0);
          const nodeText = node.text;
          if (!nodeText)
              continue;
          const nodeLength = nodeText.length;
          const nodeEnd = nodeStart + nodeLength;
          assert(!lastNode || lastNode.canSplit);
          if (!node.canSplit) {
              // If there's a boundary between nodes and `lastNode.canSplit`, add a
              // boundary to the end of the `lastNode`.
              if (lastNode && boundary === nodeStart) {
                  if (lastNode.chunks.length === 0)
                      lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
                  lastNode.chunks.push('');
              }
              while (boundary < nodeEnd) {
                  boundary = boundaries[++boundary_index];
              }
              lastNode = null;
              nodeStart = nodeEnd;
              continue;
          }
          // Check if the next boundary is in this `node`.
          lastNode = node;
          if (boundary >= nodeEnd) {
              nodeStart = nodeEnd;
              continue;
          }
          // Compute the boundary indices in the `node`.
          const chunks = node.chunks;
          let chunkStartInNode = 0;
          while (boundary < nodeEnd) {
              const boundaryInNode = boundary - nodeStart;
              assert(boundaryInNode >= chunkStartInNode);
              chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
              chunkStartInNode = boundaryInNode;
              boundary = boundaries[++boundary_index];
          }
          // Add the rest of the `nodeText`.
          assert(chunkStartInNode < nodeLength);
          chunks.push(nodeText.slice(chunkStartInNode));
          nodeStart = nodeEnd;
      }
      // Check if all nodes and boundaries are consumed.
      assert(nodeStart === textLen);
      assert(boundary_index < boundaries.length);
      assert(boundaries[boundary_index] >= textLen);
      // `node.chunks` are finalized. Split them.
      for (const node of nodes) {
          node.split(this.separator);
      }
  }
  /**
   * Applies the block style to the given element.
   * @param element The element to apply the block style.
   */
  applyBlockStyle(element) {
      if (this.className) {
          element.classList.add(this.className);
          return;
      }
      applyWrapStyle(element);
  }
524}
525/**
* BudouX {@link Parser} with HTML processing support.
*/
528export class HTMLProcessingParser extends Parser {
  constructor(model, htmlProcessorOptions = {
      separator: ZWSP,
  }) {
      super(model);
      this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
  }
  /**
   * @deprecated Use `applyToElement` instead. `applyElement` will be removed
   * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
   *
   * Applies markups for semantic line breaks to the given HTML element.
   * @param parentElement The input element.
   */
  applyElement(parentElement) {
      console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
          '`applyElement` will be removed in v0.7.0.');
      this.applyToElement(parentElement);
  }
  /**
   * Applies markups for semantic line breaks to the given HTML element.
   * @param parentElement The input element.
   */
  applyToElement(parentElement) {
      this.htmlProcessor.applyToElement(parentElement);
  }
  /**
   * Translates the given HTML string to another HTML string with markups
   * for semantic line breaks.
   * @param html An input html string.
   * @return The translated HTML string.
   */
  translateHTMLString(html) {
      if (html === '')
          return html;
      const doc = parseFromString(html);
      if (HTMLProcessor.hasChildTextNode(doc.body)) {
          const wrapper = doc.createElement('span');
          wrapper.append(...doc.body.childNodes);
          doc.body.append(wrapper);
      }
      this.applyToElement(doc.body.childNodes[0]);
      return doc.body.innerHTML;
  }
572}
573//# sourceMappingURL=html_processor.js.map
\No newline at end of file

1	`/**`
2	`* @license`
3	`* Copyright 2021 Google LLC`
4	`* Licensed under the Apache License, Version 2.0 (the "License");`
5	`* you may not use this file except in compliance with the License.`
6	`* You may obtain a copy of the License at`
7	`*`
8	`* https://www.apache.org/licenses/LICENSE-2.0`
9	`*`
10	`* Unless required by applicable law or agreed to in writing, software`
11	`* distributed under the License is distributed on an "AS IS" BASIS,`
12	`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
13	`* See the License for the specific language governing permissions and`
14	`* limitations under the License.`
15	`*/`
16	`import { applyWrapStyle, parseFromString } from './dom.js';`
17	`import { Parser } from './parser.js';`
18	`import { win } from './win.js';`
19	`const assert = console.assert;`
20	`const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE`
21	`const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);`
22	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
23	`// but we define the same here for Node.js environments.`
24	`const NodeType = {`
25	`ELEMENT_NODE: 1,`
26	`TEXT_NODE: 3,`
27	`};`
28	`const DomAction = {`
29	`Inline: 0, // An inline content, becomes a part of a paragraph.`
30	`Block: 1, // A nested paragraph.`
31	`Skip: 2, // Skip the content. The content before and after are connected.`
32	`Break: 3, // A forced break. The content before and after become paragraphs.`
33	`NoBreak: 4, // The content provides context, but it's not breakable.`
34	`BreakOpportunity: 5, // Force a break opportunity.`
35	`};`
36	`/**`
37	`* Determines the action from an element name, as defined in`
38	`* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.`
39	`* See also {@link actionForElement}.`
40	`*/`
41	`const domActions = {`
42	`// Hidden elements`
43	`// https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements`
44	`AREA: DomAction.Skip,`
45	`BASE: DomAction.Skip,`
46	`BASEFONT: DomAction.Skip,`
47	`DATALIST: DomAction.Skip,`
48	`HEAD: DomAction.Skip,`
49	`LINK: DomAction.Skip,`
50	`META: DomAction.Skip,`
51	`NOEMBED: DomAction.Skip,`
52	`NOFRAMES: DomAction.Skip,`
53	`PARAM: DomAction.Skip,`
54	`RP: DomAction.Skip,`
55	`SCRIPT: DomAction.Skip,`
56	`STYLE: DomAction.Skip,`
57	`TEMPLATE: DomAction.Skip,`
58	`TITLE: DomAction.Skip,`
59	`NOSCRIPT: DomAction.Skip,`
60	`// Flow content`
61	`// https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3`
62	`HR: DomAction.Break,`
63	// Disable if `white-space: pre`.
64	`LISTING: DomAction.Skip,`
65	`PLAINTEXT: DomAction.Skip,`
66	`PRE: DomAction.Skip,`
67	`XMP: DomAction.Skip,`
68	`// Phrasing content`
69	`// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3`
70	`BR: DomAction.Break,`
71	`RT: DomAction.Skip,`
72	`WBR: DomAction.BreakOpportunity,`
73	`// Form controls`
74	`// https://html.spec.whatwg.org/multipage/rendering.html#form-controls`
75	`INPUT: DomAction.Skip,`
76	`SELECT: DomAction.Skip,`
77	`BUTTON: DomAction.Skip,`
78	`TEXTAREA: DomAction.Skip,`
79	`// Other elements where the phrase-based line breaking should be disabled.`
80	`// https://github.com/google/budoux/blob/main/budoux/skip_nodes.json`
81	`ABBR: DomAction.Skip,`
82	`CODE: DomAction.Skip,`
83	`IFRAME: DomAction.Skip,`
84	`TIME: DomAction.Skip,`
85	`VAR: DomAction.Skip,`
86	`// Deprecated, but supported in all browsers.`
87	`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr`
88	`NOBR: DomAction.NoBreak,`
89	`};`
90	`const defaultBlockElements = new Set([`
91	`// 15.3.2 The page`
92	`'HTML',`
93	`'BODY',`
94	`// 15.3.3 Flow content`
95	`'ADDRESS',`
96	`'BLOCKQUOTE',`
97	`'CENTER',`
98	`'DIALOG',`
99	`'DIV',`
100	`'FIGURE',`
101	`'FIGCAPTION',`
102	`'FOOTER',`
103	`'FORM',`
104	`'HEADER',`
105	`'LEGEND',`
106	`'LISTING',`
107	`'MAIN',`
108	`'P',`
109	`// 15.3.6 Sections and headings`
110	`'ARTICLE',`
111	`'ASIDE',`
112	`'H1',`
113	`'H2',`
114	`'H3',`
115	`'H4',`
116	`'H5',`
117	`'H6',`
118	`'HGROUP',`
119	`'NAV',`
120	`'SECTION',`
121	`// 15.3.7 Lists`
122	`'DIR',`
123	`'DD',`
124	`'DL',`
125	`'DT',`
126	`'MENU',`
127	`'OL',`
128	`'UL',`
129	`'LI',`
130	`// 15.3.8 Tables`
131	`'TABLE',`
132	`'CAPTION',`
133	`'COL',`
134	`'TR',`
135	`'TD',`
136	`'TH',`
137	`// 15.3.12 The fieldset and legend elements`
138	`'FIELDSET',`
139	`// 15.5.4 The details and summary elements`
140	`'DETAILS',`
141	`'SUMMARY',`
142	`// 15.5.12 The marquee element`
143	`'MARQUEE',`
144	`]);`
145	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
146	`// but we define the same here for Node.js environments.`
147	`const NODETYPE = {`
148	`ELEMENT: 1,`
149	`TEXT: 3,`
150	`};`
151	`/**`
152	`* Determine the action for an element.`
153	`* @param element An element to determine the action for.`
154	`* @return The {@link domActions} for the element.`
155	`*/`
156	`function actionForElement(element) {`
157	`const nodeName = element.nodeName;`
158	`const action = domActions[nodeName];`
159	`if (action !== undefined)`
160	`return action;`
161	`if (typeof win.getComputedStyle === 'function') {`
162	`const style = win.getComputedStyle(element);`
163	`switch (style.whiteSpace) {`
164	`case 'nowrap':`
165	`case 'pre':`
166	`return DomAction.NoBreak;`
167	`}`
168	`const display = style.display;`
169	`if (display)`
170	`return display === 'inline' ? DomAction.Inline : DomAction.Block;`
171	// `display` is an empty string if the element is not connected.
172	`}`
173	// Use the built-in rules if the `display` property is empty, or if
174	// `getComputedStyle` is missing (e.g., jsdom.)
175	`return defaultBlockElements.has(nodeName)`
176	`? DomAction.Block`
177	`: DomAction.Inline;`
178	`}`
179	`/**`
180	`* Represents a node in {@link Paragraph}.`
181	`*`
182	`* It wraps a {@link Text} or a {@link string}.`
183	`*`
184	`* A {@link string} provides the context for the parser, but it can't be split.`
185	`*/`
186	`class NodeOrText {`
187	`constructor(nodeOrText) {`
188	`this.chunks = [];`
189	`this.hasBreakOpportunityAfter = false;`
190	`this.nodeOrText = nodeOrText;`
191	`}`
192	`get isString() {`
193	`return typeof this.nodeOrText === 'string';`
194	`}`
195	`get canSplit() {`
196	`return !this.isString;`
197	`}`
198	`get text() {`
199	`return this.isString`
200	`? this.nodeOrText`
201	`: this.nodeOrText.nodeValue;`
202	`}`
203	`get length() {`
204	`var _a, _b;`
205	`return (_b = (_a = this.text) === null \|\| _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;`
206	`}`
207	`/**`
208	`* Split the {@link Text} in the same way as the {@link chunks}.`
209	`* Joining all {@link chunks} must be equal to {@link text}.`
210	`*/`
211	`split(separator) {`
212	`const chunks = this.chunks;`
213	`assert(chunks.length === 0 \|\| chunks.join('') === this.text);`
214	`if (chunks.length <= 1)`
215	`return;`
216	`assert(this.canSplit);`
217	`const node = this.nodeOrText;`
218	`if (typeof separator === 'string') {`
219	// If the `separator` is a string, insert it at each boundary.
220	`node.nodeValue = chunks.join(separator);`
221	`return;`
222	`}`
223	// Otherwise create a `Text` node for each chunk, with the separator node
224	// between them, and replace the `node` with them.
225	`const document = node.ownerDocument;`
226	`let nodes = [];`
227	`for (const chunk of chunks) {`
228	`if (chunk)`
229	`nodes.push(document.createTextNode(chunk));`
230	`// Add a separator between chunks. To simplify the logic, add a separator`
231	`// after each chunk, then remove the last one.`
232	// To avoid `cloneNode` for the temporary one that is going to be removed,
233	// add `null` as a marker, then replace them with `cloneNode` later.
234	`nodes.push(null);`
235	`}`
236	`nodes.pop();`
237	`nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));`
238	`node.replaceWith(...nodes);`
239	`}`
240	`}`
241	`export class NodeOrTextForTesting extends NodeOrText {`
242	`}`
243	`/**`
244	`* Represents a "paragraph", broken by block boundaries or forced breaks.`
245	`*`
246	`* A CSS`
247	`* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}`
248	`* is usually a "paragraph", but it can be broken into multiple paragraphs by`
249	* forced breaks such as `<br>`.
250	`*/`
251	`class Paragraph {`
252	`constructor(element) {`
253	`this.nodes = [];`
254	`this.element = element;`
255	`}`
256	`isEmpty() {`
257	`return this.nodes.length === 0;`
258	`}`
259	`get text() {`
260	`return this.nodes.map(node => node.text).join('');`
261	`}`
262	`get lastNode() {`
263	`return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;`
264	`}`
265	`setHasBreakOpportunityAfter() {`
266	`const lastNode = this.lastNode;`
267	`if (lastNode)`
268	`lastNode.hasBreakOpportunityAfter = true;`
269	`}`
270	`/**`
271	`* @return Indices of forced break opportunities in the source.`
272	* They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
273	`*/`
274	`getForcedOpportunities() {`
275	`const opportunities = [];`
276	`let len = 0;`
277	`for (const node of this.nodes) {`
278	`if (node.canSplit) {`
279	`const text = node.text;`
280	`if (text) {`
281	`for (let i = 0; i < text.length; ++i) {`
282	`if (text.charCodeAt(i) === ZWSP_CODEPOINT) {`
283	`opportunities.push(len + i + 1);`
284	`}`
285	`}`
286	`}`
287	`}`
288	`len += node.length;`
289	`if (node.hasBreakOpportunityAfter) {`
290	`opportunities.push(len);`
291	`}`
292	`}`
293	`return opportunities;`
294	`}`
295	`/**`
296	`* @return Filtered {@param boundaries} by excluding`
297	`* {@link getForcedOpportunities} if it's not empty.`
298	`* Otherwise {@param boundaries}.`
299	`*/`
300	`excludeForcedOpportunities(boundaries) {`
301	`const forcedOpportunities = this.getForcedOpportunities();`
302	`if (!forcedOpportunities.length)`
303	`return boundaries;`
304	`const set = new Set(forcedOpportunities);`
305	`return boundaries.filter(i => !set.has(i));`
306	`}`
307	`}`
308	`export class ParagraphForTesting extends Paragraph {`
309	`}`
310	`/**`
311	`* Adds HTML processing support to a BudouX {@link Parser}.`
312	`*/`
313	`export class HTMLProcessor {`
314	`/**`
315	`* @param parser A BudouX {@link Parser} to compute semantic line breaks.`
316	`*/`
317	`constructor(parser, options) {`
318	`/** See {@link HTMLProcessorOptions.separator}. */`
319	`this.separator = ZWSP;`
320	`this.parser_ = parser;`
321	`if (options !== undefined) {`
322	`if (options.className !== undefined)`
323	`this.className = options.className;`
324	`if (options.separator !== undefined)`
325	`this.separator = options.separator;`
326	`}`
327	`}`
328	`/**`
329	`* Checks if the given element has a text node in its children.`
330	`*`
331	`* @param ele An element to be checked.`
332	`* @return Whether the element has a child text node.`
333	`*/`
334	`static hasChildTextNode(ele) {`
335	`for (const child of ele.childNodes) {`
336	`if (child.nodeType === NODETYPE.TEXT)`
337	`return true;`
338	`}`
339	`return false;`
340	`}`
341	`/**`
342	`* Applies markups for semantic line breaks to the given HTML element.`
343	`*`
344	`* It breaks descendant nodes into paragraphs,`
345	`* and applies the BudouX to each paragraph.`
346	`* @param element The input element.`
347	`*/`
348	`applyToElement(element) {`
349	`for (const block of this.getBlocks(element)) {`
350	`assert(!block.isEmpty());`
351	`this.applyToParagraph(block);`
352	`}`
353	`}`
354	`/**`
355	`* Find paragraphs from a given HTML element.`
356	`* @param element The root element to find paragraphs.`
357	`* @param parent The parent {@link Paragraph} if any.`
358	`* @return A list of {@link Paragraph}s.`
359	`*/`
360	`*getBlocks(element, parent) {`
361	`assert(element.nodeType === NodeType.ELEMENT_NODE);`
362	`// Skip if it was once applied to this element.`
363	`if (this.className && element.classList.contains(this.className))`
364	`return;`
365	`const action = actionForElement(element);`
366	`if (action === DomAction.Skip)`
367	`return;`
368	`if (action === DomAction.Break) {`
369	`if (parent && !parent.isEmpty()) {`
370	`parent.setHasBreakOpportunityAfter();`
371	`yield parent;`
372	`parent.nodes = [];`
373	`}`
374	`assert(!element.firstChild);`
375	`return;`
376	`}`
377	`if (action === DomAction.BreakOpportunity) {`
378	`if (parent)`
379	`parent.setHasBreakOpportunityAfter();`
380	`return;`
381	`}`
382	`// Determine if this element creates a new inline formatting context, or if`
383	`// this element belongs to the parent inline formatting context.`
384	`assert(action === DomAction.Block \|\|`
385	`action === DomAction.Inline \|\|`
386	`action === DomAction.NoBreak);`
387	`const isNewBlock = !parent \|\| action === DomAction.Block;`
388	`const block = isNewBlock ? new Paragraph(element) : parent;`
389	`// Collect all text nodes in this inline formatting context, while searching`
390	`// descendant elements recursively.`
391	`for (const child of element.childNodes) {`
392	`switch (child.nodeType) {`
393	`case NodeType.ELEMENT_NODE:`
394	`for (const childBlock of this.getBlocks(child, block))`
395	`yield childBlock;`
396	`break;`
397	`case NodeType.TEXT_NODE:`
398	`if (action === DomAction.NoBreak) {`
399	`const text = child.nodeValue;`
400	`if (text) {`
401	`block.nodes.push(new NodeOrText(text));`
402	`}`
403	`break;`
404	`}`
405	`block.nodes.push(new NodeOrText(child));`
406	`break;`
407	`}`
408	`}`
409	`// Apply if this is an inline formatting context.`
410	`if (isNewBlock && !block.isEmpty())`
411	`yield block;`
412	`}`
413	`/**`
414	`* Apply the BudouX to the given {@link Paragraph}.`
415	`* @param paragraph The {@link Paragraph} to apply.`
416	`*/`
417	`applyToParagraph(paragraph) {`
418	`assert(paragraph.nodes.length > 0);`
419	`if (!paragraph.nodes.some(node => node.canSplit))`
420	`return;`
421	`const text = paragraph.text;`
422	`// No changes if whitespace-only.`
423	`if (/^\s*$/.test(text))`
424	`return;`
425	`// Compute the phrase boundaries.`
426	`const boundaries = this.parser_.parseBoundaries(text);`
427	`// No changes if single phrase.`
428	`if (boundaries.length <= 0)`
429	`return;`
430	// The boundaries should be between 1 and `text.length - 1` in the
431	`// ascending order.`
432	`assert(boundaries[0] > 0);`
433	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
434	`assert(boundaries[boundaries.length - 1] < text.length);`
435	`const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);`
436	`// Add a sentinel to help iterating.`
437	`adjustedBoundaries.push(text.length + 1);`
438	`this.splitNodes(paragraph.nodes, adjustedBoundaries);`
439	`this.applyBlockStyle(paragraph.element);`
440	`}`
441	`/**`
442	`* Split {@link NodeOrText} at the specified boundaries.`
443	`* @param nodes A list of {@link NodeOrText}.`
444	`* @param boundaries A list of indices of the text to split at.`
445	`*/`
446	`splitNodes(nodes, boundaries) {`
447	`var _a;`
448	`assert(boundaries.length > 0);`
449	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
450	`const textLen = nodes.reduce((sum, node) => sum + node.length, 0);`
451	`// The last boundary must be a sentinel.`
452	`assert(boundaries[boundaries.length - 1] > textLen);`
453	// Distribute `boundaries` to `node.chunks`.
454	`let boundary_index = 0;`
455	`let boundary = boundaries[0];`
456	`assert(boundary > 0);`
457	let nodeStart = 0; // the start index of the `nodeText` in the whole text.
458	`let lastNode = null;`
459	`for (const node of nodes) {`
460	`assert(boundary >= nodeStart);`
461	`assert(node.chunks.length === 0);`
462	`const nodeText = node.text;`
463	`if (!nodeText)`
464	`continue;`
465	`const nodeLength = nodeText.length;`
466	`const nodeEnd = nodeStart + nodeLength;`
467	`assert(!lastNode \|\| lastNode.canSplit);`
468	`if (!node.canSplit) {`
469	// If there's a boundary between nodes and `lastNode.canSplit`, add a
470	// boundary to the end of the `lastNode`.
471	`if (lastNode && boundary === nodeStart) {`
472	`if (lastNode.chunks.length === 0)`
473	`lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');`
474	`lastNode.chunks.push('');`
475	`}`
476	`while (boundary < nodeEnd) {`
477	`boundary = boundaries[++boundary_index];`
478	`}`
479	`lastNode = null;`
480	`nodeStart = nodeEnd;`
481	`continue;`
482	`}`
483	// Check if the next boundary is in this `node`.
484	`lastNode = node;`
485	`if (boundary >= nodeEnd) {`
486	`nodeStart = nodeEnd;`
487	`continue;`
488	`}`
489	// Compute the boundary indices in the `node`.
490	`const chunks = node.chunks;`
491	`let chunkStartInNode = 0;`
492	`while (boundary < nodeEnd) {`
493	`const boundaryInNode = boundary - nodeStart;`
494	`assert(boundaryInNode >= chunkStartInNode);`
495	`chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));`
496	`chunkStartInNode = boundaryInNode;`
497	`boundary = boundaries[++boundary_index];`
498	`}`
499	// Add the rest of the `nodeText`.
500	`assert(chunkStartInNode < nodeLength);`
501	`chunks.push(nodeText.slice(chunkStartInNode));`
502	`nodeStart = nodeEnd;`
503	`}`
504	`// Check if all nodes and boundaries are consumed.`
505	`assert(nodeStart === textLen);`
506	`assert(boundary_index < boundaries.length);`
507	`assert(boundaries[boundary_index] >= textLen);`
508	// `node.chunks` are finalized. Split them.
509	`for (const node of nodes) {`
510	`node.split(this.separator);`
511	`}`
512	`}`
513	`/**`
514	`* Applies the block style to the given element.`
515	`* @param element The element to apply the block style.`
516	`*/`
517	`applyBlockStyle(element) {`
518	`if (this.className) {`
519	`element.classList.add(this.className);`
520	`return;`
521	`}`
522	`applyWrapStyle(element);`
523	`}`
524	`}`
525	`/**`
526	`* BudouX {@link Parser} with HTML processing support.`
527	`*/`
528	`export class HTMLProcessingParser extends Parser {`
529	`constructor(model, htmlProcessorOptions = {`
530	`separator: ZWSP,`
531	`}) {`
532	`super(model);`
533	`this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);`
534	`}`
535	`/**`
536	* @deprecated Use `applyToElement` instead. `applyElement` will be removed
537	* in v0.7.0 to align the function name with `HTMLProcessor`'s API.
538	`*`
539	`* Applies markups for semantic line breaks to the given HTML element.`
540	`* @param parentElement The input element.`
541	`*/`
542	`applyElement(parentElement) {`
543	console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
544	'`applyElement` will be removed in v0.7.0.');
545	`this.applyToElement(parentElement);`
546	`}`
547	`/**`
548	`* Applies markups for semantic line breaks to the given HTML element.`
549	`* @param parentElement The input element.`
550	`*/`
551	`applyToElement(parentElement) {`
552	`this.htmlProcessor.applyToElement(parentElement);`
553	`}`
554	`/**`
555	`* Translates the given HTML string to another HTML string with markups`
556	`* for semantic line breaks.`
557	`* @param html An input html string.`
558	`* @return The translated HTML string.`
559	`*/`
560	`translateHTMLString(html) {`
561	`if (html === '')`
562	`return html;`
563	`const doc = parseFromString(html);`
564	`if (HTMLProcessor.hasChildTextNode(doc.body)) {`
565	`const wrapper = doc.createElement('span');`
566	`wrapper.append(...doc.body.childNodes);`
567	`doc.body.append(wrapper);`
568	`}`
569	`this.applyToElement(doc.body.childNodes[0]);`
570	`return doc.body.innerHTML;`
571	`}`
572	`}`
573	`//# sourceMappingURL=html_processor.js.map`
\	No newline at end of file