UNPKG

budoux/dist/html_processor.js

Version:

20.5 kBJavaScriptView Raw

1"use strict";
2/**
* @license
* Copyright 2021 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
17Object.defineProperty(exports, "__esModule", { value: true });
18exports.HTMLProcessingParser = exports.HTMLProcessor = exports.ParagraphForTesting = exports.NodeOrTextForTesting = void 0;
19const dom_js_1 = require("./dom.js");
20const parser_js_1 = require("./parser.js");
21const win_js_1 = require("./win.js");
22const assert = console.assert;
23const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE
24const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
25// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
26// but we define the same here for Node.js environments.
27const NodeType = {
  ELEMENT_NODE: 1,
  TEXT_NODE: 3,
30};
31const DomAction = {
  Inline: 0, // An inline content, becomes a part of a paragraph.
  Block: 1, // A nested paragraph.
  Skip: 2, // Skip the content. The content before and after are connected.
  Break: 3, // A forced break. The content before and after become paragraphs.
  NoBreak: 4, // The content provides context, but it's not breakable.
  BreakOpportunity: 5, // Force a break opportunity.
38};
39/**
* Determines the action from an element name, as defined in
* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.
* See also {@link actionForElement}.
*/
44const domActions = {
  // Hidden elements
  // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements
  AREA: DomAction.Skip,
  BASE: DomAction.Skip,
  BASEFONT: DomAction.Skip,
  DATALIST: DomAction.Skip,
  HEAD: DomAction.Skip,
  LINK: DomAction.Skip,
  META: DomAction.Skip,
  NOEMBED: DomAction.Skip,
  NOFRAMES: DomAction.Skip,
  PARAM: DomAction.Skip,
  RP: DomAction.Skip,
  SCRIPT: DomAction.Skip,
  STYLE: DomAction.Skip,
  TEMPLATE: DomAction.Skip,
  TITLE: DomAction.Skip,
  NOSCRIPT: DomAction.Skip,
  // Flow content
  // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3
  HR: DomAction.Break,
  // Disable if `white-space: pre`.
  LISTING: DomAction.Skip,
  PLAINTEXT: DomAction.Skip,
  PRE: DomAction.Skip,
  XMP: DomAction.Skip,
  // Phrasing content
  // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3
  BR: DomAction.Break,
  RT: DomAction.Skip,
  WBR: DomAction.BreakOpportunity,
  // Form controls
  // https://html.spec.whatwg.org/multipage/rendering.html#form-controls
  INPUT: DomAction.Skip,
  SELECT: DomAction.Skip,
  BUTTON: DomAction.Skip,
  TEXTAREA: DomAction.Skip,
  // Other elements where the phrase-based line breaking should be disabled.
  // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json
  ABBR: DomAction.Skip,
  CODE: DomAction.Skip,
  IFRAME: DomAction.Skip,
  TIME: DomAction.Skip,
  VAR: DomAction.Skip,
  // Deprecated, but supported in all browsers.
  // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
  NOBR: DomAction.NoBreak,
92};
93const defaultBlockElements = new Set([
  // 15.3.2 The page
  'HTML',
  'BODY',
  // 15.3.3 Flow content
  'ADDRESS',
  'BLOCKQUOTE',
  'CENTER',
  'DIALOG',
  'DIV',
  'FIGURE',
  'FIGCAPTION',
  'FOOTER',
  'FORM',
  'HEADER',
  'LEGEND',
  'LISTING',
  'MAIN',
  'P',
  // 15.3.6 Sections and headings
  'ARTICLE',
  'ASIDE',
  'H1',
  'H2',
  'H3',
  'H4',
  'H5',
  'H6',
  'HGROUP',
  'NAV',
  'SECTION',
  // 15.3.7 Lists
  'DIR',
  'DD',
  'DL',
  'DT',
  'MENU',
  'OL',
  'UL',
  'LI',
  // 15.3.8 Tables
  'TABLE',
  'CAPTION',
  'COL',
  'TR',
  'TD',
  'TH',
  // 15.3.12 The fieldset and legend elements
  'FIELDSET',
  // 15.5.4 The details and summary elements
  'DETAILS',
  'SUMMARY',
  // 15.5.12 The marquee element
  'MARQUEE',
147]);
148// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
149// but we define the same here for Node.js environments.
150const NODETYPE = {
  ELEMENT: 1,
  TEXT: 3,
153};
154/**
* Determine the action for an element.
* @param element An element to determine the action for.
* @return The {@link domActions} for the element.
*/
159function actionForElement(element) {
  const nodeName = element.nodeName;
  const action = domActions[nodeName];
  if (action !== undefined)
      return action;
  if (typeof win_js_1.win.getComputedStyle === 'function') {
      const style = win_js_1.win.getComputedStyle(element);
      switch (style.whiteSpace) {
          case 'nowrap':
          case 'pre':
              return DomAction.NoBreak;
      }
      const display = style.display;
      if (display)
          return display === 'inline' ? DomAction.Inline : DomAction.Block;
      // `display` is an empty string if the element is not connected.
  }
  // Use the built-in rules if the `display` property is empty, or if
  // `getComputedStyle` is missing (e.g., jsdom.)
  return defaultBlockElements.has(nodeName)
      ? DomAction.Block
      : DomAction.Inline;
181}
182/**
* Represents a node in {@link Paragraph}.
*
* It wraps a {@link Text} or a {@link string}.
*
* A {@link string} provides the context for the parser, but it can't be split.
*/
189class NodeOrText {
  constructor(nodeOrText) {
      this.chunks = [];
      this.hasBreakOpportunityAfter = false;
      this.nodeOrText = nodeOrText;
  }
  get isString() {
      return typeof this.nodeOrText === 'string';
  }
  get canSplit() {
      return !this.isString;
  }
  get text() {
      return this.isString
          ? this.nodeOrText
          : this.nodeOrText.nodeValue;
  }
  get length() {
      var _a, _b;
      return (_b = (_a = this.text) === null || _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;
  }
  /**
   * Split the {@link Text} in the same way as the {@link chunks}.
   * Joining all {@link chunks} must be equal to {@link text}.
   */
  split(separator) {
      const chunks = this.chunks;
      assert(chunks.length === 0 || chunks.join('') === this.text);
      if (chunks.length <= 1)
          return;
      assert(this.canSplit);
      const node = this.nodeOrText;
      if (typeof separator === 'string') {
          // If the `separator` is a string, insert it at each boundary.
          node.nodeValue = chunks.join(separator);
          return;
      }
      // Otherwise create a `Text` node for each chunk, with the separator node
      // between them, and replace the `node` with them.
      const document = node.ownerDocument;
      let nodes = [];
      for (const chunk of chunks) {
          if (chunk)
              nodes.push(document.createTextNode(chunk));
          // Add a separator between chunks. To simplify the logic, add a separator
          // after each chunk, then remove the last one.
          // To avoid `cloneNode` for the temporary one that is going to be removed,
          // add `null` as a marker, then replace them with `cloneNode` later.
          nodes.push(null);
      }
      nodes.pop();
      nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
      node.replaceWith(...nodes);
  }
243}
244class NodeOrTextForTesting extends NodeOrText {
245}
246exports.NodeOrTextForTesting = NodeOrTextForTesting;
247/**
* Represents a "paragraph", broken by block boundaries or forced breaks.
*
* A CSS
* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}
* is usually a "paragraph", but it can be broken into multiple paragraphs by
* forced breaks such as `<br>`.
*/
255class Paragraph {
  constructor(element) {
      this.nodes = [];
      this.element = element;
  }
  isEmpty() {
      return this.nodes.length === 0;
  }
  get text() {
      return this.nodes.map(node => node.text).join('');
  }
  get lastNode() {
      return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
  }
  setHasBreakOpportunityAfter() {
      const lastNode = this.lastNode;
      if (lastNode)
          lastNode.hasBreakOpportunityAfter = true;
  }
  /**
   * @return Indices of forced break opportunities in the source.
   * They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
   */
  getForcedOpportunities() {
      const opportunities = [];
      let len = 0;
      for (const node of this.nodes) {
          if (node.canSplit) {
              const text = node.text;
              if (text) {
                  for (let i = 0; i < text.length; ++i) {
                      if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
                          opportunities.push(len + i + 1);
                      }
                  }
              }
          }
          len += node.length;
          if (node.hasBreakOpportunityAfter) {
              opportunities.push(len);
          }
      }
      return opportunities;
  }
  /**
   * @return Filtered {@param boundaries} by excluding
   * {@link getForcedOpportunities} if it's not empty.
   * Otherwise {@param boundaries}.
   */
  excludeForcedOpportunities(boundaries) {
      const forcedOpportunities = this.getForcedOpportunities();
      if (!forcedOpportunities.length)
          return boundaries;
      const set = new Set(forcedOpportunities);
      return boundaries.filter(i => !set.has(i));
  }
311}
312class ParagraphForTesting extends Paragraph {
313}
314exports.ParagraphForTesting = ParagraphForTesting;
315/**
* Adds HTML processing support to a BudouX {@link Parser}.
*/
318class HTMLProcessor {
  /**
   * @param parser A BudouX {@link Parser} to compute semantic line breaks.
   */
  constructor(parser, options) {
      /** See {@link HTMLProcessorOptions.separator}. */
      this.separator = ZWSP;
      this.parser_ = parser;
      if (options !== undefined) {
          if (options.className !== undefined)
              this.className = options.className;
          if (options.separator !== undefined)
              this.separator = options.separator;
      }
  }
  /**
   * Checks if the given element has a text node in its children.
   *
   * @param ele An element to be checked.
   * @return Whether the element has a child text node.
   */
  static hasChildTextNode(ele) {
      for (const child of ele.childNodes) {
          if (child.nodeType === NODETYPE.TEXT)
              return true;
      }
      return false;
  }
  /**
   * Applies markups for semantic line breaks to the given HTML element.
   *
   * It breaks descendant nodes into paragraphs,
   * and applies the BudouX to each paragraph.
   * @param element The input element.
   */
  applyToElement(element) {
      for (const block of this.getBlocks(element)) {
          assert(!block.isEmpty());
          this.applyToParagraph(block);
      }
  }
  /**
   * Find paragraphs from a given HTML element.
   * @param element The root element to find paragraphs.
   * @param parent The parent {@link Paragraph} if any.
   * @return A list of {@link Paragraph}s.
   */
  *getBlocks(element, parent) {
      assert(element.nodeType === NodeType.ELEMENT_NODE);
      // Skip if it was once applied to this element.
      if (this.className && element.classList.contains(this.className))
          return;
      const action = actionForElement(element);
      if (action === DomAction.Skip)
          return;
      if (action === DomAction.Break) {
          if (parent && !parent.isEmpty()) {
              parent.setHasBreakOpportunityAfter();
              yield parent;
              parent.nodes = [];
          }
          assert(!element.firstChild);
          return;
      }
      if (action === DomAction.BreakOpportunity) {
          if (parent)
              parent.setHasBreakOpportunityAfter();
          return;
      }
      // Determine if this element creates a new inline formatting context, or if
      // this element belongs to the parent inline formatting context.
      assert(action === DomAction.Block ||
          action === DomAction.Inline ||
          action === DomAction.NoBreak);
      const isNewBlock = !parent || action === DomAction.Block;
      const block = isNewBlock ? new Paragraph(element) : parent;
      // Collect all text nodes in this inline formatting context, while searching
      // descendant elements recursively.
      for (const child of element.childNodes) {
          switch (child.nodeType) {
              case NodeType.ELEMENT_NODE:
                  for (const childBlock of this.getBlocks(child, block))
                      yield childBlock;
                  break;
              case NodeType.TEXT_NODE:
                  if (action === DomAction.NoBreak) {
                      const text = child.nodeValue;
                      if (text) {
                          block.nodes.push(new NodeOrText(text));
                      }
                      break;
                  }
                  block.nodes.push(new NodeOrText(child));
                  break;
          }
      }
      // Apply if this is an inline formatting context.
      if (isNewBlock && !block.isEmpty())
          yield block;
  }
  /**
   * Apply the BudouX to the given {@link Paragraph}.
   * @param paragraph The {@link Paragraph} to apply.
   */
  applyToParagraph(paragraph) {
      assert(paragraph.nodes.length > 0);
      if (!paragraph.nodes.some(node => node.canSplit))
          return;
      const text = paragraph.text;
      // No changes if whitespace-only.
      if (/^\s*$/.test(text))
          return;
      // Compute the phrase boundaries.
      const boundaries = this.parser_.parseBoundaries(text);
      // No changes if single phrase.
      if (boundaries.length <= 0)
          return;
      // The boundaries should be between 1 and `text.length - 1` in the
      // ascending order.
      assert(boundaries[0] > 0);
      assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
      assert(boundaries[boundaries.length - 1] < text.length);
      const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
      // Add a sentinel to help iterating.
      adjustedBoundaries.push(text.length + 1);
      this.splitNodes(paragraph.nodes, adjustedBoundaries);
      this.applyBlockStyle(paragraph.element);
  }
  /**
   * Split {@link NodeOrText} at the specified boundaries.
   * @param nodes A list of {@link NodeOrText}.
   * @param boundaries A list of indices of the text to split at.
   */
  splitNodes(nodes, boundaries) {
      var _a;
      assert(boundaries.length > 0);
      assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
      const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
      // The last boundary must be a sentinel.
      assert(boundaries[boundaries.length - 1] > textLen);
      // Distribute `boundaries` to `node.chunks`.
      let boundary_index = 0;
      let boundary = boundaries[0];
      assert(boundary > 0);
      let nodeStart = 0; // the start index of the `nodeText` in the whole text.
      let lastNode = null;
      for (const node of nodes) {
          assert(boundary >= nodeStart);
          assert(node.chunks.length === 0);
          const nodeText = node.text;
          if (!nodeText)
              continue;
          const nodeLength = nodeText.length;
          const nodeEnd = nodeStart + nodeLength;
          assert(!lastNode || lastNode.canSplit);
          if (!node.canSplit) {
              // If there's a boundary between nodes and `lastNode.canSplit`, add a
              // boundary to the end of the `lastNode`.
              if (lastNode && boundary === nodeStart) {
                  if (lastNode.chunks.length === 0)
                      lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');
                  lastNode.chunks.push('');
              }
              while (boundary < nodeEnd) {
                  boundary = boundaries[++boundary_index];
              }
              lastNode = null;
              nodeStart = nodeEnd;
              continue;
          }
          // Check if the next boundary is in this `node`.
          lastNode = node;
          if (boundary >= nodeEnd) {
              nodeStart = nodeEnd;
              continue;
          }
          // Compute the boundary indices in the `node`.
          const chunks = node.chunks;
          let chunkStartInNode = 0;
          while (boundary < nodeEnd) {
              const boundaryInNode = boundary - nodeStart;
              assert(boundaryInNode >= chunkStartInNode);
              chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
              chunkStartInNode = boundaryInNode;
              boundary = boundaries[++boundary_index];
          }
          // Add the rest of the `nodeText`.
          assert(chunkStartInNode < nodeLength);
          chunks.push(nodeText.slice(chunkStartInNode));
          nodeStart = nodeEnd;
      }
      // Check if all nodes and boundaries are consumed.
      assert(nodeStart === textLen);
      assert(boundary_index < boundaries.length);
      assert(boundaries[boundary_index] >= textLen);
      // `node.chunks` are finalized. Split them.
      for (const node of nodes) {
          node.split(this.separator);
      }
  }
  /**
   * Applies the block style to the given element.
   * @param element The element to apply the block style.
   */
  applyBlockStyle(element) {
      if (this.className) {
          element.classList.add(this.className);
          return;
      }
      (0, dom_js_1.applyWrapStyle)(element);
  }
529}
530exports.HTMLProcessor = HTMLProcessor;
531/**
* BudouX {@link Parser} with HTML processing support.
*/
534class HTMLProcessingParser extends parser_js_1.Parser {
  constructor(model, htmlProcessorOptions = {
      separator: ZWSP,
  }) {
      super(model);
      this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
  }
  /**
   * @deprecated Use `applyToElement` instead. `applyElement` will be removed
   * in v0.7.0 to align the function name with `HTMLProcessor`'s API.
   *
   * Applies markups for semantic line breaks to the given HTML element.
   * @param parentElement The input element.
   */
  applyElement(parentElement) {
      console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
          '`applyElement` will be removed in v0.7.0.');
      this.applyToElement(parentElement);
  }
  /**
   * Applies markups for semantic line breaks to the given HTML element.
   * @param parentElement The input element.
   */
  applyToElement(parentElement) {
      this.htmlProcessor.applyToElement(parentElement);
  }
  /**
   * Translates the given HTML string to another HTML string with markups
   * for semantic line breaks.
   * @param html An input html string.
   * @return The translated HTML string.
   */
  translateHTMLString(html) {
      if (html === '')
          return html;
      const doc = (0, dom_js_1.parseFromString)(html);
      if (HTMLProcessor.hasChildTextNode(doc.body)) {
          const wrapper = doc.createElement('span');
          wrapper.append(...doc.body.childNodes);
          doc.body.append(wrapper);
      }
      this.applyToElement(doc.body.childNodes[0]);
      return doc.body.innerHTML;
  }
578}
579exports.HTMLProcessingParser = HTMLProcessingParser;
580//# sourceMappingURL=html_processor.js.map
\No newline at end of file

1	`"use strict";`
2	`/**`
3	`* @license`
4	`* Copyright 2021 Google LLC`
5	`* Licensed under the Apache License, Version 2.0 (the "License");`
6	`* you may not use this file except in compliance with the License.`
7	`* You may obtain a copy of the License at`
8	`*`
9	`* https://www.apache.org/licenses/LICENSE-2.0`
10	`*`
11	`* Unless required by applicable law or agreed to in writing, software`
12	`* distributed under the License is distributed on an "AS IS" BASIS,`
13	`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
14	`* See the License for the specific language governing permissions and`
15	`* limitations under the License.`
16	`*/`
17	`Object.defineProperty(exports, "__esModule", { value: true });`
18	`exports.HTMLProcessingParser = exports.HTMLProcessor = exports.ParagraphForTesting = exports.NodeOrTextForTesting = void 0;`
19	`const dom_js_1 = require("./dom.js");`
20	`const parser_js_1 = require("./parser.js");`
21	`const win_js_1 = require("./win.js");`
22	`const assert = console.assert;`
23	`const ZWSP_CODEPOINT = 0x200b; // U+200B ZERO WIDTH SPACE`
24	`const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);`
25	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
26	`// but we define the same here for Node.js environments.`
27	`const NodeType = {`
28	`ELEMENT_NODE: 1,`
29	`TEXT_NODE: 3,`
30	`};`
31	`const DomAction = {`
32	`Inline: 0, // An inline content, becomes a part of a paragraph.`
33	`Block: 1, // A nested paragraph.`
34	`Skip: 2, // Skip the content. The content before and after are connected.`
35	`Break: 3, // A forced break. The content before and after become paragraphs.`
36	`NoBreak: 4, // The content provides context, but it's not breakable.`
37	`BreakOpportunity: 5, // Force a break opportunity.`
38	`};`
39	`/**`
40	`* Determines the action from an element name, as defined in`
41	`* {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}.`
42	`* See also {@link actionForElement}.`
43	`*/`
44	`const domActions = {`
45	`// Hidden elements`
46	`// https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements`
47	`AREA: DomAction.Skip,`
48	`BASE: DomAction.Skip,`
49	`BASEFONT: DomAction.Skip,`
50	`DATALIST: DomAction.Skip,`
51	`HEAD: DomAction.Skip,`
52	`LINK: DomAction.Skip,`
53	`META: DomAction.Skip,`
54	`NOEMBED: DomAction.Skip,`
55	`NOFRAMES: DomAction.Skip,`
56	`PARAM: DomAction.Skip,`
57	`RP: DomAction.Skip,`
58	`SCRIPT: DomAction.Skip,`
59	`STYLE: DomAction.Skip,`
60	`TEMPLATE: DomAction.Skip,`
61	`TITLE: DomAction.Skip,`
62	`NOSCRIPT: DomAction.Skip,`
63	`// Flow content`
64	`// https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3`
65	`HR: DomAction.Break,`
66	// Disable if `white-space: pre`.
67	`LISTING: DomAction.Skip,`
68	`PLAINTEXT: DomAction.Skip,`
69	`PRE: DomAction.Skip,`
70	`XMP: DomAction.Skip,`
71	`// Phrasing content`
72	`// https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3`
73	`BR: DomAction.Break,`
74	`RT: DomAction.Skip,`
75	`WBR: DomAction.BreakOpportunity,`
76	`// Form controls`
77	`// https://html.spec.whatwg.org/multipage/rendering.html#form-controls`
78	`INPUT: DomAction.Skip,`
79	`SELECT: DomAction.Skip,`
80	`BUTTON: DomAction.Skip,`
81	`TEXTAREA: DomAction.Skip,`
82	`// Other elements where the phrase-based line breaking should be disabled.`
83	`// https://github.com/google/budoux/blob/main/budoux/skip_nodes.json`
84	`ABBR: DomAction.Skip,`
85	`CODE: DomAction.Skip,`
86	`IFRAME: DomAction.Skip,`
87	`TIME: DomAction.Skip,`
88	`VAR: DomAction.Skip,`
89	`// Deprecated, but supported in all browsers.`
90	`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr`
91	`NOBR: DomAction.NoBreak,`
92	`};`
93	`const defaultBlockElements = new Set([`
94	`// 15.3.2 The page`
95	`'HTML',`
96	`'BODY',`
97	`// 15.3.3 Flow content`
98	`'ADDRESS',`
99	`'BLOCKQUOTE',`
100	`'CENTER',`
101	`'DIALOG',`
102	`'DIV',`
103	`'FIGURE',`
104	`'FIGCAPTION',`
105	`'FOOTER',`
106	`'FORM',`
107	`'HEADER',`
108	`'LEGEND',`
109	`'LISTING',`
110	`'MAIN',`
111	`'P',`
112	`// 15.3.6 Sections and headings`
113	`'ARTICLE',`
114	`'ASIDE',`
115	`'H1',`
116	`'H2',`
117	`'H3',`
118	`'H4',`
119	`'H5',`
120	`'H6',`
121	`'HGROUP',`
122	`'NAV',`
123	`'SECTION',`
124	`// 15.3.7 Lists`
125	`'DIR',`
126	`'DD',`
127	`'DL',`
128	`'DT',`
129	`'MENU',`
130	`'OL',`
131	`'UL',`
132	`'LI',`
133	`// 15.3.8 Tables`
134	`'TABLE',`
135	`'CAPTION',`
136	`'COL',`
137	`'TR',`
138	`'TD',`
139	`'TH',`
140	`// 15.3.12 The fieldset and legend elements`
141	`'FIELDSET',`
142	`// 15.5.4 The details and summary elements`
143	`'DETAILS',`
144	`'SUMMARY',`
145	`// 15.5.12 The marquee element`
146	`'MARQUEE',`
147	`]);`
148	// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context,
149	`// but we define the same here for Node.js environments.`
150	`const NODETYPE = {`
151	`ELEMENT: 1,`
152	`TEXT: 3,`
153	`};`
154	`/**`
155	`* Determine the action for an element.`
156	`* @param element An element to determine the action for.`
157	`* @return The {@link domActions} for the element.`
158	`*/`
159	`function actionForElement(element) {`
160	`const nodeName = element.nodeName;`
161	`const action = domActions[nodeName];`
162	`if (action !== undefined)`
163	`return action;`
164	`if (typeof win_js_1.win.getComputedStyle === 'function') {`
165	`const style = win_js_1.win.getComputedStyle(element);`
166	`switch (style.whiteSpace) {`
167	`case 'nowrap':`
168	`case 'pre':`
169	`return DomAction.NoBreak;`
170	`}`
171	`const display = style.display;`
172	`if (display)`
173	`return display === 'inline' ? DomAction.Inline : DomAction.Block;`
174	// `display` is an empty string if the element is not connected.
175	`}`
176	// Use the built-in rules if the `display` property is empty, or if
177	// `getComputedStyle` is missing (e.g., jsdom.)
178	`return defaultBlockElements.has(nodeName)`
179	`? DomAction.Block`
180	`: DomAction.Inline;`
181	`}`
182	`/**`
183	`* Represents a node in {@link Paragraph}.`
184	`*`
185	`* It wraps a {@link Text} or a {@link string}.`
186	`*`
187	`* A {@link string} provides the context for the parser, but it can't be split.`
188	`*/`
189	`class NodeOrText {`
190	`constructor(nodeOrText) {`
191	`this.chunks = [];`
192	`this.hasBreakOpportunityAfter = false;`
193	`this.nodeOrText = nodeOrText;`
194	`}`
195	`get isString() {`
196	`return typeof this.nodeOrText === 'string';`
197	`}`
198	`get canSplit() {`
199	`return !this.isString;`
200	`}`
201	`get text() {`
202	`return this.isString`
203	`? this.nodeOrText`
204	`: this.nodeOrText.nodeValue;`
205	`}`
206	`get length() {`
207	`var _a, _b;`
208	`return (_b = (_a = this.text) === null \|\| _a === void 0 ? void 0 : _a.length) !== null && _b !== void 0 ? _b : 0;`
209	`}`
210	`/**`
211	`* Split the {@link Text} in the same way as the {@link chunks}.`
212	`* Joining all {@link chunks} must be equal to {@link text}.`
213	`*/`
214	`split(separator) {`
215	`const chunks = this.chunks;`
216	`assert(chunks.length === 0 \|\| chunks.join('') === this.text);`
217	`if (chunks.length <= 1)`
218	`return;`
219	`assert(this.canSplit);`
220	`const node = this.nodeOrText;`
221	`if (typeof separator === 'string') {`
222	// If the `separator` is a string, insert it at each boundary.
223	`node.nodeValue = chunks.join(separator);`
224	`return;`
225	`}`
226	// Otherwise create a `Text` node for each chunk, with the separator node
227	// between them, and replace the `node` with them.
228	`const document = node.ownerDocument;`
229	`let nodes = [];`
230	`for (const chunk of chunks) {`
231	`if (chunk)`
232	`nodes.push(document.createTextNode(chunk));`
233	`// Add a separator between chunks. To simplify the logic, add a separator`
234	`// after each chunk, then remove the last one.`
235	// To avoid `cloneNode` for the temporary one that is going to be removed,
236	// add `null` as a marker, then replace them with `cloneNode` later.
237	`nodes.push(null);`
238	`}`
239	`nodes.pop();`
240	`nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));`
241	`node.replaceWith(...nodes);`
242	`}`
243	`}`
244	`class NodeOrTextForTesting extends NodeOrText {`
245	`}`
246	`exports.NodeOrTextForTesting = NodeOrTextForTesting;`
247	`/**`
248	`* Represents a "paragraph", broken by block boundaries or forced breaks.`
249	`*`
250	`* A CSS`
251	`* {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context}`
252	`* is usually a "paragraph", but it can be broken into multiple paragraphs by`
253	* forced breaks such as `<br>`.
254	`*/`
255	`class Paragraph {`
256	`constructor(element) {`
257	`this.nodes = [];`
258	`this.element = element;`
259	`}`
260	`isEmpty() {`
261	`return this.nodes.length === 0;`
262	`}`
263	`get text() {`
264	`return this.nodes.map(node => node.text).join('');`
265	`}`
266	`get lastNode() {`
267	`return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;`
268	`}`
269	`setHasBreakOpportunityAfter() {`
270	`const lastNode = this.lastNode;`
271	`if (lastNode)`
272	`lastNode.hasBreakOpportunityAfter = true;`
273	`}`
274	`/**`
275	`* @return Indices of forced break opportunities in the source.`
276	* They can be created by `<wbr>` tag or `&ZeroWidthSpace;`.
277	`*/`
278	`getForcedOpportunities() {`
279	`const opportunities = [];`
280	`let len = 0;`
281	`for (const node of this.nodes) {`
282	`if (node.canSplit) {`
283	`const text = node.text;`
284	`if (text) {`
285	`for (let i = 0; i < text.length; ++i) {`
286	`if (text.charCodeAt(i) === ZWSP_CODEPOINT) {`
287	`opportunities.push(len + i + 1);`
288	`}`
289	`}`
290	`}`
291	`}`
292	`len += node.length;`
293	`if (node.hasBreakOpportunityAfter) {`
294	`opportunities.push(len);`
295	`}`
296	`}`
297	`return opportunities;`
298	`}`
299	`/**`
300	`* @return Filtered {@param boundaries} by excluding`
301	`* {@link getForcedOpportunities} if it's not empty.`
302	`* Otherwise {@param boundaries}.`
303	`*/`
304	`excludeForcedOpportunities(boundaries) {`
305	`const forcedOpportunities = this.getForcedOpportunities();`
306	`if (!forcedOpportunities.length)`
307	`return boundaries;`
308	`const set = new Set(forcedOpportunities);`
309	`return boundaries.filter(i => !set.has(i));`
310	`}`
311	`}`
312	`class ParagraphForTesting extends Paragraph {`
313	`}`
314	`exports.ParagraphForTesting = ParagraphForTesting;`
315	`/**`
316	`* Adds HTML processing support to a BudouX {@link Parser}.`
317	`*/`
318	`class HTMLProcessor {`
319	`/**`
320	`* @param parser A BudouX {@link Parser} to compute semantic line breaks.`
321	`*/`
322	`constructor(parser, options) {`
323	`/** See {@link HTMLProcessorOptions.separator}. */`
324	`this.separator = ZWSP;`
325	`this.parser_ = parser;`
326	`if (options !== undefined) {`
327	`if (options.className !== undefined)`
328	`this.className = options.className;`
329	`if (options.separator !== undefined)`
330	`this.separator = options.separator;`
331	`}`
332	`}`
333	`/**`
334	`* Checks if the given element has a text node in its children.`
335	`*`
336	`* @param ele An element to be checked.`
337	`* @return Whether the element has a child text node.`
338	`*/`
339	`static hasChildTextNode(ele) {`
340	`for (const child of ele.childNodes) {`
341	`if (child.nodeType === NODETYPE.TEXT)`
342	`return true;`
343	`}`
344	`return false;`
345	`}`
346	`/**`
347	`* Applies markups for semantic line breaks to the given HTML element.`
348	`*`
349	`* It breaks descendant nodes into paragraphs,`
350	`* and applies the BudouX to each paragraph.`
351	`* @param element The input element.`
352	`*/`
353	`applyToElement(element) {`
354	`for (const block of this.getBlocks(element)) {`
355	`assert(!block.isEmpty());`
356	`this.applyToParagraph(block);`
357	`}`
358	`}`
359	`/**`
360	`* Find paragraphs from a given HTML element.`
361	`* @param element The root element to find paragraphs.`
362	`* @param parent The parent {@link Paragraph} if any.`
363	`* @return A list of {@link Paragraph}s.`
364	`*/`
365	`*getBlocks(element, parent) {`
366	`assert(element.nodeType === NodeType.ELEMENT_NODE);`
367	`// Skip if it was once applied to this element.`
368	`if (this.className && element.classList.contains(this.className))`
369	`return;`
370	`const action = actionForElement(element);`
371	`if (action === DomAction.Skip)`
372	`return;`
373	`if (action === DomAction.Break) {`
374	`if (parent && !parent.isEmpty()) {`
375	`parent.setHasBreakOpportunityAfter();`
376	`yield parent;`
377	`parent.nodes = [];`
378	`}`
379	`assert(!element.firstChild);`
380	`return;`
381	`}`
382	`if (action === DomAction.BreakOpportunity) {`
383	`if (parent)`
384	`parent.setHasBreakOpportunityAfter();`
385	`return;`
386	`}`
387	`// Determine if this element creates a new inline formatting context, or if`
388	`// this element belongs to the parent inline formatting context.`
389	`assert(action === DomAction.Block \|\|`
390	`action === DomAction.Inline \|\|`
391	`action === DomAction.NoBreak);`
392	`const isNewBlock = !parent \|\| action === DomAction.Block;`
393	`const block = isNewBlock ? new Paragraph(element) : parent;`
394	`// Collect all text nodes in this inline formatting context, while searching`
395	`// descendant elements recursively.`
396	`for (const child of element.childNodes) {`
397	`switch (child.nodeType) {`
398	`case NodeType.ELEMENT_NODE:`
399	`for (const childBlock of this.getBlocks(child, block))`
400	`yield childBlock;`
401	`break;`
402	`case NodeType.TEXT_NODE:`
403	`if (action === DomAction.NoBreak) {`
404	`const text = child.nodeValue;`
405	`if (text) {`
406	`block.nodes.push(new NodeOrText(text));`
407	`}`
408	`break;`
409	`}`
410	`block.nodes.push(new NodeOrText(child));`
411	`break;`
412	`}`
413	`}`
414	`// Apply if this is an inline formatting context.`
415	`if (isNewBlock && !block.isEmpty())`
416	`yield block;`
417	`}`
418	`/**`
419	`* Apply the BudouX to the given {@link Paragraph}.`
420	`* @param paragraph The {@link Paragraph} to apply.`
421	`*/`
422	`applyToParagraph(paragraph) {`
423	`assert(paragraph.nodes.length > 0);`
424	`if (!paragraph.nodes.some(node => node.canSplit))`
425	`return;`
426	`const text = paragraph.text;`
427	`// No changes if whitespace-only.`
428	`if (/^\s*$/.test(text))`
429	`return;`
430	`// Compute the phrase boundaries.`
431	`const boundaries = this.parser_.parseBoundaries(text);`
432	`// No changes if single phrase.`
433	`if (boundaries.length <= 0)`
434	`return;`
435	// The boundaries should be between 1 and `text.length - 1` in the
436	`// ascending order.`
437	`assert(boundaries[0] > 0);`
438	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
439	`assert(boundaries[boundaries.length - 1] < text.length);`
440	`const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);`
441	`// Add a sentinel to help iterating.`
442	`adjustedBoundaries.push(text.length + 1);`
443	`this.splitNodes(paragraph.nodes, adjustedBoundaries);`
444	`this.applyBlockStyle(paragraph.element);`
445	`}`
446	`/**`
447	`* Split {@link NodeOrText} at the specified boundaries.`
448	`* @param nodes A list of {@link NodeOrText}.`
449	`* @param boundaries A list of indices of the text to split at.`
450	`*/`
451	`splitNodes(nodes, boundaries) {`
452	`var _a;`
453	`assert(boundaries.length > 0);`
454	`assert(boundaries.every((x, i) => i === 0 \|\| x > boundaries[i - 1]));`
455	`const textLen = nodes.reduce((sum, node) => sum + node.length, 0);`
456	`// The last boundary must be a sentinel.`
457	`assert(boundaries[boundaries.length - 1] > textLen);`
458	// Distribute `boundaries` to `node.chunks`.
459	`let boundary_index = 0;`
460	`let boundary = boundaries[0];`
461	`assert(boundary > 0);`
462	let nodeStart = 0; // the start index of the `nodeText` in the whole text.
463	`let lastNode = null;`
464	`for (const node of nodes) {`
465	`assert(boundary >= nodeStart);`
466	`assert(node.chunks.length === 0);`
467	`const nodeText = node.text;`
468	`if (!nodeText)`
469	`continue;`
470	`const nodeLength = nodeText.length;`
471	`const nodeEnd = nodeStart + nodeLength;`
472	`assert(!lastNode \|\| lastNode.canSplit);`
473	`if (!node.canSplit) {`
474	// If there's a boundary between nodes and `lastNode.canSplit`, add a
475	// boundary to the end of the `lastNode`.
476	`if (lastNode && boundary === nodeStart) {`
477	`if (lastNode.chunks.length === 0)`
478	`lastNode.chunks.push((_a = lastNode.text) !== null && _a !== void 0 ? _a : '');`
479	`lastNode.chunks.push('');`
480	`}`
481	`while (boundary < nodeEnd) {`
482	`boundary = boundaries[++boundary_index];`
483	`}`
484	`lastNode = null;`
485	`nodeStart = nodeEnd;`
486	`continue;`
487	`}`
488	// Check if the next boundary is in this `node`.
489	`lastNode = node;`
490	`if (boundary >= nodeEnd) {`
491	`nodeStart = nodeEnd;`
492	`continue;`
493	`}`
494	// Compute the boundary indices in the `node`.
495	`const chunks = node.chunks;`
496	`let chunkStartInNode = 0;`
497	`while (boundary < nodeEnd) {`
498	`const boundaryInNode = boundary - nodeStart;`
499	`assert(boundaryInNode >= chunkStartInNode);`
500	`chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));`
501	`chunkStartInNode = boundaryInNode;`
502	`boundary = boundaries[++boundary_index];`
503	`}`
504	// Add the rest of the `nodeText`.
505	`assert(chunkStartInNode < nodeLength);`
506	`chunks.push(nodeText.slice(chunkStartInNode));`
507	`nodeStart = nodeEnd;`
508	`}`
509	`// Check if all nodes and boundaries are consumed.`
510	`assert(nodeStart === textLen);`
511	`assert(boundary_index < boundaries.length);`
512	`assert(boundaries[boundary_index] >= textLen);`
513	// `node.chunks` are finalized. Split them.
514	`for (const node of nodes) {`
515	`node.split(this.separator);`
516	`}`
517	`}`
518	`/**`
519	`* Applies the block style to the given element.`
520	`* @param element The element to apply the block style.`
521	`*/`
522	`applyBlockStyle(element) {`
523	`if (this.className) {`
524	`element.classList.add(this.className);`
525	`return;`
526	`}`
527	`(0, dom_js_1.applyWrapStyle)(element);`
528	`}`
529	`}`
530	`exports.HTMLProcessor = HTMLProcessor;`
531	`/**`
532	`* BudouX {@link Parser} with HTML processing support.`
533	`*/`
534	`class HTMLProcessingParser extends parser_js_1.Parser {`
535	`constructor(model, htmlProcessorOptions = {`
536	`separator: ZWSP,`
537	`}) {`
538	`super(model);`
539	`this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);`
540	`}`
541	`/**`
542	* @deprecated Use `applyToElement` instead. `applyElement` will be removed
543	* in v0.7.0 to align the function name with `HTMLProcessor`'s API.
544	`*`
545	`* Applies markups for semantic line breaks to the given HTML element.`
546	`* @param parentElement The input element.`
547	`*/`
548	`applyElement(parentElement) {`
549	console.warn('`applyElement` is deprecated. Please use `applyToElement` instead. ' +
550	'`applyElement` will be removed in v0.7.0.');
551	`this.applyToElement(parentElement);`
552	`}`
553	`/**`
554	`* Applies markups for semantic line breaks to the given HTML element.`
555	`* @param parentElement The input element.`
556	`*/`
557	`applyToElement(parentElement) {`
558	`this.htmlProcessor.applyToElement(parentElement);`
559	`}`
560	`/**`
561	`* Translates the given HTML string to another HTML string with markups`
562	`* for semantic line breaks.`
563	`* @param html An input html string.`
564	`* @return The translated HTML string.`
565	`*/`
566	`translateHTMLString(html) {`
567	`if (html === '')`
568	`return html;`
569	`const doc = (0, dom_js_1.parseFromString)(html);`
570	`if (HTMLProcessor.hasChildTextNode(doc.body)) {`
571	`const wrapper = doc.createElement('span');`
572	`wrapper.append(...doc.body.childNodes);`
573	`doc.body.append(wrapper);`
574	`}`
575	`this.applyToElement(doc.body.childNodes[0]);`
576	`return doc.body.innerHTML;`
577	`}`
578	`}`
579	`exports.HTMLProcessingParser = HTMLProcessingParser;`
580	`//# sourceMappingURL=html_processor.js.map`
\	No newline at end of file