1 |
|
2 |
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 |
|
17 | import {applyWrapStyle, parseFromString} from './dom.js';
|
18 | import {Parser} from './parser.js';
|
19 | import {win} from './win.js';
|
20 |
|
21 | const assert = console.assert;
|
22 |
|
23 | const ZWSP_CODEPOINT = 0x200b;
|
24 | const ZWSP = String.fromCharCode(ZWSP_CODEPOINT);
|
25 |
|
26 |
|
27 |
|
28 | const NodeType = {
|
29 | ELEMENT_NODE: 1,
|
30 | TEXT_NODE: 3,
|
31 | };
|
32 |
|
33 | const DomAction = {
|
34 | Inline: 0,
|
35 | Block: 1,
|
36 | Skip: 2,
|
37 | Break: 3,
|
38 | NoBreak: 4,
|
39 | BreakOpportunity: 5,
|
40 | } as const;
|
41 | type DomAction = (typeof DomAction)[keyof typeof DomAction];
|
42 |
|
43 |
|
44 |
|
45 |
|
46 |
|
47 |
|
48 | const domActions: {[name: string]: DomAction} = {
|
49 |
|
50 |
|
51 | AREA: DomAction.Skip,
|
52 | BASE: DomAction.Skip,
|
53 | BASEFONT: DomAction.Skip,
|
54 | DATALIST: DomAction.Skip,
|
55 | HEAD: DomAction.Skip,
|
56 | LINK: DomAction.Skip,
|
57 | META: DomAction.Skip,
|
58 | NOEMBED: DomAction.Skip,
|
59 | NOFRAMES: DomAction.Skip,
|
60 | PARAM: DomAction.Skip,
|
61 | RP: DomAction.Skip,
|
62 | SCRIPT: DomAction.Skip,
|
63 | STYLE: DomAction.Skip,
|
64 | TEMPLATE: DomAction.Skip,
|
65 | TITLE: DomAction.Skip,
|
66 | NOSCRIPT: DomAction.Skip,
|
67 |
|
68 |
|
69 |
|
70 | HR: DomAction.Break,
|
71 |
|
72 | LISTING: DomAction.Skip,
|
73 | PLAINTEXT: DomAction.Skip,
|
74 | PRE: DomAction.Skip,
|
75 | XMP: DomAction.Skip,
|
76 |
|
77 |
|
78 |
|
79 | BR: DomAction.Break,
|
80 | RT: DomAction.Skip,
|
81 | WBR: DomAction.BreakOpportunity,
|
82 |
|
83 |
|
84 |
|
85 | INPUT: DomAction.Skip,
|
86 | SELECT: DomAction.Skip,
|
87 | BUTTON: DomAction.Skip,
|
88 | TEXTAREA: DomAction.Skip,
|
89 |
|
90 |
|
91 |
|
92 | ABBR: DomAction.Skip,
|
93 | CODE: DomAction.Skip,
|
94 | IFRAME: DomAction.Skip,
|
95 | TIME: DomAction.Skip,
|
96 | VAR: DomAction.Skip,
|
97 |
|
98 |
|
99 |
|
100 | NOBR: DomAction.NoBreak,
|
101 | };
|
102 |
|
103 | const defaultBlockElements = new Set([
|
104 |
|
105 | 'HTML',
|
106 | 'BODY',
|
107 |
|
108 | 'ADDRESS',
|
109 | 'BLOCKQUOTE',
|
110 | 'CENTER',
|
111 | 'DIALOG',
|
112 | 'DIV',
|
113 | 'FIGURE',
|
114 | 'FIGCAPTION',
|
115 | 'FOOTER',
|
116 | 'FORM',
|
117 | 'HEADER',
|
118 | 'LEGEND',
|
119 | 'LISTING',
|
120 | 'MAIN',
|
121 | 'P',
|
122 |
|
123 | 'ARTICLE',
|
124 | 'ASIDE',
|
125 | 'H1',
|
126 | 'H2',
|
127 | 'H3',
|
128 | 'H4',
|
129 | 'H5',
|
130 | 'H6',
|
131 | 'HGROUP',
|
132 | 'NAV',
|
133 | 'SECTION',
|
134 |
|
135 | 'DIR',
|
136 | 'DD',
|
137 | 'DL',
|
138 | 'DT',
|
139 | 'MENU',
|
140 | 'OL',
|
141 | 'UL',
|
142 | 'LI',
|
143 |
|
144 | 'TABLE',
|
145 | 'CAPTION',
|
146 | 'COL',
|
147 | 'TR',
|
148 | 'TD',
|
149 | 'TH',
|
150 |
|
151 | 'FIELDSET',
|
152 |
|
153 | 'DETAILS',
|
154 | 'SUMMARY',
|
155 |
|
156 | 'MARQUEE',
|
157 | ]);
|
158 |
|
159 |
|
160 |
|
161 | const NODETYPE = {
|
162 | ELEMENT: 1,
|
163 | TEXT: 3,
|
164 | };
|
165 |
|
166 |
|
167 |
|
168 |
|
169 |
|
170 |
|
171 | function actionForElement(element: Element): DomAction {
|
172 | const nodeName = element.nodeName;
|
173 | const action = domActions[nodeName];
|
174 | if (action !== undefined) return action;
|
175 |
|
176 | if (typeof win.getComputedStyle === 'function') {
|
177 | const style = win.getComputedStyle(element);
|
178 | switch (style.whiteSpace) {
|
179 | case 'nowrap':
|
180 | case 'pre':
|
181 | return DomAction.NoBreak;
|
182 | }
|
183 |
|
184 | const display = style.display;
|
185 | if (display)
|
186 | return display === 'inline' ? DomAction.Inline : DomAction.Block;
|
187 |
|
188 | }
|
189 |
|
190 |
|
191 |
|
192 | return defaultBlockElements.has(nodeName)
|
193 | ? DomAction.Block
|
194 | : DomAction.Inline;
|
195 | }
|
196 |
|
197 |
|
198 |
|
199 |
|
200 |
|
201 |
|
202 |
|
203 |
|
204 | class NodeOrText {
|
205 | nodeOrText: Text | string;
|
206 | chunks: string[] = [];
|
207 | hasBreakOpportunityAfter = false;
|
208 |
|
209 | constructor(nodeOrText: Text | string) {
|
210 | this.nodeOrText = nodeOrText;
|
211 | }
|
212 |
|
213 | get isString(): boolean {
|
214 | return typeof this.nodeOrText === 'string';
|
215 | }
|
216 | get canSplit(): boolean {
|
217 | return !this.isString;
|
218 | }
|
219 | get text(): string | null {
|
220 | return this.isString
|
221 | ? (this.nodeOrText as string)
|
222 | : (this.nodeOrText as Text).nodeValue;
|
223 | }
|
224 | get length(): number {
|
225 | return this.text?.length ?? 0;
|
226 | }
|
227 |
|
228 | |
229 |
|
230 |
|
231 |
|
232 | split(separator: string | Node) {
|
233 | const chunks = this.chunks;
|
234 | assert(chunks.length === 0 || chunks.join('') === this.text);
|
235 | if (chunks.length <= 1) return;
|
236 | assert(this.canSplit);
|
237 | const node = this.nodeOrText as Text;
|
238 | if (typeof separator === 'string') {
|
239 |
|
240 | node.nodeValue = chunks.join(separator);
|
241 | return;
|
242 | }
|
243 |
|
244 |
|
245 |
|
246 | const document = node.ownerDocument;
|
247 | let nodes = [];
|
248 | for (const chunk of chunks) {
|
249 | if (chunk) nodes.push(document.createTextNode(chunk));
|
250 |
|
251 |
|
252 |
|
253 |
|
254 | nodes.push(null);
|
255 | }
|
256 | nodes.pop();
|
257 | nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
|
258 | node.replaceWith(...nodes);
|
259 | }
|
260 | }
|
261 | export class NodeOrTextForTesting extends NodeOrText {}
|
262 |
|
263 |
|
264 |
|
265 |
|
266 |
|
267 |
|
268 |
|
269 |
|
270 |
|
271 | class Paragraph {
|
272 | element: HTMLElement;
|
273 | nodes: NodeOrText[] = [];
|
274 |
|
275 | constructor(element: HTMLElement) {
|
276 | this.element = element;
|
277 | }
|
278 |
|
279 | isEmpty(): boolean {
|
280 | return this.nodes.length === 0;
|
281 | }
|
282 | get text(): string {
|
283 | return this.nodes.map(node => node.text).join('');
|
284 | }
|
285 |
|
286 | get lastNode(): NodeOrText | undefined {
|
287 | return this.nodes.length ? this.nodes[this.nodes.length - 1] : undefined;
|
288 | }
|
289 | setHasBreakOpportunityAfter() {
|
290 | const lastNode = this.lastNode;
|
291 | if (lastNode) lastNode.hasBreakOpportunityAfter = true;
|
292 | }
|
293 |
|
294 | |
295 |
|
296 |
|
297 |
|
298 | getForcedOpportunities(): number[] {
|
299 | const opportunities: number[] = [];
|
300 | let len = 0;
|
301 | for (const node of this.nodes) {
|
302 | if (node.canSplit) {
|
303 | const text = node.text;
|
304 | if (text) {
|
305 | for (let i = 0; i < text.length; ++i) {
|
306 | if (text.charCodeAt(i) === ZWSP_CODEPOINT) {
|
307 | opportunities.push(len + i + 1);
|
308 | }
|
309 | }
|
310 | }
|
311 | }
|
312 | len += node.length;
|
313 | if (node.hasBreakOpportunityAfter) {
|
314 | opportunities.push(len);
|
315 | }
|
316 | }
|
317 | return opportunities;
|
318 | }
|
319 |
|
320 | |
321 |
|
322 |
|
323 |
|
324 |
|
325 | excludeForcedOpportunities(boundaries: number[]): number[] {
|
326 | const forcedOpportunities = this.getForcedOpportunities();
|
327 | if (!forcedOpportunities.length) return boundaries;
|
328 | const set = new Set<number>(forcedOpportunities);
|
329 | return boundaries.filter(i => !set.has(i));
|
330 | }
|
331 | }
|
332 | export class ParagraphForTesting extends Paragraph {}
|
333 |
|
334 |
|
335 |
|
336 |
|
337 | export interface HTMLProcessorOptions {
|
338 | |
339 |
|
340 |
|
341 |
|
342 |
|
343 |
|
344 |
|
345 | className?: string;
|
346 | |
347 |
|
348 |
|
349 |
|
350 |
|
351 |
|
352 |
|
353 | separator?: string | Node;
|
354 | }
|
355 |
|
356 |
|
357 |
|
358 |
|
359 | export class HTMLProcessor {
|
360 | private parser_: Parser;
|
361 |
|
362 | className?: string;
|
363 |
|
364 | separator: string | Node = ZWSP;
|
365 |
|
366 | |
367 |
|
368 |
|
369 | constructor(parser: Parser, options?: HTMLProcessorOptions) {
|
370 | this.parser_ = parser;
|
371 | if (options !== undefined) {
|
372 | if (options.className !== undefined) this.className = options.className;
|
373 | if (options.separator !== undefined) this.separator = options.separator;
|
374 | }
|
375 | }
|
376 |
|
377 | |
378 |
|
379 |
|
380 |
|
381 |
|
382 |
|
383 | static hasChildTextNode(ele: HTMLElement) {
|
384 | for (const child of ele.childNodes) {
|
385 | if (child.nodeType === NODETYPE.TEXT) return true;
|
386 | }
|
387 | return false;
|
388 | }
|
389 |
|
390 | |
391 |
|
392 |
|
393 |
|
394 |
|
395 |
|
396 |
|
397 | applyToElement(element: HTMLElement) {
|
398 | for (const block of this.getBlocks(element)) {
|
399 | assert(!block.isEmpty());
|
400 | this.applyToParagraph(block);
|
401 | }
|
402 | }
|
403 |
|
404 | |
405 |
|
406 |
|
407 |
|
408 |
|
409 |
|
410 | *getBlocks(
|
411 | element: HTMLElement,
|
412 | parent?: Paragraph
|
413 | ): IterableIterator<Paragraph> {
|
414 | assert(element.nodeType === NodeType.ELEMENT_NODE);
|
415 |
|
416 |
|
417 | if (this.className && element.classList.contains(this.className)) return;
|
418 |
|
419 | const action = actionForElement(element);
|
420 | if (action === DomAction.Skip) return;
|
421 | if (action === DomAction.Break) {
|
422 | if (parent && !parent.isEmpty()) {
|
423 | parent.setHasBreakOpportunityAfter();
|
424 | yield parent;
|
425 | parent.nodes = [];
|
426 | }
|
427 | assert(!element.firstChild);
|
428 | return;
|
429 | }
|
430 | if (action === DomAction.BreakOpportunity) {
|
431 | if (parent) parent.setHasBreakOpportunityAfter();
|
432 | return;
|
433 | }
|
434 |
|
435 |
|
436 |
|
437 | assert(
|
438 | action === DomAction.Block ||
|
439 | action === DomAction.Inline ||
|
440 | action === DomAction.NoBreak
|
441 | );
|
442 | const isNewBlock = !parent || action === DomAction.Block;
|
443 | const block = isNewBlock ? new Paragraph(element) : parent;
|
444 |
|
445 |
|
446 |
|
447 | for (const child of element.childNodes) {
|
448 | switch (child.nodeType) {
|
449 | case NodeType.ELEMENT_NODE:
|
450 | for (const childBlock of this.getBlocks(child as HTMLElement, block))
|
451 | yield childBlock;
|
452 | break;
|
453 | case NodeType.TEXT_NODE:
|
454 | if (action === DomAction.NoBreak) {
|
455 | const text = child.nodeValue;
|
456 | if (text) {
|
457 | block.nodes.push(new NodeOrText(text));
|
458 | }
|
459 | break;
|
460 | }
|
461 | block.nodes.push(new NodeOrText(child as Text));
|
462 | break;
|
463 | }
|
464 | }
|
465 |
|
466 |
|
467 | if (isNewBlock && !block.isEmpty()) yield block;
|
468 | }
|
469 |
|
470 | |
471 |
|
472 |
|
473 |
|
474 | applyToParagraph(paragraph: Paragraph): void {
|
475 | assert(paragraph.nodes.length > 0);
|
476 | if (!paragraph.nodes.some(node => node.canSplit)) return;
|
477 | const text = paragraph.text;
|
478 |
|
479 | if (/^\s*$/.test(text)) return;
|
480 |
|
481 |
|
482 | const boundaries = this.parser_.parseBoundaries(text);
|
483 |
|
484 | if (boundaries.length <= 0) return;
|
485 |
|
486 |
|
487 | assert(boundaries[0] > 0);
|
488 | assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
|
489 | assert(boundaries[boundaries.length - 1] < text.length);
|
490 |
|
491 | const adjustedBoundaries = paragraph.excludeForcedOpportunities(boundaries);
|
492 |
|
493 |
|
494 | adjustedBoundaries.push(text.length + 1);
|
495 |
|
496 | this.splitNodes(paragraph.nodes, adjustedBoundaries);
|
497 | this.applyBlockStyle(paragraph.element);
|
498 | }
|
499 |
|
500 | |
501 |
|
502 |
|
503 |
|
504 |
|
505 | splitNodes(nodes: NodeOrText[], boundaries: number[]): void {
|
506 | assert(boundaries.length > 0);
|
507 | assert(boundaries.every((x, i) => i === 0 || x > boundaries[i - 1]));
|
508 | const textLen = nodes.reduce((sum, node) => sum + node.length, 0);
|
509 |
|
510 | assert(boundaries[boundaries.length - 1] > textLen);
|
511 |
|
512 |
|
513 | let boundary_index = 0;
|
514 | let boundary = boundaries[0];
|
515 | assert(boundary > 0);
|
516 | let nodeStart = 0;
|
517 | let lastNode: NodeOrText | null = null;
|
518 | for (const node of nodes) {
|
519 | assert(boundary >= nodeStart);
|
520 | assert(node.chunks.length === 0);
|
521 | const nodeText = node.text;
|
522 | if (!nodeText) continue;
|
523 | const nodeLength = nodeText.length;
|
524 | const nodeEnd = nodeStart + nodeLength;
|
525 | assert(!lastNode || lastNode.canSplit);
|
526 | if (!node.canSplit) {
|
527 |
|
528 |
|
529 | if (lastNode && boundary === nodeStart) {
|
530 | if (lastNode.chunks.length === 0)
|
531 | lastNode.chunks.push(lastNode.text ?? '');
|
532 | lastNode.chunks.push('');
|
533 | }
|
534 | while (boundary < nodeEnd) {
|
535 | boundary = boundaries[++boundary_index];
|
536 | }
|
537 | lastNode = null;
|
538 | nodeStart = nodeEnd;
|
539 | continue;
|
540 | }
|
541 |
|
542 |
|
543 | lastNode = node;
|
544 | if (boundary >= nodeEnd) {
|
545 | nodeStart = nodeEnd;
|
546 | continue;
|
547 | }
|
548 |
|
549 |
|
550 | const chunks = node.chunks;
|
551 | let chunkStartInNode = 0;
|
552 | while (boundary < nodeEnd) {
|
553 | const boundaryInNode = boundary - nodeStart;
|
554 | assert(boundaryInNode >= chunkStartInNode);
|
555 | chunks.push(nodeText.slice(chunkStartInNode, boundaryInNode));
|
556 | chunkStartInNode = boundaryInNode;
|
557 | boundary = boundaries[++boundary_index];
|
558 | }
|
559 |
|
560 | assert(chunkStartInNode < nodeLength);
|
561 | chunks.push(nodeText.slice(chunkStartInNode));
|
562 |
|
563 | nodeStart = nodeEnd;
|
564 | }
|
565 |
|
566 | assert(nodeStart === textLen);
|
567 | assert(boundary_index < boundaries.length);
|
568 | assert(boundaries[boundary_index] >= textLen);
|
569 |
|
570 |
|
571 | for (const node of nodes) {
|
572 | node.split(this.separator);
|
573 | }
|
574 | }
|
575 |
|
576 | |
577 |
|
578 |
|
579 |
|
580 | applyBlockStyle(element: HTMLElement): void {
|
581 | if (this.className) {
|
582 | element.classList.add(this.className);
|
583 | return;
|
584 | }
|
585 | applyWrapStyle(element);
|
586 | }
|
587 | }
|
588 |
|
589 |
|
590 |
|
591 |
|
592 | export class HTMLProcessingParser extends Parser {
|
593 | htmlProcessor: HTMLProcessor;
|
594 |
|
595 | constructor(
|
596 | model: {[key: string]: {[key: string]: number}},
|
597 | htmlProcessorOptions: HTMLProcessorOptions = {
|
598 | separator: ZWSP,
|
599 | }
|
600 | ) {
|
601 | super(model);
|
602 | this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
|
603 | }
|
604 |
|
605 | |
606 |
|
607 |
|
608 |
|
609 |
|
610 |
|
611 |
|
612 | applyElement(parentElement: HTMLElement) {
|
613 | console.warn(
|
614 | '`applyElement` is deprecated. Please use `applyToElement` instead. ' +
|
615 | '`applyElement` will be removed in v0.7.0.'
|
616 | );
|
617 | this.applyToElement(parentElement);
|
618 | }
|
619 |
|
620 | |
621 |
|
622 |
|
623 |
|
624 | applyToElement(parentElement: HTMLElement) {
|
625 | this.htmlProcessor.applyToElement(parentElement);
|
626 | }
|
627 |
|
628 | |
629 |
|
630 |
|
631 |
|
632 |
|
633 |
|
634 | translateHTMLString(html: string) {
|
635 | if (html === '') return html;
|
636 | const doc = parseFromString(html);
|
637 | if (HTMLProcessor.hasChildTextNode(doc.body)) {
|
638 | const wrapper = doc.createElement('span');
|
639 | wrapper.append(...doc.body.childNodes);
|
640 | doc.body.append(wrapper);
|
641 | }
|
642 | this.applyToElement(doc.body.childNodes[0] as HTMLElement);
|
643 | return doc.body.innerHTML;
|
644 | }
|
645 | }
|