UNPKG

20.8 kBJavaScriptView Raw
1import { defaults } from './defaults.js';
2import {
3 rtrim,
4 splitCells,
5 escape,
6 findClosingBracket
7} from './helpers.js';
8
9function outputLink(cap, link, raw, lexer) {
10 const href = link.href;
11 const title = link.title ? escape(link.title) : null;
12 const text = cap[1].replace(/\\([\[\]])/g, '$1');
13
14 if (cap[0].charAt(0) !== '!') {
15 lexer.state.inLink = true;
16 const token = {
17 type: 'link',
18 raw,
19 href,
20 title,
21 text,
22 tokens: lexer.inlineTokens(text, [])
23 };
24 lexer.state.inLink = false;
25 return token;
26 } else {
27 return {
28 type: 'image',
29 raw,
30 href,
31 title,
32 text: escape(text)
33 };
34 }
35}
36
37function indentCodeCompensation(raw, text) {
38 const matchIndentToCode = raw.match(/^(\s+)(?:```)/);
39
40 if (matchIndentToCode === null) {
41 return text;
42 }
43
44 const indentToCode = matchIndentToCode[1];
45
46 return text
47 .split('\n')
48 .map(node => {
49 const matchIndentInNode = node.match(/^\s+/);
50 if (matchIndentInNode === null) {
51 return node;
52 }
53
54 const [indentInNode] = matchIndentInNode;
55
56 if (indentInNode.length >= indentToCode.length) {
57 return node.slice(indentToCode.length);
58 }
59
60 return node;
61 })
62 .join('\n');
63}
64
65/**
66 * Tokenizer
67 */
68export class Tokenizer {
69 constructor(options) {
70 this.options = options || defaults;
71 }
72
73 space(src) {
74 const cap = this.rules.block.newline.exec(src);
75 if (cap && cap[0].length > 0) {
76 return {
77 type: 'space',
78 raw: cap[0]
79 };
80 }
81 }
82
83 code(src) {
84 const cap = this.rules.block.code.exec(src);
85 if (cap) {
86 const text = cap[0].replace(/^ {1,4}/gm, '');
87 return {
88 type: 'code',
89 raw: cap[0],
90 codeBlockStyle: 'indented',
91 text: !this.options.pedantic
92 ? rtrim(text, '\n')
93 : text
94 };
95 }
96 }
97
98 fences(src) {
99 const cap = this.rules.block.fences.exec(src);
100 if (cap) {
101 const raw = cap[0];
102 const text = indentCodeCompensation(raw, cap[3] || '');
103
104 return {
105 type: 'code',
106 raw,
107 lang: cap[2] ? cap[2].trim() : cap[2],
108 text
109 };
110 }
111 }
112
113 heading(src) {
114 const cap = this.rules.block.heading.exec(src);
115 if (cap) {
116 let text = cap[2].trim();
117
118 // remove trailing #s
119 if (/#$/.test(text)) {
120 const trimmed = rtrim(text, '#');
121 if (this.options.pedantic) {
122 text = trimmed.trim();
123 } else if (!trimmed || / $/.test(trimmed)) {
124 // CommonMark requires space before trailing #s
125 text = trimmed.trim();
126 }
127 }
128
129 const token = {
130 type: 'heading',
131 raw: cap[0],
132 depth: cap[1].length,
133 text: text,
134 tokens: []
135 };
136 this.lexer.inline(token.text, token.tokens);
137 return token;
138 }
139 }
140
141 hr(src) {
142 const cap = this.rules.block.hr.exec(src);
143 if (cap) {
144 return {
145 type: 'hr',
146 raw: cap[0]
147 };
148 }
149 }
150
151 blockquote(src) {
152 const cap = this.rules.block.blockquote.exec(src);
153 if (cap) {
154 const text = cap[0].replace(/^ *> ?/gm, '');
155
156 return {
157 type: 'blockquote',
158 raw: cap[0],
159 tokens: this.lexer.blockTokens(text, []),
160 text
161 };
162 }
163 }
164
165 list(src) {
166 let cap = this.rules.block.list.exec(src);
167 if (cap) {
168 let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
169 line, nextLine, rawLine, itemContents, endEarly;
170
171 let bull = cap[1].trim();
172 const isordered = bull.length > 1;
173
174 const list = {
175 type: 'list',
176 raw: '',
177 ordered: isordered,
178 start: isordered ? +bull.slice(0, -1) : '',
179 loose: false,
180 items: []
181 };
182
183 bull = isordered ? `\\d{1,9}\\${bull.slice(-1)}` : `\\${bull}`;
184
185 if (this.options.pedantic) {
186 bull = isordered ? bull : '[*+-]';
187 }
188
189 // Get next list item
190 const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`);
191
192 // Check if current bullet point can start a new List Item
193 while (src) {
194 endEarly = false;
195 if (!(cap = itemRegex.exec(src))) {
196 break;
197 }
198
199 if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?)
200 break;
201 }
202
203 raw = cap[0];
204 src = src.substring(raw.length);
205
206 line = cap[2].split('\n', 1)[0];
207 nextLine = src.split('\n', 1)[0];
208
209 if (this.options.pedantic) {
210 indent = 2;
211 itemContents = line.trimLeft();
212 } else {
213 indent = cap[2].search(/[^ ]/); // Find first non-space char
214 indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
215 itemContents = line.slice(indent);
216 indent += cap[1].length;
217 }
218
219 blankLine = false;
220
221 if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line
222 raw += nextLine + '\n';
223 src = src.substring(nextLine.length + 1);
224 endEarly = true;
225 }
226
227 if (!endEarly) {
228 const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);
229
230 // Check if following lines should be included in List Item
231 while (src) {
232 rawLine = src.split('\n', 1)[0];
233 line = rawLine;
234
235 // Re-align to follow commonmark nesting rules
236 if (this.options.pedantic) {
237 line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
238 }
239
240 // End list item if found start of new bullet
241 if (nextBulletRegex.test(line)) {
242 break;
243 }
244
245 if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible
246 itemContents += '\n' + line.slice(indent);
247 } else if (!blankLine) { // Until blank line, item doesn't need indentation
248 itemContents += '\n' + line;
249 } else { // Otherwise, improper indentation ends this item
250 break;
251 }
252
253 if (!blankLine && !line.trim()) { // Check if current line is blank
254 blankLine = true;
255 }
256
257 raw += rawLine + '\n';
258 src = src.substring(rawLine.length + 1);
259 }
260 }
261
262 if (!list.loose) {
263 // If the previous item ended with a blank line, the list is loose
264 if (endsWithBlankLine) {
265 list.loose = true;
266 } else if (/\n *\n *$/.test(raw)) {
267 endsWithBlankLine = true;
268 }
269 }
270
271 // Check for task list items
272 if (this.options.gfm) {
273 istask = /^\[[ xX]\] /.exec(itemContents);
274 if (istask) {
275 ischecked = istask[0] !== '[ ] ';
276 itemContents = itemContents.replace(/^\[[ xX]\] +/, '');
277 }
278 }
279
280 list.items.push({
281 type: 'list_item',
282 raw: raw,
283 task: !!istask,
284 checked: ischecked,
285 loose: false,
286 text: itemContents
287 });
288
289 list.raw += raw;
290 }
291
292 // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
293 list.items[list.items.length - 1].raw = raw.trimRight();
294 list.items[list.items.length - 1].text = itemContents.trimRight();
295 list.raw = list.raw.trimRight();
296
297 const l = list.items.length;
298
299 // Item child tokens handled here at end because we needed to have the final item to trim it first
300 for (i = 0; i < l; i++) {
301 this.lexer.state.top = false;
302 list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
303 const spacers = list.items[i].tokens.filter(t => t.type === 'space');
304 const hasMultipleLineBreaks = spacers.every(t => {
305 const chars = t.raw.split('');
306 let lineBreaks = 0;
307 for (const char of chars) {
308 if (char === '\n') {
309 lineBreaks += 1;
310 }
311 if (lineBreaks > 1) {
312 return true;
313 }
314 }
315
316 return false;
317 });
318
319 if (!list.loose && spacers.length && hasMultipleLineBreaks) {
320 // Having a single line break doesn't mean a list is loose. A single line break is terminating the last list item
321 list.loose = true;
322 list.items[i].loose = true;
323 }
324 }
325
326 return list;
327 }
328 }
329
330 html(src) {
331 const cap = this.rules.block.html.exec(src);
332 if (cap) {
333 const token = {
334 type: 'html',
335 raw: cap[0],
336 pre: !this.options.sanitizer
337 && (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'),
338 text: cap[0]
339 };
340 if (this.options.sanitize) {
341 token.type = 'paragraph';
342 token.text = this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0]);
343 token.tokens = [];
344 this.lexer.inline(token.text, token.tokens);
345 }
346 return token;
347 }
348 }
349
350 def(src) {
351 const cap = this.rules.block.def.exec(src);
352 if (cap) {
353 if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1);
354 const tag = cap[1].toLowerCase().replace(/\s+/g, ' ');
355 return {
356 type: 'def',
357 tag,
358 raw: cap[0],
359 href: cap[2],
360 title: cap[3]
361 };
362 }
363 }
364
365 table(src) {
366 const cap = this.rules.block.table.exec(src);
367 if (cap) {
368 const item = {
369 type: 'table',
370 header: splitCells(cap[1]).map(c => { return { text: c }; }),
371 align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
372 rows: cap[3] ? cap[3].replace(/\n[ \t]*$/, '').split('\n') : []
373 };
374
375 if (item.header.length === item.align.length) {
376 item.raw = cap[0];
377
378 let l = item.align.length;
379 let i, j, k, row;
380 for (i = 0; i < l; i++) {
381 if (/^ *-+: *$/.test(item.align[i])) {
382 item.align[i] = 'right';
383 } else if (/^ *:-+: *$/.test(item.align[i])) {
384 item.align[i] = 'center';
385 } else if (/^ *:-+ *$/.test(item.align[i])) {
386 item.align[i] = 'left';
387 } else {
388 item.align[i] = null;
389 }
390 }
391
392 l = item.rows.length;
393 for (i = 0; i < l; i++) {
394 item.rows[i] = splitCells(item.rows[i], item.header.length).map(c => { return { text: c }; });
395 }
396
397 // parse child tokens inside headers and cells
398
399 // header child tokens
400 l = item.header.length;
401 for (j = 0; j < l; j++) {
402 item.header[j].tokens = [];
403 this.lexer.inlineTokens(item.header[j].text, item.header[j].tokens);
404 }
405
406 // cell child tokens
407 l = item.rows.length;
408 for (j = 0; j < l; j++) {
409 row = item.rows[j];
410 for (k = 0; k < row.length; k++) {
411 row[k].tokens = [];
412 this.lexer.inlineTokens(row[k].text, row[k].tokens);
413 }
414 }
415
416 return item;
417 }
418 }
419 }
420
421 lheading(src) {
422 const cap = this.rules.block.lheading.exec(src);
423 if (cap) {
424 const token = {
425 type: 'heading',
426 raw: cap[0],
427 depth: cap[2].charAt(0) === '=' ? 1 : 2,
428 text: cap[1],
429 tokens: []
430 };
431 this.lexer.inline(token.text, token.tokens);
432 return token;
433 }
434 }
435
436 paragraph(src) {
437 const cap = this.rules.block.paragraph.exec(src);
438 if (cap) {
439 const token = {
440 type: 'paragraph',
441 raw: cap[0],
442 text: cap[1].charAt(cap[1].length - 1) === '\n'
443 ? cap[1].slice(0, -1)
444 : cap[1],
445 tokens: []
446 };
447 this.lexer.inline(token.text, token.tokens);
448 return token;
449 }
450 }
451
452 text(src) {
453 const cap = this.rules.block.text.exec(src);
454 if (cap) {
455 const token = {
456 type: 'text',
457 raw: cap[0],
458 text: cap[0],
459 tokens: []
460 };
461 this.lexer.inline(token.text, token.tokens);
462 return token;
463 }
464 }
465
466 escape(src) {
467 const cap = this.rules.inline.escape.exec(src);
468 if (cap) {
469 return {
470 type: 'escape',
471 raw: cap[0],
472 text: escape(cap[1])
473 };
474 }
475 }
476
477 tag(src) {
478 const cap = this.rules.inline.tag.exec(src);
479 if (cap) {
480 if (!this.lexer.state.inLink && /^<a /i.test(cap[0])) {
481 this.lexer.state.inLink = true;
482 } else if (this.lexer.state.inLink && /^<\/a>/i.test(cap[0])) {
483 this.lexer.state.inLink = false;
484 }
485 if (!this.lexer.state.inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
486 this.lexer.state.inRawBlock = true;
487 } else if (this.lexer.state.inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
488 this.lexer.state.inRawBlock = false;
489 }
490
491 return {
492 type: this.options.sanitize
493 ? 'text'
494 : 'html',
495 raw: cap[0],
496 inLink: this.lexer.state.inLink,
497 inRawBlock: this.lexer.state.inRawBlock,
498 text: this.options.sanitize
499 ? (this.options.sanitizer
500 ? this.options.sanitizer(cap[0])
501 : escape(cap[0]))
502 : cap[0]
503 };
504 }
505 }
506
507 link(src) {
508 const cap = this.rules.inline.link.exec(src);
509 if (cap) {
510 const trimmedUrl = cap[2].trim();
511 if (!this.options.pedantic && /^</.test(trimmedUrl)) {
512 // commonmark requires matching angle brackets
513 if (!(/>$/.test(trimmedUrl))) {
514 return;
515 }
516
517 // ending angle bracket cannot be escaped
518 const rtrimSlash = rtrim(trimmedUrl.slice(0, -1), '\\');
519 if ((trimmedUrl.length - rtrimSlash.length) % 2 === 0) {
520 return;
521 }
522 } else {
523 // find closing parenthesis
524 const lastParenIndex = findClosingBracket(cap[2], '()');
525 if (lastParenIndex > -1) {
526 const start = cap[0].indexOf('!') === 0 ? 5 : 4;
527 const linkLen = start + cap[1].length + lastParenIndex;
528 cap[2] = cap[2].substring(0, lastParenIndex);
529 cap[0] = cap[0].substring(0, linkLen).trim();
530 cap[3] = '';
531 }
532 }
533 let href = cap[2];
534 let title = '';
535 if (this.options.pedantic) {
536 // split pedantic href and title
537 const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href);
538
539 if (link) {
540 href = link[1];
541 title = link[3];
542 }
543 } else {
544 title = cap[3] ? cap[3].slice(1, -1) : '';
545 }
546
547 href = href.trim();
548 if (/^</.test(href)) {
549 if (this.options.pedantic && !(/>$/.test(trimmedUrl))) {
550 // pedantic allows starting angle bracket without ending angle bracket
551 href = href.slice(1);
552 } else {
553 href = href.slice(1, -1);
554 }
555 }
556 return outputLink(cap, {
557 href: href ? href.replace(this.rules.inline._escapes, '$1') : href,
558 title: title ? title.replace(this.rules.inline._escapes, '$1') : title
559 }, cap[0], this.lexer);
560 }
561 }
562
563 reflink(src, links) {
564 let cap;
565 if ((cap = this.rules.inline.reflink.exec(src))
566 || (cap = this.rules.inline.nolink.exec(src))) {
567 let link = (cap[2] || cap[1]).replace(/\s+/g, ' ');
568 link = links[link.toLowerCase()];
569 if (!link || !link.href) {
570 const text = cap[0].charAt(0);
571 return {
572 type: 'text',
573 raw: text,
574 text
575 };
576 }
577 return outputLink(cap, link, cap[0], this.lexer);
578 }
579 }
580
581 emStrong(src, maskedSrc, prevChar = '') {
582 let match = this.rules.inline.emStrong.lDelim.exec(src);
583 if (!match) return;
584
585 // _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well
586 if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return;
587
588 const nextChar = match[1] || match[2] || '';
589
590 if (!nextChar || (nextChar && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
591 const lLength = match[0].length - 1;
592 let rDelim, rLength, delimTotal = lLength, midDelimTotal = 0;
593
594 const endReg = match[0][0] === '*' ? this.rules.inline.emStrong.rDelimAst : this.rules.inline.emStrong.rDelimUnd;
595 endReg.lastIndex = 0;
596
597 // Clip maskedSrc to same section of string as src (move to lexer?)
598 maskedSrc = maskedSrc.slice(-1 * src.length + lLength);
599
600 while ((match = endReg.exec(maskedSrc)) != null) {
601 rDelim = match[1] || match[2] || match[3] || match[4] || match[5] || match[6];
602
603 if (!rDelim) continue; // skip single * in __abc*abc__
604
605 rLength = rDelim.length;
606
607 if (match[3] || match[4]) { // found another Left Delim
608 delimTotal += rLength;
609 continue;
610 } else if (match[5] || match[6]) { // either Left or Right Delim
611 if (lLength % 3 && !((lLength + rLength) % 3)) {
612 midDelimTotal += rLength;
613 continue; // CommonMark Emphasis Rules 9-10
614 }
615 }
616
617 delimTotal -= rLength;
618
619 if (delimTotal > 0) continue; // Haven't found enough closing delimiters
620
621 // Remove extra characters. *a*** -> *a*
622 rLength = Math.min(rLength, rLength + delimTotal + midDelimTotal);
623
624 // Create `em` if smallest delimiter has odd char count. *a***
625 if (Math.min(lLength, rLength) % 2) {
626 const text = src.slice(1, lLength + match.index + rLength);
627 return {
628 type: 'em',
629 raw: src.slice(0, lLength + match.index + rLength + 1),
630 text,
631 tokens: this.lexer.inlineTokens(text, [])
632 };
633 }
634
635 // Create 'strong' if smallest delimiter has even char count. **a***
636 const text = src.slice(2, lLength + match.index + rLength - 1);
637 return {
638 type: 'strong',
639 raw: src.slice(0, lLength + match.index + rLength + 1),
640 text,
641 tokens: this.lexer.inlineTokens(text, [])
642 };
643 }
644 }
645 }
646
647 codespan(src) {
648 const cap = this.rules.inline.code.exec(src);
649 if (cap) {
650 let text = cap[2].replace(/\n/g, ' ');
651 const hasNonSpaceChars = /[^ ]/.test(text);
652 const hasSpaceCharsOnBothEnds = /^ /.test(text) && / $/.test(text);
653 if (hasNonSpaceChars && hasSpaceCharsOnBothEnds) {
654 text = text.substring(1, text.length - 1);
655 }
656 text = escape(text, true);
657 return {
658 type: 'codespan',
659 raw: cap[0],
660 text
661 };
662 }
663 }
664
665 br(src) {
666 const cap = this.rules.inline.br.exec(src);
667 if (cap) {
668 return {
669 type: 'br',
670 raw: cap[0]
671 };
672 }
673 }
674
675 del(src) {
676 const cap = this.rules.inline.del.exec(src);
677 if (cap) {
678 return {
679 type: 'del',
680 raw: cap[0],
681 text: cap[2],
682 tokens: this.lexer.inlineTokens(cap[2], [])
683 };
684 }
685 }
686
687 autolink(src, mangle) {
688 const cap = this.rules.inline.autolink.exec(src);
689 if (cap) {
690 let text, href;
691 if (cap[2] === '@') {
692 text = escape(this.options.mangle ? mangle(cap[1]) : cap[1]);
693 href = 'mailto:' + text;
694 } else {
695 text = escape(cap[1]);
696 href = text;
697 }
698
699 return {
700 type: 'link',
701 raw: cap[0],
702 text,
703 href,
704 tokens: [
705 {
706 type: 'text',
707 raw: text,
708 text
709 }
710 ]
711 };
712 }
713 }
714
715 url(src, mangle) {
716 let cap;
717 if (cap = this.rules.inline.url.exec(src)) {
718 let text, href;
719 if (cap[2] === '@') {
720 text = escape(this.options.mangle ? mangle(cap[0]) : cap[0]);
721 href = 'mailto:' + text;
722 } else {
723 // do extended autolink path validation
724 let prevCapZero;
725 do {
726 prevCapZero = cap[0];
727 cap[0] = this.rules.inline._backpedal.exec(cap[0])[0];
728 } while (prevCapZero !== cap[0]);
729 text = escape(cap[0]);
730 if (cap[1] === 'www.') {
731 href = 'http://' + text;
732 } else {
733 href = text;
734 }
735 }
736 return {
737 type: 'link',
738 raw: cap[0],
739 text,
740 href,
741 tokens: [
742 {
743 type: 'text',
744 raw: text,
745 text
746 }
747 ]
748 };
749 }
750 }
751
752 inlineText(src, smartypants) {
753 const cap = this.rules.inline.text.exec(src);
754 if (cap) {
755 let text;
756 if (this.lexer.state.inRawBlock) {
757 text = this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0];
758 } else {
759 text = escape(this.options.smartypants ? smartypants(cap[0]) : cap[0]);
760 }
761 return {
762 type: 'text',
763 raw: cap[0],
764 text
765 };
766 }
767 }
768}