UNPKG

21.5 kBJavaScriptView Raw
1import { defaults } from './defaults.js';
2import {
3 rtrim,
4 splitCells,
5 escape,
6 findClosingBracket
7} from './helpers.js';
8
9function outputLink(cap, link, raw, lexer) {
10 const href = link.href;
11 const title = link.title ? escape(link.title) : null;
12 const text = cap[1].replace(/\\([\[\]])/g, '$1');
13
14 if (cap[0].charAt(0) !== '!') {
15 lexer.state.inLink = true;
16 const token = {
17 type: 'link',
18 raw,
19 href,
20 title,
21 text,
22 tokens: lexer.inlineTokens(text, [])
23 };
24 lexer.state.inLink = false;
25 return token;
26 }
27 return {
28 type: 'image',
29 raw,
30 href,
31 title,
32 text: escape(text)
33 };
34}
35
36function indentCodeCompensation(raw, text) {
37 const matchIndentToCode = raw.match(/^(\s+)(?:```)/);
38
39 if (matchIndentToCode === null) {
40 return text;
41 }
42
43 const indentToCode = matchIndentToCode[1];
44
45 return text
46 .split('\n')
47 .map(node => {
48 const matchIndentInNode = node.match(/^\s+/);
49 if (matchIndentInNode === null) {
50 return node;
51 }
52
53 const [indentInNode] = matchIndentInNode;
54
55 if (indentInNode.length >= indentToCode.length) {
56 return node.slice(indentToCode.length);
57 }
58
59 return node;
60 })
61 .join('\n');
62}
63
64/**
65 * Tokenizer
66 */
67export class Tokenizer {
68 constructor(options) {
69 this.options = options || defaults;
70 }
71
72 space(src) {
73 const cap = this.rules.block.newline.exec(src);
74 if (cap && cap[0].length > 0) {
75 return {
76 type: 'space',
77 raw: cap[0]
78 };
79 }
80 }
81
82 code(src) {
83 const cap = this.rules.block.code.exec(src);
84 if (cap) {
85 const text = cap[0].replace(/^ {1,4}/gm, '');
86 return {
87 type: 'code',
88 raw: cap[0],
89 codeBlockStyle: 'indented',
90 text: !this.options.pedantic
91 ? rtrim(text, '\n')
92 : text
93 };
94 }
95 }
96
97 fences(src) {
98 const cap = this.rules.block.fences.exec(src);
99 if (cap) {
100 const raw = cap[0];
101 const text = indentCodeCompensation(raw, cap[3] || '');
102
103 return {
104 type: 'code',
105 raw,
106 lang: cap[2] ? cap[2].trim() : cap[2],
107 text
108 };
109 }
110 }
111
112 heading(src) {
113 const cap = this.rules.block.heading.exec(src);
114 if (cap) {
115 let text = cap[2].trim();
116
117 // remove trailing #s
118 if (/#$/.test(text)) {
119 const trimmed = rtrim(text, '#');
120 if (this.options.pedantic) {
121 text = trimmed.trim();
122 } else if (!trimmed || / $/.test(trimmed)) {
123 // CommonMark requires space before trailing #s
124 text = trimmed.trim();
125 }
126 }
127
128 const token = {
129 type: 'heading',
130 raw: cap[0],
131 depth: cap[1].length,
132 text,
133 tokens: []
134 };
135 this.lexer.inline(token.text, token.tokens);
136 return token;
137 }
138 }
139
140 hr(src) {
141 const cap = this.rules.block.hr.exec(src);
142 if (cap) {
143 return {
144 type: 'hr',
145 raw: cap[0]
146 };
147 }
148 }
149
150 blockquote(src) {
151 const cap = this.rules.block.blockquote.exec(src);
152 if (cap) {
153 const text = cap[0].replace(/^ *>[ \t]?/gm, '');
154
155 return {
156 type: 'blockquote',
157 raw: cap[0],
158 tokens: this.lexer.blockTokens(text, []),
159 text
160 };
161 }
162 }
163
164 list(src) {
165 let cap = this.rules.block.list.exec(src);
166 if (cap) {
167 let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
168 line, nextLine, rawLine, itemContents, endEarly;
169
170 let bull = cap[1].trim();
171 const isordered = bull.length > 1;
172
173 const list = {
174 type: 'list',
175 raw: '',
176 ordered: isordered,
177 start: isordered ? +bull.slice(0, -1) : '',
178 loose: false,
179 items: []
180 };
181
182 bull = isordered ? `\\d{1,9}\\${bull.slice(-1)}` : `\\${bull}`;
183
184 if (this.options.pedantic) {
185 bull = isordered ? bull : '[*+-]';
186 }
187
188 // Get next list item
189 const itemRegex = new RegExp(`^( {0,3}${bull})((?:[\t ][^\\n]*)?(?:\\n|$))`);
190
191 // Check if current bullet point can start a new List Item
192 while (src) {
193 endEarly = false;
194 if (!(cap = itemRegex.exec(src))) {
195 break;
196 }
197
198 if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?)
199 break;
200 }
201
202 raw = cap[0];
203 src = src.substring(raw.length);
204
205 line = cap[2].split('\n', 1)[0];
206 nextLine = src.split('\n', 1)[0];
207
208 if (this.options.pedantic) {
209 indent = 2;
210 itemContents = line.trimLeft();
211 } else {
212 indent = cap[2].search(/[^ ]/); // Find first non-space char
213 indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
214 itemContents = line.slice(indent);
215 indent += cap[1].length;
216 }
217
218 blankLine = false;
219
220 if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line
221 raw += nextLine + '\n';
222 src = src.substring(nextLine.length + 1);
223 endEarly = true;
224 }
225
226 if (!endEarly) {
227 const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])((?: [^\\n]*)?(?:\\n|$))`);
228 const hrRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}((?:- *){3,}|(?:_ *){3,}|(?:\\* *){3,})(?:\\n+|$)`);
229 const fencesBeginRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:\`\`\`|~~~)`);
230 const headingBeginRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}#`);
231
232 // Check if following lines should be included in List Item
233 while (src) {
234 rawLine = src.split('\n', 1)[0];
235 line = rawLine;
236
237 // Re-align to follow commonmark nesting rules
238 if (this.options.pedantic) {
239 line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
240 }
241
242 // End list item if found code fences
243 if (fencesBeginRegex.test(line)) {
244 break;
245 }
246
247 // End list item if found start of new heading
248 if (headingBeginRegex.test(line)) {
249 break;
250 }
251
252 // End list item if found start of new bullet
253 if (nextBulletRegex.test(line)) {
254 break;
255 }
256
257 // Horizontal rule found
258 if (hrRegex.test(src)) {
259 break;
260 }
261
262 if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible
263 itemContents += '\n' + line.slice(indent);
264 } else if (!blankLine) { // Until blank line, item doesn't need indentation
265 itemContents += '\n' + line;
266 } else { // Otherwise, improper indentation ends this item
267 break;
268 }
269
270 if (!blankLine && !line.trim()) { // Check if current line is blank
271 blankLine = true;
272 }
273
274 raw += rawLine + '\n';
275 src = src.substring(rawLine.length + 1);
276 }
277 }
278
279 if (!list.loose) {
280 // If the previous item ended with a blank line, the list is loose
281 if (endsWithBlankLine) {
282 list.loose = true;
283 } else if (/\n *\n *$/.test(raw)) {
284 endsWithBlankLine = true;
285 }
286 }
287
288 // Check for task list items
289 if (this.options.gfm) {
290 istask = /^\[[ xX]\] /.exec(itemContents);
291 if (istask) {
292 ischecked = istask[0] !== '[ ] ';
293 itemContents = itemContents.replace(/^\[[ xX]\] +/, '');
294 }
295 }
296
297 list.items.push({
298 type: 'list_item',
299 raw,
300 task: !!istask,
301 checked: ischecked,
302 loose: false,
303 text: itemContents
304 });
305
306 list.raw += raw;
307 }
308
309 // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
310 list.items[list.items.length - 1].raw = raw.trimRight();
311 list.items[list.items.length - 1].text = itemContents.trimRight();
312 list.raw = list.raw.trimRight();
313
314 const l = list.items.length;
315
316 // Item child tokens handled here at end because we needed to have the final item to trim it first
317 for (i = 0; i < l; i++) {
318 this.lexer.state.top = false;
319 list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
320 const spacers = list.items[i].tokens.filter(t => t.type === 'space');
321 const hasMultipleLineBreaks = spacers.every(t => {
322 const chars = t.raw.split('');
323 let lineBreaks = 0;
324 for (const char of chars) {
325 if (char === '\n') {
326 lineBreaks += 1;
327 }
328 if (lineBreaks > 1) {
329 return true;
330 }
331 }
332
333 return false;
334 });
335
336 if (!list.loose && spacers.length && hasMultipleLineBreaks) {
337 // Having a single line break doesn't mean a list is loose. A single line break is terminating the last list item
338 list.loose = true;
339 list.items[i].loose = true;
340 }
341 }
342
343 return list;
344 }
345 }
346
347 html(src) {
348 const cap = this.rules.block.html.exec(src);
349 if (cap) {
350 const token = {
351 type: 'html',
352 raw: cap[0],
353 pre: !this.options.sanitizer
354 && (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'),
355 text: cap[0]
356 };
357 if (this.options.sanitize) {
358 token.type = 'paragraph';
359 token.text = this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0]);
360 token.tokens = [];
361 this.lexer.inline(token.text, token.tokens);
362 }
363 return token;
364 }
365 }
366
367 def(src) {
368 const cap = this.rules.block.def.exec(src);
369 if (cap) {
370 if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1);
371 const tag = cap[1].toLowerCase().replace(/\s+/g, ' ');
372 return {
373 type: 'def',
374 tag,
375 raw: cap[0],
376 href: cap[2],
377 title: cap[3]
378 };
379 }
380 }
381
382 table(src) {
383 const cap = this.rules.block.table.exec(src);
384 if (cap) {
385 const item = {
386 type: 'table',
387 header: splitCells(cap[1]).map(c => { return { text: c }; }),
388 align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
389 rows: cap[3] && cap[3].trim() ? cap[3].replace(/\n[ \t]*$/, '').split('\n') : []
390 };
391
392 if (item.header.length === item.align.length) {
393 item.raw = cap[0];
394
395 let l = item.align.length;
396 let i, j, k, row;
397 for (i = 0; i < l; i++) {
398 if (/^ *-+: *$/.test(item.align[i])) {
399 item.align[i] = 'right';
400 } else if (/^ *:-+: *$/.test(item.align[i])) {
401 item.align[i] = 'center';
402 } else if (/^ *:-+ *$/.test(item.align[i])) {
403 item.align[i] = 'left';
404 } else {
405 item.align[i] = null;
406 }
407 }
408
409 l = item.rows.length;
410 for (i = 0; i < l; i++) {
411 item.rows[i] = splitCells(item.rows[i], item.header.length).map(c => { return { text: c }; });
412 }
413
414 // parse child tokens inside headers and cells
415
416 // header child tokens
417 l = item.header.length;
418 for (j = 0; j < l; j++) {
419 item.header[j].tokens = [];
420 this.lexer.inline(item.header[j].text, item.header[j].tokens);
421 }
422
423 // cell child tokens
424 l = item.rows.length;
425 for (j = 0; j < l; j++) {
426 row = item.rows[j];
427 for (k = 0; k < row.length; k++) {
428 row[k].tokens = [];
429 this.lexer.inline(row[k].text, row[k].tokens);
430 }
431 }
432
433 return item;
434 }
435 }
436 }
437
438 lheading(src) {
439 const cap = this.rules.block.lheading.exec(src);
440 if (cap) {
441 const token = {
442 type: 'heading',
443 raw: cap[0],
444 depth: cap[2].charAt(0) === '=' ? 1 : 2,
445 text: cap[1],
446 tokens: []
447 };
448 this.lexer.inline(token.text, token.tokens);
449 return token;
450 }
451 }
452
453 paragraph(src) {
454 const cap = this.rules.block.paragraph.exec(src);
455 if (cap) {
456 const token = {
457 type: 'paragraph',
458 raw: cap[0],
459 text: cap[1].charAt(cap[1].length - 1) === '\n'
460 ? cap[1].slice(0, -1)
461 : cap[1],
462 tokens: []
463 };
464 this.lexer.inline(token.text, token.tokens);
465 return token;
466 }
467 }
468
469 text(src) {
470 const cap = this.rules.block.text.exec(src);
471 if (cap) {
472 const token = {
473 type: 'text',
474 raw: cap[0],
475 text: cap[0],
476 tokens: []
477 };
478 this.lexer.inline(token.text, token.tokens);
479 return token;
480 }
481 }
482
483 escape(src) {
484 const cap = this.rules.inline.escape.exec(src);
485 if (cap) {
486 return {
487 type: 'escape',
488 raw: cap[0],
489 text: escape(cap[1])
490 };
491 }
492 }
493
494 tag(src) {
495 const cap = this.rules.inline.tag.exec(src);
496 if (cap) {
497 if (!this.lexer.state.inLink && /^<a /i.test(cap[0])) {
498 this.lexer.state.inLink = true;
499 } else if (this.lexer.state.inLink && /^<\/a>/i.test(cap[0])) {
500 this.lexer.state.inLink = false;
501 }
502 if (!this.lexer.state.inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
503 this.lexer.state.inRawBlock = true;
504 } else if (this.lexer.state.inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
505 this.lexer.state.inRawBlock = false;
506 }
507
508 return {
509 type: this.options.sanitize
510 ? 'text'
511 : 'html',
512 raw: cap[0],
513 inLink: this.lexer.state.inLink,
514 inRawBlock: this.lexer.state.inRawBlock,
515 text: this.options.sanitize
516 ? (this.options.sanitizer
517 ? this.options.sanitizer(cap[0])
518 : escape(cap[0]))
519 : cap[0]
520 };
521 }
522 }
523
524 link(src) {
525 const cap = this.rules.inline.link.exec(src);
526 if (cap) {
527 const trimmedUrl = cap[2].trim();
528 if (!this.options.pedantic && /^</.test(trimmedUrl)) {
529 // commonmark requires matching angle brackets
530 if (!(/>$/.test(trimmedUrl))) {
531 return;
532 }
533
534 // ending angle bracket cannot be escaped
535 const rtrimSlash = rtrim(trimmedUrl.slice(0, -1), '\\');
536 if ((trimmedUrl.length - rtrimSlash.length) % 2 === 0) {
537 return;
538 }
539 } else {
540 // find closing parenthesis
541 const lastParenIndex = findClosingBracket(cap[2], '()');
542 if (lastParenIndex > -1) {
543 const start = cap[0].indexOf('!') === 0 ? 5 : 4;
544 const linkLen = start + cap[1].length + lastParenIndex;
545 cap[2] = cap[2].substring(0, lastParenIndex);
546 cap[0] = cap[0].substring(0, linkLen).trim();
547 cap[3] = '';
548 }
549 }
550 let href = cap[2];
551 let title = '';
552 if (this.options.pedantic) {
553 // split pedantic href and title
554 const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href);
555
556 if (link) {
557 href = link[1];
558 title = link[3];
559 }
560 } else {
561 title = cap[3] ? cap[3].slice(1, -1) : '';
562 }
563
564 href = href.trim();
565 if (/^</.test(href)) {
566 if (this.options.pedantic && !(/>$/.test(trimmedUrl))) {
567 // pedantic allows starting angle bracket without ending angle bracket
568 href = href.slice(1);
569 } else {
570 href = href.slice(1, -1);
571 }
572 }
573 return outputLink(cap, {
574 href: href ? href.replace(this.rules.inline._escapes, '$1') : href,
575 title: title ? title.replace(this.rules.inline._escapes, '$1') : title
576 }, cap[0], this.lexer);
577 }
578 }
579
580 reflink(src, links) {
581 let cap;
582 if ((cap = this.rules.inline.reflink.exec(src))
583 || (cap = this.rules.inline.nolink.exec(src))) {
584 let link = (cap[2] || cap[1]).replace(/\s+/g, ' ');
585 link = links[link.toLowerCase()];
586 if (!link || !link.href) {
587 const text = cap[0].charAt(0);
588 return {
589 type: 'text',
590 raw: text,
591 text
592 };
593 }
594 return outputLink(cap, link, cap[0], this.lexer);
595 }
596 }
597
598 emStrong(src, maskedSrc, prevChar = '') {
599 let match = this.rules.inline.emStrong.lDelim.exec(src);
600 if (!match) return;
601
602 // _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well
603 if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return;
604
605 const nextChar = match[1] || match[2] || '';
606
607 if (!nextChar || (nextChar && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
608 const lLength = match[0].length - 1;
609 let rDelim, rLength, delimTotal = lLength, midDelimTotal = 0;
610
611 const endReg = match[0][0] === '*' ? this.rules.inline.emStrong.rDelimAst : this.rules.inline.emStrong.rDelimUnd;
612 endReg.lastIndex = 0;
613
614 // Clip maskedSrc to same section of string as src (move to lexer?)
615 maskedSrc = maskedSrc.slice(-1 * src.length + lLength);
616
617 while ((match = endReg.exec(maskedSrc)) != null) {
618 rDelim = match[1] || match[2] || match[3] || match[4] || match[5] || match[6];
619
620 if (!rDelim) continue; // skip single * in __abc*abc__
621
622 rLength = rDelim.length;
623
624 if (match[3] || match[4]) { // found another Left Delim
625 delimTotal += rLength;
626 continue;
627 } else if (match[5] || match[6]) { // either Left or Right Delim
628 if (lLength % 3 && !((lLength + rLength) % 3)) {
629 midDelimTotal += rLength;
630 continue; // CommonMark Emphasis Rules 9-10
631 }
632 }
633
634 delimTotal -= rLength;
635
636 if (delimTotal > 0) continue; // Haven't found enough closing delimiters
637
638 // Remove extra characters. *a*** -> *a*
639 rLength = Math.min(rLength, rLength + delimTotal + midDelimTotal);
640
641 // Create `em` if smallest delimiter has odd char count. *a***
642 if (Math.min(lLength, rLength) % 2) {
643 const text = src.slice(1, lLength + match.index + rLength);
644 return {
645 type: 'em',
646 raw: src.slice(0, lLength + match.index + rLength + 1),
647 text,
648 tokens: this.lexer.inlineTokens(text, [])
649 };
650 }
651
652 // Create 'strong' if smallest delimiter has even char count. **a***
653 const text = src.slice(2, lLength + match.index + rLength - 1);
654 return {
655 type: 'strong',
656 raw: src.slice(0, lLength + match.index + rLength + 1),
657 text,
658 tokens: this.lexer.inlineTokens(text, [])
659 };
660 }
661 }
662 }
663
664 codespan(src) {
665 const cap = this.rules.inline.code.exec(src);
666 if (cap) {
667 let text = cap[2].replace(/\n/g, ' ');
668 const hasNonSpaceChars = /[^ ]/.test(text);
669 const hasSpaceCharsOnBothEnds = /^ /.test(text) && / $/.test(text);
670 if (hasNonSpaceChars && hasSpaceCharsOnBothEnds) {
671 text = text.substring(1, text.length - 1);
672 }
673 text = escape(text, true);
674 return {
675 type: 'codespan',
676 raw: cap[0],
677 text
678 };
679 }
680 }
681
682 br(src) {
683 const cap = this.rules.inline.br.exec(src);
684 if (cap) {
685 return {
686 type: 'br',
687 raw: cap[0]
688 };
689 }
690 }
691
692 del(src) {
693 const cap = this.rules.inline.del.exec(src);
694 if (cap) {
695 return {
696 type: 'del',
697 raw: cap[0],
698 text: cap[2],
699 tokens: this.lexer.inlineTokens(cap[2], [])
700 };
701 }
702 }
703
704 autolink(src, mangle) {
705 const cap = this.rules.inline.autolink.exec(src);
706 if (cap) {
707 let text, href;
708 if (cap[2] === '@') {
709 text = escape(this.options.mangle ? mangle(cap[1]) : cap[1]);
710 href = 'mailto:' + text;
711 } else {
712 text = escape(cap[1]);
713 href = text;
714 }
715
716 return {
717 type: 'link',
718 raw: cap[0],
719 text,
720 href,
721 tokens: [
722 {
723 type: 'text',
724 raw: text,
725 text
726 }
727 ]
728 };
729 }
730 }
731
732 url(src, mangle) {
733 let cap;
734 if (cap = this.rules.inline.url.exec(src)) {
735 let text, href;
736 if (cap[2] === '@') {
737 text = escape(this.options.mangle ? mangle(cap[0]) : cap[0]);
738 href = 'mailto:' + text;
739 } else {
740 // do extended autolink path validation
741 let prevCapZero;
742 do {
743 prevCapZero = cap[0];
744 cap[0] = this.rules.inline._backpedal.exec(cap[0])[0];
745 } while (prevCapZero !== cap[0]);
746 text = escape(cap[0]);
747 if (cap[1] === 'www.') {
748 href = 'http://' + text;
749 } else {
750 href = text;
751 }
752 }
753 return {
754 type: 'link',
755 raw: cap[0],
756 text,
757 href,
758 tokens: [
759 {
760 type: 'text',
761 raw: text,
762 text
763 }
764 ]
765 };
766 }
767 }
768
769 inlineText(src, smartypants) {
770 const cap = this.rules.inline.text.exec(src);
771 if (cap) {
772 let text;
773 if (this.lexer.state.inRawBlock) {
774 text = this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0];
775 } else {
776 text = escape(this.options.smartypants ? smartypants(cap[0]) : cap[0]);
777 }
778 return {
779 type: 'text',
780 raw: cap[0],
781 text
782 };
783 }
784 }
785}