UNPKG

20.2 kBJavaScriptView Raw
1const { defaults } = require('./defaults.js');
2const {
3 rtrim,
4 splitCells,
5 escape,
6 findClosingBracket
7} = require('./helpers.js');
8
9function outputLink(cap, link, raw, lexer) {
10 const href = link.href;
11 const title = link.title ? escape(link.title) : null;
12 const text = cap[1].replace(/\\([\[\]])/g, '$1');
13
14 if (cap[0].charAt(0) !== '!') {
15 lexer.state.inLink = true;
16 return {
17 type: 'link',
18 raw,
19 href,
20 title,
21 text,
22 tokens: lexer.inlineTokens(text, [])
23 };
24 } else {
25 return {
26 type: 'image',
27 raw,
28 href,
29 title,
30 text: escape(text)
31 };
32 }
33}
34
35function indentCodeCompensation(raw, text) {
36 const matchIndentToCode = raw.match(/^(\s+)(?:```)/);
37
38 if (matchIndentToCode === null) {
39 return text;
40 }
41
42 const indentToCode = matchIndentToCode[1];
43
44 return text
45 .split('\n')
46 .map(node => {
47 const matchIndentInNode = node.match(/^\s+/);
48 if (matchIndentInNode === null) {
49 return node;
50 }
51
52 const [indentInNode] = matchIndentInNode;
53
54 if (indentInNode.length >= indentToCode.length) {
55 return node.slice(indentToCode.length);
56 }
57
58 return node;
59 })
60 .join('\n');
61}
62
63/**
64 * Tokenizer
65 */
66module.exports = class Tokenizer {
67 constructor(options) {
68 this.options = options || defaults;
69 }
70
71 space(src) {
72 const cap = this.rules.block.newline.exec(src);
73 if (cap) {
74 if (cap[0].length > 1) {
75 return {
76 type: 'space',
77 raw: cap[0]
78 };
79 }
80 return { raw: '\n' };
81 }
82 }
83
84 code(src) {
85 const cap = this.rules.block.code.exec(src);
86 if (cap) {
87 const text = cap[0].replace(/^ {1,4}/gm, '');
88 return {
89 type: 'code',
90 raw: cap[0],
91 codeBlockStyle: 'indented',
92 text: !this.options.pedantic
93 ? rtrim(text, '\n')
94 : text
95 };
96 }
97 }
98
99 fences(src) {
100 const cap = this.rules.block.fences.exec(src);
101 if (cap) {
102 const raw = cap[0];
103 const text = indentCodeCompensation(raw, cap[3] || '');
104
105 return {
106 type: 'code',
107 raw,
108 lang: cap[2] ? cap[2].trim() : cap[2],
109 text
110 };
111 }
112 }
113
114 heading(src) {
115 const cap = this.rules.block.heading.exec(src);
116 if (cap) {
117 let text = cap[2].trim();
118
119 // remove trailing #s
120 if (/#$/.test(text)) {
121 const trimmed = rtrim(text, '#');
122 if (this.options.pedantic) {
123 text = trimmed.trim();
124 } else if (!trimmed || / $/.test(trimmed)) {
125 // CommonMark requires space before trailing #s
126 text = trimmed.trim();
127 }
128 }
129
130 const token = {
131 type: 'heading',
132 raw: cap[0],
133 depth: cap[1].length,
134 text: text,
135 tokens: []
136 };
137 this.lexer.inline(token.text, token.tokens);
138 return token;
139 }
140 }
141
142 hr(src) {
143 const cap = this.rules.block.hr.exec(src);
144 if (cap) {
145 return {
146 type: 'hr',
147 raw: cap[0]
148 };
149 }
150 }
151
152 blockquote(src) {
153 const cap = this.rules.block.blockquote.exec(src);
154 if (cap) {
155 const text = cap[0].replace(/^ *> ?/gm, '');
156
157 return {
158 type: 'blockquote',
159 raw: cap[0],
160 tokens: this.lexer.blockTokens(text, []),
161 text
162 };
163 }
164 }
165
166 list(src) {
167 let cap = this.rules.block.list.exec(src);
168 if (cap) {
169 let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
170 line, lines, itemContents;
171
172 let bull = cap[1].trim();
173 const isordered = bull.length > 1;
174
175 const list = {
176 type: 'list',
177 raw: '',
178 ordered: isordered,
179 start: isordered ? +bull.slice(0, -1) : '',
180 loose: false,
181 items: []
182 };
183
184 bull = isordered ? `\\d{1,9}\\${bull.slice(-1)}` : `\\${bull}`;
185
186 if (this.options.pedantic) {
187 bull = isordered ? bull : '[*+-]';
188 }
189
190 // Get next list item
191 const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`);
192
193 // Get each top-level item
194 while (src) {
195 if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?)
196 break;
197 }
198
199 if (!(cap = itemRegex.exec(src))) {
200 break;
201 }
202
203 lines = cap[2].split('\n');
204
205 if (this.options.pedantic) {
206 indent = 2;
207 itemContents = lines[0].trimLeft();
208 } else {
209 indent = cap[2].search(/[^ ]/); // Find first non-space char
210 indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
211 itemContents = lines[0].slice(indent - cap[1].length);
212 }
213
214 blankLine = false;
215 raw = cap[0];
216
217 if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line
218 raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
219 list.loose = true;
220 lines = [];
221 }
222
223 const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);
224
225 for (i = 1; i < lines.length; i++) {
226 line = lines[i];
227
228 if (this.options.pedantic) { // Re-align to follow commonmark nesting rules
229 line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
230 }
231
232 // End list item if found start of new bullet
233 if (nextBulletRegex.test(line)) {
234 raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
235 break;
236 }
237
238 // Until we encounter a blank line, item contents do not need indentation
239 if (!blankLine) {
240 if (!line.trim()) { // Check if current line is empty
241 blankLine = true;
242 }
243
244 // Dedent if possible
245 if (line.search(/[^ ]/) >= indent) {
246 itemContents += '\n' + line.slice(indent);
247 } else {
248 itemContents += '\n' + line;
249 }
250 continue;
251 }
252
253 // Dedent this line
254 if (line.search(/[^ ]/) >= indent || !line.trim()) {
255 itemContents += '\n' + line.slice(indent);
256 continue;
257 } else { // Line was not properly indented; end of this item
258 raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
259 break;
260 }
261 }
262
263 if (!list.loose) {
264 // If the previous item ended with a blank line, the list is loose
265 if (endsWithBlankLine) {
266 list.loose = true;
267 } else if (/\n *\n *$/.test(raw)) {
268 endsWithBlankLine = true;
269 }
270 }
271
272 // Check for task list items
273 if (this.options.gfm) {
274 istask = /^\[[ xX]\] /.exec(itemContents);
275 if (istask) {
276 ischecked = istask[0] !== '[ ] ';
277 itemContents = itemContents.replace(/^\[[ xX]\] +/, '');
278 }
279 }
280
281 list.items.push({
282 type: 'list_item',
283 raw: raw,
284 task: !!istask,
285 checked: ischecked,
286 loose: false,
287 text: itemContents
288 });
289
290 list.raw += raw;
291 src = src.slice(raw.length);
292 }
293
294 // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
295 list.items[list.items.length - 1].raw = raw.trimRight();
296 list.items[list.items.length - 1].text = itemContents.trimRight();
297 list.raw = list.raw.trimRight();
298
299 const l = list.items.length;
300
301 // Item child tokens handled here at end because we needed to have the final item to trim it first
302 for (i = 0; i < l; i++) {
303 this.lexer.state.top = false;
304 list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
305 if (list.items[i].tokens.some(t => t.type === 'space')) {
306 list.loose = true;
307 list.items[i].loose = true;
308 }
309 }
310
311 return list;
312 }
313 }
314
315 html(src) {
316 const cap = this.rules.block.html.exec(src);
317 if (cap) {
318 const token = {
319 type: 'html',
320 raw: cap[0],
321 pre: !this.options.sanitizer
322 && (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'),
323 text: cap[0]
324 };
325 if (this.options.sanitize) {
326 token.type = 'paragraph';
327 token.text = this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0]);
328 token.tokens = [];
329 this.lexer.inline(token.text, token.tokens);
330 }
331 return token;
332 }
333 }
334
335 def(src) {
336 const cap = this.rules.block.def.exec(src);
337 if (cap) {
338 if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1);
339 const tag = cap[1].toLowerCase().replace(/\s+/g, ' ');
340 return {
341 type: 'def',
342 tag,
343 raw: cap[0],
344 href: cap[2],
345 title: cap[3]
346 };
347 }
348 }
349
350 table(src) {
351 const cap = this.rules.block.table.exec(src);
352 if (cap) {
353 const item = {
354 type: 'table',
355 header: splitCells(cap[1]).map(c => { return { text: c }; }),
356 align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
357 rows: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : []
358 };
359
360 if (item.header.length === item.align.length) {
361 item.raw = cap[0];
362
363 let l = item.align.length;
364 let i, j, k, row;
365 for (i = 0; i < l; i++) {
366 if (/^ *-+: *$/.test(item.align[i])) {
367 item.align[i] = 'right';
368 } else if (/^ *:-+: *$/.test(item.align[i])) {
369 item.align[i] = 'center';
370 } else if (/^ *:-+ *$/.test(item.align[i])) {
371 item.align[i] = 'left';
372 } else {
373 item.align[i] = null;
374 }
375 }
376
377 l = item.rows.length;
378 for (i = 0; i < l; i++) {
379 item.rows[i] = splitCells(item.rows[i], item.header.length).map(c => { return { text: c }; });
380 }
381
382 // parse child tokens inside headers and cells
383
384 // header child tokens
385 l = item.header.length;
386 for (j = 0; j < l; j++) {
387 item.header[j].tokens = [];
388 this.lexer.inlineTokens(item.header[j].text, item.header[j].tokens);
389 }
390
391 // cell child tokens
392 l = item.rows.length;
393 for (j = 0; j < l; j++) {
394 row = item.rows[j];
395 for (k = 0; k < row.length; k++) {
396 row[k].tokens = [];
397 this.lexer.inlineTokens(row[k].text, row[k].tokens);
398 }
399 }
400
401 return item;
402 }
403 }
404 }
405
406 lheading(src) {
407 const cap = this.rules.block.lheading.exec(src);
408 if (cap) {
409 const token = {
410 type: 'heading',
411 raw: cap[0],
412 depth: cap[2].charAt(0) === '=' ? 1 : 2,
413 text: cap[1],
414 tokens: []
415 };
416 this.lexer.inline(token.text, token.tokens);
417 return token;
418 }
419 }
420
421 paragraph(src) {
422 const cap = this.rules.block.paragraph.exec(src);
423 if (cap) {
424 const token = {
425 type: 'paragraph',
426 raw: cap[0],
427 text: cap[1].charAt(cap[1].length - 1) === '\n'
428 ? cap[1].slice(0, -1)
429 : cap[1],
430 tokens: []
431 };
432 this.lexer.inline(token.text, token.tokens);
433 return token;
434 }
435 }
436
437 text(src) {
438 const cap = this.rules.block.text.exec(src);
439 if (cap) {
440 const token = {
441 type: 'text',
442 raw: cap[0],
443 text: cap[0],
444 tokens: []
445 };
446 this.lexer.inline(token.text, token.tokens);
447 return token;
448 }
449 }
450
451 escape(src) {
452 const cap = this.rules.inline.escape.exec(src);
453 if (cap) {
454 return {
455 type: 'escape',
456 raw: cap[0],
457 text: escape(cap[1])
458 };
459 }
460 }
461
462 tag(src) {
463 const cap = this.rules.inline.tag.exec(src);
464 if (cap) {
465 if (!this.lexer.state.inLink && /^<a /i.test(cap[0])) {
466 this.lexer.state.inLink = true;
467 } else if (this.lexer.state.inLink && /^<\/a>/i.test(cap[0])) {
468 this.lexer.state.inLink = false;
469 }
470 if (!this.lexer.state.inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
471 this.lexer.state.inRawBlock = true;
472 } else if (this.lexer.state.inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
473 this.lexer.state.inRawBlock = false;
474 }
475
476 return {
477 type: this.options.sanitize
478 ? 'text'
479 : 'html',
480 raw: cap[0],
481 inLink: this.lexer.state.inLink,
482 inRawBlock: this.lexer.state.inRawBlock,
483 text: this.options.sanitize
484 ? (this.options.sanitizer
485 ? this.options.sanitizer(cap[0])
486 : escape(cap[0]))
487 : cap[0]
488 };
489 }
490 }
491
492 link(src) {
493 const cap = this.rules.inline.link.exec(src);
494 if (cap) {
495 const trimmedUrl = cap[2].trim();
496 if (!this.options.pedantic && /^</.test(trimmedUrl)) {
497 // commonmark requires matching angle brackets
498 if (!(/>$/.test(trimmedUrl))) {
499 return;
500 }
501
502 // ending angle bracket cannot be escaped
503 const rtrimSlash = rtrim(trimmedUrl.slice(0, -1), '\\');
504 if ((trimmedUrl.length - rtrimSlash.length) % 2 === 0) {
505 return;
506 }
507 } else {
508 // find closing parenthesis
509 const lastParenIndex = findClosingBracket(cap[2], '()');
510 if (lastParenIndex > -1) {
511 const start = cap[0].indexOf('!') === 0 ? 5 : 4;
512 const linkLen = start + cap[1].length + lastParenIndex;
513 cap[2] = cap[2].substring(0, lastParenIndex);
514 cap[0] = cap[0].substring(0, linkLen).trim();
515 cap[3] = '';
516 }
517 }
518 let href = cap[2];
519 let title = '';
520 if (this.options.pedantic) {
521 // split pedantic href and title
522 const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href);
523
524 if (link) {
525 href = link[1];
526 title = link[3];
527 }
528 } else {
529 title = cap[3] ? cap[3].slice(1, -1) : '';
530 }
531
532 href = href.trim();
533 if (/^</.test(href)) {
534 if (this.options.pedantic && !(/>$/.test(trimmedUrl))) {
535 // pedantic allows starting angle bracket without ending angle bracket
536 href = href.slice(1);
537 } else {
538 href = href.slice(1, -1);
539 }
540 }
541 return outputLink(cap, {
542 href: href ? href.replace(this.rules.inline._escapes, '$1') : href,
543 title: title ? title.replace(this.rules.inline._escapes, '$1') : title
544 }, cap[0], this.lexer);
545 }
546 }
547
548 reflink(src, links) {
549 let cap;
550 if ((cap = this.rules.inline.reflink.exec(src))
551 || (cap = this.rules.inline.nolink.exec(src))) {
552 let link = (cap[2] || cap[1]).replace(/\s+/g, ' ');
553 link = links[link.toLowerCase()];
554 if (!link || !link.href) {
555 const text = cap[0].charAt(0);
556 return {
557 type: 'text',
558 raw: text,
559 text
560 };
561 }
562 return outputLink(cap, link, cap[0], this.lexer);
563 }
564 }
565
566 emStrong(src, maskedSrc, prevChar = '') {
567 let match = this.rules.inline.emStrong.lDelim.exec(src);
568 if (!match) return;
569
570 // _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well
571 if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return;
572
573 const nextChar = match[1] || match[2] || '';
574
575 if (!nextChar || (nextChar && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
576 const lLength = match[0].length - 1;
577 let rDelim, rLength, delimTotal = lLength, midDelimTotal = 0;
578
579 const endReg = match[0][0] === '*' ? this.rules.inline.emStrong.rDelimAst : this.rules.inline.emStrong.rDelimUnd;
580 endReg.lastIndex = 0;
581
582 // Clip maskedSrc to same section of string as src (move to lexer?)
583 maskedSrc = maskedSrc.slice(-1 * src.length + lLength);
584
585 while ((match = endReg.exec(maskedSrc)) != null) {
586 rDelim = match[1] || match[2] || match[3] || match[4] || match[5] || match[6];
587
588 if (!rDelim) continue; // skip single * in __abc*abc__
589
590 rLength = rDelim.length;
591
592 if (match[3] || match[4]) { // found another Left Delim
593 delimTotal += rLength;
594 continue;
595 } else if (match[5] || match[6]) { // either Left or Right Delim
596 if (lLength % 3 && !((lLength + rLength) % 3)) {
597 midDelimTotal += rLength;
598 continue; // CommonMark Emphasis Rules 9-10
599 }
600 }
601
602 delimTotal -= rLength;
603
604 if (delimTotal > 0) continue; // Haven't found enough closing delimiters
605
606 // Remove extra characters. *a*** -> *a*
607 rLength = Math.min(rLength, rLength + delimTotal + midDelimTotal);
608
609 // Create `em` if smallest delimiter has odd char count. *a***
610 if (Math.min(lLength, rLength) % 2) {
611 const text = src.slice(1, lLength + match.index + rLength);
612 return {
613 type: 'em',
614 raw: src.slice(0, lLength + match.index + rLength + 1),
615 text,
616 tokens: this.lexer.inlineTokens(text, [])
617 };
618 }
619
620 // Create 'strong' if smallest delimiter has even char count. **a***
621 const text = src.slice(2, lLength + match.index + rLength - 1);
622 return {
623 type: 'strong',
624 raw: src.slice(0, lLength + match.index + rLength + 1),
625 text,
626 tokens: this.lexer.inlineTokens(text, [])
627 };
628 }
629 }
630 }
631
632 codespan(src) {
633 const cap = this.rules.inline.code.exec(src);
634 if (cap) {
635 let text = cap[2].replace(/\n/g, ' ');
636 const hasNonSpaceChars = /[^ ]/.test(text);
637 const hasSpaceCharsOnBothEnds = /^ /.test(text) && / $/.test(text);
638 if (hasNonSpaceChars && hasSpaceCharsOnBothEnds) {
639 text = text.substring(1, text.length - 1);
640 }
641 text = escape(text, true);
642 return {
643 type: 'codespan',
644 raw: cap[0],
645 text
646 };
647 }
648 }
649
650 br(src) {
651 const cap = this.rules.inline.br.exec(src);
652 if (cap) {
653 return {
654 type: 'br',
655 raw: cap[0]
656 };
657 }
658 }
659
660 del(src) {
661 const cap = this.rules.inline.del.exec(src);
662 if (cap) {
663 return {
664 type: 'del',
665 raw: cap[0],
666 text: cap[2],
667 tokens: this.lexer.inlineTokens(cap[2], [])
668 };
669 }
670 }
671
672 autolink(src, mangle) {
673 const cap = this.rules.inline.autolink.exec(src);
674 if (cap) {
675 let text, href;
676 if (cap[2] === '@') {
677 text = escape(this.options.mangle ? mangle(cap[1]) : cap[1]);
678 href = 'mailto:' + text;
679 } else {
680 text = escape(cap[1]);
681 href = text;
682 }
683
684 return {
685 type: 'link',
686 raw: cap[0],
687 text,
688 href,
689 tokens: [
690 {
691 type: 'text',
692 raw: text,
693 text
694 }
695 ]
696 };
697 }
698 }
699
700 url(src, mangle) {
701 let cap;
702 if (cap = this.rules.inline.url.exec(src)) {
703 let text, href;
704 if (cap[2] === '@') {
705 text = escape(this.options.mangle ? mangle(cap[0]) : cap[0]);
706 href = 'mailto:' + text;
707 } else {
708 // do extended autolink path validation
709 let prevCapZero;
710 do {
711 prevCapZero = cap[0];
712 cap[0] = this.rules.inline._backpedal.exec(cap[0])[0];
713 } while (prevCapZero !== cap[0]);
714 text = escape(cap[0]);
715 if (cap[1] === 'www.') {
716 href = 'http://' + text;
717 } else {
718 href = text;
719 }
720 }
721 return {
722 type: 'link',
723 raw: cap[0],
724 text,
725 href,
726 tokens: [
727 {
728 type: 'text',
729 raw: text,
730 text
731 }
732 ]
733 };
734 }
735 }
736
737 inlineText(src, smartypants) {
738 const cap = this.rules.inline.text.exec(src);
739 if (cap) {
740 let text;
741 if (this.lexer.state.inRawBlock) {
742 text = this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0];
743 } else {
744 text = escape(this.options.smartypants ? smartypants(cap[0]) : cap[0]);
745 }
746 return {
747 type: 'text',
748 raw: cap[0],
749 text
750 };
751 }
752 }
753};