Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BREAKING CHANGE: Move all regexes to rules #3519

Merged
merged 5 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/Lexer.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { _Tokenizer } from './Tokenizer.ts';
import { _defaults } from './defaults.ts';
import { block, inline } from './rules.ts';
import { other, block, inline } from './rules.ts';
import type { Token, TokensList, Tokens } from './Tokens.ts';
import type { MarkedOptions, TokenizerExtension } from './MarkedOptions.ts';

Expand Down Expand Up @@ -36,6 +36,7 @@ export class _Lexer {
};

const rules = {
other,
block: block.normal,
inline: inline.normal,
};
Expand Down Expand Up @@ -85,7 +86,7 @@ export class _Lexer {
*/
lex(src: string) {
src = src
.replace(/\r\n|\r/g, '\n');
.replace(other.carriageReturn, '\n');

this.blockTokens(src, this.tokens);

Expand All @@ -105,7 +106,7 @@ export class _Lexer {
blockTokens(src: string, tokens?: TokensList, lastParagraphClipped?: boolean): TokensList;
blockTokens(src: string, tokens: Token[] = [], lastParagraphClipped = false) {
if (this.options.pedantic) {
src = src.replace(/\t/g, ' ').replace(/^ +$/gm, '');
src = src.replace(other.tabCharGlobal, ' ').replace(other.spaceLine, '');
}

let token: Tokens.Generic | undefined;
Expand Down
5 changes: 3 additions & 2 deletions src/Renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
cleanUrl,
escape,
} from './helpers.ts';
import { other } from './rules.ts';
import type { MarkedOptions } from './MarkedOptions.ts';
import type { Tokens } from './Tokens.ts';
import type { _Parser } from './Parser.ts';
Expand All @@ -22,9 +23,9 @@ export class _Renderer {
}

code({ text, lang, escaped }: Tokens.Code): string {
const langString = (lang || '').match(/^\S*/)?.[0];
const langString = (lang || '').match(other.notSpaceStart)?.[0];

const code = text.replace(/\n$/, '') + '\n';
const code = text.replace(other.endingNewline, '') + '\n';

if (!langString) {
return '<pre><code>'
Expand Down
106 changes: 53 additions & 53 deletions src/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ import type { _Lexer } from './Lexer.ts';
import type { Links, Tokens, Token } from './Tokens.ts';
import type { MarkedOptions } from './MarkedOptions.ts';

function outputLink(cap: string[], link: Pick<Tokens.Link, 'href' | 'title'>, raw: string, lexer: _Lexer): Tokens.Link | Tokens.Image {
function outputLink(cap: string[], link: Pick<Tokens.Link, 'href' | 'title'>, raw: string, lexer: _Lexer, rules: Rules): Tokens.Link | Tokens.Image {
const href = link.href;
const title = link.title || null;
const text = cap[1].replace(/\\([\[\]])/g, '$1');
const text = cap[1].replace(rules.other.outputLinkReplace, '$1');

if (cap[0].charAt(0) !== '!') {
lexer.state.inLink = true;
Expand All @@ -36,8 +36,8 @@ function outputLink(cap: string[], link: Pick<Tokens.Link, 'href' | 'title'>, ra
};
}

function indentCodeCompensation(raw: string, text: string) {
const matchIndentToCode = raw.match(/^(\s+)(?:```)/);
function indentCodeCompensation(raw: string, text: string, rules: Rules) {
const matchIndentToCode = raw.match(rules.other.indentCodeCompensation);

if (matchIndentToCode === null) {
return text;
Expand All @@ -48,7 +48,7 @@ function indentCodeCompensation(raw: string, text: string) {
return text
.split('\n')
.map(node => {
const matchIndentInNode = node.match(/^\s+/);
const matchIndentInNode = node.match(rules.other.beginningSpace);
if (matchIndentInNode === null) {
return node;
}
Expand Down Expand Up @@ -89,7 +89,7 @@ export class _Tokenizer {
code(src: string): Tokens.Code | undefined {
const cap = this.rules.block.code.exec(src);
if (cap) {
const text = cap[0].replace(/^(?: {1,4}| {0,3}\t)/gm, '');
const text = cap[0].replace(this.rules.other.codeRemoveIndent, '');
return {
type: 'code',
raw: cap[0],
Expand All @@ -105,7 +105,7 @@ export class _Tokenizer {
const cap = this.rules.block.fences.exec(src);
if (cap) {
const raw = cap[0];
const text = indentCodeCompensation(raw, cap[3] || '');
const text = indentCodeCompensation(raw, cap[3] || '', this.rules);

return {
type: 'code',
Expand All @@ -122,11 +122,11 @@ export class _Tokenizer {
let text = cap[2].trim();

// remove trailing #s
if (/#$/.test(text)) {
if (this.rules.other.endingHash.test(text)) {
const trimmed = rtrim(text, '#');
if (this.options.pedantic) {
text = trimmed.trim();
} else if (!trimmed || / $/.test(trimmed)) {
} else if (!trimmed || this.rules.other.endingSpaceChar.test(trimmed)) {
// CommonMark requires space before trailing #s
text = trimmed.trim();
}
Expand Down Expand Up @@ -167,7 +167,7 @@ export class _Tokenizer {
let i;
for (i = 0; i < lines.length; i++) {
// get lines up to a continuation
if (/^ {0,3}>/.test(lines[i])) {
if (this.rules.other.blockquoteStart.test(lines[i])) {
currentLines.push(lines[i]);
inBlockquote = true;
} else if (!inBlockquote) {
Expand All @@ -181,8 +181,8 @@ export class _Tokenizer {
const currentRaw = currentLines.join('\n');
const currentText = currentRaw
// precede setext continuation with 4 spaces so it isn't a setext
.replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n $1')
.replace(/^ {0,3}>[ \t]?/gm, '');
.replace(this.rules.other.blockquoteSetextReplace, '\n $1')
.replace(this.rules.other.blockquoteSetextReplace2, '');
raw = raw ? `${raw}\n${currentRaw}` : currentRaw;
text = text ? `${text}\n${currentText}` : currentText;

Expand Down Expand Up @@ -258,7 +258,7 @@ export class _Tokenizer {
}

// Get next list item
const itemRegex = new RegExp(`^( {0,3}${bull})((?:[\t ][^\\n]*)?(?:\\n|$))`);
const itemRegex = this.rules.other.listItemRegex(bull);
let endsWithBlankLine = false;
// Check if current bullet point can start a new List Item
while (src) {
Expand All @@ -276,7 +276,7 @@ export class _Tokenizer {
raw = cap[0];
src = src.substring(raw.length);

let line = cap[2].split('\n', 1)[0].replace(/^\t+/, (t: string) => ' '.repeat(3 * t.length));
let line = cap[2].split('\n', 1)[0].replace(this.rules.other.listReplaceTabs, (t: string) => ' '.repeat(3 * t.length));
let nextLine = src.split('\n', 1)[0];
let blankLine = !line.trim();

Expand All @@ -287,24 +287,24 @@ export class _Tokenizer {
} else if (blankLine) {
indent = cap[1].length + 1;
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char
indent = cap[2].search(this.rules.other.nonSpaceChar); // Find first non-space char
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
itemContents = line.slice(indent);
indent += cap[1].length;
}

if (blankLine && /^[ \t]*$/.test(nextLine)) { // Items begin with at most one blank line
if (blankLine && this.rules.other.blankLine.test(nextLine)) { // Items begin with at most one blank line
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
endEarly = true;
}

if (!endEarly) {
const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])((?:[ \t][^\\n]*)?(?:\\n|$))`);
const hrRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}((?:- *){3,}|(?:_ *){3,}|(?:\\* *){3,})(?:\\n+|$)`);
const fencesBeginRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:\`\`\`|~~~)`);
const headingBeginRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}#`);
const htmlBeginRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}<(?:[a-z].*>|!--)`, 'i');
const nextBulletRegex = this.rules.other.nextBulletRegex(indent);
const hrRegex = this.rules.other.hrRegex(indent);
const fencesBeginRegex = this.rules.other.fencesBeginRegex(indent);
const headingBeginRegex = this.rules.other.headingBeginRegex(indent);
const htmlBeginRegex = this.rules.other.htmlBeginRegex(indent);

// Check if following lines should be included in List Item
while (src) {
Expand All @@ -314,10 +314,10 @@ export class _Tokenizer {

// Re-align to follow commonmark nesting rules
if (this.options.pedantic) {
nextLine = nextLine.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
nextLine = nextLine.replace(this.rules.other.listReplaceNesting, ' ');
nextLineWithoutTabs = nextLine;
} else {
nextLineWithoutTabs = nextLine.replace(/\t/g, ' ');
nextLineWithoutTabs = nextLine.replace(this.rules.other.tabCharGlobal, ' ');
}

// End list item if found code fences
Expand Down Expand Up @@ -345,7 +345,7 @@ export class _Tokenizer {
break;
}

if (nextLineWithoutTabs.search(/[^ ]/) >= indent || !nextLine.trim()) { // Dedent if possible
if (nextLineWithoutTabs.search(this.rules.other.nonSpaceChar) >= indent || !nextLine.trim()) { // Dedent if possible
itemContents += '\n' + nextLineWithoutTabs.slice(indent);
} else {
// not enough indentation
Expand All @@ -354,7 +354,7 @@ export class _Tokenizer {
}

// paragraph continuation unless last line was a different block level element
if (line.replace(/\t/g, ' ').search(/[^ ]/) >= 4) { // indented code block
if (line.replace(this.rules.other.tabCharGlobal, ' ').search(this.rules.other.nonSpaceChar) >= 4) { // indented code block
break;
}
if (fencesBeginRegex.test(line)) {
Expand Down Expand Up @@ -384,7 +384,7 @@ export class _Tokenizer {
// If the previous item ended with a blank line, the list is loose
if (endsWithBlankLine) {
list.loose = true;
} else if (/\n[ \t]*\n[ \t]*$/.test(raw)) {
} else if (this.rules.other.doubleBlankLine.test(raw)) {
endsWithBlankLine = true;
}
}
Expand All @@ -393,10 +393,10 @@ export class _Tokenizer {
let ischecked: boolean | undefined;
// Check for task list items
if (this.options.gfm) {
istask = /^\[[ xX]\] /.exec(itemContents);
istask = this.rules.other.listIsTask.exec(itemContents);
if (istask) {
ischecked = istask[0] !== '[ ] ';
itemContents = itemContents.replace(/^\[[ xX]\] +/, '');
itemContents = itemContents.replace(this.rules.other.listReplaceTask, '');
}
}

Expand Down Expand Up @@ -426,7 +426,7 @@ export class _Tokenizer {
if (!list.loose) {
// Check if list should be loose
const spacers = list.items[i].tokens.filter(t => t.type === 'space');
const hasMultipleLineBreaks = spacers.length > 0 && spacers.some(t => /\n.*\n/.test(t.raw));
const hasMultipleLineBreaks = spacers.length > 0 && spacers.some(t => this.rules.other.anyLine.test(t.raw));

list.loose = hasMultipleLineBreaks;
}
Expand Down Expand Up @@ -460,8 +460,8 @@ export class _Tokenizer {
def(src: string): Tokens.Def | undefined {
const cap = this.rules.block.def.exec(src);
if (cap) {
const tag = cap[1].toLowerCase().replace(/\s+/g, ' ');
const href = cap[2] ? cap[2].replace(/^<(.*)>$/, '$1').replace(this.rules.inline.anyPunctuation, '$1') : '';
const tag = cap[1].toLowerCase().replace(this.rules.other.multipleSpaceGlobal, ' ');
const href = cap[2] ? cap[2].replace(this.rules.other.hrefBrackets, '$1').replace(this.rules.inline.anyPunctuation, '$1') : '';
const title = cap[3] ? cap[3].substring(1, cap[3].length - 1).replace(this.rules.inline.anyPunctuation, '$1') : cap[3];
return {
type: 'def',
Expand All @@ -479,14 +479,14 @@ export class _Tokenizer {
return;
}

if (!/[:|]/.test(cap[2])) {
if (!this.rules.other.tableDelimiter.test(cap[2])) {
// delimiter row must have a pipe (|) or colon (:) otherwise it is a setext heading
return;
}

const headers = splitCells(cap[1]);
const aligns = cap[2].replace(/^\||\| *$/g, '').split('|');
const rows = cap[3] && cap[3].trim() ? cap[3].replace(/\n[ \t]*$/, '').split('\n') : [];
const aligns = cap[2].replace(this.rules.other.tableAlignChars, '').split('|');
const rows = cap[3] && cap[3].trim() ? cap[3].replace(this.rules.other.tableRowBlankLine, '').split('\n') : [];

const item: Tokens.Table = {
type: 'table',
Expand All @@ -502,11 +502,11 @@ export class _Tokenizer {
}

for (const align of aligns) {
if (/^ *-+: *$/.test(align)) {
if (this.rules.other.tableAlignRight.test(align)) {
item.align.push('right');
} else if (/^ *:-+: *$/.test(align)) {
} else if (this.rules.other.tableAlignCenter.test(align)) {
item.align.push('center');
} else if (/^ *:-+ *$/.test(align)) {
} else if (this.rules.other.tableAlignLeft.test(align)) {
item.align.push('left');
} else {
item.align.push(null);
Expand Down Expand Up @@ -590,14 +590,14 @@ export class _Tokenizer {
tag(src: string): Tokens.Tag | undefined {
const cap = this.rules.inline.tag.exec(src);
if (cap) {
if (!this.lexer.state.inLink && /^<a /i.test(cap[0])) {
if (!this.lexer.state.inLink && this.rules.other.startATag.test(cap[0])) {
this.lexer.state.inLink = true;
} else if (this.lexer.state.inLink && /^<\/a>/i.test(cap[0])) {
} else if (this.lexer.state.inLink && this.rules.other.endATag.test(cap[0])) {
this.lexer.state.inLink = false;
}
if (!this.lexer.state.inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
if (!this.lexer.state.inRawBlock && this.rules.other.startPreScriptTag.test(cap[0])) {
this.lexer.state.inRawBlock = true;
} else if (this.lexer.state.inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
} else if (this.lexer.state.inRawBlock && this.rules.other.endPreScriptTag.test(cap[0])) {
this.lexer.state.inRawBlock = false;
}

Expand All @@ -616,9 +616,9 @@ export class _Tokenizer {
const cap = this.rules.inline.link.exec(src);
if (cap) {
const trimmedUrl = cap[2].trim();
if (!this.options.pedantic && /^</.test(trimmedUrl)) {
if (!this.options.pedantic && this.rules.other.startAngleBracket.test(trimmedUrl)) {
// commonmark requires matching angle brackets
if (!(/>$/.test(trimmedUrl))) {
if (!(this.rules.other.endAngleBracket.test(trimmedUrl))) {
return;
}

Expand All @@ -642,7 +642,7 @@ export class _Tokenizer {
let title = '';
if (this.options.pedantic) {
// split pedantic href and title
const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href);
const link = this.rules.other.pedanticHrefTitle.exec(href);

if (link) {
href = link[1];
Expand All @@ -653,8 +653,8 @@ export class _Tokenizer {
}

href = href.trim();
if (/^</.test(href)) {
if (this.options.pedantic && !(/>$/.test(trimmedUrl))) {
if (this.rules.other.startAngleBracket.test(href)) {
if (this.options.pedantic && !(this.rules.other.endAngleBracket.test(trimmedUrl))) {
// pedantic allows starting angle bracket without ending angle bracket
href = href.slice(1);
} else {
Expand All @@ -664,15 +664,15 @@ export class _Tokenizer {
return outputLink(cap, {
href: href ? href.replace(this.rules.inline.anyPunctuation, '$1') : href,
title: title ? title.replace(this.rules.inline.anyPunctuation, '$1') : title,
}, cap[0], this.lexer);
}, cap[0], this.lexer, this.rules);
}
}

reflink(src: string, links: Links): Tokens.Link | Tokens.Image | Tokens.Text | undefined {
let cap;
if ((cap = this.rules.inline.reflink.exec(src))
|| (cap = this.rules.inline.nolink.exec(src))) {
const linkString = (cap[2] || cap[1]).replace(/\s+/g, ' ');
const linkString = (cap[2] || cap[1]).replace(this.rules.other.multipleSpaceGlobal, ' ');
const link = links[linkString.toLowerCase()];
if (!link) {
const text = cap[0].charAt(0);
Expand All @@ -682,7 +682,7 @@ export class _Tokenizer {
text,
};
}
return outputLink(cap, link, cap[0], this.lexer);
return outputLink(cap, link, cap[0], this.lexer, this.rules);
}
}

Expand All @@ -691,7 +691,7 @@ export class _Tokenizer {
if (!match) return;

// _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well
if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return;
if (match[3] && prevChar.match(this.rules.other.unicodeAlphaNumeric)) return;

const nextChar = match[1] || match[2] || '';

Expand Down Expand Up @@ -759,9 +759,9 @@ export class _Tokenizer {
codespan(src: string): Tokens.Codespan | undefined {
const cap = this.rules.inline.code.exec(src);
if (cap) {
let text = cap[2].replace(/\n/g, ' ');
const hasNonSpaceChars = /[^ ]/.test(text);
const hasSpaceCharsOnBothEnds = /^ /.test(text) && / $/.test(text);
let text = cap[2].replace(this.rules.other.newLineCharGlobal, ' ');
const hasNonSpaceChars = this.rules.other.nonSpaceChar.test(text);
const hasSpaceCharsOnBothEnds = this.rules.other.startingSpaceChar.test(text) && this.rules.other.endingSpaceChar.test(text);
if (hasNonSpaceChars && hasSpaceCharsOnBothEnds) {
text = text.substring(1, text.length - 1);
}
Expand Down
Loading