Skip to content

Commit

Permalink
more strict RST inline markup parsing (nim-lang#17827)
Browse files Browse the repository at this point in the history
* more strict RST inline markup parsing
* add test for unexpected bonus
* introduce `roPreferMarkdown`
a-mr authored Apr 29, 2021
1 parent e61381a commit 1640508
Showing 3 changed files with 207 additions and 13 deletions.
7 changes: 5 additions & 2 deletions compiler/docgen.nim
Original file line number Diff line number Diff line change
@@ -178,7 +178,8 @@ proc newDocumentor*(filename: AbsoluteFile; cache: IdentCache; conf: ConfigRef,
result.outDir = conf.outDir.string
initRstGenerator(result[], (if conf.cmd != cmdRst2tex: outHtml else: outLatex),
conf.configVars, filename.string,
{roSupportRawDirective, roSupportMarkdown, roNimFile},
{roSupportRawDirective, roSupportMarkdown,
roPreferMarkdown, roNimFile},
docgenFindFile, compilerMsgHandler)

if conf.configVars.hasKey("doc.googleAnalytics"):
@@ -1380,7 +1381,9 @@ proc commandRstAux(cache: IdentCache, conf: ConfigRef;
d.isPureRst = true
var rst = parseRst(readFile(filen.string), filen.string,
line=LineRstInit, column=ColRstInit,
d.hasToc, {roSupportRawDirective, roSupportMarkdown}, conf)
d.hasToc,
{roSupportRawDirective, roSupportMarkdown, roPreferMarkdown},
conf)
var modDesc = newStringOfCap(30_000)
renderRstToOut(d[], rst, modDesc)
d.modDesc = rope(modDesc)
52 changes: 41 additions & 11 deletions lib/packages/docutils/rst.nim
Original file line number Diff line number Diff line change
@@ -11,9 +11,9 @@
## rst
## ==================================
##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Nim-flavored reStructuredText
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Nim-flavored reStructuredText and Markdown
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##
## This module implements a `reStructuredText`:idx: (RST) parser.
## A large subset is implemented with some limitations_ and
@@ -177,6 +177,8 @@ type
roSupportRawDirective, ## support the ``raw`` directive (don't support
## it for sandboxing)
roSupportMarkdown, ## support additional features of Markdown
roPreferMarkdown, ## parse as Markdown (keeping RST as "extension"
## to Markdown) -- implies `roSupportMarkdown`
roNimFile ## set for Nim files where default interpreted
## text role should be :nim:

@@ -277,6 +279,7 @@ type
line*, col*, baseIndent*: int
skipPounds*: bool
adornmentLine*: bool
escapeNext*: bool

proc getThing(L: var Lexer, tok: var Token, s: set[char]) =
tok.kind = tkWord
@@ -314,10 +317,18 @@ proc getPunctAdornment(L: var Lexer, tok: var Token) =
tok.col = L.col
var pos = L.bufpos
let c = L.buf[pos]
while true:
if not L.escapeNext and (c != '\\' or L.adornmentLine):
while true:
tok.symbol.add(L.buf[pos])
inc pos
if L.buf[pos] != c: break
elif L.escapeNext:
tok.symbol.add(L.buf[pos])
inc pos
if L.buf[pos] != c: break
else: # not L.escapeNext and c == '\\' and not L.adornmentLine
tok.symbol.add '\\'
inc pos
L.escapeNext = true
inc L.col, pos - L.bufpos
L.bufpos = pos
if tok.symbol == "\\": tok.kind = tkPunct
@@ -429,7 +440,9 @@ proc getTokens(buffer: string, skipPounds: bool, tokens: var TokenSeq): int =
while true:
inc length
setLen(tokens, length)
let toEscape = L.escapeNext
rawGetTok(L, tokens[length - 1])
if toEscape: L.escapeNext = false
if tokens[length - 1].kind == tkEof: break
if tokens[0].kind == tkWhite:
# BUGFIX
@@ -981,16 +994,24 @@ proc expect(p: var RstParser, tok: string) =
if currentTok(p).symbol == tok: inc p.idx
else: rstMessage(p, meExpected, tok)

proc isInlineMarkupEnd(p: RstParser, markup: string): bool =
proc isInlineMarkupEnd(p: RstParser, markup: string, exact: bool): bool =
# rst rules: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
result = currentTok(p).symbol == markup
if exact:
result = currentTok(p).symbol == markup
else:
result = currentTok(p).symbol.endsWith markup
if (not result) and markup == "``":
# check that escaping may have splitted `` to 2 tokens ` and `
result = currentTok(p).symbol == "`" and prevTok(p).symbol == "`"
if not result: return
# Rule 2:
result = prevTok(p).kind notin {tkIndent, tkWhite}
if not result: return
# Rule 7:
result = nextTok(p).kind in {tkIndent, tkWhite, tkEof} or
markup in ["``", "`"] and nextTok(p).kind in {tkIndent, tkWhite, tkWord, tkEof} or
(roPreferMarkdown in p.s.options and
markup in ["``", "`"] and
nextTok(p).kind in {tkIndent, tkWhite, tkWord, tkEof}) or
nextTok(p).symbol[0] in
{'\'', '\"', ')', ']', '}', '>', '-', '/', '\\', ':', '.', ',', ';', '!', '?', '_'}
if not result: return
@@ -1130,7 +1151,8 @@ proc toOtherRole(n: PRstNode, kind: RstNodeKind, roleName: string): PRstNode =
proc parsePostfix(p: var RstParser, n: PRstNode): PRstNode =
var newKind = n.kind
var newSons = n.sons
if isInlineMarkupEnd(p, "_") or isInlineMarkupEnd(p, "__"):
if isInlineMarkupEnd(p, "_", exact=true) or
isInlineMarkupEnd(p, "__", exact=true):
inc p.idx
if p.tok[p.idx-2].symbol == "`" and p.tok[p.idx-3].symbol == ">":
var a = newRstNode(rnInner)
@@ -1215,7 +1237,7 @@ proc parseWordOrRef(p: var RstParser, father: PRstNode) =
inc p.idx
while currentTok(p).kind in {tkWord, tkPunct}:
if currentTok(p).kind == tkPunct:
if isInlineMarkupEnd(p, "_"):
if isInlineMarkupEnd(p, "_", exact=true):
isRef = true
break
if not validRefnamePunct(currentTok(p).symbol):
@@ -1253,7 +1275,15 @@ proc parseUntil(p: var RstParser, father: PRstNode, postfix: string,
while true:
case currentTok(p).kind
of tkPunct:
if isInlineMarkupEnd(p, postfix):
if isInlineMarkupEnd(p, postfix, exact=false):
let l = currentTok(p).symbol.len
if l > postfix.len:
# handle cases like *emphasis with stars****. (It's valid RST!)
father.add newLeaf(currentTok(p).symbol[0 ..< l - postfix.len])
elif postfix == "``" and currentTok(p).symbol == "`" and
prevTok(p).symbol == "`":
# handle cases like ``literal\`` - delete ` already added after \
father.sons.setLen(father.sons.len - 1)
inc p.idx
break
else:
161 changes: 161 additions & 0 deletions tests/stdlib/trst.nim
Original file line number Diff line number Diff line change
@@ -4,6 +4,10 @@ discard """
[Suite] RST indentation
[Suite] RST include directive
[Suite] RST escaping
[Suite] RST inline markup
'''
"""

@@ -267,3 +271,160 @@ And this should **NOT** be visible in `docs.html`
"""
doAssert "<em>Visible</em>" == rstTohtml(input, {}, defaultConfig())
removeFile("other.rst")

suite "RST escaping":
test "backspaces":
check("""\ this""".toAst == dedent"""
rnLeaf 'this'
""")

check("""\\ this""".toAst == dedent"""
rnInner
rnLeaf '\'
rnLeaf ' '
rnLeaf 'this'
""")

check("""\\\ this""".toAst == dedent"""
rnInner
rnLeaf '\'
rnLeaf 'this'
""")

check("""\\\\ this""".toAst == dedent"""
rnInner
rnLeaf '\'
rnLeaf '\'
rnLeaf ' '
rnLeaf 'this'
""")

suite "RST inline markup":
test "end-string has repeating symbols":
check("*emphasis content****".toAst == dedent"""
rnEmphasis
rnLeaf 'emphasis'
rnLeaf ' '
rnLeaf 'content'
rnLeaf '***'
""")

check("""*emphasis content\****""".toAst == dedent"""
rnEmphasis
rnLeaf 'emphasis'
rnLeaf ' '
rnLeaf 'content'
rnLeaf '*'
rnLeaf '**'
""") # exact configuration of leafs with * is not really essential,
# only total number of * is essential

check("**strong content****".toAst == dedent"""
rnStrongEmphasis
rnLeaf 'strong'
rnLeaf ' '
rnLeaf 'content'
rnLeaf '**'
""")

check("""**strong content*\****""".toAst == dedent"""
rnStrongEmphasis
rnLeaf 'strong'
rnLeaf ' '
rnLeaf 'content'
rnLeaf '*'
rnLeaf '*'
rnLeaf '*'
""")

check("``lit content`````".toAst == dedent"""
rnInlineLiteral
rnLeaf 'lit'
rnLeaf ' '
rnLeaf 'content'
rnLeaf '```'
""")


test """interpreted text can be ended with \` """:
let output = (".. default-role:: literal\n" & """`\``""").toAst
check(output.endsWith """
rnParagraph
rnInlineLiteral
rnLeaf '`'""" & "\n")

let output2 = """`\``""".toAst
check(output2 == dedent"""
rnInlineCode
rnDirArg
rnLeaf 'nim'
[nil]
rnLiteralBlock
rnLeaf '`'
""")

let output3 = """`proc \`+\``""".toAst
check(output3 == dedent"""
rnInlineCode
rnDirArg
rnLeaf 'nim'
[nil]
rnLiteralBlock
rnLeaf 'proc `+`'
""")

test """inline literals can contain \ anywhere""":
check("""``\``""".toAst == dedent"""
rnInlineLiteral
rnLeaf '\'
""")

check("""``\\``""".toAst == dedent"""
rnInlineLiteral
rnLeaf '\'
rnLeaf '\'
""")

check("""``\```""".toAst == dedent"""
rnInlineLiteral
rnLeaf '\'
rnLeaf '`'
""")

check("""``\\```""".toAst == dedent"""
rnInlineLiteral
rnLeaf '\'
rnLeaf '\'
rnLeaf '`'
""")

check("""``\````""".toAst == dedent"""
rnInlineLiteral
rnLeaf '\'
rnLeaf '`'
rnLeaf '`'
""")

test "references with _ at the end":
check(dedent"""
.. _lnk: https
lnk_""".toAst ==
dedent"""
rnHyperlink
rnInner
rnLeaf 'lnk'
rnInner
rnLeaf 'https'
""")

test "not a hyper link":
check(dedent"""
.. _lnk: https
lnk___""".toAst ==
dedent"""
rnInner
rnLeaf 'lnk'
rnLeaf '___'
""")

0 comments on commit 1640508

Please sign in to comment.