Skip to content

Commit

Permalink
regexp: of clearer notation
Browse files Browse the repository at this point in the history
  • Loading branch information
dg committed Jan 16, 2025
1 parent d744b49 commit 4109166
Show file tree
Hide file tree
Showing 14 changed files with 285 additions and 46 deletions.
27 changes: 24 additions & 3 deletions src/Texy/Modules/BlockModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,20 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^/--++\ *+(.*)' . Texy\Patterns::MODIFIER_H . '?$((?:\n(?0)|\n.*+)*)(?:\n\\\--.*$|\z)~mUix',
'~^
/--++ \ *+ # opening tag /--
(.*) # content type
' . Texy\Patterns::MODIFIER_H . '?
$
((?:
\n (?0) | # recursive nested blocks
\n.*+ # or any content
)*)
(?:
\n \\\--.* $ | # closing tag \--
\z # or end of input
)
~mUix',
'blocks',
);
}
Expand All @@ -52,8 +65,16 @@ private function beforeBlockParse(Texy\BlockParser $parser, string &$text): void
// autoclose exclusive blocks
$text = Texy\Regexp::replace(
$text,
'~^(/--++\ *+(?!div|texysource).*)$((?:\n.*+)*?)(?:\n\\\--.*$|(?=(\n/--.*$)))~mi',
"\$1\$2\n\\--",
'~^
( /--++ \ *+ (?!div|texysource) .* ) # opening tag except div/texysource
$
((?: \n.*+ )*?) # content
(?:
\n \\\--.* $ | # closing tag
(?= (\n /--.* $)) # or next block starts
)
~mi',
"\$1\$2\n\\--", // add closing tag
);
}

Expand Down
7 changes: 6 additions & 1 deletion src/Texy/Modules/BlockQuoteModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^(?:' . Texy\Patterns::MODIFIER_H . '\n)?\>([\ \t]++|:)(\S.*+)$~mUx', // original
'~^
(?:' . Texy\Patterns::MODIFIER_H . '\n)?
\> # blockquote char
( [\ \t]++ | : ) # space/tab or colon
( \S.*+ ) # content
$~mUx',
'blockquote',
);
}
Expand Down
5 changes: 4 additions & 1 deletion src/Texy/Modules/EmoticonModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ private function beforeParse(): void

$this->texy->registerLinePattern(
$this->pattern(...),
'~(?<=^|[\x00-\x20])(' . implode('|', $pattern) . ')~',
'~
(?<= ^ | [\x00-\x20] )
(' . implode('|', $pattern) . ')
~x',
'emoticon',
'~' . implode('|', $pattern) . '~',
);
Expand Down
20 changes: 18 additions & 2 deletions src/Texy/Modules/FigureModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,26 @@ public function __construct(Texy\Texy $texy)

$texy->addHandler('figure', $this->solve(...));

// [* urls .(title)[class]{style} >]
$texy->registerBlockPattern(
$this->pattern(...),
'~^\[\*\ *+([^\n' . Patterns::MARK . ']{1,1000})' . Patterns::MODIFIER . '?\ *+(\*|(?<!<)>|<)\]' // [* urls .(title)[class]{style} >]
. '(?::(' . Patterns::LINK_URL . '|:))??\ ++\*\*\*\ ++(.{0,2000})' . Patterns::MODIFIER_H . '?()$~mUx',
'~^
\[\*\ *+ # opening bracket with asterisk
([^\n' . Patterns::MARK . ']{1,1000}) # URLs
' . Patterns::MODIFIER . '?
\ *+
(\*|(?<!<)>|<) # alignment
\]
(?:
:( # link delimiter
' . Patterns::LINK_URL . ' | # link
: # or just colon
)
)??
\ ++ \*\*\* \ ++ # separator
(.{0,2000}) # figure content
' . Patterns::MODIFIER_H . '?
()$~mUx',
'figure',
);
}
Expand Down
14 changes: 11 additions & 3 deletions src/Texy/Modules/HeadingModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,22 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->patternUnderline(...),
'~^(\S.{0,1000})' . Texy\Patterns::MODIFIER_H . '?\n'
. '(\#{3,}+|\*{3,}+|={3,}+|-{3,}+)$~mUx',
'~^
( \S .{0,1000} ) # heading text
' . Texy\Patterns::MODIFIER_H . '?
\n
( \#{3,}+ | \*{3,}+ | ={3,}+ | -{3,}+ ) # underline characters
$~mUx',
'heading/underlined',
);

$texy->registerBlockPattern(
$this->patternSurround(...),
'~^(\#{2,}+|={2,}+)(.+)' . Texy\Patterns::MODIFIER_H . '?()$~mUx',
'~^
( \#{2,}+ | ={2,}+ ) # opening characters
(.+) # heading text
' . Texy\Patterns::MODIFIER_H . '?
()$~mUx',
'heading/surrounded',
);
}
Expand Down
6 changes: 5 additions & 1 deletion src/Texy/Modules/HorizLineModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^(\*{3,}+|-{3,}+)[\ \t]*' . Texy\Patterns::MODIFIER . '?()$~mUx',
'~^
( \*{3,}+ | -{3,}+ ) # three or more * or -
[\ \t]* # optional spaces
' . Texy\Patterns::MODIFIER . '?
()$~mUx',
'horizline',
);
}
Expand Down
36 changes: 32 additions & 4 deletions src/Texy/Modules/HtmlModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,31 @@ public function __construct(Texy\Texy $texy)

$texy->registerLinePattern(
$this->patternTag(...),
'~<(/?)([a-z][a-z0-9_:-]{0,50})((?:\s++[a-z0-9\_:-]++|=\s*+"[^"' . Patterns::MARK . ']*+"|=\s*+\'[^\'' . Patterns::MARK . ']*+\'|=[^\s>' . Patterns::MARK . ']++)*)\s*+(/?)>~isx',
'~
< (/?) # tag begin
([a-z][a-z0-9_:-]{0,50}) # tag name
(
(?:
\s++ [a-z0-9\_:-]++ | # attribute name
= \s*+ " [^"' . Patterns::MARK . ']*+ " | # attribute value in double quotes
= \s*+ \' [^\'' . Patterns::MARK . ']*+ \' | # attribute value in single quotes
= [^\s>' . Patterns::MARK . ']++ # attribute value without quotes
)*
)
\s*+
(/?) # self-closing slash
>
~isx',
'html/tag',
);

$texy->registerLinePattern(
$this->patternComment(...),
'~<!--([^' . Patterns::MARK . ']*?)-->~isx',
'~
<!--
( [^' . Patterns::MARK . ']*? )
-->
~isx',
'html/comment',
);
}
Expand Down Expand Up @@ -288,12 +306,22 @@ private function parseAttributes(string $attrs): array
{
$matches = $res = [];
preg_match_all(
'~([a-z0-9\_:-]+)\s*(?:=\s*(\'[^\']*\'|"[^"]*"|[^\'"\s]+))?()~isux',
'~
([a-z0-9\_:-]+) # attribute name
\s*
(?:
= \s* # equals sign
(
\' [^\']* \' | # single quoted value
" [^"]* " | # double quoted value
[^\'"\s]+ # unquoted value
)
)?
()~isux',
$attrs,
$matches,
PREG_SET_ORDER,
);

foreach ($matches as $m) {
$key = strtolower($m[1]);
$value = $m[2];
Expand Down
5 changes: 4 additions & 1 deletion src/Texy/Modules/HtmlOutputModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ private function postProcess(Texy\Texy $texy, string &$s): void
// wellform and reformat
$s = Regexp::replace(
$s . '</end/>',
'~([^<]*+)<(?:(!--.*--)|(/?)([a-z][a-z0-9._:-]*)(|[ \n].*)\s*(/?))>()~Uis',
'~
( [^<]*+ )
< (?: (!--.*--) | (/?) ([a-z][a-z0-9._:-]*) (|[ \n].*) \s* (/?) ) >
()~Uisx',
$this->cb(...),
);

Expand Down
25 changes: 22 additions & 3 deletions src/Texy/Modules/ImageModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,20 @@ public function __construct(Texy\Texy $texy)
// [*image*]:LINK
$texy->registerLinePattern(
$this->patternImage(...),
'~\[\*\ *+([^\n' . Patterns::MARK . ']{1,1000})' . Patterns::MODIFIER . '?\ *+(\*|(?<!<)>|<)\]' // [* urls .(title)[class]{style} >]
. '(?::(' . Patterns::LINK_URL . '|:))??()~Ux',
'~
\[\* \ *+ # opening bracket with asterisk
([^\n' . Patterns::MARK . ']{1,1000}) # URLs
' . Patterns::MODIFIER . '?
\ *+
(\*|(?<!<)>|<) # alignment
\]
(?:
:(' . Patterns::LINK_URL . ' | : ) # link or just colon
)??
()~Ux',
'image',
);

}


Expand All @@ -69,7 +79,16 @@ private function beforeParse(Texy\Texy $texy, &$text): void
// [*image*]: urls .(title)[class]{style}
$text = Texy\Regexp::replace(
$text,
'~^\[\*([^\n]{1,100})\*\]:[\ \t]+(.{1,1000})[\ \t]*' . Patterns::MODIFIER . '?\s*()$~mUx',
'~^
\[\* # opening [*
( [^\n]{1,100} ) # URL
\*\] # closing *]
: [\ \t]+
(.{1,1000}) # URL
[\ \t]*
' . Patterns::MODIFIER . '?
\s*
()$~mUx',
$this->patternReferenceDef(...),
);
}
Expand Down
55 changes: 45 additions & 10 deletions src/Texy/Modules/LinkModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,45 @@ public function __construct(Texy\Texy $texy)
// [reference]
$texy->registerLinePattern(
$this->patternReference(...),
'~(\[[^\[\]\*\n' . Patterns::MARK . ']++\])~Ux',
'~(
\[
[^\[\]\*\n' . Patterns::MARK . ']++ # reference text
\]
)~Ux',
'link/reference',
);

// direct url; charaters not allowed in URL <>[\]^`{|}
// direct url; characters not allowed in URL <>[\]^`{|}
$texy->registerLinePattern(
$this->patternUrlEmail(...),
'~(?<=^|[\s([<:\x17])(?:https?://|www\.|ftp://)[0-9.' . Patterns::CHAR . '-][/\d' . Patterns::CHAR . '+\.\~%&?@=_:;#$!,*()\x{ad}-]{1,1000}[/\d' . Patterns::CHAR . '+\~?@=_#$*]~x',
'~
(?<= ^ | [\s([<:\x17] ) # must be preceded by these chars
(?: https?:// | www\. | ftp:// ) # protocol or www
[0-9.' . Patterns::CHAR . '-] # first char
[/\d' . Patterns::CHAR . '+\.\~%&?@=_:;#$!,*()\x{ad}-]{1,1000} # URL body
[/\d' . Patterns::CHAR . '+\~?@=_#$*] # last char
~x',
'link/url',
'~(?:https?://|www\.|ftp://)~',
);

// direct email
self::$EMAIL = '[' . Patterns::CHAR . '][0-9.+_' . Patterns::CHAR . '-]{0,63}@[0-9.+_' . Patterns::CHAR . '\x{ad}-]{1,252}\.[' . Patterns::CHAR . '\x{ad}]{2,19}';
self::$EMAIL = '
[' . Patterns::CHAR . '] # first char
[0-9.+_' . Patterns::CHAR . '-]{0,63} # local part
@
[0-9.+_' . Patterns::CHAR . '\x{ad}-]{1,252} # domain
\.
[' . Patterns::CHAR . '\x{ad}]{2,19} # TLD
';
$texy->registerLinePattern(
$this->patternUrlEmail(...),
'~(?<=^|[\s([<\x17])' . self::$EMAIL . '~x',
'~
(?<= ^ | [\s([<\x17] ) # must be preceded by these chars
' . self::$EMAIL . '
~x',
'link/email',
'~' . self::$EMAIL . '~',
'~' . self::$EMAIL . '~x',
);
}

Expand All @@ -90,7 +110,15 @@ private function beforeParse(Texy\Texy $texy, &$text): void
if (!empty($texy->allowed['link/definition'])) {
$text = Texy\Regexp::replace(
$text,
'~^\[([^\[\]#\?\*\n]{1,100})\]:\ ++(\S{1,1000})([\ \t].{1,1000})?' . Patterns::MODIFIER . '?\s*()$~mUx',
'~^
\[
( [^\[\]#\?\*\n]{1,100} ) # reference name
\] : \ ++
( \S{1,1000} ) # URL
( [\ \t] .{1,1000} )? # optional description
' . Patterns::MODIFIER . '?
\s*
()$~mUx',
$this->patternReferenceDef(...),
);
}
Expand Down Expand Up @@ -334,7 +362,7 @@ private function checkLink(Link $link): void
// special supported case
$link->URL = 'http://' . $link->URL;

} elseif (preg_match('~' . self::$EMAIL . '$~Au', $link->URL)) {
} elseif (preg_match('~' . self::$EMAIL . '$~Aux', $link->URL)) {
// email
$link->URL = 'mailto:' . $link->URL;

Expand All @@ -352,7 +380,7 @@ private function checkLink(Link $link): void
*/
private function textualUrl(Link $link): string
{
if ($this->texy->obfuscateEmail && preg_match('~^' . self::$EMAIL . '$~u', $link->raw)) { // email
if ($this->texy->obfuscateEmail && preg_match('~^' . self::$EMAIL . '$~ux', $link->raw)) { // email
return str_replace('@', '&#64;<!-- -->', $link->raw);
}

Expand All @@ -362,7 +390,14 @@ private function textualUrl(Link $link): string
: $link->raw;

// parse_url() in PHP damages UTF-8 - use regular expression
if (!preg_match('~^(?:(?P<scheme>[a-z]+):)?(?://(?P<host>[^/?#]+))?(?P<path>(?:/|^)(?!/)[^?#]*)?(?:\?(?P<query>[^#]*))?(?:\#(?P<fragment>.*))?()$~ux', $raw, $parts)) {
if (!preg_match('~^
(?: (?P<scheme> [a-z]+ ) : )?
(?: // (?P<host> [^/?#]+ ) )?
(?P<path> (?: / | ^ ) (?! /) [^?#]* )?
(?: \? (?P<query> [^#]* ) )?
(?: \# (?P<fragment> .* ) )?
()$
~ux', $raw, $parts)) {
return $link->raw;
}

Expand Down
Loading

0 comments on commit 4109166

Please sign in to comment.