Skip to content

Commit

Permalink
regexp: more readable notation
Browse files Browse the repository at this point in the history
  • Loading branch information
dg committed Jan 21, 2025
1 parent c257263 commit 1d845e8
Show file tree
Hide file tree
Showing 18 changed files with 533 additions and 96 deletions.
2 changes: 1 addition & 1 deletion src/Texy/Modifier.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public function setProperties(?string $s): void
$ch = $s[$p];

if ($ch === '(') { // title
$m = Regexp::match($s, '~(?:\\\\\)|[^)\n])++\)~', offset: $p);
$m = Regexp::match($s, '~(?: \\\\\) | [^)\n] )++\)~', offset: $p);
$this->title = Helpers::unescapeHtml(str_replace('\)', ')', trim(substr($m[0], 1, -1))));
$p += strlen($m[0]);

Expand Down
27 changes: 24 additions & 3 deletions src/Texy/Modules/BlockModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,20 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^/--++\ *+(.*)' . Texy\Patterns::MODIFIER_H . '?$((?:\n(?0)|\n.*+)*)(?:\n\\\--.*$|\z)~mUi',
'~^
/--++ \ *+ # opening tag /--
(.*) # content type
' . Texy\Patterns::MODIFIER_H . '?
$
((?:
\n (?0) | # recursive nested blocks
\n.*+ # or any content
)*)
(?:
\n \\\--.* $ | # closing tag \--
\z # or end of input
)
~mUi',
'blocks',
);
}
Expand All @@ -52,8 +65,16 @@ private function beforeBlockParse(Texy\BlockParser $parser, string &$text): void
// autoclose exclusive blocks
$text = Texy\Regexp::replace(
$text,
'~^(/--++\ *+(?!div|texysource).*)$((?:\n.*+)*?)(?:\n\\\--.*$|(?=(\n/--.*$)))~mi',
"\$1\$2\n\\--",
'~^
( /--++ \ *+ (?! div|texysource ) .* ) # opening tag except div/texysource
$
((?: \n.*+ )*?) # content
(?:
\n \\\--.* $ | # closing tag
(?= (\n /--.* $)) # or next block starts
)
~mi',
"\$1\$2\n\\--", // add closing tag
);
}

Expand Down
9 changes: 7 additions & 2 deletions src/Texy/Modules/BlockQuoteModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^(?:' . Texy\Patterns::MODIFIER_H . '\n)?\>([\ \t]++|:)(\S.*+)$~mU', // original
'~^
(?: ' . Texy\Patterns::MODIFIER_H . '\n)?
\> # blockquote char
( [\ \t]++ | : ) # space/tab or colon
( \S.*+ ) # content
$~mU',
'blockquote',
);
}
Expand Down Expand Up @@ -59,7 +64,7 @@ public function pattern(Texy\BlockParser $parser, array $matches): Texy\HtmlElem
}
$content .= $mContent . "\n";

if (!$parser->next("~^>(?:|([\\ \\t]{1,$spaces}|:)(.*))()$~mA", $matches)) {
if (!$parser->next("~^>(?: | ([\\ \\t]{1,$spaces} | :) (.*))()$~mA", $matches)) {
break;
}

Expand Down
5 changes: 4 additions & 1 deletion src/Texy/Modules/EmoticonModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ private function beforeParse(): void

$this->texy->registerLinePattern(
$this->pattern(...),
'~(?<=^|[\x00-\x20])(' . implode('|', $pattern) . ')~',
'~
(?<= ^ | [\x00-\x20] )
(' . implode('|', $pattern) . ')
~',
'emoticon',
'~' . implode('|', $pattern) . '~',
);
Expand Down
20 changes: 18 additions & 2 deletions src/Texy/Modules/FigureModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,26 @@ public function __construct(Texy\Texy $texy)

$texy->addHandler('figure', $this->solve(...));

// [* urls .(title)[class]{style} >]
$texy->registerBlockPattern(
$this->pattern(...),
'~^\[\*\ *+([^\n' . Patterns::MARK . ']{1,1000})' . Patterns::MODIFIER . '?\ *+(\*|(?<!<)>|<)\]' // [* urls .(title)[class]{style} >]
. '(?::(' . Patterns::LINK_URL . '|:))??\ ++\*\*\*\ ++(.{0,2000})' . Patterns::MODIFIER_H . '?()$~mU',
'~^
\[\*\ *+ # opening bracket with asterisk
([^\n' . Patterns::MARK . ']{1,1000}) # URLs
' . Patterns::MODIFIER . '?
\ *+
( \* | (?<! < ) > | < ) # alignment
\]
(?:
:( # link delimiter
' . Patterns::LINK_URL . ' | # link
: # or just colon
)
)??
\ ++ \*\*\* \ ++ # separator
(.{0,2000}) # figure content
' . Patterns::MODIFIER_H . '?
()$~mU',
'figure',
);
}
Expand Down
14 changes: 11 additions & 3 deletions src/Texy/Modules/HeadingModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,22 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->patternUnderline(...),
'~^(\S.{0,1000})' . Texy\Patterns::MODIFIER_H . '?\n'
. '(\#{3,}+|\*{3,}+|={3,}+|-{3,}+)$~mU',
'~^
( \S .{0,1000} ) # heading text
' . Texy\Patterns::MODIFIER_H . '?
\n
( \#{3,}+ | \*{3,}+ | ={3,}+ | -{3,}+ ) # underline characters
$~mU',
'heading/underlined',
);

$texy->registerBlockPattern(
$this->patternSurround(...),
'~^(\#{2,}+|={2,}+)(.+)' . Texy\Patterns::MODIFIER_H . '?()$~mU',
'~^
( \#{2,}+ | ={2,}+ ) # opening characters
(.+) # heading text
' . Texy\Patterns::MODIFIER_H . '?
()$~mU',
'heading/surrounded',
);
}
Expand Down
6 changes: 5 additions & 1 deletion src/Texy/Modules/HorizLineModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@ public function __construct(Texy\Texy $texy)

$texy->registerBlockPattern(
$this->pattern(...),
'~^(\*{3,}+|-{3,}+)[\ \t]*' . Texy\Patterns::MODIFIER . '?()$~mU',
'~^
( \*{3,}+ | -{3,}+ ) # three or more * or -
[\ \t]* # optional spaces
' . Texy\Patterns::MODIFIER . '?
()$~mU',
'horizline',
);
}
Expand Down
37 changes: 34 additions & 3 deletions src/Texy/Modules/HtmlModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,31 @@ public function __construct(Texy\Texy $texy)

$texy->registerLinePattern(
$this->patternTag(...),
'~<(/?)([a-z][a-z0-9_:-]{0,50})((?:\s++[a-z0-9\_:-]++|=\s*+"[^"' . Patterns::MARK . ']*+"|=\s*+\'[^\'' . Patterns::MARK . ']*+\'|=[^\s>' . Patterns::MARK . ']++)*)\s*+(/?)>~is',
'~
< (/?) # tag begin
([a-z][a-z0-9_:-]{0,50}) # tag name
(
(?:
\s++ [a-z0-9\_:-]++ | # attribute name
= \s*+ " [^"' . Patterns::MARK . ']*+ " | # attribute value in double quotes
= \s*+ \' [^\'' . Patterns::MARK . ']*+ \' | # attribute value in single quotes
= [^\s>' . Patterns::MARK . ']++ # attribute value without quotes
)*
)
\s*+
(/?) # self-closing slash
>
~is',
'html/tag',
);

$texy->registerLinePattern(
$this->patternComment(...),
'~<!--([^' . Patterns::MARK . ']*?)-->~is',
'~
<!--
( [^' . Patterns::MARK . ']*? )
-->
~is',
'html/comment',
);
}
Expand Down Expand Up @@ -290,7 +308,20 @@ private function parseAttributes(string $attrs): array
$res = [];
$matches = Regexp::matchAll(
$attrs,
'~([a-z0-9\_:-]+)\s*(?:=\s*(\'[^\']*\'|"[^"]*"|[^\'"\s]+))?()~is',
<<<'X'
~
([a-z0-9\_:-]+) # attribute name
\s*
(?:
= \s* # equals sign
(
' [^']* ' | # single quoted value
" [^"]* " | # double quoted value
[^'"\s]+ # unquoted value
)
)?
()~is
X,
);

foreach ($matches as $m) {
Expand Down
5 changes: 4 additions & 1 deletion src/Texy/Modules/HtmlOutputModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ private function postProcess(Texy\Texy $texy, string &$s): void
// wellform and reformat
$s = Regexp::replace(
$s . '</end/>',
'~([^<]*+)<(?:(!--.*--)|(/?)([a-z][a-z0-9._:-]*)(|[ \n].*)\s*(/?))>()~Uis',
'~
( [^<]*+ )
< (?: (!--.*--) | (/?) ([a-z][a-z0-9._:-]*) (|[ \n].*) \s* (/?) ) >
()~Uis',
$this->cb(...),
);

Expand Down
25 changes: 22 additions & 3 deletions src/Texy/Modules/ImageModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,20 @@ public function __construct(Texy\Texy $texy)
// [*image*]:LINK
$texy->registerLinePattern(
$this->patternImage(...),
'~\[\*\ *+([^\n' . Patterns::MARK . ']{1,1000})' . Patterns::MODIFIER . '?\ *+(\*|(?<!<)>|<)\]' // [* urls .(title)[class]{style} >]
. '(?::(' . Patterns::LINK_URL . '|:))??()~U',
'~
\[\* \ *+ # opening bracket with asterisk
([^\n' . Patterns::MARK . ']{1,1000}) # URLs
' . Patterns::MODIFIER . '?
\ *+
( \* | (?<! < ) > | < ) # alignment
\]
(?:
:(' . Patterns::LINK_URL . ' | : ) # link or just colon
)??
()~U',
'image',
);

}


Expand All @@ -69,7 +79,16 @@ private function beforeParse(Texy\Texy $texy, &$text): void
// [*image*]: urls .(title)[class]{style}
$text = Texy\Regexp::replace(
$text,
'~^\[\*([^\n]{1,100})\*\]:[\ \t]+(.{1,1000})[\ \t]*' . Patterns::MODIFIER . '?\s*()$~mU',
'~^
\[\* # opening [*
( [^\n]{1,100} ) # URL
\*\] # closing *]
: [\ \t]+
(.{1,1000}) # URL
[\ \t]*
' . Patterns::MODIFIER . '?
\s*
()$~mU',
$this->patternReferenceDef(...),
);
}
Expand Down
51 changes: 43 additions & 8 deletions src/Texy/Modules/LinkModule.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,43 @@ public function __construct(Texy\Texy $texy)
// [reference]
$texy->registerLinePattern(
$this->patternReference(...),
'~(\[[^\[\]\*\n' . Patterns::MARK . ']++\])~U',
'~(
\[
[^\[\]\*\n' . Patterns::MARK . ']++ # reference text
\]
)~U',
'link/reference',
);

// direct url; charaters not allowed in URL <>[\]^`{|}
// direct url; characters not allowed in URL <>[\]^`{|}
$texy->registerLinePattern(
$this->patternUrlEmail(...),
'~(?<=^|[\s([<:\x17])(?:https?://|www\.|ftp://)[0-9.' . Patterns::CHAR . '-][/\d' . Patterns::CHAR . '+\.\~%&?@=_:;#$!,*()\x{ad}-]{1,1000}[/\d' . Patterns::CHAR . '+\~?@=_#$*]~',
'~
(?<= ^ | [\s([<:\x17] ) # must be preceded by these chars
(?: https?:// | www\. | ftp:// ) # protocol or www
[0-9.' . Patterns::CHAR . '-] # first char
[/\d' . Patterns::CHAR . '+\.\~%&?@=_:;#$!,*()\x{ad}-]{1,1000} # URL body
[/\d' . Patterns::CHAR . '+\~?@=_#$*] # last char
~',
'link/url',
'~(?:https?://|www\.|ftp://)~',
'~(?: https?:// | www\. | ftp://)~',
);

// direct email
self::$EMAIL = '[' . Patterns::CHAR . '][0-9.+_' . Patterns::CHAR . '-]{0,63}@[0-9.+_' . Patterns::CHAR . '\x{ad}-]{1,252}\.[' . Patterns::CHAR . '\x{ad}]{2,19}';
self::$EMAIL = '
[' . Patterns::CHAR . '] # first char
[0-9.+_' . Patterns::CHAR . '-]{0,63} # local part
@
[0-9.+_' . Patterns::CHAR . '\x{ad}-]{1,252} # domain
\.
[' . Patterns::CHAR . '\x{ad}]{2,19} # TLD
';
$texy->registerLinePattern(
$this->patternUrlEmail(...),
'~(?<=^|[\s([<\x17])' . self::$EMAIL . '~',
'~
(?<= ^ | [\s([<\x17] ) # must be preceded by these chars
' . self::$EMAIL . '
~',
'link/email',
'~' . self::$EMAIL . '~',
);
Expand All @@ -91,7 +111,15 @@ private function beforeParse(Texy\Texy $texy, &$text): void
if (!empty($texy->allowed['link/definition'])) {
$text = Texy\Regexp::replace(
$text,
'~^\[([^\[\]#\?\*\n]{1,100})\]:\ ++(\S{1,1000})([\ \t].{1,1000})?' . Patterns::MODIFIER . '?\s*()$~mU',
'~^
\[
( [^\[\]#\?\*\n]{1,100} ) # reference name
\] : \ ++
( \S{1,1000} ) # URL
( [\ \t] .{1,1000} )? # optional description
' . Patterns::MODIFIER . '?
\s*
()$~mU',
$this->patternReferenceDef(...),
);
}
Expand Down Expand Up @@ -363,7 +391,14 @@ private function textualUrl(Link $link): string
: $link->raw;

// parse_url() in PHP damages UTF-8 - use regular expression
if (!($parts = Regexp::match($raw, '~^(?:(?P<scheme>[a-z]+):)?(?://(?P<host>[^/?#]+))?(?P<path>(?:/|^)(?!/)[^?#]*)?(?:\?(?P<query>[^#]*))?(?:\#(?P<fragment>.*))?()$~'))) {
if (!($parts = Regexp::match($raw, '~^
(?: (?P<scheme> [a-z]+ ) : )?
(?: // (?P<host> [^/?#]+ ) )?
(?P<path> (?: / | ^ ) (?! / ) [^?#]* )?
(?: \? (?P<query> [^#]* ) )?
(?: \# (?P<fragment> .* ) )?
()$
~'))) {
return $link->raw;
}

Expand Down
Loading

0 comments on commit 1d845e8

Please sign in to comment.