Skip to content

Commit 77506a9

Browse files
committed
Add support for optional capture ranges to reg::Ex class.
1 parent 0bb03e6 commit 77506a9

File tree

2 files changed

+77
-13
lines changed

2 files changed

+77
-13
lines changed

src/regex.cpp

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
*
3-
* Copyright (C) 1997-2021 by Dimitri van Heesch.
3+
* Copyright (C) 1997-2025 by Dimitri van Heesch.
44
*
55
* Permission to use, copy, modify, and distribute this software and its
66
* documentation under the terms of the GNU General Public License is hereby
@@ -520,38 +520,102 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
520520
size_t startIndex = index;
521521
size_t len = str.length();
522522
PToken tok = data[++tokenPos];
523-
if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
523+
524+
// Special handling for an optional capture group: (...)?
525+
if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
526+
{
527+
size_t groupId = tok.value();
528+
size_t innerStart = tokenPos + 1;
529+
530+
// Find matching EndCapture, accounting for nesting depth
531+
size_t tp = innerStart;
532+
int depth = 1;
533+
while (tp<tokenLen && depth>0)
534+
{
535+
if (data[tp].kind()==PToken::Kind::BeginCapture) depth++;
536+
else if (data[tp].kind()==PToken::Kind::EndCapture) depth--;
537+
tp++;
538+
}
539+
if (depth!=0) return false; // malformed, unmatched ')'
540+
size_t endCapturePos = tp - 1; // position of EndCapture
541+
size_t afterSeqPos = endCapturePos + 2; // skip EndCapture and End marker
542+
543+
// Try with the group present
544+
Match tmp;
545+
tmp.init(str, /*captureCount*/ captureCount);
546+
bool innerOk = matchAt(innerStart,endCapturePos,str,tmp,index,level+1);
547+
if (innerOk)
548+
{
549+
size_t capLen = tmp.length();
550+
551+
// Copy nested captures from tmp (they may exist inside the group)
552+
for (size_t gid=1; gid<tmp.size(); gid++)
553+
{
554+
size_t sp = tmp[gid].position();
555+
size_t sl = tmp[gid].length();
556+
if (sp!=std::string::npos && sl!=std::string::npos)
557+
{
558+
match.startCapture(gid,sp);
559+
match.endCapture(gid,sp+sl);
560+
}
561+
}
562+
// Set the outer group's capture
563+
match.startCapture(groupId,index);
564+
match.endCapture(groupId,index+capLen);
565+
566+
bool ok = matchAt(afterSeqPos,tokenLen,str,match,index+capLen,level+1);
567+
if (ok)
568+
{
569+
match.setMatch(pos,(index+capLen)-pos+match.length());
570+
return true;
571+
}
572+
}
573+
574+
// Try with the group absent (empty capture)
575+
match.startCapture(groupId,index);
576+
match.endCapture(groupId,index); // zero-length
577+
578+
bool ok2 = matchAt(afterSeqPos,tokenLen,str,match,index,level+1);
579+
if (ok2)
580+
{
581+
match.setMatch(pos,index-pos+match.length());
582+
return true;
583+
}
584+
return false;
585+
}
586+
587+
if (tok.kind()==PToken::Kind::Character) // 'x*' or 'x?'
524588
{
525589
char c_tok = tok.asciiValue();
526590
while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
527591
tokenPos++;
528592
}
529-
else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
593+
else if (tok.isCharClass()) // '[a-f0-4]*' or '[...]?' -> eat matching characters
530594
{
531595
while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
532596
tokenPos+=tok.value()+1; // skip over character ranges + end token
533597
}
534-
else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
598+
else if (tok.kind()==PToken::Kind::Alpha) // '\a*' or '\a?' -> eat start id characters
535599
{
536600
while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
537601
tokenPos++;
538602
}
539-
else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
603+
else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' or '\w?' -> eat id characters
540604
{
541605
while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
542606
tokenPos++;
543607
}
544-
else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
608+
else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' or '\s?' -> eat spaces
545609
{
546610
while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
547611
tokenPos++;
548612
}
549-
else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
613+
else if (tok.kind()==PToken::Kind::Digit) // '\d*' or '\d?' -> eat digits
550614
{
551615
while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
552616
tokenPos++;
553617
}
554-
else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
618+
else if (tok.kind()==PToken::Kind::Any) // '.*' or '.?' -> eat all
555619
{
556620
if (type==Optional) index++; else index = str.length();
557621
tokenPos++;

src/regex.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
*
3-
* Copyright (C) 1997-2021 by Dimitri van Heesch.
3+
* Copyright (C) 1997-2025 by Dimitri van Heesch.
44
*
55
* Permission to use, copy, modify, and distribute this software and its
66
* documentation under the terms of the GNU General Public License is hereby
@@ -83,8 +83,8 @@ class Ex
8383
* @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
8484
* meaning in a character range. `^` only has a special meaning as the first character.
8585
*
86-
* @note capture ranges can be nested. Quantifiers (`*`, `+`, `?`) on entire capture ranges
87-
* are not supported.
86+
* @note capture ranges can be nested. Optional capture ranges `(...)?` are supported but
87+
* repeated ranges `(...)*` or `(...)+` are not.
8888
*
8989
* In Wildcard mode `*` is used to match any sequence of zero or more characters.
9090
* The character `?` can be used to match an optional character. Character ranges are
@@ -208,7 +208,7 @@ class Match
208208
{
209209
if (groupId < m_subMatches.size())
210210
{
211-
if (index>m_subMatches[groupId].position())
211+
if (index>=m_subMatches[groupId].position())
212212
{
213213
m_subMatches[groupId].setEnd(index);
214214
}
@@ -333,4 +333,4 @@ std::string replace(std::string_view str,const Ex &re,std::string_view replaceme
333333

334334
} // namespace
335335

336-
#endif
336+
#endif

0 commit comments

Comments
 (0)