Skip to content

Commit 0bb03e6

Browse files
committed
Add support for nested capture groups to reg::Ex
1 parent d49be63 commit 0bb03e6

File tree

2 files changed

+63
-35
lines changed

2 files changed

+63
-35
lines changed

src/regex.cpp

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ class Ex::Private
189189

190190
/** The pattern string as passed by the user */
191191
std::string pattern;
192+
193+
/** Number of capture groups in the pattern (excluding the whole match) */
194+
size_t captureCount = 0;
192195
};
193196

194197
/** Compiles a regular expression passed as a string into a stream of tokens that can be used for
@@ -198,6 +201,7 @@ void Ex::Private::compile()
198201
{
199202
error = false;
200203
data.clear();
204+
captureCount = 0;
201205
if (pattern.empty()) return;
202206
const char *start = pattern.c_str();
203207
const char *ps = start;
@@ -206,6 +210,10 @@ void Ex::Private::compile()
206210
int prevTokenPos=-1;
207211
int tokenPos=0;
208212

213+
// capture group assignment
214+
std::vector<size_t> captureStack;
215+
size_t nextCaptureId = 0;
216+
209217
auto addToken = [&](PToken tok)
210218
{
211219
tokenPos++;
@@ -274,12 +282,27 @@ void Ex::Private::compile()
274282
addToken(PToken(PToken::Kind::Any));
275283
break;
276284
case '(': // begin of capture group
277-
prevTokenPos = tokenPos;
278-
addToken(PToken(PToken::Kind::BeginCapture));
285+
{
286+
prevTokenPos = tokenPos;
287+
addToken(PToken(PToken::Kind::BeginCapture));
288+
size_t id = ++nextCaptureId; // groups start at 1, 0 is whole match
289+
data.back().setValue(id);
290+
captureStack.push_back(id);
291+
}
279292
break;
280293
case ')': // end of capture group
281-
prevTokenPos = tokenPos;
282-
addToken(PToken(PToken::Kind::EndCapture));
294+
{
295+
prevTokenPos = tokenPos;
296+
if (captureStack.empty())
297+
{
298+
error=true;
299+
return;
300+
}
301+
size_t id = captureStack.back();
302+
captureStack.pop_back();
303+
addToken(PToken(PToken::Kind::EndCapture));
304+
data.back().setValue(id);
305+
}
283306
break;
284307
case '[': // character class
285308
{
@@ -402,6 +425,12 @@ void Ex::Private::compile()
402425
}
403426
ps++;
404427
}
428+
if (!captureStack.empty()) // Unmatched '('?
429+
{
430+
error=true;
431+
return;
432+
}
433+
captureCount = nextCaptureId;
405434
//addToken(PToken(PToken::Kind::End));
406435
}
407436

@@ -412,6 +441,7 @@ void Ex::Private::dump()
412441
size_t l = data.size();
413442
size_t i =0;
414443
DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
444+
DBG("captureCount=%zu\n",captureCount);
415445
while (i<l)
416446
{
417447
DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
@@ -531,7 +561,7 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
531561
size_t tokenStart = ++tokenPos;
532562
while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
533563
Match rangeMatch;
534-
rangeMatch.init(str);
564+
rangeMatch.init(str,0);
535565
bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
536566
if (found)
537567
{
@@ -614,12 +644,12 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
614644
(isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
615645
break;
616646
case PToken::Kind::BeginCapture:
617-
DBG("BeginCapture(%zu)\n",index);
618-
match.startCapture(index);
647+
DBG("BeginCapture(%zu) gid=%u\n",index,tok.value());
648+
match.startCapture(tok.value(),index);
619649
break;
620650
case PToken::Kind::EndCapture:
621-
DBG("EndCapture(%zu)\n",index);
622-
match.endCapture(index);
651+
DBG("EndCapture(%zu) gid=%u\n",index,tok.value());
652+
match.endCapture(tok.value(),index);
623653
break;
624654
case PToken::Kind::Any:
625655
if (index>=str.length()) return false;
@@ -707,7 +737,7 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
707737
{
708738
bool found=false;
709739
if (p->data.size()==0 || p->error) return found;
710-
match.init(str);
740+
match.init(str,p->captureCount);
711741

712742
PToken tok = p->data[0];
713743
if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position
@@ -721,10 +751,10 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
721751
size_t index = str.find(tok.asciiValue(),pos);
722752
if (index==std::string::npos)
723753
{
724-
DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.asciiValue());
754+
DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",std::string(str).c_str(),pos,tok.asciiValue());
725755
return false;
726756
}
727-
DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.asciiValue(),index);
757+
DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,std::string(str).c_str(),tok.asciiValue(),index);
728758
pos=index;
729759
}
730760
while (pos<str.length()) // search for a match starting at pos
@@ -734,7 +764,7 @@ bool Ex::match(std::string_view str,Match &match,size_t pos) const
734764
pos++;
735765
}
736766
}
737-
DBG("Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,found);
767+
DBG("Ex::match(str='%s',pos=%zu)=%d\n",std::string(str).c_str(),pos,found);
738768
return found;
739769
}
740770

src/regex.h

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,8 @@ class Ex
8383
* @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
8484
* meaning in a character range. `^` only has a special meaning as the first character.
8585
*
86-
* @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on
87-
* capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are
88-
* specified then some character has to be in between them,
89-
* e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`.
86+
* @note capture ranges can be nested. Quantifiers (`*`, `+`, `?`) on entire capture ranges
87+
* are not supported.
9088
*
9189
* In Wildcard mode `*` is used to match any sequence of zero or more characters.
9290
* The character `?` can be used to match an optional character. Character ranges are
@@ -189,41 +187,41 @@ class Match
189187

190188
private:
191189
friend class Ex;
192-
void init(std::string_view str)
190+
void init(std::string_view str,size_t captureCount)
193191
{
194192
m_subMatches.clear();
195-
m_subMatches.emplace_back(str);
193+
m_subMatches.reserve(captureCount+1);
194+
for (size_t i=0;i<captureCount+1;i++)
195+
{
196+
m_subMatches.emplace_back(str);
197+
}
196198
m_str = str;
197199
}
198-
void startCapture(size_t index)
200+
void startCapture(size_t groupId,size_t index)
199201
{
200-
if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times
201-
// only update the index, example `\s*(x)`
202+
if (groupId < m_subMatches.size())
202203
{
203-
m_captureIndex = m_subMatches.size();
204-
m_subMatches.emplace_back(m_str);
205-
m_insideCapture = true;
204+
m_subMatches[groupId].setStart(index);
206205
}
207-
m_subMatches.back().setStart(index);
208206
}
209-
void endCapture(size_t index)
207+
void endCapture(size_t groupId,size_t index)
210208
{
211-
if (index>m_subMatches.back().position())
209+
if (groupId < m_subMatches.size())
212210
{
213-
m_captureIndex=0;
214-
m_subMatches.back().setEnd(index);
215-
m_insideCapture = false;
211+
if (index>m_subMatches[groupId].position())
212+
{
213+
m_subMatches[groupId].setEnd(index);
214+
}
216215
}
217216
}
218217
void setMatch(size_t pos,size_t len)
219218
{
220-
m_subMatches[m_captureIndex].setMatch(pos,len);
219+
// Always set the whole match
220+
m_subMatches[0].setMatch(pos,len);
221221
}
222222

223223
std::vector<SubMatch> m_subMatches;
224-
size_t m_captureIndex=0;
225224
std::string_view m_str;
226-
bool m_insideCapture=false;
227225
};
228226

229227
/** Class to iterate through matches.
@@ -335,4 +333,4 @@ std::string replace(std::string_view str,const Ex &re,std::string_view replaceme
335333

336334
} // namespace
337335

338-
#endif
336+
#endif

0 commit comments

Comments
 (0)