Add support for optional capture ranges to reg::Ex class.

doxygen · doxygen · commit 77506a939fdb · 2025-09-19T21:03:39.000+02:00
diff --git a/src/regex.cpp b/src/regex.cpp
@@ -1,6 +1,6 @@
 /******************************************************************************
  *
- * Copyright (C) 1997-2021 by Dimitri van Heesch.
+ * Copyright (C) 1997-2025 by Dimitri van Heesch.
  *
  * Permission to use, copy, modify, and distribute this software and its
  * documentation under the terms of the GNU General Public License is hereby
@@ -520,38 +520,102 @@ bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,M
     size_t startIndex = index;
     size_t len = str.length();
     PToken tok = data[++tokenPos];
-    if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
+
+    // Special handling for an optional capture group: (...)?
+    if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
+    {
+      size_t groupId = tok.value();
+      size_t innerStart = tokenPos + 1;
+
+      // Find matching EndCapture, accounting for nesting depth
+      size_t tp = innerStart;
+      int depth = 1;
+      while (tp<tokenLen && depth>0)
+      {
+        if (data[tp].kind()==PToken::Kind::BeginCapture) depth++;
+        else if (data[tp].kind()==PToken::Kind::EndCapture) depth--;
+        tp++;
+      }
+      if (depth!=0) return false; // malformed, unmatched ')'
+      size_t endCapturePos = tp - 1;            // position of EndCapture
+      size_t afterSeqPos   = endCapturePos + 2; // skip EndCapture and End marker
+
+      // Try with the group present
+      Match tmp;
+      tmp.init(str, /*captureCount*/ captureCount);
+      bool innerOk = matchAt(innerStart,endCapturePos,str,tmp,index,level+1);
+      if (innerOk)
+      {
+        size_t capLen = tmp.length();
+
+        // Copy nested captures from tmp (they may exist inside the group)
+        for (size_t gid=1; gid<tmp.size(); gid++)
+        {
+          size_t sp = tmp[gid].position();
+          size_t sl = tmp[gid].length();
+          if (sp!=std::string::npos && sl!=std::string::npos)
+          {
+            match.startCapture(gid,sp);
+            match.endCapture(gid,sp+sl);
+          }
+        }
+        // Set the outer group's capture
+        match.startCapture(groupId,index);
+        match.endCapture(groupId,index+capLen);
+
+        bool ok = matchAt(afterSeqPos,tokenLen,str,match,index+capLen,level+1);
+        if (ok)
+        {
+          match.setMatch(pos,(index+capLen)-pos+match.length());
+          return true;
+        }
+      }
+
+      // Try with the group absent (empty capture)
+      match.startCapture(groupId,index);
+      match.endCapture(groupId,index); // zero-length
+
+      bool ok2 = matchAt(afterSeqPos,tokenLen,str,match,index,level+1);
+      if (ok2)
+      {
+        match.setMatch(pos,index-pos+match.length());
+        return true;
+      }
+      return false;
+    }
+
+    if (tok.kind()==PToken::Kind::Character) // 'x*' or 'x?'
     {
       char c_tok = tok.asciiValue();
       while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
       tokenPos++;
     }
-    else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
+    else if (tok.isCharClass()) // '[a-f0-4]*' or '[...]?' -> eat matching characters
     {
       while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
       tokenPos+=tok.value()+1; // skip over character ranges + end token
     }
-    else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
+    else if (tok.kind()==PToken::Kind::Alpha) // '\a*' or '\a?' -> eat start id characters
     {
       while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
-    else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
+    else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' or '\w?' -> eat id characters
     {
       while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
-    else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
+    else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' or '\s?' -> eat spaces
     {
       while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
-    else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
+    else if (tok.kind()==PToken::Kind::Digit) // '\d*' or '\d?' -> eat digits
     {
       while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
-    else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
+    else if (tok.kind()==PToken::Kind::Any) // '.*' or '.?' -> eat all
     {
       if (type==Optional) index++; else index = str.length();
       tokenPos++;
diff --git a/src/regex.h b/src/regex.h
@@ -1,6 +1,6 @@
 /******************************************************************************
  *
- * Copyright (C) 1997-2021 by Dimitri van Heesch.
+ * Copyright (C) 1997-2025 by Dimitri van Heesch.
  *
  * Permission to use, copy, modify, and distribute this software and its
  * documentation under the terms of the GNU General Public License is hereby
@@ -83,8 +83,8 @@ class Ex
      *  @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
      *  meaning in a character range. `^` only has a special meaning as the first character.
      *
-     *  @note capture ranges can be nested. Quantifiers (`*`, `+`, `?`) on entire capture ranges
-     *  are not supported.
+     *  @note capture ranges can be nested. Optional capture ranges `(...)?` are supported but
+     *  repeated ranges `(...)*` or `(...)+` are not.
      *
      *  In Wildcard mode `*` is used to match any sequence of zero or more characters.
      *  The character `?` can be used to match an optional character. Character ranges are
@@ -208,7 +208,7 @@ class Match
     {
       if (groupId < m_subMatches.size())
       {
-        if (index>m_subMatches[groupId].position())
+        if (index>=m_subMatches[groupId].position())
         {
           m_subMatches[groupId].setEnd(index);
         }
@@ -333,4 +333,4 @@ std::string replace(std::string_view str,const Ex &re,std::string_view replaceme
 
 } // namespace
 
-#endif
+#endif

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`/******************************************************************************`
`2`	`2`	`*`
`3`		`- * Copyright (C) 1997-2021 by Dimitri van Heesch.`
	`3`	`+ * Copyright (C) 1997-2025 by Dimitri van Heesch.`
`4`	`4`	`*`
`5`	`5`	`* Permission to use, copy, modify, and distribute this software and its`
`6`	`6`	`* documentation under the terms of the GNU General Public License is hereby`
`@@ -83,8 +83,8 @@ class Ex`
`83`	`83`	* @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
`84`	`84`	* meaning in a character range. `^` only has a special meaning as the first character.
`85`	`85`	`*`
`86`		- * @note capture ranges can be nested. Quantifiers (`*`, `+`, `?`) on entire capture ranges
`87`		`- * are not supported.`
	`86`	+ * @note capture ranges can be nested. Optional capture ranges `(...)?` are supported but
	`87`	+ * repeated ranges `(...)*` or `(...)+` are not.
`88`	`88`	`*`
`89`	`89`	* In Wildcard mode `*` is used to match any sequence of zero or more characters.
`90`	`90`	* The character `?` can be used to match an optional character. Character ranges are
`@@ -208,7 +208,7 @@ class Match`
`208`	`208`	`{`
`209`	`209`	`if (groupId < m_subMatches.size())`
`210`	`210`	`{`
`211`		`- if (index>m_subMatches[groupId].position())`
	`211`	`+ if (index>=m_subMatches[groupId].position())`
`212`	`212`	`{`
`213`	`213`	`m_subMatches[groupId].setEnd(index);`
`214`	`214`	`}`
`@@ -333,4 +333,4 @@ std::string replace(std::string_view str,const Ex &re,std::string_view replaceme`
`333`	`333`
`334`	`334`	`} // namespace`
`335`	`335`
`336`		`-#endif`
	`336`	`+#endif`