Speed up datacard parsing

Two regular expressions were applied to all lines in card, both of which were slower than doing the equivalent transformation by hand in python. - `\\s*#.*` removes comments and whitespace at the end of the line - `(?<=\\s)-+(\\s|$)` converts any ` -` entries to ` 0` The latter was applied universally, whereas now it is only checked for parameter effect size arguments. This means that the interpretation of arguments for nuisances such as `gmN` is not more restricted.
cms-analysis · Aug 23, 2022 · 840d8e0 · 840d8e0
1 parent da84905
commit 840d8e0
Showing 1 changed file with 13 additions and 3 deletions.
diff --git a/python/DatacardParser.py b/python/DatacardParser.py
@@ -282,6 +282,14 @@ def addDatacardParserOptions(parser):
     )
 
 
+def strip(l):
+    """Strip comments and whitespace from end of line"""
+    idx = l.find("#")
+    if idx > 0:
+        return l[:idx].rstrip()
+    return l.rstrip()
+
+
 def isVetoed(name, vetoList):
     for pattern in vetoList:
         if not pattern:
@@ -453,10 +461,10 @@ def parseCard(file, options):
                 break  # rate is the last line before nuisances
         # parse nuisances
         for lineNumber2, l in enumerate(file):
-            if l.startswith("--"):
+            if l.startswith("--") or l.startswith("#"):
                 continue
-            l = re.sub("\\s*#.*", "", l)
-            l = re.sub("(?<=\\s)-+(\\s|$)", " 0\\1", l)
+
+            l = strip(l)
             f = l.split()
             if len(f) <= 1:
                 continue
@@ -623,6 +631,8 @@ def parseCard(file, options):
                         if v <= 0.00:
                             raise ValueError('Found "%s" in the nuisances affecting %s for %s. This would lead to NANs later on, so please fix it.' % (r, p, b))
                 else:
+                    if r == "-" * len(r):
+                        r = 0.0
                     errline[b][p] = float(r)
                     # values of 0.0 are treated as 1.0; scrap negative values.
                     if pdf not in ["trG", "dFD", "dFD2"] and errline[b][p] < 0: