From 0a207528150c1204ad0d1e71900bfd3e4b3188cc Mon Sep 17 00:00:00 2001
From: Mathis Frahm <49306645+mafrahm@users.noreply.github.com>
Date: Fri, 16 Feb 2024 16:53:06 +0100
Subject: [PATCH] updates to the CreateYieldTable task (#396)

* updates to the CreateYieldTable task

* rename suffix parameter to output_suffix

---------

Co-authored-by: Marcel Rieger <riga@users.noreply.github.com>
---
 columnflow/tasks/yields.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py
index 2e959dbb6..00985ea16 100644
--- a/columnflow/tasks/yields.py
+++ b/columnflow/tasks/yields.py
@@ -57,6 +57,10 @@ class CreateYieldTable(
         description="string parameter to define the normalization of the yields; "
         "choices: '', per_process, per_category, all; empty default",
     )
+    output_suffix = luigi.Parameter(
+        default=law.NO_STR,
+        description="Adds a suffix to the output name of the yields table; empty default",
+    )
 
     # upstream requirements
     reqs = Requirements(
@@ -105,9 +109,13 @@ def resolve_param_values(cls, params):
         return params
 
     def output(self):
-        # TODO: add a "yield" output that gives the raw numbers, e.g. as a csv
+        suffix = ""
+        if self.output_suffix and self.output_suffix != law.NO_STR:
+            suffix = f"__{self.output_suffix}"
+
         return {
-            "table": self.target(f"yields__proc_{self.processes_repr}__cat_{self.categories_repr}.txt"),
+            "table": self.target(f"table__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.txt"),
+            "yields": self.target(f"yields__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.json"),
         }
 
     @law.decorator.log
@@ -176,7 +184,7 @@ def run(self):
 
             # read out yields per category and per process
             for process_inst, h in hists.items():
-                processes.append(process_inst.label)
+                processes.append(process_inst)
 
                 for category_inst in category_insts:
                     leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
@@ -195,7 +203,7 @@ def run(self):
                             f"mcstat_{process_inst.name}_{category_inst.name}",
                             math.sqrt(h_cat.variance),
                         )
-                    yields[category_inst.label].append(value)
+                    yields[category_inst].append(value)
 
             # obtain normalizaton factors
             norm_factors = 1
@@ -207,7 +215,7 @@ def run(self):
             elif self.normalize_yields == "per_process":
                 norm_factors = [
                     sum(yields[category][i] for category in yields.keys())
-                    for i in range(len(yields[category_insts[0].label]))
+                    for i in range(len(yields[category_insts[0]]))
                 ]
             elif self.normalize_yields == "per_category":
                 norm_factors = {
@@ -215,7 +223,9 @@ def run(self):
                     for category, category_yields in yields.items()
                 }
 
-            yields_str = defaultdict(list, {"Process": processes})
+            # initialize dicts
+            yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]})
+            raw_yields = defaultdict(dict, {})
 
             # apply normalization and format
             for category, category_yields in yields.items():
@@ -228,15 +238,22 @@ def run(self):
                     else:
                         norm_factor = norm_factors
 
+                    raw_yield = (value / norm_factor).nominal
+                    raw_yields[category.name][processes[i].name] = raw_yield
+
                     # format yields into strings
-                    yields_str[category].append((value / norm_factor).str(
+                    yield_str = (value / norm_factor).str(
                         combine_uncs="all",
                         format=self.number_format,
                         style="latex" if "latex" in self.table_format else "plain",
-                    ))
+                    )
+                    if "latex" in self.table_format:
+                        yield_str = f"${yield_str}$"
+                    yields_str[category.label].append(yield_str)
 
             # create, print and save the yield table
             yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format)
             self.publish_message(yield_table)
 
             outputs["table"].dump(yield_table, formatter="text")
+            outputs["yields"].dump(raw_yields, formatter="json")