Merge branch 'master' of github.com:columnflow/columnflow into docs_e…

…xample_py Conflicts: columnflow/tasks/framework/mixins.py
haddadanas · May 23, 2024 · d60f8ec · d60f8ec
2 parents 61bf212 + da009d3
commit d60f8ec
Show file tree

Hide file tree

Showing 26 changed files with 929 additions and 183 deletions.
diff --git a/analysis_templates/cms_minimal/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/cms_minimal/__cf_module_name__/config/analysis___cf_short_name_lc__.py
@@ -4,17 +4,14 @@
 Configuration of the __cf_analysis_name__ analysis.
 """
 
-import functools
-
 import law
 import order as od
 from scinum import Number
 
 from columnflow.util import DotDict, maybe_import
 from columnflow.columnar_util import EMPTY_FLOAT, ColumnCollection
 from columnflow.config_util import (
-    get_root_processes_from_campaign, add_shift_aliases, get_shifts_from_sources, add_category,
-    verify_config_processes,
+    get_root_processes_from_campaign, add_shift_aliases, add_category, verify_config_processes,
 )
 
 ak = maybe_import("awkward")
@@ -112,6 +109,7 @@
 cfg.x.default_calibrator = "example"
 cfg.x.default_selector = "example"
 cfg.x.default_producer = "example"
+cfg.x.default_weight_producer = "example"
 cfg.x.default_ml_model = None
 cfg.x.default_inference_model = "example"
 cfg.x.default_categories = ("incl",)
@@ -258,13 +256,6 @@
     },
 })
 
-# event weight columns as keys in an OrderedDict, mapped to shift instances they depend on
-get_shifts = functools.partial(get_shifts_from_sources, cfg)
-cfg.x.event_weights = DotDict({
-    "normalization_weight": [],
-    "muon_weight": get_shifts("mu"),
-})
-
 # versions per task family, either referring to strings or to callables receving the invoking
 # task instance and parameters to be passed to the task family
 cfg.x.versions = {

diff --git a/analysis_templates/cms_minimal/__cf_module_name__/weight/__init__.py b/analysis_templates/cms_minimal/__cf_module_name__/weight/__init__.py
@@ -0,0 +1 @@
+# coding: utf-8
diff --git a/analysis_templates/cms_minimal/__cf_module_name__/weight/example.py b/analysis_templates/cms_minimal/__cf_module_name__/weight/example.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+
+"""
+Example event weight producer.
+"""
+
+from columnflow.weight import WeightProducer, weight_producer
+from columnflow.util import maybe_import
+from columnflow.config_util import get_shifts_from_sources
+from columnflow.columnar_util import Route
+
+ak = maybe_import("awkward")
+np = maybe_import("numpy")
+
+
+@weight_producer(
+    # both used columns and dependent shifts are defined in init below
+    # only run on mc
+    mc_only=True,
+)
+def example(self: WeightProducer, events: ak.Array, **kwargs) -> ak.Array:
+    # build the full event weight
+    weight = ak.Array(np.ones(len(events), dtype=np.float32))
+    for column in self.weight_columns:
+        weight = weight * Route(column).apply(events)
+
+    return events, weight
+
+
+@example.init
+def example_init(self: WeightProducer) -> None:
+    # store column names referring to weights to multiply
+    self.weight_columns = {
+        "normalization_weight",
+        "muon_weight",
+    }
+    self.uses |= self.weight_columns
+
+    # declare shifts that the produced event weight depends on
+    shift_sources = {
+        "mu",
+    }
+    self.shifts |= set(get_shifts_from_sources(self.config_inst, *shift_sources))
diff --git a/columnflow/__init__.py b/columnflow/__init__.py
@@ -82,6 +82,12 @@
             logger.debug(f"loading production module '{m}'")
             maybe_import(m.strip())
 
+    import columnflow.weight  # noqa
+    if law.config.has_option("analysis", "weight_production_modules"):
+        for m in law.config.get_expanded("analysis", "weight_production_modules", [], split_csv=True):
+            logger.debug(f"loading weight production module '{m}'")
+            maybe_import(m.strip())
+
     import columnflow.calibration  # noqa
     if law.config.has_option("analysis", "calibration_modules"):
         for m in law.config.get_expanded("analysis", "calibration_modules", [], split_csv=True):

diff --git a/columnflow/plotting/plot_util.py b/columnflow/plotting/plot_util.py
@@ -14,7 +14,7 @@
 
 import order as od
 
-from columnflow.util import maybe_import, try_int
+from columnflow.util import maybe_import, try_int, try_complex
 
 math = maybe_import("math")
 hist = maybe_import("hist")
@@ -122,6 +122,17 @@ def apply_variable_settings(
                 h = h[{var_inst.name: hist.rebin(rebin_factor)}]
                 hists[proc_inst] = h
 
+        slices = getattr(var_inst, "slice", None) or var_inst.x("slice", None)
+        if (
+            slices and isinstance(slices, Iterable) and len(slices) >= 2 and
+            try_complex(slices[0]) and try_complex(slices[1])
+        ):
+            slice_0 = int(slices[0]) if try_int(slices[0]) else complex(slices[0])
+            slice_1 = int(slices[1]) if try_int(slices[1]) else complex(slices[1])
+            for proc_inst, h in list(hists.items()):
+                h = h[{var_inst.name: slice(slice_0, slice_1)}]
+                hists[proc_inst] = h
+
     return hists
 
 

diff --git a/columnflow/production/categories.py b/columnflow/production/categories.py
@@ -89,3 +89,6 @@ def category_ids_init(self: Producer) -> None:
             self.produces.add(categorizer)
 
             self.categorizer_map[cat_inst].append(categorizer)
+
+    # cast to normal dict to prevent silent failures on KeyError
+    self.categorizer_map = dict(self.categorizer_map)
diff --git a/columnflow/production/cms/btag.py b/columnflow/production/cms/btag.py
@@ -6,17 +6,21 @@
 
 from __future__ import annotations
 
+import law
+
 from columnflow.production import Producer, producer
 from columnflow.util import maybe_import, InsertableDict
 from columnflow.columnar_util import set_ak_column, flat_np_view, layout_ak_array
 
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
 
+logger = law.logger.get_logger(__name__)
+
 
 @producer(
     uses={
-        "Jet.hadronFlavour", "Jet.eta", "Jet.pt", "Jet.btagDeepFlavB",
+        "Jet.hadronFlavour", "Jet.eta", "Jet.pt",
     },
     # only run on mc
     mc_only=True,
@@ -29,6 +33,8 @@ def btag_weights(
     self: Producer,
     events: ak.Array,
     jet_mask: ak.Array | type(Ellipsis) = Ellipsis,
+    negative_b_score_action: str = "ignore",
+    negative_b_score_log_mode: str = "warning",
     **kwargs,
 ) -> ak.Array:
     """
@@ -44,31 +50,90 @@ def btag_weights(
     *get_btag_file* can be adapted in a subclass in case it is stored differently in the external
     files.
 
-    The name of the correction set as well as a list of JEC uncertainty sources which should be
-    propagated through the weight calculation should be given as an auxiliary entry in the config:
+    The name of the correction set, a list of JEC uncertainty sources which should be
+    propagated through the weight calculation, and the column used for b-tagging should
+    be given as an auxiliary entry in the config:
 
     .. code-block:: python
 
-        cfg.x.btag_sf = ("deepJet_shape", ["Absolute", "FlavorQCD", ...])
+        cfg.x.btag_sf = ("deepJet_shape", ["Absolute", "FlavorQCD", ...], "btagDeepFlavB")
 
     *get_btag_config* can be adapted in a subclass in case it is stored differently in the config.
 
     Optionally, a *jet_mask* can be supplied to compute the scale factor weight based only on a
     subset of jets.
 
+    The *negative_b_score_action* defines the procedure of how to handle jets with a negative b-tag.
+    Supported modes are:
+
+        - "ignore": the *jet_mask* is extended to exclude jets with b_score < 0
+        - "remove": the btag_weight is set to 0 for jets with b_score < 0
+        - "raise": an exception is raised
+
+    The verbosity of the handling of jets with negative b-score can be
+    set via *negative_b_score_log_mode*, which offers the following options:
+
+        - ``"none"``: no message is given
+        - ``"info"``: a `logger.info` message is given
+        - ``"debug"``: a `logger.debug` message is given
+        - ``"warning"``: a `logger.warning` message is given
+
     Resources:
 
        - https://twiki.cern.ch/twiki/bin/view/CMS/BTagShapeCalibration?rev=26
        - https://indico.cern.ch/event/1096988/contributions/4615134/attachments/2346047/4000529/Nov21_btaggingSFjsons.pdf
     """
+    known_actions = ("ignore", "remove", "raise")
+    if negative_b_score_action not in known_actions:
+        raise ValueError(
+            f"unknown negative_b_score_action '{negative_b_score_action}', "
+            f"known values are {','.join(known_actions)}",
+        )
+
+    known_log_modes = ("none", "info", "debug", "warning")
+    if negative_b_score_log_mode not in known_log_modes:
+        raise ValueError(
+            f"unknown negative_b_score_log_mode '{negative_b_score_log_mode}', "
+            f"known values are {','.join(known_log_modes)}",
+        )
+
     # get the total number of jets in the chunk
     n_jets_all = ak.sum(ak.num(events.Jet, axis=1))
 
+    # check that the b-tag score is not negative for all jets considered in the SF calculation
+    jets_negative_b_score = events.Jet[self.b_score_column][jet_mask] < 0
+    if ak.any(jets_negative_b_score):
+        msg_func = {
+            "none": lambda msg: None,
+            "info": logger.info,
+            "warning": logger.warning,
+            "debug": logger.debug,
+        }[negative_b_score_log_mode]
+        msg = f"In dataset {self.dataset_inst.name}, {ak.sum(jets_negative_b_score)} jets have a negative b-tag score."
+
+        if negative_b_score_action == "ignore":
+            msg_func(
+                f"{msg} The *jet_mask* will be adjusted to exclude these jets, resulting in a "
+                "*btag_weight* of 1 for these jets.",
+            )
+        elif negative_b_score_action == "remove":
+            msg_func(
+                f"{msg} The *btag_weight* will be set to 0 for these jets.",
+            )
+        elif negative_b_score_action == "raise":
+            raise Exception(msg)
+
+        # set jet mask to False when b_score is negative
+        if jet_mask is Ellipsis:
+            jet_mask = (events.Jet[self.b_score_column] >= 0)
+        else:
+            jet_mask = jet_mask & (events.Jet[self.b_score_column] >= 0)
+
     # get flat inputs, evaluated at jet_mask
     flavor = flat_np_view(events.Jet.hadronFlavour[jet_mask], axis=1)
     abs_eta = flat_np_view(abs(events.Jet.eta[jet_mask]), axis=1)
     pt = flat_np_view(events.Jet.pt[jet_mask], axis=1)
-    b_discr = flat_np_view(events.Jet.btagDeepFlavB[jet_mask], axis=1)
+    b_discr = flat_np_view(events.Jet[self.b_score_column][jet_mask], axis=1)
 
     # helper to create and store the weight
     def add_weight(syst_name, syst_direction, column_name):
@@ -102,6 +167,11 @@ def add_weight(syst_name, syst_direction, column_name):
 
         # enforce the correct shape and create the product over all jets per event
         sf = layout_ak_array(sf_flat_all, events.Jet.pt)
+
+        if negative_b_score_log_mode == "remove":
+            # set the weight to 0 for jets with negative btag score
+            sf = ak.where(jets_negative_b_score, 0, sf)
+
         weight = ak.prod(sf, axis=1, mask_identity=False)
 
         # save the new column
@@ -121,6 +191,14 @@ def add_weight(syst_name, syst_direction, column_name):
                     direction,
                     f"btag_weight_{name}_{direction}",
                 )
+                if syst_name in ["cferr1", "cferr2"]:
+                    # for c flavor uncertainties, multiply the uncertainty with the nominal btag weight
+                    events = set_ak_column(
+                        events,
+                        f"btag_weight_{name}_{direction}",
+                        events.btag_weight * events[f"btag_weight_{name}_{direction}"],
+                        value_type=np.float32,
+                    )
     elif self.shift_is_known_jec_source:
         # TODO: year dependent jec variations fully covered?
         events = add_weight(
@@ -135,6 +213,40 @@ def add_weight(syst_name, syst_direction, column_name):
     return events
 
 
+def get_b_score_column(btag_config: tuple[str, list, str] | tuple[str, list]) -> str:
+    """
+    Helper function to resolve the btag score column from the btag configuration.
+
+    :param btag_config: Entry in auxiliary `config_inst.x.btag_sf`, see example
+    :py:meth:`~columflow.production.cms.btag.btag_weights`. If tuple has less
+    than 3 entries, the column name is derived from the name of the correction set.
+    :returns: Name of column that is required for the calculation of this set of corrections.
+    """
+    corrector_name = btag_config[0]
+    if len(btag_config) >= 3:
+        b_score_column = btag_config[2]
+    else:
+        # resolve the column name from the corrector name
+        if "deepjet" in corrector_name.lower():
+            b_score_column = "btagDeepFlavB"
+        elif "particlenet" in corrector_name.lower():
+            b_score_column = "btagPNetB"
+        else:
+            raise NotImplementedError(f"Cannot automatically determine btag column for Corrector '{corrector_name}'")
+        logger.info(f"No btag column specified; defaulting to column '{b_score_column}'")
+
+    # warn about potentially wrong column usage
+    if (
+        "deepjet" in corrector_name.lower() and "pnet" in b_score_column or
+        "particlenet" in corrector_name.lower() and "deepflav" in b_score_column
+    ):
+        logger.warning(
+            f"Using btag column '{b_score_column}' for BTag Corrector '{corrector_name}' is highly discouraged",
+        )
+
+    return b_score_column
+
+
 @btag_weights.init
 def btag_weights_init(self: Producer) -> None:
     # depending on the requested shift_inst, there are three cases to handle:
@@ -143,6 +255,11 @@ def btag_weights_init(self: Producer) -> None:
     #   2. when the nominal shift is requested, the central weight and all variations related to the
     #      method-intrinsic shifts are produced
     #   3. when any other shift is requested, only create the central weight column
+    btag_config = self.get_btag_config()
+    self.b_score_column = get_b_score_column(btag_config)
+
+    self.uses.add(f"Jet.{self.b_score_column}")
+
     shift_inst = getattr(self, "local_shift_inst", None)
     if not shift_inst:
         return
@@ -152,7 +269,7 @@ def btag_weights_init(self: Producer) -> None:
     btag_sf_jec_source = "" if self.jec_source == "Total" else self.jec_source
     self.shift_is_known_jec_source = (
         self.jec_source and
-        btag_sf_jec_source in self.get_btag_config()[1]
+        btag_sf_jec_source in btag_config[1]
     )
 
     # save names of method-intrinsic uncertainties
@@ -210,7 +327,3 @@ def btag_weights_setup(
     )
     corrector_name = self.get_btag_config()[0]
     self.btag_sf_corrector = correction_set[corrector_name]
-
-    # check versions
-    if self.btag_sf_corrector.version not in (3,):
-        raise Exception(f"unsuppprted btag sf corrector version {self.btag_sf_corrector.version}")