Merge branch 'master' of github.com:haddadanas/columnflow into Docs

haddadanas · Dec 19, 2023 · 2099233 · 2099233
2 parents 9bb6809 + f901c9e
commit 2099233
Show file tree

Hide file tree

Showing 20 changed files with 929 additions and 148 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,5 @@ data
 .data
 .law
 .setups
+.mypy_cache
+.vscode
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -8,7 +8,6 @@ build:
     pre_create_environment:
       - bash setup.sh ""
 
-
 submodules:
   include: all
   recursive: true

diff --git a/analysis_templates/cms_minimal/.gitignore b/analysis_templates/cms_minimal/.gitignore
@@ -34,3 +34,5 @@ data
 .data
 .law
 .setups
+.mypy_cache
+.vscode
diff --git a/analysis_templates/cms_minimal/setup.sh b/analysis_templates/cms_minimal/setup.sh
@@ -70,7 +70,7 @@ setup___cf_short_name_lc__() {
     CF_SKIP_SETUP="1" source "${CF_BASE}/setup.sh" "" || return "$?"
 
     # interactive setup
-    if [ "${CF_REMOTE_JOB}" != "1" ]; then
+    if [ "${CF_REMOTE_ENV}" != "1" ]; then
         cf_setup_interactive_body() {
             # pre-export the CF_FLAVOR which will be cms
             export CF_FLAVOR="cms"
@@ -88,7 +88,6 @@ setup___cf_short_name_lc__() {
     export CF_CONDA_BASE="${CF_CONDA_BASE:-${CF_SOFTWARE_BASE}/conda}"
     export CF_VENV_BASE="${CF_VENV_BASE:-${CF_SOFTWARE_BASE}/venvs}"
     export CF_CMSSW_BASE="${CF_CMSSW_BASE:-${CF_SOFTWARE_BASE}/cmssw}"
-    export CF_CI_JOB="$( [ "${GITHUB_ACTIONS}" = "true" ] && echo 1 || echo 0 )"
 
 
     #

diff --git a/columnflow/__init__.py b/columnflow/__init__.py
@@ -23,7 +23,31 @@
 m = re.match(r"^(\d+)\.(\d+)\.(\d+)(-.+)?$", __version__)
 version = tuple(map(int, m.groups()[:3])) + (m.group(4),)
 
-# cf flavor
+#: Boolean denoting whether the environment is in a local environment (based on ``CF_LOCAL_ENV``).
+env_is_local = law.util.flag_to_bool(os.getenv("CF_LOCAL_ENV", "0"))
+
+#: Boolean denoting whether the environment is in a remote job (based on ``CF_REMOTE_ENV``).
+env_is_remote = law.util.flag_to_bool(os.getenv("CF_REMOTE_ENV", "0"))
+
+#: Boolean denoting whether the environment is in a remote job on the WLCG (based on ``CF_ON_GRID``).
+env_is_grid = law.util.flag_to_bool(os.getenv("CF_ON_GRID", "0"))
+
+#: Boolean denoting whether the environment is in a remote job on a HTCondor cluster (based on ``CF_ON_HTCONDOR``).
+env_is_htcondor = law.util.flag_to_bool(os.getenv("CF_ON_HTCONDOR", "0"))
+
+#: Boolean denoting whether the environment is in a remote job on a Slurm cluster (based on ``CF_ON_SLURM``).
+env_is_slurm = law.util.flag_to_bool(os.getenv("CF_ON_SLURM", "0"))
+
+#: Boolean denoting whether the environment is in a CI env (based on ``CF_CI_ENV``).
+env_is_ci = law.util.flag_to_bool(os.getenv("CF_CI_ENV", "0"))
+
+#: Boolean denoting whether the environment is in a readthedocs env (based on ``CF_RTD_ENV``, or ``READTHEDOCS``).
+env_is_rtd = law.util.flag_to_bool(os.getenv("CF_RTD_ENV" if "CF_RTD" in os.environ else "READTHEDOCS", "0"))
+
+#: Boolean denoting whether the environment is used for development (based on ``CF_DEV``).
+env_is_dev = not env_is_remote and law.util.flag_to_bool(os.getenv("CF_DEV", "0"))
+
+#: String refering to the "flavor" of the cf setup.
 flavor = os.getenv("CF_FLAVOR")
 if isinstance(flavor, str):
     flavor = flavor.lower()
@@ -40,57 +64,59 @@
 # some core tasks (BundleCMSSW) need the cms contrib package, to be refactored, see #155
 law.contrib.load("cms")
 
-# initialize wlcg file systems once so that their cache cleanup is triggered if configured
-if law.config.has_option("outputs", "wlcg_file_systems"):
-    wlcg_file_systems = [
-        law.wlcg.WLCGFileSystem(fs.strip())
-        for fs in law.config.get_expanded("outputs", "wlcg_file_systems", [], split_csv=True)
-    ]
-
-# initialize producers, calibrators, selectors, categorizers, ml models and stat models
-from columnflow.util import maybe_import
-
-import columnflow.production  # noqa
-if law.config.has_option("analysis", "production_modules"):
-    for m in law.config.get_expanded("analysis", "production_modules", [], split_csv=True):
-        logger.debug(f"loading production module '{m}'")
-        maybe_import(m.strip())
-
-import columnflow.calibration  # noqa
-if law.config.has_option("analysis", "calibration_modules"):
-    for m in law.config.get_expanded("analysis", "calibration_modules", [], split_csv=True):
-        logger.debug(f"loading calibration module '{m}'")
-        maybe_import(m.strip())
-
-import columnflow.selection  # noqa
-if law.config.has_option("analysis", "selection_modules"):
-    for m in law.config.get_expanded("analysis", "selection_modules", [], split_csv=True):
-        logger.debug(f"loading selection module '{m}'")
-        maybe_import(m.strip())
-
-import columnflow.categorization  # noqa
-if law.config.has_option("analysis", "categorization_modules"):
-    for m in law.config.get_expanded("analysis", "categorization_modules", [], split_csv=True):
-        logger.debug(f"loading categorization module '{m}'")
-        maybe_import(m.strip())
-
-import columnflow.ml  # noqa
-if law.config.has_option("analysis", "ml_modules"):
-    for m in law.config.get_expanded("analysis", "ml_modules", [], split_csv=True):
-        logger.debug(f"loading ml module '{m}'")
-        maybe_import(m.strip())
-
-import columnflow.inference  # noqa
-if law.config.has_option("analysis", "inference_modules"):
-    for m in law.config.get_expanded("analysis", "inference_modules", [], split_csv=True):
-        logger.debug(f"loading inference module '{m}'")
-        maybe_import(m.strip())
-
-# preload all task modules so that task parameters are globally known and accepted
-if law.config.has_section("modules"):
-    for m in law.config.options("modules"):
-        logger.debug(f"loading task module '{m}'")
-        maybe_import(m.strip())
-
-# cleanup
-del m
+# initilize various objects
+if not env_is_rtd:
+    # initialize wlcg file systems once so that their cache cleanup is triggered if configured
+    if law.config.has_option("outputs", "wlcg_file_systems"):
+        wlcg_file_systems = [
+            law.wlcg.WLCGFileSystem(fs.strip())
+            for fs in law.config.get_expanded("outputs", "wlcg_file_systems", [], split_csv=True)
+        ]
+
+    # initialize producers, calibrators, selectors, categorizers, ml models and stat models
+    from columnflow.util import maybe_import
+
+    import columnflow.production  # noqa
+    if law.config.has_option("analysis", "production_modules"):
+        for m in law.config.get_expanded("analysis", "production_modules", [], split_csv=True):
+            logger.debug(f"loading production module '{m}'")
+            maybe_import(m.strip())
+
+    import columnflow.calibration  # noqa
+    if law.config.has_option("analysis", "calibration_modules"):
+        for m in law.config.get_expanded("analysis", "calibration_modules", [], split_csv=True):
+            logger.debug(f"loading calibration module '{m}'")
+            maybe_import(m.strip())
+
+    import columnflow.selection  # noqa
+    if law.config.has_option("analysis", "selection_modules"):
+        for m in law.config.get_expanded("analysis", "selection_modules", [], split_csv=True):
+            logger.debug(f"loading selection module '{m}'")
+            maybe_import(m.strip())
+
+    import columnflow.categorization  # noqa
+    if law.config.has_option("analysis", "categorization_modules"):
+        for m in law.config.get_expanded("analysis", "categorization_modules", [], split_csv=True):
+            logger.debug(f"loading categorization module '{m}'")
+            maybe_import(m.strip())
+
+    import columnflow.ml  # noqa
+    if law.config.has_option("analysis", "ml_modules"):
+        for m in law.config.get_expanded("analysis", "ml_modules", [], split_csv=True):
+            logger.debug(f"loading ml module '{m}'")
+            maybe_import(m.strip())
+
+    import columnflow.inference  # noqa
+    if law.config.has_option("analysis", "inference_modules"):
+        for m in law.config.get_expanded("analysis", "inference_modules", [], split_csv=True):
+            logger.debug(f"loading inference module '{m}'")
+            maybe_import(m.strip())
+
+    # preload all task modules so that task parameters are globally known and accepted
+    if law.config.has_section("modules"):
+        for m in law.config.options("modules"):
+            logger.debug(f"loading task module '{m}'")
+            maybe_import(m.strip())
+
+    # cleanup
+    del m
diff --git a/columnflow/config_util.py b/columnflow/config_util.py
@@ -25,8 +25,50 @@
 import law
 import order as od
 
+from columnflow.util import maybe_import
 from columnflow.types import Callable, Any, Sequence
 
+ak = maybe_import("awkward")
+np = maybe_import("numpy")
+
+
+def get_events_from_categories(
+    events: ak.Array,
+    categories: Sequence[str | od.Category],
+    config_inst: od.Config | None = None,
+) -> ak.Array:
+    """
+    Helper function that returns all events from an awkward array *events* that are categorized
+    into one of the leafs of one of the *categories*.
+
+    :param events: Awkward array. Requires the 'category_ids' field to be present.
+    :param categories: Sequence of category instances. Can also be a sequence of strings when passing a
+        *config_inst*.
+    :param config_inst: Optional config instance to load category instances.
+    :raises ValueError: If "category_ids" is not present in the *events* fields.
+    :return: Awkward array of all events that are categorized into one of the leafs of one of the
+        *categories*
+    """
+    if "category_ids" not in events.fields:
+        raise ValueError(
+            f"{get_events_from_categories.__name__} requires the 'category_ids' field to be present",
+        )
+
+    categories = law.util.make_list(categories)
+    if config_inst:
+        # get category insts
+        categories = [config_inst.get_category(cat) for cat in categories]
+
+    leaf_category_insts = set.union(*map(set, (cat.get_leaf_categories() or {cat} for cat in categories)))
+
+    # do the "or" of all leaf categories
+    mask = np.zeros(len(events), dtype=bool)
+    for cat in leaf_category_insts:
+        cat_mask = ak.any(events.category_ids == cat.id, axis=1)
+        mask = cat_mask | mask
+
+    return events[mask]
+
 
 def get_root_processes_from_campaign(campaign: od.Campaign) -> od.UniqueObjectIndex:
     """
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,3 +34,5 @@ data @@
     .data
     .law
     .setups
+    .mypy_cache
+    .vscode