From f7d04eb2048f7445454bf1c21589e0d8016bf13c Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Wed, 6 Nov 2024 17:43:34 -0800 Subject: [PATCH 1/2] bug fix --- src/boostedhh/processors/SkimmerABC.py | 3 ++- src/boostedhh/processors/__init__.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/boostedhh/processors/SkimmerABC.py b/src/boostedhh/processors/SkimmerABC.py index 474b7f0..4fcde44 100644 --- a/src/boostedhh/processors/SkimmerABC.py +++ b/src/boostedhh/processors/SkimmerABC.py @@ -1,5 +1,6 @@ """ Skimmer Base Class - common functions for all skimmers. + Author(s): Raghav Kansal """ @@ -12,7 +13,7 @@ import numpy as np import pandas as pd from coffea import processor -from HH4b.hh_vars import LUMI +from boostedhh.hh_vars import LUMI from . import corrections diff --git a/src/boostedhh/processors/__init__.py b/src/boostedhh/processors/__init__.py index 9ca2393..6c43ea2 100644 --- a/src/boostedhh/processors/__init__.py +++ b/src/boostedhh/processors/__init__.py @@ -1,4 +1 @@ -from __future__ import annotations - -from .bbbbSkimmer import * # noqa: F403 -from .ttSkimmer import * # noqa: F403 +from __future__ import annotations \ No newline at end of file From 726d45a525c1f2ce51af6c2259ba362cd6d24d19 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Wed, 6 Nov 2024 17:51:29 -0800 Subject: [PATCH 2/2] corrections --- src/boostedhh/processors/__init__.py | 4 +- src/boostedhh/processors/corrections.py | 413 ++++++++++++++++++++++++ 2 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 src/boostedhh/processors/corrections.py diff --git a/src/boostedhh/processors/__init__.py b/src/boostedhh/processors/__init__.py index 6c43ea2..e41c8b3 100644 --- a/src/boostedhh/processors/__init__.py +++ b/src/boostedhh/processors/__init__.py @@ -1 +1,3 @@ -from __future__ import annotations \ No newline at end of file +from __future__ import annotations + +from .SkimmerABC import SkimmerABC \ No newline at end of file diff --git a/src/boostedhh/processors/corrections.py b/src/boostedhh/processors/corrections.py new file mode 100644 index 0000000..aa7c932 --- /dev/null +++ b/src/boostedhh/processors/corrections.py @@ -0,0 +1,413 @@ +""" +Collection of utilities for corrections and systematics in processors. + +Loosely based on https://github.com/jennetd/hbb-coffea/blob/master/boostedhiggs/corrections.py + +Most corrections retrieved from the cms-nanoAOD repo: +See https://cms-nanoaod-integration.web.cern.ch/commonJSONSFs/ + +Authors: Raghav Kansal, Cristina Suarez +""" + +from __future__ import annotations + +import gzip +import pathlib +import pickle +import random + +import awkward as ak +import correctionlib +import numpy as np +import uproot +from coffea.analysis_tools import Weights +from coffea.nanoevents.methods import vector +from coffea.nanoevents.methods.base import NanoEventsArray +from coffea.nanoevents.methods.nanoaod import FatJetArray, JetArray + +from .utils import pad_val + +ak.behavior.update(vector.behavior) +package_path = str(pathlib.Path(__file__).parent.parent.resolve()) + +# Important Run3 start of Run +FirstRun_2022C = 355794 +FirstRun_2022D = 357487 +LastRun_2022D = 359021 +FirstRun_2022E = 359022 +LastRun_2022F = 362180 + +""" +CorrectionLib files are available from: /cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration - synced daily +""" +pog_correction_path = "/cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration/" +pog_jsons = { + "muon": ["MUO", "muon_Z.json.gz"], + "electron": ["EGM", "electron.json.gz"], + "pileup": ["LUM", "puWeights.json.gz"], + "fatjet_jec": ["JME", "fatJet_jerc.json.gz"], + "jet_jec": ["JME", "jet_jerc.json.gz"], + "jetveto": ["JME", "jetvetomaps.json.gz"], + "btagging": ["BTV", "btagging.json.gz"], +} + + +def get_UL_year(year: str) -> str: + return f"{year}_UL" + + +def get_pog_json(obj: str, year: str) -> str: + try: + pog_json = pog_jsons[obj] + except: + print(f"No json for {obj}") + + year = get_UL_year(year) if year == "2018" else year + if "2022" in year or "2023" in year: + year = { + "2022": "2022_Summer22", + "2022EE": "2022_Summer22EE", + "2023": "2023_Summer23", + "2023BPix": "2023_Summer23BPix", + }[year] + return f"{pog_correction_path}/POG/{pog_json[0]}/{year}/{pog_json[1]}" + + +def add_pileup_weight(weights: Weights, year: str, nPU: np.ndarray, dataset: str | None = None): + # clip nPU from 0 to 100 + nPU = np.clip(nPU, 0, 99) + # print(list(nPU)) + + if "Pu60" in dataset or "Pu70" in dataset: + # pileup profile from data + path_pileup = package_path + "/corrections/data/MyDataPileupHistogram2022FG.root" + pileup_profile = uproot.open(path_pileup)["pileup"] + pileup_profile = pileup_profile.to_numpy()[0] + # normalise + pileup_profile /= pileup_profile.sum() + + # https://indico.cern.ch/event/695872/contributions/2877123/attachments/1593469/2522749/pileup_ppd_feb_2018.pdf + # pileup profile from MC + pu_name = "Pu60" if "Pu60" in dataset else "Pu70" + path_pileup_dataset = package_path + f"/corrections/data/pileup/{pu_name}.npy" + pileup_MC = np.load(path_pileup_dataset) + + # avoid division by 0 (?) + pileup_MC[pileup_MC == 0.0] = 1 + pileup_correction = pileup_profile / pileup_MC + # remove large MC reweighting factors to prevent artifacts + pileup_correction[pileup_correction > 10] = 10 + sf = pileup_correction[nPU] + # no uncertainties + weights.add("pileup", sf) + + else: + # https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun3 + values = {} + + cset = correctionlib.CorrectionSet.from_file(get_pog_json("pileup", year)) + corr = { + "2018": "Collisions18_UltraLegacy_goldenJSON", + "2022": "Collisions2022_355100_357900_eraBCD_GoldenJson", + "2022EE": "Collisions2022_359022_362760_eraEFG_GoldenJson", + "2023": "Collisions2023_366403_369802_eraBC_GoldenJson", + "2023BPix": "Collisions2023_369803_370790_eraD_GoldenJson", + }[year] + # evaluate and clip up to 10 to avoid large weights + values["nominal"] = np.clip(cset[corr].evaluate(nPU, "nominal"), 0, 10) + values["up"] = np.clip(cset[corr].evaluate(nPU, "up"), 0, 10) + values["down"] = np.clip(cset[corr].evaluate(nPU, "down"), 0, 10) + + weights.add("pileup", values["nominal"], values["up"], values["down"]) + + +def get_vpt(genpart, check_offshell=False): + """Only the leptonic samples have no resonance in the decay tree, and only + when M is beyond the configured Breit-Wigner cutoff (usually 15*width) + """ + boson = ak.firsts( + genpart[ + ((genpart.pdgId == 23) | (abs(genpart.pdgId) == 24)) + & genpart.hasFlags(["fromHardProcess", "isLastCopy"]) + ] + ) + if check_offshell: + offshell = genpart[ + genpart.hasFlags(["fromHardProcess", "isLastCopy"]) + & ak.is_none(boson) + & (abs(genpart.pdgId) >= 11) + & (abs(genpart.pdgId) <= 16) + ].sum() + return ak.where(ak.is_none(boson.pt), offshell.pt, boson.pt) + return np.array(ak.fill_none(boson.pt, 0.0)) + + +def add_ps_weight(weights, ps_weights): + """ + Parton Shower Weights (FSR and ISR) + """ + + nweights = len(weights.weight()) + nom = np.ones(nweights) + + up_isr = np.ones(nweights) + down_isr = np.ones(nweights) + up_fsr = np.ones(nweights) + down_fsr = np.ones(nweights) + + if len(ps_weights[0]) == 4: + up_isr = ps_weights[:, 0] # ISR=2, FSR=1 + down_isr = ps_weights[:, 2] # ISR=0.5, FSR=1 + + up_fsr = ps_weights[:, 1] # ISR=1, FSR=2 + down_fsr = ps_weights[:, 3] # ISR=1, FSR=0.5 + + elif len(ps_weights[0]) > 1: + print("PS weight vector has length ", len(ps_weights[0])) + + weights.add("ISRPartonShower", nom, up_isr, down_isr) + weights.add("FSRPartonShower", nom, up_fsr, down_fsr) + + +def get_pdf_weights(events): + """ + For the PDF acceptance uncertainty: + - store 103 variations. 0-100 PDF values + - The last two values: alpha_s variations. + - you just sum the yield difference from the nominal in quadrature to get the total uncertainty. + e.g. https://github.com/LPC-HH/HHLooper/blob/master/python/prepare_card_SR_final.py#L258 + and https://github.com/LPC-HH/HHLooper/blob/master/app/HHLooper.cc#L1488 + + Some references: + Scale/PDF weights in MC https://twiki.cern.ch/twiki/bin/view/CMS/HowToPDF + https://twiki.cern.ch/twiki/bin/viewauth/CMS/TopSystematics#PDF + """ + return events.LHEPdfWeight.to_numpy() + + +def get_scale_weights(events): + """ + QCD Scale variations, best explanation I found is here: + https://twiki.cern.ch/twiki/bin/viewauth/CMS/TopSystematics#Factorization_and_renormalizatio + + TLDR: we want to vary the renormalization and factorization scales by a factor of 0.5 and 2, + and then take the envelope of the variations on our final observation as the up/down uncertainties. + + Importantly, we need to keep track of the normalization for each variation, + so that this uncertainty takes into account the acceptance effects of our selections. + + LHE scale variation weights (w_var / w_nominal) (from https://cms-nanoaod-integration.web.cern.ch/autoDoc/NanoAODv9/2018UL/doc_TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1.html#LHEScaleWeight) + [0] is renscfact=0.5d0 facscfact=0.5d0 ; <= + [1] is renscfact=0.5d0 facscfact=1d0 ; <= + [2] is renscfact=0.5d0 facscfact=2d0 ; + [3] is renscfact=1d0 facscfact=0.5d0 ; <= + [4] is renscfact=1d0 facscfact=1d0 ; + [5] is renscfact=1d0 facscfact=2d0 ; <= + [6] is renscfact=2d0 facscfact=0.5d0 ; + [7] is renscfact=2d0 facscfact=1d0 ; <= + [8] is renscfact=2d0 facscfact=2d0 ; <= + + See also https://git.rwth-aachen.de/3pia/cms_analyses/common/-/blob/11e0c5225416a580d27718997a11dc3f1ec1e8d1/processor/generator.py#L93 for an example. + """ + if len(events[0].LHEScaleWeight) > 0: + if len(events[0].LHEScaleWeight) == 9: + variations = events.LHEScaleWeight[:, [0, 1, 3, 5, 7, 8]].to_numpy() + nominal = events.LHEScaleWeight[:, 4].to_numpy()[:, np.newaxis] + variations /= nominal + else: + variations = events.LHEScaleWeight[:, [0, 1, 3, 4, 6, 7]].to_numpy() + return np.clip(variations, 0.0, 4.0) + else: + return None + + +class JECs: + def __init__(self, year): + if year in ["2022", "2022EE", "2023", "2023BPix"]: + jec_compiled = package_path + "/corrections/jec_compiled.pkl.gz" + elif year in ["2016", "2016APV", "2017", "2018"]: + jec_compiled = package_path + "/corrections/jec_compiled_run2.pkl.gz" + else: + jec_compiled = None + + self.jet_factory = {} + self.met_factory = None + + if jec_compiled is not None: + with gzip.open(jec_compiled, "rb") as filehandler: + jmestuff = pickle.load(filehandler) + + self.jet_factory["ak4"] = jmestuff["jet_factory"] + self.jet_factory["ak8"] = jmestuff["fatjet_factory"] + self.met_factory = jmestuff["met_factory"] + + def _add_jec_variables(self, jets: JetArray, event_rho: ak.Array, isData: bool) -> JetArray: + """add variables needed for JECs""" + jets["pt_raw"] = (1 - jets.rawFactor) * jets.pt + jets["mass_raw"] = (1 - jets.rawFactor) * jets.mass + jets["event_rho"] = ak.broadcast_arrays(event_rho, jets.pt)[0] + if not isData: + # gen pT needed for smearing + jets["pt_gen"] = ak.values_astype(ak.fill_none(jets.matched_gen.pt, 0), np.float32) + return jets + + def get_jec_jets( + self, + events: NanoEventsArray, + jets: FatJetArray, + year: str, + isData: bool = False, + jecs: dict[str, str] | None = None, + fatjets: bool = True, + applyData: bool = False, + dataset: str | None = None, + nano_version: str = "v12", + ) -> FatJetArray: + """ + If ``jecs`` is not None, returns the shifted values of variables are affected by JECs. + """ + + rho = ( + events.Rho.fixedGridRhoFastjetAll + if "Rho" in events.fields + else events.fixedGridRhoFastjetAll + ) + jets = self._add_jec_variables(jets, rho, isData) + + apply_jecs = ak.any(jets.pt) if (applyData or not isData) else False + if "v12" not in nano_version: + apply_jecs = False + if not apply_jecs: + return jets, None + + jec_vars = ["pt"] # variables we are saving that are affected by JECs + jet_factory_str = "ak4" + if fatjets: + jet_factory_str = "ak8" + + if self.jet_factory[jet_factory_str] is None: + print("No factory available") + return jets, None + + import cachetools + + jec_cache = cachetools.Cache(np.inf) + + if isData: + if year == "2022": + corr_key = f"{year}_runCD" + elif year == "2022EE" and "Run2022E" in dataset: + corr_key = f"{year}_runE" + elif year == "2022EE" and "Run2022F" in dataset: + corr_key = f"{year}_runF" + elif year == "2022EE" and "Run2022G" in dataset: + corr_key = f"{year}_runG" + elif year == "2023": + corr_key = "2023_runCv4" if "Run2023Cv4" in dataset else "2023_runCv123" + elif year == "2023BPix": + corr_key = "2023BPix_runD" + else: + print(dataset, year) + print("warning, no valid dataset, JECs won't be applied to data") + applyData = False + else: + corr_key = f"{year}mc" + + # fatjet_factory.build gives an error if there are no jets in event + if apply_jecs: + jets = self.jet_factory[jet_factory_str][corr_key].build(jets, jec_cache) + + # return only jets if no variations are given + if jecs is None or isData: + return jets, None + + jec_shifted_vars = {} + for jec_var in jec_vars: + tdict = {"": jets[jec_var]} + if apply_jecs: + for key, shift in jecs.items(): + for var in ["up", "down"]: + if shift in ak.fields(jets): + tdict[f"{key}_{var}"] = jets[shift][var][jec_var] + jec_shifted_vars[jec_var] = tdict + + return jets, jec_shifted_vars + + +def get_jmsr( + fatjets: FatJetArray, + num_jets: int, + jmsr_vars: list[str], + jms_values: dict, + jmr_values: dict, + isData: bool = False, + seed: int = 42, +) -> dict: + """Calculates post JMS/R masses and shifts""" + + jmsr_shifted_vars = {} + + for mkey in jmsr_vars: + tdict = {} + + mass = pad_val(fatjets[mkey], num_jets, axis=1) + jms = jms_values[mkey] + jmr = jmr_values[mkey] + + if isData: + tdict[""] = mass + else: + rng = np.random.default_rng(seed) + smearing = rng.normal(size=mass.shape) + # scale to JMR nom, down, up (minimum at 0) + jmr_nom, jmr_down, jmr_up = ((smearing * max(jmr[i] - 1, 0) + 1) for i in range(3)) + jms_nom, jms_down, jms_up = jms + + corr_mass_JMRUp = random.gauss(0.0, jmr[2] - 1.0) + corr_mass = max(jmr[0] - 1.0, 0.0) / (jmr[2] - 1.0) * corr_mass_JMRUp + + mass_jms = mass * jms_nom + mass_jmr = mass * jmr_nom + + tdict[""] = mass * jms_nom * (1.0 + corr_mass) + tdict["JMS_down"] = mass_jmr * jms_down + tdict["JMS_up"] = mass_jmr * jms_up + tdict["JMR_down"] = mass_jms * jmr_down + tdict["JMR_up"] = mass_jms * jmr_up + + jmsr_shifted_vars[mkey] = tdict + + return jmsr_shifted_vars + + +# Jet Veto Maps +# the JERC group recommends ALL analyses use these maps, as the JECs are derived excluding these zones. +# apply to both Data and MC +# https://cms-talk.web.cern.ch/t/jet-veto-maps-for-run3-data/18444?u=anmalara +# https://cms-talk.web.cern.ch/t/jes-for-2022-re-reco-cde-and-prompt-fg/32873 +def get_jetveto_event(jets: JetArray, year: str): + """ + Get event selection that rejects events with jets in the veto map + """ + + # correction: Non-zero value for (eta, phi) indicates that the region is vetoed + cset = correctionlib.CorrectionSet.from_file(get_pog_json("jetveto", year)) + j, nj = ak.flatten(jets), ak.num(jets) + + def get_veto(j, nj, csetstr): + j_phi = np.clip(np.array(j.phi), -3.1415, 3.1415) + j_eta = np.clip(np.array(j.eta), -4.7, 4.7) + veto = cset[csetstr].evaluate("jetvetomap", j_eta, j_phi) + return ak.unflatten(veto, nj) + + corr_str = { + "2022": "Summer22_23Sep2023_RunCD_V1", + "2022EE": "Summer22EE_23Sep2023_RunEFG_V1", + "2023": "Summer23Prompt23_RunC_V1", + "2023BPix": "Summer23BPixPrompt23_RunD_V1", + }[year] + + jet_veto = get_veto(j, nj, corr_str) > 0 + + event_sel = ~(ak.any((jets.pt > 15) & jet_veto, axis=1)) + return event_sel \ No newline at end of file