Skip to content

Commit

Permalink
Add filter files
Browse files Browse the repository at this point in the history
  • Loading branch information
achiefa committed Dec 10, 2024
1 parent c3563e2 commit efcd072
Show file tree
Hide file tree
Showing 3 changed files with 327 additions and 45 deletions.
18 changes: 18 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
'''
Filter script for CMS_WCHARM_7TEV
'''

import logging

from filter_utils import Extractor
import numpy as np

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')


if __name__ == "__main__":
CMS_WCHARM_TOT = Extractor("./metadata.yaml", "WPWM-TOT", mult_factor=1000)
_, _, _ = CMS_WCHARM_TOT.generate_data(variant='default', save_to_yaml=True)

CMS_WCHARM_RATIO = Extractor("./metadata.yaml", "WPWM-RATIO", mult_factor=1000)
_, _, _ = CMS_WCHARM_RATIO.generate_data(variant='default', save_to_yaml=True)
280 changes: 280 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
import logging

import numpy as np
import yaml

from nnpdf_data.filter_utils.utils import prettify_float

yaml.add_representer(float, prettify_float)

SQRTS = 8000
MW2 = 80.385**2
CMSLUMI13 = 2.5

# List of systematic uncertainties that shuold
# be considered uncorrelated
UNCORR_SYS_UNC = ['UnfoldMCstat', 'UnfoldOtherGen', 'UnfoldReweight']
ART_LABEL = 'art_corr'
STAT_LABEL = 'stat_uncorr'
TABLE_TOKEN = 'Table'


class Extractor:
"""
Extracts kinematics, central data, and uncertainties for a given dataset
Parameters
----------
metadata_file: str
Path to the metadata file
observable: str
The name of the observable for which the data is extracted. The name must
be listed in the metadata file.
"""

def __init__(self, metadata_file, observable, mult_factor=1):

# Open metadata and select process
with open(metadata_file, 'r') as file:
metadata = yaml.safe_load(file)
self.metadata = next(
(
md
for md in metadata["implemented_observables"]
if md['observable_name'] == observable
),
None,
)
if self.metadata is None:
raise Exception(f"{observable} is not listed in the metadata file.")

# Initialise dict of tables
self.tables = {}
self.observable = observable
self.mult_factor = mult_factor
self.kin_labels = self.metadata['kinematic_coverage']
self.ndata = self.metadata['ndata']

def __retrieve_table(self, table_id):
"""
Implementation of the lazy loading for the tables. If the table
is loaded for the first time, it is stored into an internal
container of the class, so that it will not be loaded each time.
When called, this functions checks if the table has already been stored
and, if that is the case, returns the stored table.
Parameters
----------
table_id: int
Index that specifies the table
Return
------
The table specified by `table_id`. If not previously loaded, it is also
stored into the internal container for future use.
"""
try:
table = self.tables[str(table_id)]
except KeyError:
logging.debug(
f'Table {table_id} has not already been used or stored.' f' Storing the table...'
)
with open(f'./rawdata/{TABLE_TOKEN}{table_id}.yaml', 'r') as tab:
tab_dict = yaml.safe_load(tab)
self.tables[str(table_id)] = tab_dict
table = tab_dict
return table

def __extract_kinematics(self, table: dict):
"""
Extracts the kinematic variables of the single differential
distribution given a table.
For each bin, it computes the max, min, and mid value of the transverse
momentum of the boson.
Parameters
----------
table: dict
Dictionary containing the bins in the transverse momentum
tab_number: int
Index to select the range of the second kinematic variable
Return
------
List of bins containing min, max, and mid values for each of the kinematic
observables listed in the `kinematic_coverage` of the metadata file.
"""
data = table['independent_variables'][0]
label = self.kin_labels
kinematics = []
for bin in data['values']:
abs_eta_min = bin['low']
abs_eta_max = bin['high']
kin_bin = {
label[0]: {
'min': abs_eta_min,
'mid': (abs_eta_max + abs_eta_min) / 2,
'max': abs_eta_max,
},
label[1]: {'min': None, 'mid': MW2, 'max': None},
}
kinematics.append(kin_bin)
return kinematics

def generate_kinematics(self):
"""
Function that generates the kinematics by looping over all the
tables specified in the metadata file. The resulting kinematics
is then saved to a yaml file. It relies on the method
`__extract_kinematics`.
"""

logging.info(f"Generating kinematics for ATLAS_{self.observable}...")

# Initialise kinematics list
kinematics = []
ndata = 0
table = self.metadata["tables"][0]
tab_dict = self.__retrieve_table(table)
kin = self.__extract_kinematics(tab_dict)
kinematics = np.concatenate([kinematics, kin])
ndata += len(kin)

# Check number of data agrees with metadata
try:
assert self.metadata['ndata'] is not None
assert self.metadata['ndata'] == ndata
except AssertionError as e:
logging.warning(
f"The number of data in the metafile is either wrong or unspecified."
f" The correct number is {ndata}. Please, update the metafile."
)
return
return kinematics.tolist()

def generate_data_and_unc(self, mult_factor=1.0):
"""
Same as `generate_kinematics`, but for central data points.
"""
logging.info(f"Generating central data for CMS_{self.observable}...")
dat_central = []
stat_unc = []
asy_sys_unc = []
table = self.metadata['tables'][0]
tab_dict = self.__retrieve_table(table)

# Select data with pT > 25 GeV
tab_dict = tab_dict['dependent_variables'][0]['values']

# Loop over bins
for rap_bin in tab_dict:
dat_central.append(rap_bin['value'] * mult_factor)
stat_unc.append(rap_bin['errors'][0]['symerror'] * mult_factor)
asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * mult_factor)
return dat_central, stat_unc, asy_sys_unc

def __build_unc_definitions(self, variant='default'):
unc_definitions = {}

# Statistical uncertainty
unc_definitions[STAT_LABEL] = {
'description': f'Statistical uncertainty',
'treatment': 'ADD',
'type': 'UNCORR',
}

if self.observable == 'WPWM-RATIO':
unc_definitions['ART_LABEL'] = {
'description': f'Correlated systematic uncertainty',
'treatment': 'MULT',
'type': 'CORR',
}
elif self.observable == 'WPWM-TOT':
for idx in range(self.ndata):
unc_definitions[f'{ART_LABEL}_{idx+1}'] = {
'description': f'Correlated systematic uncertainty {idx+1}',
'treatment': 'MULT',
'type': 'CORR',
}

if variant != 'default':
raise ValueError(f'The variant {variant} is not implemented yet.')

return unc_definitions

def generate_covmat(self, diag_stat_uncs=None):
table = self.metadata["tables"][1]
tab_dict = self.__retrieve_table(table)
matlist = tab_dict['dependent_variables'][0]['values']
matlist = [d['value'] for d in matlist]
covmat = np.zeros((self.ndata, self.ndata))
for i in range(self.ndata):
for j in range(self.ndata):
covmat[i, j] = matlist[i + self.ndata * j] * diag_stat_uncs[i] * diag_stat_uncs[j]
return covmat

def generate_data(self, variant='default', save_to_yaml=False, path='./'):
# Get central data and kinematics
central_data, stat_unc, sys_unc = self.generate_data_and_unc(self.mult_factor)
kinematics = self.generate_kinematics()

# Uncertainty definitions
unc_definitions = self.__build_unc_definitions(variant=variant)
sys_artificial = [] # Initialize vector of artificial uncertainties

if self.observable == 'WPWM-TOT':
covmat = self.generate_covmat(stat_unc)
eigvals, eigvecs = np.linalg.eig(covmat)
art_unc = np.sqrt(eigvals) * eigvecs

# Loop over bins
for data_idx, data in enumerate(central_data):
# Statistical uncertainty
unc_dict = {STAT_LABEL: stat_unc[data_idx]}
for sys_idx, art_sys in enumerate(art_unc[data_idx, :]):
unc_dict[f'{ART_LABEL}_{sys_idx+1}'] = float(art_sys)
sys_artificial.append(unc_dict)

elif self.observable == 'WPWM-RATIO':
for data_idx, data in enumerate(central_data):
# Statistical uncertainty
unc_dict = {STAT_LABEL: stat_unc[data_idx]}
unc_dict[f'{ART_LABEL}'] = sys_unc[data_idx]
sys_artificial.append(unc_dict)

if save_to_yaml:
# Save kinematics into file
logging.info("Dumping kinematics to file...")
kinematics_yaml = {'bins': kinematics}
with open(path + self.metadata['kinematics']['file'], 'w') as kin_out_file:
yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
logging.info("Done!")

# Save central data into file
logging.info("Dumping kinematics to file...")
dat_central_yaml = {'data_central': central_data}
file_name = self.metadata['data_central']
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")

# Save unertainties
logging.info("Dumping kinematics to file...")
uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial}
file_name = (
self.metadata['data_uncertainties'][0]
if variant == 'default'
else self.metadata['variants'][variant]['data_uncertainties'][0]
)
with open(path + file_name, 'w') as dat_out_file:
yaml.dump(uncertainties_yaml, dat_out_file, sort_keys=False)
logging.info("Done!")
return kinematics, central_data, sys_artificial
else:
return kinematics, central_data, sys_artificial

def get_table(self, table_id):
return self.__retrieve_table(table_id)
Loading

0 comments on commit efcd072

Please sign in to comment.