Skip to content

Commit

Permalink
first version of the nnpdf data package , with versioning
Browse files Browse the repository at this point in the history
add utility function to read metadata just from dataset name

deprecate a bunch of functions
  • Loading branch information
scarlehoff committed Dec 5, 2024
1 parent 1bc9d9f commit 1f845a0
Show file tree
Hide file tree
Showing 24 changed files with 664 additions and 525 deletions.
138 changes: 138 additions & 0 deletions deprecated_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Note: this module will be removed after the next tag, don't use anything from here
"""

import dataclasses
import logging
from operator import attrgetter

import pandas as pd

from nnpdf_data.coredata import CommonData

log = logging.getLogger(__name__)

log.warning(
"You are loading deprecated functionality that use the old commondata parser. This is no longer supported and will be removed in the near future"
)


### Old commondata:
### All code below this line is deprecated and will be removed
def load_commondata_old(commondatafile, systypefile, setname):
"""Parse a commondata file and a systype file into a CommonData.
Parameters
----------
commondatafile : file or path to file
systypefile : file or path to file
Returns
-------
commondata : CommonData
An object containing the data and information from the commondata
and systype files.
"""
# First parse commondata file
commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
# Remove NaNs
# TODO: replace commondata files with bad formatting
# Build header
commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2

commondataheader += ["ADD", "MULT"] * nsys
commondatatable.columns = commondataheader
commondatatable.set_index("entry", inplace=True)
ndata = len(commondatatable)
commondataproc = commondatatable["process"][1]
# Check for consistency with commondata metadata
cdmetadata = peek_commondata_metadata(commondatafile)
if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata):
raise ValueError(f"Commondata table information does not match metadata for {setname}")

# Now parse the systype file
systypetable = parse_systypes(systypefile)

# Populate CommonData object
return CommonData(
setname=setname,
ndata=ndata,
commondataproc=commondataproc,
nkin=3,
nsys=nsys,
commondata_table=commondatatable,
systype_table=systypetable,
legacy=True,
)


def parse_systypes(systypefile):
"""Parses a systype file and returns a pandas dataframe."""
systypeheader = ["sys_index", "treatment", "name"]
try:
systypetable = pd.read_csv(
systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
)
systypetable.dropna(axis="columns", inplace=True)
# Some datasets e.g. CMSWCHARMRAT have no systematics
except pd.errors.EmptyDataError:
systypetable = pd.DataFrame(columns=systypeheader)

systypetable.set_index("sys_index", inplace=True)

return systypetable


@dataclasses.dataclass(frozen=True)
class CommonDataMetadata:
"""Contains metadata information about the data being read"""

name: str
nsys: int
ndata: int
process_type: str


def peek_commondata_metadata(commondatafilename):
"""Read some of the properties of the commondata object as a CommonData Metadata"""
with open(commondatafilename) as f:
try:
l = f.readline()
name, nsys_str, ndata_str = l.split()
l = f.readline()
process_type_str = l.split()[1]
except Exception:
log.error(f"Error processing {commondatafilename}")
raise

return CommonDataMetadata(
name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
)


def get_plot_kinlabels(commondata):
"""Return the LaTex kinematic labels for a given Commondata"""
key = commondata.process_type

# TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata
return KINLABEL_LATEX.get(key, key)


def get_kinlabel_key(process_label):
"""
Since there is no 1:1 correspondence between latex keys and the old libNNPDF names
we match the longest key such that the proc label starts with it.
"""
l = process_label
try:
if process_label == "EWK_RAP_ASY":
# TODO this function is disappearing in this PR
l = "EWK_RAP"
return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
except StopIteration as e:
raise ValueError(
"Could not find a set of kinematic "
"variables matching the process %s Check the "
"labels defined in commondata.cc. " % (l)
) from e
6 changes: 3 additions & 3 deletions doc/sphinx/source/vp/customplots.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ There are two ways to take advantage of resources produced using the
* Using extra modules: Additional Python modules or files can be passed to
``validphys`` using the ``--extra-modules`` (or ``-x``) flag. The
functions in these modules then act ``validphys`` providers and can take
resources from ``validpys`` as input. This approach allows the
resources from ``validpys`` as input. This approach allows the
immediate use of runcards or the default styles. One limitation is that
there is currently no way of adding production rules or parsers in this
way. Prefer this for actions that are too difficult to upstream to
Expand All @@ -76,7 +76,7 @@ There are two ways to take advantage of resources produced using the
from matplotlib.figure import Figure
from reportengine.figure import figure

from validphys.commondataparser import load_commondata
from nnpdf_data.commondataparser import load_commondata

# A simple plot that probably should be in validphys to begin with.

Expand All @@ -103,7 +103,7 @@ There are two ways to take advantage of resources produced using the



Note that both of these come at the cost of risking future breakage
Note that both of these come at the cost of risking future breakage
somewhat as we don't guarantee any sort of stability on the internal
interfaces.

Expand Down
12 changes: 6 additions & 6 deletions doc/sphinx/source/vp/pydataobjs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,8 @@ Loading CommonData
------------------

The underlying functions for loading CommonData can be found in
:py:mod:`validphys.commondataparser`. The data is loaded
as :py:class:`validphys.coredata.CommonData`, which uses the
:py:mod:`nnpdf_data.commondataparser`. The data is loaded
as :py:class:`nnpdf_data.coredata.CommonData`, which uses the
`dataclasses <https://docs.python.org/3/library/dataclasses.html>`_ module
which automatically generates some special methods for the class. The
underlying data is stored as DataFrames, and so can be used
Expand All @@ -153,7 +153,7 @@ with the standard pandas machinery::
import pandas as pd

from validphys.api import API
from validphys.commondataparser import load_commondata
from nnpdf_data.commondataparser import load_commondata
# define dataset settings
ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
# first get the CommonDataSpec
Expand All @@ -162,11 +162,11 @@ with the standard pandas machinery::
assert isinstance(lcd.central_values, pd.Series)
assert isinstance(lcd.systematics_table, pd.DataFrame)

The :py:class:`validphys.coredata.CommonData` class has a method which returns
The :py:class:`nnpdf_data.coredata.CommonData` class has a method which returns
a new instance of the class with cuts applied::

from validphys.api import API
from validphys.commondataparser import load_commondata
from nnpdf_data.commondataparser import load_commondata
# define dataset and additional settings
ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
inp = {
Expand All @@ -193,7 +193,7 @@ more convenient than calling the underlying functions::
Loading Covariance Matrices
---------------------------

Functions which take :py:class:`validphys.coredata.CommonData` s and return
Functions which take :py:class:`nnpdf_data.coredata.CommonData` s and return
covariance matrices can be found in
:py:mod:`validphys.covmats`. As with the commondata
the functions can be called in scripts directly::
Expand Down
34 changes: 34 additions & 0 deletions nnpdf_data/examples_of_use.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
This file contains examples of use of ``nnpdf_data`` as a library.
This library is currently in pre-alpha form and should not be considered stable.
The functions and examples in this file will be eventually removed but might become
part of the library as an external user-facing interface.
There is currently no user-facing interface so no stability is expected.
"""

from nnpdf_data import path_commondata
from nnpdf_data.commondataparser import parse_new_metadata


def parse_dataset(dataset, variant=None):
"""Given a dataset name, read the observable metadata as a CommonData object.
A variant can be given.
The output is a ``ObservableMetaData`` object, with references to all files
that form the dataset but none of them is loaded.
This can then be used to _load_ the dataset using load_commondata.
Example
-------
>>> from nnpdf_data.commondataparser import load_commondata
>>> cd_meta = parse_dataset("LHCB_Z0_7TEV_DIELECTRON_Y")
>>> cd = load_commondata(cd_meta)
>>> print(cd)
CommonData(setname='LHCB_Z0_7TEV_DIELECTRON_Y', ndata=9, commondataproc='DY_Z_Y', nkin=3, nsys=11, legacy=False, legacy_names=['LHCBZ940PB'], kin_variables=['y', 'm_Z2', 'sqrts'])
"""
setname, observable = dataset.rsplit("_", 1)
metadata_file = path_commondata / setname / "metadata.yaml"
metadata = parse_new_metadata(metadata_file, observable, variant=variant)
return metadata
78 changes: 12 additions & 66 deletions nnpdf_data/nnpdf_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,22 @@
from functools import lru_cache
import pathlib

import yaml
from ._version import __version__
from .commondataparser import parse_new_metadata
from .validphys_compatibility import legacy_to_new_map, legacy_to_new_mapping, new_to_legacy_map

path_vpdata = pathlib.Path(__file__).parent
path_commondata = path_vpdata / "commondata"

# VP should not have access to this file, only to the products
_path_legacy_mapping = path_commondata / "dataset_names.yml"
theory_cards = path_vpdata / "theory_cards"

with open(_path_legacy_mapping) as file:
_legacy_to_new_mapping_raw = yaml.load(file, yaml.Loader)
# Convert strings into a dictionary
legacy_to_new_mapping = {
k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items()
}


@lru_cache
def legacy_to_new_map(dataset_name, sys=None):
"""Find the new dataset name and variant corresponding to an old dataset
and systematics choice"""
if dataset_name not in legacy_to_new_mapping:
return dataset_name, None

new_name = legacy_to_new_mapping[dataset_name]
variant = new_name.get("variant")
new_name = new_name["dataset"]
if sys is not None:
if variant is None:
raise KeyError(
f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this."
)
variant += f"_{sys}"

return new_name, variant


@lru_cache
def new_to_legacy_map(dataset_name, variant_used):
"""Loop over the dictionary and find the right dataset.
Since it is posible to have more than 1 dataset mapped to the same new one,
returns a list of everything that matches.
This function will loop over the entire dictionary of mappings and selects
1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches
2. All datasets that match the dataset name: matches
If there are any `exact_matches`, it will return only those; otherwise, return all `matches`
if there are no `matches` at all, return None
"""

matches = []
exact_matches = []

for old_name, new_info in legacy_to_new_mapping.items():
new_name = new_info["dataset"]
variant = new_info.get("variant")

if new_name == dataset_name:
matches.append(old_name)
# if it's a nuclear DIS data promote legacy to be legacy_dw
if "_DW_" in old_name and variant_used == "legacy":
variant = "legacy_dw"
def load_dataset_metadata(dataset_name, variant=None):
"""Given a dataset name, return the metadata"""

if variant_used == variant:
exact_matches.append(old_name)
# Compatibility with old nnpdf names, these two lines
# might disappear at any given point
if variant is None:
dataset_name, variant = legacy_to_new_map(dataset_name)

# If we found exact matches, return those and stop looking
if exact_matches:
return exact_matches
elif matches:
return matches
return None
setname, observable = dataset_name.rsplit("_", 1)
metadata_file = path_commondata / setname / "metadata.yaml"
return parse_new_metadata(metadata_file, observable, variant=variant)
Loading

0 comments on commit 1f845a0

Please sign in to comment.