From 34654b26aac8599d9acedc2b1a827f9e8230bd18 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 17 Jul 2024 12:07:01 -0500 Subject: [PATCH 01/32] Edit/fix docstrings for some files --- src/coffea/analysis_tools.py | 48 ++++++++++++++++++++- src/coffea/btag_tools/btagscalefactor.py | 12 +++--- src/coffea/dataset_tools/apply_processor.py | 2 + src/coffea/dataset_tools/manipulations.py | 7 +++ src/coffea/dataset_tools/preprocess.py | 1 + 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py index 24bdf7edb..4f9c777c7 100644 --- a/src/coffea/analysis_tools.py +++ b/src/coffea/analysis_tools.py @@ -22,6 +22,23 @@ class WeightStatistics: + """ + Container for statistics about the weight, including the sum of squared weights + and number of entries. + + Parameters + ---------- + sumw: float + The sum of weights + sumw2: float + The sum of squared weights + minw: float + The minimum weight + maxw: float + The maximum weight + n: int + The number of entries + """ def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0): self.sumw = sumw self.sumw2 = sumw2 @@ -36,6 +53,17 @@ def identity(self): return WeightStatistics() def add(self, other): + """Add two WeightStatistics objects together. + + Adds the sum of weights, the sum of squared weights, and the number of entries. + Takes the minimum and maximum across the two WeightStatistics objects. Modifies + this object in place. + + Parameters + ---------- + other: WeightStatistics + The other WeightStatistics object to add to this one + """ self.sumw += other.sumw self.sumw2 += other.sumw2 self.minw = min(self.minw, other.minw) @@ -76,6 +104,8 @@ def __init__(self, size, storeIndividual=False): @property def weightStatistics(self): + """Statistics about the weight, including the sum of squared weights + and number of entries.""" return self._weightStats def __add_eager(self, name, weight, weightUp, weightDown, shift): @@ -349,7 +379,7 @@ def __add_variation( @lru_cache def weight(self, modifier=None): - """Current event weight vector + """Returns the current event weight vector Parameters ---------- @@ -1102,6 +1132,14 @@ def names(self): @property def delayed_mode(self): + """ + Is the PackedSelection in delayed mode? + + Returns + ------- + res: bool + True if the PackedSelection is in delayed mode + """ if isinstance(self._data, dask_awkward.Array): return True elif isinstance(self._data, numpy.ndarray): @@ -1114,6 +1152,14 @@ def delayed_mode(self): @property def maxitems(self): + """ + What is the maximum supported number of selections in this PackedSelection? + + Returns + ------- + res: bool + The maximum supported number of selections + """ return PackedSelection._supported_types[self._dtype] def __add_delayed(self, name, selection, fill_value): diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 6c91588e4..47785731a 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -21,11 +21,11 @@ class BTagScaleFactor: If set true, keep the parsed dataframe as an attribute (.df) for later inspection """ - LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) - FLAV_B, FLAV_C, FLAV_UDSG = range(3) + _LOOSE, _MEDIUM, _TIGHT, _RESHAPE = range(4) + _FLAV_B, _FLAV_C, _FLAV_UDSG = range(3) _flavor = numpy.array([0, 4, 5, 6]) - _flavor2btvflavor = {0: FLAV_UDSG, 4: FLAV_C, 5: FLAV_B} - _wpString = {"loose": LOOSE, "medium": MEDIUM, "tight": TIGHT, "reshape": RESHAPE} + _flavor2btvflavor = {0: _FLAV_UDSG, 4: _FLAV_C, 5: _FLAV_B} + _wpString = {"loose": _LOOSE, "medium": _MEDIUM, "tight": _TIGHT, "reshape": _RESHAPE} _expectedColumns = [ "OperatingPoint", "measurementType", @@ -165,7 +165,7 @@ def findbin(flavor, eta, pt, discr): flavor, eta, pt, discr = (x[idx] for x in bin_low_edges) mapping[idx] = findbin(flavor, eta, pt, discr) - if self.workingpoint == BTagScaleFactor.RESHAPE: + if self.workingpoint == BTagScaleFactor._RESHAPE: self._corrections[syst] = dense_mapped_lookup( (self._flavor, edges_eta, edges_pt, edges_discr), mapping, @@ -209,7 +209,7 @@ def eval(self, systematic, flavor, eta, pt, discr=None, ignore_missing=False): """ if systematic not in self._corrections: raise ValueError("Unrecognized systematic: %s" % systematic) - if self.workingpoint == BTagScaleFactor.RESHAPE: + if self.workingpoint == BTagScaleFactor._RESHAPE: if discr is None: raise ValueError("RESHAPE scale factor requires a discriminant array") return self._corrections[systematic]( diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 489b8b6f0..cd301d3ce 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -40,6 +40,7 @@ def apply_to_dataset( ) -> DaskOutputType | tuple[DaskOutputType, dask_awkward.Array]: """ Apply the supplied function or processor to the supplied dataset. + Parameters ---------- data_manipulation : ProcessorABC or GenericHEPAnalysis @@ -97,6 +98,7 @@ def apply_to_fileset( ) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: """ Apply the supplied function or processor to the supplied fileset (set of datasets). + Parameters ---------- data_manipulation : ProcessorABC or GenericHEPAnalysis diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index e515ce2bf..da6623f47 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -12,6 +12,7 @@ def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpec: """ Modify the input dataset so that only the first "maxchunks" chunks of each file will be processed. + Parameters ---------- fileset: FilesetSpec @@ -30,6 +31,7 @@ def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpe def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: """ Modify the input dataset so that only the chunks of each file specified by the input slice are processed. + Parameters ---------- fileset: FilesetSpec @@ -56,6 +58,7 @@ def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSp def max_files(fileset: FilesetSpec, maxfiles: int | None = None) -> FilesetSpec: """ Modify the input dataset so that only the first "maxfiles" files of each dataset will be processed. + Parameters ---------- fileset: FilesetSpec @@ -74,6 +77,7 @@ def max_files(fileset: FilesetSpec, maxfiles: int | None = None) -> FilesetSpec: def slice_files(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: """ Modify the input dataset so that only the files of each dataset specified by the input slice are processed. + Parameters ---------- fileset: FilesetSpec @@ -111,6 +115,7 @@ def filter_files( ) -> FilesetSpec: """ Modify the input dataset so that only the files of each dataset that pass the filter remain. + Parameters ---------- fileset: FilesetSpec @@ -134,6 +139,7 @@ def get_failed_steps_for_dataset( ) -> DatasetSpec: """ Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters ---------- dataset: DatasetSpec @@ -190,6 +196,7 @@ def get_failed_steps_for_fileset( ): """ Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters ---------- fileset: FilesetSpec diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index cef72ad5d..455fd0725 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -32,6 +32,7 @@ def get_steps( ) -> awkward.Array | dask_awkward.Array: """ Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters ---------- normed_files: awkward.Array | dask_awkward.Array From f59cc8c6e16ec92141cfc8cd723524790e988c68 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Fri, 19 Jul 2024 16:39:20 -0500 Subject: [PATCH 02/32] DataDiscoveryCLI methods docstrings --- src/coffea/dataset_tools/__init__.py | 11 ++ src/coffea/dataset_tools/dataset_query.py | 173 ++++++++++++++++++---- 2 files changed, 159 insertions(+), 25 deletions(-) diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index 647fd336c..7c85f0d5b 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -9,8 +9,19 @@ slice_files, ) from coffea.dataset_tools.preprocess import preprocess +from coffea.dataset_tools.dataset_query import DataDiscoveryCLI, print_dataset_query +from coffea.dataset_tools.rucio_utils import ( + get_rucio_client, + get_dataset_files_replicas, + query_dataset, +) __all__ = [ + "get_rucio_client", + "get_dataset_files_replicas", + "query_dataset", + "DataDiscoveryCLI", + "print_dataset_query", "preprocess", "apply_to_dataset", "apply_to_fileset", diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index fa423688f..2eae8e3fe 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -4,7 +4,7 @@ import os import random from collections import defaultdict -from typing import List +from typing import List, Any, Hashable, Dict import yaml from dask.distributed import Client @@ -15,10 +15,29 @@ from rich.tree import Tree from . import rucio_utils -from .preprocess import preprocess - - -def print_dataset_query(query, dataset_list, console, selected=[]): +from .preprocess import preprocess, FilesetSpecOptional + + +def print_dataset_query( + query: str, + dataset_list: Dict[str, Dict[str,list[str]]], + console: Console, + selected: list[str] = [] +) -> None: + """ + Pretty-print the results of a rucio query in a table. + + Parameters + ---------- + query: str + The query given to rucio + dataset_list: dict[str, dict[str,list[str]]] + The second output of a call to query_dataset with tree=True + console: Console + A Console object to print to + selected: list[str], default [] + A list of selected datasets + """ table = Table(title=f"Query: [bold red]{query}") table.add_column("Name", justify="left", style="cyan", no_wrap=True) table.add_column("Tag", style="magenta", no_wrap=True) @@ -88,6 +107,11 @@ def get_indices_query(input_str: str, maxN: int) -> List[int]: class DataDiscoveryCLI: + """ + Simplifies dataset query, replicas, filters, and uproot preprocessing with Dask. + It can be accessed in a Python script or interpreter via this class, or from the + command line (as in `python -m coffea.dataset_tools.dataset_query --help`). + """ def __init__(self): self.console = Console() self.rucio_client = None @@ -140,7 +164,7 @@ def start_cli(self): - [bold cyan]query-results[/]: List the results of the last dataset query - [bold cyan]list-selected[/]: Print a list of the selected datasets - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset - - [bold cyan]sites-filters[/]: show the active sites filters and ask to clear them + - [bold cyan]sites-filters[/]: Show the active sites filters and ask to clear them - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" @@ -198,7 +222,14 @@ def do_whoami(self): print(self.rucio_client.whoami()) def do_query(self, query=None): - # Your code here + """ + Look for datasets with * wildcards (like in DAS) + + Parameters + ---------- + query: str | None, default None + The query to pass to rucio. If None, will prompt the user for an input. + """ if query is None: query = Prompt.ask( "[yellow bold]Query for[/]", @@ -218,6 +249,7 @@ def do_query(self, query=None): print("Use the command [bold red]select[/] to selected the datasets") def do_query_results(self): + """List the results of the last dataset query""" if self.last_query_list: print_dataset_query( self.last_query, @@ -229,8 +261,19 @@ def do_query_results(self): print("First [bold red]query (Q)[/] for a dataset") def do_select(self, selection=None, metadata=None): - """Selected the datasets from the list of query results. Input a list of indices - also with range 4-6 or "all".""" + """ + Selected the datasets from the list of query results. Input a list of indices + also with range 4-6 or "all". + + Parameters + ---------- + selection: list[str] | None, default None + A list of indices corresponding to selected datasets. Should be a + string, with indices separated by spaces. Can include ranges (like "4-6") + or "all". + metadata: dict[Hashable,Any], default None + Metadata to store in associated with selected datasets. + """ if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return @@ -260,6 +303,7 @@ def do_select(self, selection=None, metadata=None): ) def do_list_selected(self): + """Print a list of the selected datasets""" print("[cyan]Selected datasets:") table = Table(title="Selected datasets") table.add_column("Index", justify="left", style="cyan", no_wrap=True) @@ -280,12 +324,20 @@ def do_list_selected(self): self.console.print(table) def do_replicas(self, mode=None, selection=None): - """Query Rucio for replicas. - mode: - None: ask the user about the mode - - round-robin (take files randomly from available sites), - - choose: ask the user to choose from a list of sites - - first: take the first site from the rucio query - selection: list of indices or 'all' to select all the selected datasets for replicas query + """ + Query Rucio for replicas. + + Parameters + ---------- + mode: str, default None + One of the following + - None: ask the user about the mode + - round-robin (take files randomly from available sites), + - choose: ask the user to choose from a list of sites + - first: take the first site from the rucio query + selection: str, default None + list of indices or 'all' to select all the selected datasets for + replicas query """ if selection is None: selection = Prompt.ask( @@ -433,6 +485,16 @@ def as_dict(self): return self.final_output def do_allowlist_sites(self, sites=None): + """ + Restrict the grid sites available for replicas query only to the requested list + + Parameters + ---------- + sites: list[str] | None, default None + The sites to allow the replicas query to look at. If passing in a list, + elements of the list are sites. If passing in None, the prompt requires + a single string containing a comma-separated listing. + """ if sites is None: sites = Prompt.ask( "[yellow]Restrict the available sites to (comma-separated list)" @@ -446,6 +508,16 @@ def do_allowlist_sites(self, sites=None): print(f"- {s}") def do_blocklist_sites(self, sites=None): + """ + Exclude grid sites from the available sites for replicas query + + Parameters + ---------- + sites: list[str] | None, default None + The sites to prevent the replicas query from looking at. If passing in a + list elements of the list are sites. If passing in None, the prompt + requires a single string containing a comma-separated listing. + """ if sites is None: sites = Prompt.ask( "[yellow]Exclude the sites (comma-separated list)" @@ -459,6 +531,14 @@ def do_blocklist_sites(self, sites=None): print(f"- {s}") def do_regex_sites(self, regex=None): + """ + Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" + + Parameters + ---------- + regex: str | None, default None + Sites to use for replica queries, described with a regex string. + """ if regex is None: regex = Prompt.ask("[yellow]Regex to restrict the available sites") if len(regex): @@ -466,6 +546,16 @@ def do_regex_sites(self, regex=None): print(f"New sites regex: [cyan]{self.sites_regex}") def do_sites_filters(self, ask_clear=True): + """ + Show the active sites filters (allowed, disallowed, and regex) and ask to clear + them + + Parameters + ---------- + ask_clear: bool, default True + If True, ask the user via prompt if allow, disallow, and regex filters + should be cleared. + """ print("[green bold]Allow-listed sites:") if self.sites_allowlist: for s in self.sites_allowlist: @@ -479,13 +569,14 @@ def do_sites_filters(self, ask_clear=True): print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") if ask_clear: - if Confirm.ask("Clear sites restrinction?", default=False): + if Confirm.ask("Clear sites restriction?", default=False): self.sites_allowlist = None self.sites_blocklist = None self.sites_regex = None print("[bold green]Sites filters cleared") def do_list_replicas(self): + """Print the selected files replicas for the selected dataset""" selection = Prompt.ask( "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" ) @@ -507,7 +598,13 @@ def do_list_replicas(self): self.console.print(tree) def do_save(self, filename=None): - """Save the replica information in yaml format""" + """ + Save the replica information in yaml format + + Parameters: + filename: str | None, default None + The name of the file to save the information into + """ if not filename: filename = Prompt.ask( "[yellow bold]Output file name (.yaml or .json)", default="output.json" @@ -529,8 +626,21 @@ def do_preprocess( align_to_clusters=None, scheduler_url=None, ): - """Perform preprocessing for concrete fileset extraction. - Args: output_file [step_size] [align to file cluster boundaries] [dask scheduler url] + """ + Perform preprocessing for concrete fileset extraction into a file, compressed + with gzip. + + Parameters + ---------- + output_file: str | None, default None + The name of the file to write the preprocessed file into + step_size: int | None, default None + The chunk size for file splitting + align_to_clusters: bool | None, default None + Whether or not round to the cluster size in a root file. See + align_clusters parameter in coffea.dataset_tools.preprocess. + scheduler_url: str | None, default None + Dask scheduler URL where the preprocessing should take place """ if not output_file: output_file = Prompt.ask( @@ -572,12 +682,25 @@ def load_dataset_definition( Initialize the DataDiscoverCLI by querying a set of datasets defined in `dataset_definitions` and selected results and replicas following the options. - - query_results_strategy: "all" or "manual" to be prompt for selection - - replicas_strategy: - - "round-robin": select randomly from the available sites for each file - - "choose": filter the sites with a list of indices for all the files - - "first": take the first result returned by rucio - - "manual": to be prompt for manual decision dataset by dataset + Parameters + ---------- + dataset_definition: Dict[str,Dict[Hashable,Any]] + Keys are dataset queries (ie: something that can be passed to do_query()) + query_results_strategy: str, default "all" + How to decide which datasets to select. If "manual", user will be prompted + for selection + replicas_strategy: str, default "round-robin" + Options are: + - "round-robin": select randomly from the available sites for each file + - "choose": filter the sites with a list of indices for all the files + - "first": take the first result returned by rucio + - "manual": to be prompt for manual decision dataset by dataset + + Returns + ------- + out_replicas: FilesetSpecOptional + An uproot-readable fileset. At this point, the fileset is not fully + preprocessed, but this can be done with do_preprocess(). """ for dataset_query, dataset_meta in dataset_definition.items(): print(f"\nProcessing query: {dataset_query}") From e91cea27122a58decd120defb378e3ee9cfb72fc Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 31 Jul 2024 11:30:05 -0500 Subject: [PATCH 03/32] Pre-commit changes --- src/coffea/analysis_tools.py | 7 +++--- src/coffea/btag_tools/btagscalefactor.py | 7 +++++- src/coffea/dataset_tools/__init__.py | 4 +-- src/coffea/dataset_tools/apply_processor.py | 2 +- src/coffea/dataset_tools/dataset_query.py | 27 +++++++++++---------- src/coffea/dataset_tools/manipulations.py | 2 +- src/coffea/dataset_tools/preprocess.py | 2 +- 7 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py index 4f9c777c7..65f366219 100644 --- a/src/coffea/analysis_tools.py +++ b/src/coffea/analysis_tools.py @@ -25,7 +25,7 @@ class WeightStatistics: """ Container for statistics about the weight, including the sum of squared weights and number of entries. - + Parameters ---------- sumw: float @@ -39,6 +39,7 @@ class WeightStatistics: n: int The number of entries """ + def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0): self.sumw = sumw self.sumw2 = sumw2 @@ -54,7 +55,7 @@ def identity(self): def add(self, other): """Add two WeightStatistics objects together. - + Adds the sum of weights, the sum of squared weights, and the number of entries. Takes the minimum and maximum across the two WeightStatistics objects. Modifies this object in place. @@ -1134,7 +1135,7 @@ def names(self): def delayed_mode(self): """ Is the PackedSelection in delayed mode? - + Returns ------- res: bool diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 47785731a..97131ebe0 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -25,7 +25,12 @@ class BTagScaleFactor: _FLAV_B, _FLAV_C, _FLAV_UDSG = range(3) _flavor = numpy.array([0, 4, 5, 6]) _flavor2btvflavor = {0: _FLAV_UDSG, 4: _FLAV_C, 5: _FLAV_B} - _wpString = {"loose": _LOOSE, "medium": _MEDIUM, "tight": _TIGHT, "reshape": _RESHAPE} + _wpString = { + "loose": _LOOSE, + "medium": _MEDIUM, + "tight": _TIGHT, + "reshape": _RESHAPE, + } _expectedColumns = [ "OperatingPoint", "measurementType", diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index 7c85f0d5b..b5e97e547 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,4 +1,5 @@ from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset +from coffea.dataset_tools.dataset_query import DataDiscoveryCLI, print_dataset_query from coffea.dataset_tools.manipulations import ( filter_files, get_failed_steps_for_dataset, @@ -9,10 +10,9 @@ slice_files, ) from coffea.dataset_tools.preprocess import preprocess -from coffea.dataset_tools.dataset_query import DataDiscoveryCLI, print_dataset_query from coffea.dataset_tools.rucio_utils import ( - get_rucio_client, get_dataset_files_replicas, + get_rucio_client, query_dataset, ) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index cd301d3ce..71612210b 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -98,7 +98,7 @@ def apply_to_fileset( ) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: """ Apply the supplied function or processor to the supplied fileset (set of datasets). - + Parameters ---------- data_manipulation : ProcessorABC or GenericHEPAnalysis diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 2eae8e3fe..41fcad96a 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -4,7 +4,7 @@ import os import random from collections import defaultdict -from typing import List, Any, Hashable, Dict +from typing import Dict, List import yaml from dask.distributed import Client @@ -15,14 +15,14 @@ from rich.tree import Tree from . import rucio_utils -from .preprocess import preprocess, FilesetSpecOptional +from .preprocess import preprocess def print_dataset_query( query: str, - dataset_list: Dict[str, Dict[str,list[str]]], + dataset_list: Dict[str, Dict[str, list[str]]], console: Console, - selected: list[str] = [] + selected: list[str] = [], ) -> None: """ Pretty-print the results of a rucio query in a table. @@ -112,6 +112,7 @@ class DataDiscoveryCLI: It can be accessed in a Python script or interpreter via this class, or from the command line (as in `python -m coffea.dataset_tools.dataset_query --help`). """ + def __init__(self): self.console = Console() self.rucio_client = None @@ -224,7 +225,7 @@ def do_whoami(self): def do_query(self, query=None): """ Look for datasets with * wildcards (like in DAS) - + Parameters ---------- query: str | None, default None @@ -264,7 +265,7 @@ def do_select(self, selection=None, metadata=None): """ Selected the datasets from the list of query results. Input a list of indices also with range 4-6 or "all". - + Parameters ---------- selection: list[str] | None, default None @@ -326,7 +327,7 @@ def do_list_selected(self): def do_replicas(self, mode=None, selection=None): """ Query Rucio for replicas. - + Parameters ---------- mode: str, default None @@ -487,7 +488,7 @@ def as_dict(self): def do_allowlist_sites(self, sites=None): """ Restrict the grid sites available for replicas query only to the requested list - + Parameters ---------- sites: list[str] | None, default None @@ -510,7 +511,7 @@ def do_allowlist_sites(self, sites=None): def do_blocklist_sites(self, sites=None): """ Exclude grid sites from the available sites for replicas query - + Parameters ---------- sites: list[str] | None, default None @@ -531,9 +532,9 @@ def do_blocklist_sites(self, sites=None): print(f"- {s}") def do_regex_sites(self, regex=None): - """ + r""" Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" - + Parameters ---------- regex: str | None, default None @@ -549,7 +550,7 @@ def do_sites_filters(self, ask_clear=True): """ Show the active sites filters (allowed, disallowed, and regex) and ask to clear them - + Parameters ---------- ask_clear: bool, default True @@ -600,7 +601,7 @@ def do_list_replicas(self): def do_save(self, filename=None): """ Save the replica information in yaml format - + Parameters: filename: str | None, default None The name of the file to save the information into diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index da6623f47..c037e7f5c 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -196,7 +196,7 @@ def get_failed_steps_for_fileset( ): """ Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. - + Parameters ---------- fileset: FilesetSpec diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index 455fd0725..2e0f1de62 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -32,7 +32,7 @@ def get_steps( ) -> awkward.Array | dask_awkward.Array: """ Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. - + Parameters ---------- normed_files: awkward.Array | dask_awkward.Array From 0b421a57922e208d2e5e3df27764bd7ccff8cafa Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 31 Jul 2024 16:12:38 -0500 Subject: [PATCH 04/32] Make LOOSE, MEDIUM, etc. public attributes --- src/coffea/btag_tools/btagscalefactor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 97131ebe0..3c8ad4f83 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -21,15 +21,15 @@ class BTagScaleFactor: If set true, keep the parsed dataframe as an attribute (.df) for later inspection """ - _LOOSE, _MEDIUM, _TIGHT, _RESHAPE = range(4) + LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) _FLAV_B, _FLAV_C, _FLAV_UDSG = range(3) _flavor = numpy.array([0, 4, 5, 6]) _flavor2btvflavor = {0: _FLAV_UDSG, 4: _FLAV_C, 5: _FLAV_B} _wpString = { - "loose": _LOOSE, - "medium": _MEDIUM, - "tight": _TIGHT, - "reshape": _RESHAPE, + "loose": LOOSE, + "medium": MEDIUM, + "tight": TIGHT, + "reshape": RESHAPE, } _expectedColumns = [ "OperatingPoint", @@ -96,13 +96,13 @@ def __init__(self, filename, workingpoint, methods="comb,comb,incl", keep_df=Fal f"The BTag csv file {filename} is in the new UL format which is not supported by coffea.btag_tools.\n" "Instead one can use correctionlib for UL scale factors." ) - cut = (df["jetFlavor"] == self.FLAV_B) & (df["measurementType"] == methods[0]) + cut = (df["jetFlavor"] == self._FLAV_B) & (df["measurementType"] == methods[0]) if len(methods) > 1: - cut |= (df["jetFlavor"] == self.FLAV_C) & ( + cut |= (df["jetFlavor"] == self._FLAV_C) & ( df["measurementType"] == methods[1] ) if len(methods) > 2: - cut |= (df["jetFlavor"] == self.FLAV_UDSG) & ( + cut |= (df["jetFlavor"] == self._FLAV_UDSG) & ( df["measurementType"] == methods[2] ) cut &= df["OperatingPoint"] == workingpoint @@ -170,7 +170,7 @@ def findbin(flavor, eta, pt, discr): flavor, eta, pt, discr = (x[idx] for x in bin_low_edges) mapping[idx] = findbin(flavor, eta, pt, discr) - if self.workingpoint == BTagScaleFactor._RESHAPE: + if self.workingpoint == BTagScaleFactor.RESHAPE: self._corrections[syst] = dense_mapped_lookup( (self._flavor, edges_eta, edges_pt, edges_discr), mapping, @@ -214,7 +214,7 @@ def eval(self, systematic, flavor, eta, pt, discr=None, ignore_missing=False): """ if systematic not in self._corrections: raise ValueError("Unrecognized systematic: %s" % systematic) - if self.workingpoint == BTagScaleFactor._RESHAPE: + if self.workingpoint == BTagScaleFactor.RESHAPE: if discr is None: raise ValueError("RESHAPE scale factor requires a discriminant array") return self._corrections[systematic]( From 019ca310db3f0118da90d4e7509d5c3aa3adc40f Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 29 Aug 2024 10:23:40 -0500 Subject: [PATCH 05/32] Make FLAV_B etc. public --- src/coffea/btag_tools/btagscalefactor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 47785731a..6c91588e4 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -21,11 +21,11 @@ class BTagScaleFactor: If set true, keep the parsed dataframe as an attribute (.df) for later inspection """ - _LOOSE, _MEDIUM, _TIGHT, _RESHAPE = range(4) - _FLAV_B, _FLAV_C, _FLAV_UDSG = range(3) + LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) + FLAV_B, FLAV_C, FLAV_UDSG = range(3) _flavor = numpy.array([0, 4, 5, 6]) - _flavor2btvflavor = {0: _FLAV_UDSG, 4: _FLAV_C, 5: _FLAV_B} - _wpString = {"loose": _LOOSE, "medium": _MEDIUM, "tight": _TIGHT, "reshape": _RESHAPE} + _flavor2btvflavor = {0: FLAV_UDSG, 4: FLAV_C, 5: FLAV_B} + _wpString = {"loose": LOOSE, "medium": MEDIUM, "tight": TIGHT, "reshape": RESHAPE} _expectedColumns = [ "OperatingPoint", "measurementType", @@ -165,7 +165,7 @@ def findbin(flavor, eta, pt, discr): flavor, eta, pt, discr = (x[idx] for x in bin_low_edges) mapping[idx] = findbin(flavor, eta, pt, discr) - if self.workingpoint == BTagScaleFactor._RESHAPE: + if self.workingpoint == BTagScaleFactor.RESHAPE: self._corrections[syst] = dense_mapped_lookup( (self._flavor, edges_eta, edges_pt, edges_discr), mapping, @@ -209,7 +209,7 @@ def eval(self, systematic, flavor, eta, pt, discr=None, ignore_missing=False): """ if systematic not in self._corrections: raise ValueError("Unrecognized systematic: %s" % systematic) - if self.workingpoint == BTagScaleFactor._RESHAPE: + if self.workingpoint == BTagScaleFactor.RESHAPE: if discr is None: raise ValueError("RESHAPE scale factor requires a discriminant array") return self._corrections[systematic]( From 5c930fd2c10942dbcba0829eb5d4fd0707a0fe85 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 29 Aug 2024 10:59:38 -0500 Subject: [PATCH 06/32] Add docstrings to parameters in get_dataset_files_replicas --- src/coffea/dataset_tools/rucio_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index a4d3bc9ce..d98ac935a 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -157,13 +157,22 @@ def get_dataset_files_replicas( ---------- dataset: str + The dataset to search for. allowlist_sites: list + List of sites to select from. If the file is not found there, raise an Exception. blocklist_sites: list + List of sites to avoid. If the file has no left site, raise an Exception. regex_sites: list + Regex expression to restrict the list of sites. mode: str, default "full" + One of "full", "first", "best", or "roundrobin". Behavior of each described above. client: rucio Client, optional + The rucio client to use. If not provided, one will be generated for you. partial_allowed: bool, default False + If False, throws an exception if any file in the dataset cannot be found. If True, + will find as many files from the dataset as it can. scope: rucio scope, "cms" + The scope for rucio to search through. Returns ------- From 2174c1c7072f920c558c0b411026cd6db2e46ef8 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 29 Aug 2024 11:44:39 -0500 Subject: [PATCH 07/32] Reformat docstrings for query_dataset --- src/coffea/dataset_tools/rucio_utils.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index d98ac935a..dce5b4abf 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -301,18 +301,21 @@ def query_dataset( Parameters --------- - query: str = query to filter datasets / containers with the rucio list_dids functions - client: rucio client - tree: bool = if True return the results splitting the dataset name in parts parts - datatype: "container/dataset": rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. - scope: "cms". Rucio instance + query: str + Query to filter datasets / containers with the rucio list_dids functions + client: rucio Client + The rucio client to use. If not provided, one will be generated for you + tree: bool, default False + If True, return the results splitting the dataset name in parts + datatype: str, default "container" + Options are "container", "datset". rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. + scope: str, defualt "cms" + Rucio instance Returns ------- - list of containers/datasets - - if tree==True, returns the list of dataset and also a dictionary decomposing the datasets - names in the 1st command part and a list of available 2nd parts. + List of containers/datasets. If tree==True, returns the list of dataset and also a dictionary decomposing + the datasets names in the 1st command part and a list of available 2nd parts. """ client = client if client else get_rucio_client() From 63c7e268046aec61c0b5cc15c354e6a5483cddcb Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 29 Aug 2024 15:43:23 -0500 Subject: [PATCH 08/32] Update docstrings in extractor --- src/coffea/lookup_tools/extractor.py | 60 ++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/src/coffea/lookup_tools/extractor.py b/src/coffea/lookup_tools/extractor.py index 3d73e5586..3ba3c8aaa 100644 --- a/src/coffea/lookup_tools/extractor.py +++ b/src/coffea/lookup_tools/extractor.py @@ -65,7 +65,18 @@ def __init__(self): self._finalized = False def add_weight_set(self, local_name, thetype, weights): - """adds one extracted weight to the extractor""" + """ + Adds one extracted weight to the extractor. + + Parameters + ---------- + local_name: str + The name of the weight. + thetype: str + The type of weight (eg: jme_standard_function). + weights: Varies + The weights themselves. Type and structure depends on thetype. + """ if self._finalized: raise Exception("extractor is finalized cannot add new weights!") if local_name in self._names.keys(): @@ -76,8 +87,13 @@ def add_weight_set(self, local_name, thetype, weights): def add_weight_sets(self, weightsdescs): """ - expects a list of text lines to be formatted as ' ' - allows * * and * to do easy imports of whole file + Add multiple weight sets at once, coming from one or more files. + + Parameters + ---------- + weightsdescs: Iterable[str] + Expects a list of text lines to be formatted as ' '. + Allows * * and * to do easy imports of whole file. """ for weightdesc in weightsdescs: if weightdesc[0] == "#": @@ -110,7 +126,14 @@ def add_weight_sets(self, weightsdescs): self._names[local_name] = 0 def import_file(self, thefile): - """cache the whole contents of a file for later processing""" + """ + Cache the whole contents of a file for later processing + + Parameters + ---------- + thefile: str + The path to the file to be imported + """ if thefile not in self._filecache.keys(): drop_gz = thefile.replace(".gz", "") file_dots = os.path.basename(drop_gz).split(".") @@ -127,7 +150,16 @@ def import_file(self, thefile): self._filecache[thefile] = file_converters[theformat][thetype](thefile) def extract_from_file(self, thefile, name): - """import a file and then extract a lookup set""" + """ + Import a file and then extract a lookup set + + Parameters + ---------- + thefile: str + The path to the file to import + name: str + The name of the weights to extract, as named in the file + """ self.import_file(thefile) weights = self._filecache[thefile] names = {key[0]: key[1] for key in weights.keys()} @@ -137,8 +169,14 @@ def extract_from_file(self, thefile, name): def finalize(self, reduce_list=None): """ - stop any further imports and if provided pare down - the stored histograms to those specified in reduce_list + Stop any further imports and, if requested, pare down + the stored histograms to those specified in reduce_list. + + Parameters + ---------- + reduce_list: list[str], optional + Reduce the weights contained in this extractor to only those with names + in reduce_list. If not provided, no such reduction takes place. """ if self._finalized: raise Exception("extractor is already finalized!") @@ -159,7 +197,13 @@ def finalize(self, reduce_list=None): self._finalized = True def make_evaluator(self): - """produce an evaluator based on the finalized extractor""" + """ + Produce an evaluator based on the finalized extractor + + Returns + ------- + An evaluator based on the names, weight types, and weights of the finalized extractor. + """ if self._finalized: return evaluator(self._names, self._types, self._weights) else: From 91f718d760fc477b38bdd5af2bd40250ada9273b Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 29 Aug 2024 16:20:09 -0500 Subject: [PATCH 09/32] Update docstrings in evaluator --- src/coffea/lookup_tools/evaluator.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/coffea/lookup_tools/evaluator.py b/src/coffea/lookup_tools/evaluator.py index 20a1ab31a..f4cc888f1 100644 --- a/src/coffea/lookup_tools/evaluator.py +++ b/src/coffea/lookup_tools/evaluator.py @@ -35,9 +35,23 @@ class evaluator: evaluator = extractor.make_evaluator() out = evaluator["testSF2d"](eta, pt) - The returned value has the same shape as the input arguments. + The returned value has the same shape as the input arguments. `lookup_types` is a map of possible + constructors for extracted data. The arguments used when calling the evaluator depend on which named + weight is being used (eg. in the above example, the "testSF2d" weight requires `eta` and `pt` be + passed when calling the evaluator). - lookup_types is a map of possible constructors for extracted data + It is recommended to construct an evaluator from an extractor, so ensure that inputs to the + constructor are properly ordered and formatted. + + Parameters + ---------- + names: dict[str, int] + A dictionary mapping the names of weights to the index of that weight in `primitives`. + types: list[str] + A list of the types of weights, ordered in the same way as `primitives`. + primitives: list[Varies] + A list of primitives, whose type and structure depend on types. Should be order in the + same way as `primitives`. """ def __init__(self, names, types, primitives): From 3094a5ced6b9c47801df2f372d95ec0e9e0a2cb6 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 4 Sep 2024 14:08:28 -0500 Subject: [PATCH 10/32] Update docstrings for jetmet_tools classes --- .../jetmet_tools/FactorizedJetCorrector.py | 16 ++++++++++------ .../jetmet_tools/JetCorrectionUncertainty.py | 10 +++++++++- src/coffea/jetmet_tools/JetResolution.py | 14 +++++++++----- .../jetmet_tools/JetResolutionScaleFactor.py | 14 +++++++++----- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/coffea/jetmet_tools/FactorizedJetCorrector.py b/src/coffea/jetmet_tools/FactorizedJetCorrector.py index 0f752e49e..1bea71179 100644 --- a/src/coffea/jetmet_tools/FactorizedJetCorrector.py +++ b/src/coffea/jetmet_tools/FactorizedJetCorrector.py @@ -62,15 +62,19 @@ class FactorizedJetCorrector: You can use this class as follows:: fjc = FactorizedJetCorrector(name1=corrL1,...) - jetCorrs = fjc(JetParameter1=jet.parameter1,...) - + jetCorrs = fjc.getCorrection(JetParameter1=jet.parameter1,...) + + in which `jetCorrs` are the corrected jet scaled factors, with the same shape as + the input parameters. In order to see what parameters must be passed to + `getCorrection()`, one can do `fjc.signature`. + + You construct a FactorizedJetCorrector by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a FactorizedJetCorrector by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] diff --git a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py index 26f43f873..4532873d9 100644 --- a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py +++ b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py @@ -67,8 +67,16 @@ class JetCorrectionUncertainty: You can use this class as follows:: jcu = JetCorrectionUncertainty(name1=corrL1,...) - jetUncs = jcu(JetParameter1=jet.parameter1,...) + jetUncs = jcu.getUncertainty(JetParameter1=jet.parameter1,...) + in which `jetUncs` are the uncertainties, with the same shape as the input parameters. + In order to see which parameters must be passed to `getUncertainty`, one can do + `jcu.signature`. + + You construct a JetCorrectionUncertainty by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): diff --git a/src/coffea/jetmet_tools/JetResolution.py b/src/coffea/jetmet_tools/JetResolution.py index 18cb0ca02..59c5c4bd4 100644 --- a/src/coffea/jetmet_tools/JetResolution.py +++ b/src/coffea/jetmet_tools/JetResolution.py @@ -45,15 +45,19 @@ class JetResolution: You can use this class as follows:: jr = JetResolution(name1=corrL1,...) - jetRes = jr(JetParameter1=jet.parameter1,...) + jetRes = jr.getResolution(JetParameter1=jet.parameter1,...) + in which `jetRes` are the resolutions, with the same shape as the input parameters. + In order to see what parameters must be passed to `getResolution`, one can do + `jr.signature`. + + You construct a JetResolution object by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a JetResolution by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] diff --git a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py index 40f7c3b5d..db1d6a555 100644 --- a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py +++ b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py @@ -45,15 +45,19 @@ class JetResolutionScaleFactor: You can use this class as follows:: jersf = JetResolutionScaleFactor(name1=corrL1,...) - jetResSF = jersf(JetParameter1=jet.parameter1,...) + jetResSF = jersf.getScaleFactor(JetParameter1=jet.parameter1,...) + in which `jetResSF` are the scale factors, with the same shape as the input paramters. + In order to see which parameters must be passed to `getScaleFactor`, one can do + `jersf.signature`. + + You construct a JetResolutionScaleFactor by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a JetResolutionScaleFactor by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] From bf13ed7a37efabece36785088d6f152f1ae98ce6 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 4 Sep 2024 15:03:16 -0500 Subject: [PATCH 11/32] Update JECStack docstring, add parameters --- src/coffea/jetmet_tools/JECStack.py | 32 ++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/coffea/jetmet_tools/JECStack.py b/src/coffea/jetmet_tools/JECStack.py index d76124f65..c308c465a 100644 --- a/src/coffea/jetmet_tools/JECStack.py +++ b/src/coffea/jetmet_tools/JECStack.py @@ -8,12 +8,34 @@ class JECStack: + """ + Mostly used as an input to `CorrectedJetsFactory`. Hosts and organizes multiple + corrections under one object. + + jec, junc, etc. can be explicitly set by passing in the appropriate corrector class + (eg: FactorizedJetCorrector). If they are not set, correctors will be created, using + the info in `corrections` as input. + + Paramters + --------- + corrections: dict[str,lookup_base] + A dict-like of function names and functions. The function depends on the type + of correction (eg: for JEC, should be jme_standard_function). We expect JEC + names to be formatted as their filenames. + jec: FactorizedJetCorrector, optional + If provided, overrides the jec that would be created from `corrections` in + the stack. + junc: JetCorrectionUncertainty, optional + If provided, overrides the junc that would be created from `corrections` in + the stack. + jer: JetResolution, optional + If provided, overrides the jer that would be created from `corrections` in + the stack. + jersf: JetResolutionScaleFactor, optional + If provided, overrides the jersf that would be created from `corrections` in + the stack. + """ def __init__(self, corrections, jec=None, junc=None, jer=None, jersf=None): - """ - corrections is a dict-like of function names and functions - we expect JEC names to be formatted as their filenames - jecs, etc. can be overridden by passing in the appropriate corrector class. - """ self._jec = None self._junc = None self._jer = None From d152528ff76264a5f808fb9262352895cc7941ed Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 4 Sep 2024 16:12:47 -0500 Subject: [PATCH 12/32] Add docstrings to attributes --- .../jetmet_tools/CorrectedJetsFactory.py | 41 +++++++++++++++++++ src/coffea/jetmet_tools/JECStack.py | 18 +++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/coffea/jetmet_tools/CorrectedJetsFactory.py b/src/coffea/jetmet_tools/CorrectedJetsFactory.py index a7637c69b..a47db9a8b 100644 --- a/src/coffea/jetmet_tools/CorrectedJetsFactory.py +++ b/src/coffea/jetmet_tools/CorrectedJetsFactory.py @@ -129,6 +129,26 @@ def getfunction(layout, depth, **kwargs): class CorrectedJetsFactory: + """ + Factory class for applying corrections to jets, including organizing variations + (eg: JES up and down). It is constructed from a name map, which translates between + field names for corrections and field names in inputs, and a JECStack, which + contains the actual correction functions. + + Once a CorrectedJetsFactory is constructed, the `build` method can produce corrected + jets from an input array of jets. + + Parameters + ---------- + name_map: dict[str,str] + Keys are argument names in the various corrections' signatures (eg: the `signature` + attribute of a `FactorizedJJetCorrector` object). Values are the names of the + corresponding fields as they would appear in the jet array passed to the `build` + method. + jec_stack: JECStack + Contains the corrections that will be applied to the input jet array when calling + `build`. + """ def __init__(self, name_map, jec_stack): # from PhysicsTools/PatUtils/interface/SmearedJetProducerT.h#L283 self.forceStochastic = False @@ -173,12 +193,33 @@ def __init__(self, name_map, jec_stack): self.jec_stack = jec_stack def uncertainties(self): + """ + Returns a list of the sources of uncertainty included in the stack. + + Returns + ------- + list[str] + A list of the sources of uncertainty. + """ out = ["JER"] if self.jec_stack.jer is not None else [] if self.jec_stack.junc is not None: out.extend([f"JES_{unc}" for unc in self.jec_stack.junc.levels]) return out def build(self, injets): + """ + Apply the corrections to the array of jets, returning an array of corrected + jets. + + Parameters + ---------- + injets: (Awkward array[jets]) + An array of uncorrected jets, to which we want to apply corrections. + + Returns + ------- + Awkward array of jets, representing the corrected jets. + """ if not isinstance(injets, (awkward.highlevel.Array, dask_awkward.Array)): raise Exception("input jets must be an (dask_)awkward array of some kind!") diff --git a/src/coffea/jetmet_tools/JECStack.py b/src/coffea/jetmet_tools/JECStack.py index c308c465a..f6c1dce89 100644 --- a/src/coffea/jetmet_tools/JECStack.py +++ b/src/coffea/jetmet_tools/JECStack.py @@ -16,7 +16,7 @@ class JECStack: (eg: FactorizedJetCorrector). If they are not set, correctors will be created, using the info in `corrections` as input. - Paramters + Parameters --------- corrections: dict[str,lookup_base] A dict-like of function names and functions. The function depends on the type @@ -121,6 +121,10 @@ def __init__(self, corrections, jec=None, junc=None, jer=None, jersf=None): @property def blank_name_map(self): + """ + A dictionary in the form of the `name_map` input parameter for + `CorrectedJetsFactory`, with all keys mapped to None. + """ out = { "massRaw", "ptRaw", @@ -148,16 +152,28 @@ def blank_name_map(self): @property def jec(self): + """ + The stack's FactorizedJetCorrector object. + """ return self._jec @property def junc(self): + """ + The stack's JetCorrectionUncertainty object. + """ return self._junc @property def jer(self): + """ + The stack's JetResolution object. + """ return self._jer @property def jersf(self): + """ + The stack's JetResolutionScaleFactor object. + """ return self._jersf From 36ad043e43474feb72990da573c850441a55474a Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 4 Sep 2024 16:49:46 -0500 Subject: [PATCH 13/32] Add docs to CMF, tweak docs in CJF --- .../jetmet_tools/CorrectedJetsFactory.py | 3 +- .../jetmet_tools/CorrectedMETFactory.py | 46 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/coffea/jetmet_tools/CorrectedJetsFactory.py b/src/coffea/jetmet_tools/CorrectedJetsFactory.py index a47db9a8b..e6621a4f3 100644 --- a/src/coffea/jetmet_tools/CorrectedJetsFactory.py +++ b/src/coffea/jetmet_tools/CorrectedJetsFactory.py @@ -218,7 +218,8 @@ def build(self, injets): Returns ------- - Awkward array of jets, representing the corrected jets. + Awkward array of jets, representing the corrected jets, with shape matching + `injets`. """ if not isinstance(injets, (awkward.highlevel.Array, dask_awkward.Array)): raise Exception("input jets must be an (dask_)awkward array of some kind!") diff --git a/src/coffea/jetmet_tools/CorrectedMETFactory.py b/src/coffea/jetmet_tools/CorrectedMETFactory.py index 5853dc5b2..ded176cf9 100644 --- a/src/coffea/jetmet_tools/CorrectedMETFactory.py +++ b/src/coffea/jetmet_tools/CorrectedMETFactory.py @@ -19,6 +19,29 @@ def corrected_polar_met( class CorrectedMETFactory: + """ + Factory class for propagating corrections made to jets into a corrected value + of MET. This includes organizing different variations associated with uncertainties + in MET from unclustered energy. + + Once the `CorrectedMETFactory` is constructed, an array of corrected MET values and + variations can be produced with the `build` method, which requires an array of + uncorrected MET and an array of corrected jets. + + Parameters + ---------- + name_map: dict[str,str] + Keys must include at least the following: + - METpt + - METphi + - JetPt + - JetPhi + - ptRaw + - UnClusteredEnergyDeltaX + - UnClusteredEnergyDeltaY + and each of those must be mapped to the corresponding field name of the input + arrays `in_MET` and `in_corrected_jets` for the `build` method. + """ def __init__(self, name_map): for name in [ "METpt", @@ -37,6 +60,21 @@ def __init__(self, name_map): self.name_map = name_map def build(self, in_MET, in_corrected_jets): + """ + Produce an array of corrected MET values from an array of uncorrected MET + values and an array of corrected jets. + + Parameters + ---------- + in_MET: (Awkward array[float]) + An array of raw (uncorrected) MET values. + in_corrected_jets: (Awkward array[jets]) + An array of corrected jets, as produced by `CorrectedJetsFactory`. + + Returns + ------- + Awkward array of corrected MET values, with shape matching `in_MET`. + """ if not isinstance( in_MET, (awkward.highlevel.Array, dask_awkward.Array) ) or not isinstance( @@ -171,4 +209,12 @@ def create_variants(raw_met, corrected_jets_or_variants, dx, dy): return out def uncertainties(self): + """ + Returns a list of the sources of uncertainty included in the stack. + + Returns + ------- + list[str] + A list of the sources of uncertainty. + """ return ["MET_UnclusteredEnergy"] From dbc9687e1758094e759b266fd62b56d9f82488c0 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 5 Sep 2024 15:29:09 -0500 Subject: [PATCH 14/32] Add docstrings to lumitools --- src/coffea/lumi_tools/lumi_tools.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/coffea/lumi_tools/lumi_tools.py b/src/coffea/lumi_tools/lumi_tools.py index 0dfc3aae2..29170d554 100644 --- a/src/coffea/lumi_tools/lumi_tools.py +++ b/src/coffea/lumi_tools/lumi_tools.py @@ -33,14 +33,18 @@ class LumiData: Parameters ---------- lumi_csv : str - The path the the luminosity csv output file + The path to the luminosity csv file to read from. Generally, this is the output file from brilcalc. + is_inst_lumi: bool, default False + If True, treats the values read in from `lumi_csv` as average instantaneous luminosities, instead of integrated luminosities. - The values are extracted from the csv output as returned by brilcalc, e.g. with a command such as:: + The values are extracted from the csv output as returned by brilcalc_, e.g. with a command such as:: brilcalc lumi -c /cvmfs/cms.cern.ch/SITECONF/local/JobConfig/site-local-config.xml \ -b "STABLE BEAMS" --normtag=/cvmfs/cms-bril.cern.ch/cms-lumi-pog/Normtags/normtag_PHYSICS.json \ -u /pb --byls --output-style csv -i Cert_294927-306462_13TeV_PromptReco_Collisions17_JSON.txt > lumi2017.csv + .. _brilcalc: https://cms-service-lumi.web.cern.ch/cms-service-lumi/brilwsdoc.html + Note that some brilcalc files may be in different units than inverse picobarns, including possibly average instantaneous luminosity. You should make sure that you understand the units of the LumiData file you are using before calculating luminosity with this tool. If you are using a LumiData file containing avg. inst. luminosity, make sure to set is_inst_lumi=True in the constructor of this class. @@ -73,7 +77,12 @@ def get_lumi(self, runlumis): ---------- runlumis : numpy.ndarray or LumiList A 2d numpy array of ``[[run,lumi], [run,lumi], ...]`` or `LumiList` object - of the lumiSections to integrate over. + of the lumiSections to integrate over, where `run` is a run number and `lumi` is a + lumisection number. + + Returns + ------- + (float) The total integrated luminosity of the runs and lumisections indicated in `runlumis`. """ if self.index is None: self.index = Dict.empty( @@ -132,14 +141,15 @@ def _get_lumi_kernel(runs, lumis, index, tot_lumi): class LumiMask: - """Holds a luminosity mask index, and provides vectorized lookup + """ + Holds a luminosity mask index, and provides vectorized lookup, retaining only valid (run,lumisection) pairs. Parameters ---------- jsonfile : str Path the the 'golden json' file or other valid lumiSection database in json format. - This class parses a CMS lumi json into an efficient valid lumiSection lookup table + This class parses a CMS lumi json into an efficient valid lumiSection lookup table. """ def __init__(self, jsonfile): @@ -154,7 +164,8 @@ def __init__(self, jsonfile): self._masks[numpy.uint32(run)] = mask def __call__(self, runs, lumis): - """Check if run and lumi are valid + """ + Check pairs of runs and lumis for validity, and produce a mask retaining the valid pairs. Parameters ---------- From 0b5b2b682ed25804a0b9cd0ac00965550e5d9c7f Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Thu, 5 Sep 2024 15:45:29 -0500 Subject: [PATCH 15/32] Tweak docstrings in numpy_call_wrapper --- src/coffea/ml_tools/helper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coffea/ml_tools/helper.py b/src/coffea/ml_tools/helper.py index 15d282879..db3ed2bb3 100644 --- a/src/coffea/ml_tools/helper.py +++ b/src/coffea/ml_tools/helper.py @@ -136,13 +136,13 @@ class numpy_call_wrapper(abc.ABC): For tools outside the coffea package (like for ML inference), the inputs typically expect a numpy-like input. This class wraps up the user-level - awkward->numpy data mangling and the underling numpy evaluation calls to + awkward->numpy data mangling and the underlying numpy evaluation calls to recognizable to dask. For the class to be fully functional, the user must overload these methods: - - numpy_call: How the evaluation using all numpy tool be performed - - prepare_awkward: How awkward arrays should be translated to the a numpy + - numpy_call: How the evaluation using all-numpy tool be performed + - prepare_awkward: How awkward arrays should be translated to a numpy format that is compatible with the numpy_call Additionally, the following helper functions can be omitted, but will help @@ -150,7 +150,7 @@ class numpy_call_wrapper(abc.ABC): - validate_numpy_input: makes sure the computation routine understand the input. - - numpy_to_awkward: Additional translation to convert numpy outputs to + - postprocess_awkward: Additional translation to convert numpy outputs to awkward (defaults to a simple `awkward.from_numpy` conversion) """ @@ -232,7 +232,7 @@ def postprocess_awkward(self, return_array, *args, **kwargs): def _call_awkward(self, *args, **kwargs): """ The common routine of prepare_awkward conversion, numpy evaluation, - then numpy_to_awkward conversion. + then postprocess_awkward conversion. """ ak_args, ak_kwargs = self.prepare_awkward(*args, **kwargs) (np_args, np_kwargs), _ = self._ak_to_np_(*ak_args, **ak_kwargs) @@ -245,7 +245,7 @@ def _call_dask(self, *args, **kwargs): Wrapper required for dask awkward calls. Here we create a new callable class (_callable_wrap) that packs the - prepare_awkward/numpy_call/numpy_to_awkward call routines to be + prepare_awkward/numpy_call/postprocess_awkward call routines to be passable to the dask_awkward.map_partition method. In addition, because map_partition by default expects the callable's From 55da601c934da17973c2a2b459e58c3047893254 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 9 Sep 2024 14:26:47 -0500 Subject: [PATCH 16/32] Expand docstring for torch_wrapper --- src/coffea/ml_tools/torch_wrapper.py | 42 ++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/coffea/ml_tools/torch_wrapper.py b/src/coffea/ml_tools/torch_wrapper.py index 36625d244..e09f92afb 100644 --- a/src/coffea/ml_tools/torch_wrapper.py +++ b/src/coffea/ml_tools/torch_wrapper.py @@ -14,24 +14,38 @@ class torch_wrapper(nonserializable_attribute, numpy_call_wrapper): """ Wrapper for running pytorch with awkward/dask-awkward inputs. - """ - def __init__(self, torch_jit: str): - """ - As torch models are not guaranteed to be serializable we load the model - using torch save-state files. Notice that we only support TorchScript - files for this wrapper class [1]. If the user is attempting to run on - the clusters, the TorchScript file will need to be passed to the worker - nodes in a way which preserves the file path. + As torch models are not guaranteed to be serializable we load the model + using torch save-state files. Notice that we only support TorchScript + files for this wrapper class [1]. If the user is attempting to run on + the clusters, the TorchScript file will need to be passed to the worker + nodes in a way which preserves the file path. - [1] - https://pytorch.org/tutorials/beginner/saving_loading_models.html#export-load-model-in-torchscript-format + Once an instance `wrapper` of this class is created, it can be called on inputs + like `wrapper(*args)`, where `args` are the inputs to `prepare_awkward` (see + next paragraph). - Parameters - ---------- + In order to actually use the class, the user must override the method + `prepare_awkward`. The input to this method is an arbitrary number of awkward + arrays or dask awkward arrays (but never a mix of dask/non-dask array). The + output is two objects: a tuple `a` and a dictionary `b` such that the underlying + `pytorch` model instance calls like `model(*a,**b)`. The contents of a and b + should be numpy-compatible awkward-like arrays: if the inputs are non-dask awkward + arrays, the return should also be non-dask awkward arrays that can be trivially + converted to numpy arrays via a ak.to_numpy call; if the inputs are dask awkward + arrays, the return should be still be dask awkward arrays that can be trivially + converted via a to_awkward().to_numpy() call. - - torch_jit: Path to the TorchScript file to load - """ + [1] + https://pytorch.org/tutorials/beginner/saving_loading_models.html#export-load-model-in-torchscript-format + + Parameters + ---------- + torch_jit: str + Path to the TorchScript file to load + """ + + def __init__(self, torch_jit: str): if _torch_import_error is not None: warnings.warn( "Users should make sure the torch package is installed before proceeding!\n" From a4cac801f4e2c1c75626043e12303b4aca4f1132 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 9 Sep 2024 15:38:23 -0500 Subject: [PATCH 17/32] Update docstrings for triton_wrapper and xgboost_wrapper --- src/coffea/ml_tools/triton_wrapper.py | 102 +++++++++++++++++-------- src/coffea/ml_tools/xgboost_wrapper.py | 35 ++++++++- 2 files changed, 102 insertions(+), 35 deletions(-) diff --git a/src/coffea/ml_tools/triton_wrapper.py b/src/coffea/ml_tools/triton_wrapper.py index 29b78986d..626bb0751 100644 --- a/src/coffea/ml_tools/triton_wrapper.py +++ b/src/coffea/ml_tools/triton_wrapper.py @@ -21,9 +21,37 @@ class triton_wrapper(nonserializable_attribute, numpy_call_wrapper): Wrapper for running triton inference. The target of this class is such that all triton specific operations are - wrapped and abstracted-away from the users. The users should then only needs + wrapped and abstracted-away from the users. The user should then only need to handle awkward-level operations to mangle the arrays into the expected - input format required by the the model of interest. + input format required by the the model of interest. This must be done by + overriding the `prepare_awkward` method. + + Once an instance `wrapper` of this class is created, it can be called on inputs + like `wrapper(*args)`, where `args` are the inputs to `prepare_awkward` (see + next paragraph). + + In order to actually use the class, the user must override the method + `prepare_awkward`. The input to this method is an arbitrary number of awkward + arrays or dask awkward arrays (but never a mix of dask/non-dask array). The + output is two objects: a tuple `a` and a dictionary `b` such that the underlying + `tritonclient` instance calls like `client(*a,**b)`. The contents of a and b + should be numpy-compatible awkward-like arrays: if the inputs are non-dask awkward + arrays, the return should also be non-dask awkward arrays that can be trivially + converted to numpy arrays via a ak.to_numpy call; if the inputs are dask awkward + arrays, the return should be still be dask awkward arrays that can be trivially + converted via a to_awkward().to_numpy() call. + + Parameters + ---------- + model_url: str + A string in the format of: `triton+://
//` + + client_args: dict[str,str], optional + Optional keyword arguments to pass to the underlying `InferenceServerClient` objects. + + batch_size: int, default -1 + How the input arrays should be split up for analysis processing. Leave negative to + have this automatically resolved. """ batch_size_fallback = 10 # Fall back should batch size not be determined. @@ -32,19 +60,6 @@ class triton_wrapper(nonserializable_attribute, numpy_call_wrapper): def __init__( self, model_url: str, client_args: Optional[Dict] = None, batch_size=-1 ): - """ - Parameters - ---------- - - - model_url: A string in the format of: - triton+://
// - - - client_args: optional keyword arguments to pass to the underlying - `InferenceServerClient` objects. - - - batch_size: How the input arrays should be split up for analysis - processing. Leave negative to have this automatically resolved. - """ if _triton_import_error is not None: warnings.warn( "Users should make sure the tritonclient package is installed before proceeding!\n" @@ -157,8 +172,28 @@ def validate_numpy_input( self, output_list: List[str], input_dict: Dict[str, numpy.array] ) -> None: """ - tritonclient can return the expected input array dimensions and - available output values. + Check that tritonclient can return the expected input array dimensions and + available output values. Can be useful when ensuring that data is being properly + mangled for Triton. This method is called just before passing to the Triton client + when an inference request is made. + + If no errors are raised, it is understood that the input is validated by this function. + + Parameters + ---------- + output_list: list[str] + List of string corresponding to the name of the outputs + of interest. These strings will be automatically translated into the + required `tritonclient.InferRequestedOutput` objects. This is identical + to the first argument the user passes in when calling the `triton_wrapper` + instance. + + input_dict: dict[str,np.array] + Dictionary with the model's input-names as the key and the + appropriate numpy array as the dictionary value. This dictionary is + automatically translated into a list of `tritonclient.InferInput` + objects. This is identical to the second argument the user passes in when + calling the `triton_wrapper` instance. """ # Input value checking for iname, iarr in input_dict.items(): @@ -213,22 +248,23 @@ def numpy_call( """ Parameters ---------- - - - output_list: List of string corresponding to the name of the outputs - of interest. These strings will be automatically translated into the - required `tritonclient.InferRequestedOutput` objects. - - - input_dict: Dictionary with the model's input-names as the key and the - appropriate numpy array as the dictionary value. This dictionary is - automatically translated into a list of `tritonclient.InferInput` - objects. - - - Return - ------ - - The return will be the dictionary of numpy arrays that have the - output_list arguments as keys. + output_list: list[str] + List of string corresponding to the name of the outputs + of interest. These strings will be automatically translated into the + required `tritonclient.InferRequestedOutput` objects. + + input_dict: dict[str,np.array] + Dictionary with the model's input-names as the key and the + appropriate numpy array as the dictionary value. This dictionary is + automatically translated into a list of `tritonclient.InferInput` + objects. + + + Returns + ------- + dict[str,np.array] + The return will be the dictionary of numpy arrays that have the + output_list arguments as keys. """ # Setting up the inference input containers diff --git a/src/coffea/ml_tools/xgboost_wrapper.py b/src/coffea/ml_tools/xgboost_wrapper.py index 6c8271914..a66ec559a 100644 --- a/src/coffea/ml_tools/xgboost_wrapper.py +++ b/src/coffea/ml_tools/xgboost_wrapper.py @@ -16,6 +16,12 @@ class xgboost_wrapper(numpy_call_wrapper, nonserializable_attribute): """ Very simple wrapper for xgbooster inference. The xgboost.Booster object is nonserializable, so the users should pass in the xgboost model file. + + Parameters + ---------- + fname: str + Path to the xgboost model file, such that an `xgbooster` can be created + via `xgboost.Booster(model_file=fname)`. """ def __init__(self, fname): @@ -43,11 +49,25 @@ def validate_numpy_input( predict_args: Optional[Dict] = None, ): """ + Check that the arguments to be passed into the actual xgboost inference + request are valid. + The inner most dimension of the data array should be smaller than the number of features of the xgboost model. (Will raise a warning if mismatched). We will not attempt to parse the kwargs passed to the construction of a DMatrix, or the predict call, as those advanced features are expected to be properly handled by the user. + + Parameters + ---------- + data: np.ndarray + The data to pass into the `xgboost.DMatrix` construction. + dmat_args: dict[str,str], optional + Keyword arguments to pass into the `xgboost.DMatrix` construction. + predict_args: dict[str,str], optional + Keyword arguments to pass to the actual prediction step of `xgboost`, + ie: the `predict` method of `xgbooster.Booster.predict`. Note that the + first argument of that method is handled by this method. """ ndims = data.shape[-1] nfeat = self.xgbooster.num_features() @@ -68,10 +88,21 @@ def numpy_call( predict_args: Optional[Dict] = None, ): """ - Passing the numpy array data as-is to the construction of an + Pass the numpy array data as-is to the construction of an xgboost.DMatrix constructor (with additional keyword arguments should - they be specified), the run the xgboost.Booster.predict method (with + they be specified), then run the xgboost.Booster.predict method (with additional keyword arguments). + + Parameters + ---------- + data: np.ndarray + The data to pass into the `xgboost.DMatrix` construction. + dmat_args: dict[str,str], optional + Keyword arguments to pass into the `xgboost.DMatrix` construction. + predict_args: dict[str,str], optional + Keyword arguments to pass to the actual prediction step of `xgboost`, + ie: the `predict` method of `xgbooster.Booster.predict`. Note that the + first argument of that method is handled by this method. """ if dmat_args is None: dmat_args = {} From 023bf546231fbfb54f09f90ed29177e9320554da Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 10 Sep 2024 14:26:36 -0500 Subject: [PATCH 18/32] Add returns docstrings to factory methods --- src/coffea/nanoevents/factory.py | 35 ++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 34866a11b..58dbd6cb2 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -207,7 +207,12 @@ def __call__(self, form): class NanoEventsFactory: - """A factory class to build NanoEvents objects""" + """ + A factory class to build NanoEvents objects. + + For most users, it is advisable to construct instances via methods like `from_root` so that + the constructor args are properly set. + """ def __init__(self, schema, mapping, partition_key, cache=None, is_dask=False): self._is_dask = is_dask @@ -292,6 +297,11 @@ def from_root( see: https://github.com/scikit-hep/uproot5/blob/main/src/uproot/_dask.py#L109 interpretation_executor (None or Executor with a ``submit`` method): see: https://github.com/scikit-hep/uproot5/blob/main/src/uproot/_dask.py#L113 + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from the file at `file`. """ if treepath is not uproot._util.unset and not isinstance( @@ -442,6 +452,11 @@ def from_parquet( Pass a list instance to record which branches were lazily accessed by this instance delayed: Nanoevents will use dask as a backend to construct a delayed task graph representing your analysis. + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from the file at `file`. """ import pyarrow import pyarrow.dataset as ds @@ -581,6 +596,11 @@ def from_preloaded( Arbitrary metadata to add to the `base.NanoEvents` object access_log : list, optional Pass a list instance to record which branches were lazily accessed by this instance + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from information in `array_source`. """ if not isinstance(array_source, Mapping): raise TypeError( @@ -679,7 +699,18 @@ def __len__(self): return stop - start def events(self): - """Build events""" + """ + Build events + + Returns + ------- + out: + If the NanoEventsFactory is running in delayed mode (Dask), this is + a Dask awkward array of the events. If the mapping also produces a + report, the output will be a tuple (events, report). + If the factory is not running in delayed mode, this is an awkward + array of the events. + """ if self._is_dask: events = self._mapping(form_mapping=self._schema) report = None From c9d4d103cb1f5e6665de680164f8811ce279afd7 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 10 Sep 2024 15:08:22 -0500 Subject: [PATCH 19/32] Update docstrings in schemas --- src/coffea/nanoevents/schemas/base.py | 4 ++-- src/coffea/nanoevents/schemas/delphes.py | 12 +++++++++- src/coffea/nanoevents/schemas/nanoaod.py | 27 ++++++++++++++++++---- src/coffea/nanoevents/schemas/physlite.py | 2 +- src/coffea/nanoevents/schemas/treemaker.py | 13 ++++++++++- 5 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/coffea/nanoevents/schemas/base.py b/src/coffea/nanoevents/schemas/base.py index 59a4b4285..4b1705c9b 100644 --- a/src/coffea/nanoevents/schemas/base.py +++ b/src/coffea/nanoevents/schemas/base.py @@ -122,12 +122,12 @@ def __init__(self, base_form, *args, **kwargs): @property def form(self): - """Awkward form of this schema""" + """Awkward form of this schema (dict)""" return self._form @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import base return base.behavior diff --git a/src/coffea/nanoevents/schemas/delphes.py b/src/coffea/nanoevents/schemas/delphes.py index 5b1095ee6..0e708e806 100644 --- a/src/coffea/nanoevents/schemas/delphes.py +++ b/src/coffea/nanoevents/schemas/delphes.py @@ -68,6 +68,9 @@ class DelphesSchema(BaseSchema): "Rho": "Rho", "ScalarHT": "ScalarHT", } + """ + Default configuration for mixin types, based on the collection name. + """ # These are stored as length-1 vectors unnecessarily singletons = [ @@ -79,6 +82,10 @@ class DelphesSchema(BaseSchema): "ScalarHT", "MissingET", ] + """ + Fields that are stored as length-1 vectors in Delphes, to be flattened out in nanoevents + (removing an unnecessary level of nesting). + """ docstrings = { "AlphaQCD": "value of the QCD coupling used in the event, see hep-ph/0109068", @@ -197,6 +204,9 @@ class DelphesSchema(BaseSchema): "ZOuter": "position (z component) at the edge", "Zd": "Z coordinate of point of closest approach to vertex", } + """ + The docstrings for each field in the resulting nanoevents + """ def __init__(self, base_form, version="latest", *args, **kwargs): super().__init__(base_form) @@ -310,7 +320,7 @@ def _preprocess_branch_form(objname, form): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import delphes return delphes.behavior diff --git a/src/coffea/nanoevents/schemas/nanoaod.py b/src/coffea/nanoevents/schemas/nanoaod.py index ecf9bbaee..d456d7232 100644 --- a/src/coffea/nanoevents/schemas/nanoaod.py +++ b/src/coffea/nanoevents/schemas/nanoaod.py @@ -43,8 +43,8 @@ class NanoAODSchema(BaseSchema): """ __dask_capable__ = True - warn_missing_crossrefs = True - error_missing_event_ids = True + warn_missing_crossrefs = True #If True, issues a warning when a missing global index cross-ref target is encountered + error_missing_event_ids = True #If True, raises an exception when 'run', 'event', or 'luminosityBlock' fields are missing event_ids = ["run", "luminosityBlock", "event"] """List of NanoAOD event IDs @@ -189,17 +189,34 @@ def v7(cls, base_form): For example, one can use ``NanoEventsFactory.from_root("file.root", schemaclass=NanoAODSchema.v7)`` to ensure NanoAODv7 compatibility. + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv7 """ return cls(base_form, version="7") @classmethod def v6(cls, base_form): - """Build the NanoEvents assuming NanoAODv6""" + """Build the NanoEvents assuming NanoAODv6 + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv6 + """ return cls(base_form, version="6") @classmethod def v5(cls, base_form): - """Build the NanoEvents assuming NanoAODv5""" + """Build the NanoEvents assuming NanoAODv5 + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv5 + """ return cls(base_form, version="5") def _build_collections(self, field_names, input_contents): @@ -327,7 +344,7 @@ def _build_collections(self, field_names, input_contents): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import nanoaod return nanoaod.behavior diff --git a/src/coffea/nanoevents/schemas/physlite.py b/src/coffea/nanoevents/schemas/physlite.py index 2a17e1892..42e5f692e 100644 --- a/src/coffea/nanoevents/schemas/physlite.py +++ b/src/coffea/nanoevents/schemas/physlite.py @@ -159,7 +159,7 @@ def _create_eventindex_form(base_form, key): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import physlite return physlite.behavior diff --git a/src/coffea/nanoevents/schemas/treemaker.py b/src/coffea/nanoevents/schemas/treemaker.py index a90bd609b..c420307f5 100644 --- a/src/coffea/nanoevents/schemas/treemaker.py +++ b/src/coffea/nanoevents/schemas/treemaker.py @@ -166,7 +166,7 @@ def _build_collections(self, branch_forms): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import base, vector behavior = {} @@ -181,6 +181,17 @@ def uproot_writeable(cls, events): writeable. Based off the discussion thread here [1], but added specific cased to handled the nested structures define for TreeMaker n-tuples. [1] https://github.com/CoffeaTeam/coffea/discussions/735 + + Parameters + ---------- + events: TreeMakerSchema events + The TreeMakerSchema events to be turned into something uproot-writeable + + Returns + ------- + out: dict + An uproot-writeable dictionary representing the same information as the input + TreeMakerSchema events """ import awkward as ak From 57ebec1f62230f9c10990d6681ffc136d9929202 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Wed, 9 Oct 2024 11:40:29 -0500 Subject: [PATCH 20/32] Update docstring for Systematic --- src/coffea/nanoevents/methods/base.py | 29 +++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/coffea/nanoevents/methods/base.py b/src/coffea/nanoevents/methods/base.py index 896f53a25..3e06162da 100644 --- a/src/coffea/nanoevents/methods/base.py +++ b/src/coffea/nanoevents/methods/base.py @@ -26,14 +26,22 @@ def __call__(self, coll: awkward.Array, *args: Any, **kwargs: Any) -> awkward.Ar @awkward.mixin_class(behavior) class Systematic: - """A base mixin class to describe and build variations on a feature of an nanoevents object.""" + """A base mixin class to describe and build variations on a feature of a nanoevents object.""" _systematic_kinds = set() @classmethod def add_kind(cls, kind: str): """ - Register a type of systematic variation, it must fulfill the base class interface. + Register a type of systematic variation, which must fulfill the base class interface. Types of + systematic variations must be registered here before an actual systematic of that type can be + added. For example, by default an up/down systematic is registered, as described in + `coffea.nanoevents.methods.systematics.UpDownSystematic`. + + Parameters + ---------- + kind: str + The name of the type of systematic described by this class """ cls._systematic_kinds.add(kind) @@ -96,10 +104,19 @@ def add_systematic( varying_function: Callable, ): """ - name: str, name of the systematic variation / uncertainty source - kind: str, the name of the kind of systematic variation - what: Union[str, List[str], Tuple[str]], name what gets varied, this could be a list or tuple of column names - varying_function: Union[function, bound method], a function that describes how 'what' is varied, it must close over all non-event-data arguments. + Add a systematic to the nanoevents object's `systematics` field, with field name `name`, of kind `kind` (must be registered + with `add_kind` already), and varying the objects under field(s) `what` with a function `varying_function`. + + Parameters + ---------- + name: str + Name of the systematic variation / uncertainty source + kind: str + The name of the kind of systematic variation + what: Union[str, List[str], Tuple[str]] + Name what gets varied, this could be a list or tuple of column names + varying_function: Union[function, bound method] + A function that describes how 'what' is varied, it must close over all non-event-data arguments. """ self._ensure_systematics() From 35b24267b10561ec2e93e577ad8e2bdd66d40b8a Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 10:44:30 -0500 Subject: [PATCH 21/32] Add docstrings to children, distinctChildren --- src/coffea/nanoevents/methods/nanoaod.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/coffea/nanoevents/methods/nanoaod.py b/src/coffea/nanoevents/methods/nanoaod.py index 13b1782ff..3cd2d4378 100644 --- a/src/coffea/nanoevents/methods/nanoaod.py +++ b/src/coffea/nanoevents/methods/nanoaod.py @@ -126,6 +126,10 @@ def distinctParent(self, dask_array): @dask_property def children(self): + """ + Accessor to direct children of this particle (not grandchildren). Includes particles + with the same PDG ID as this particle. + """ return self._events().GenPart._apply_global_index(self.childrenIdxG) @children.dask @@ -134,6 +138,12 @@ def children(self, dask_array): @dask_property def distinctChildren(self): + """ + Accessor to direct children of this particle which do not have the same PDG ID as + this particle. Note that this implies the summed four-momentum of the distinctChildren + may not sum to the four-momentum of this particle (for example, if this particle + radiates another particle type). If that behavior is desired, see `distinctChildrenDeep`. + """ return self._events().GenPart._apply_global_index(self.distinctChildrenIdxG) @distinctChildren.dask @@ -144,7 +154,12 @@ def distinctChildren(self, dask_array): @dask_property def distinctChildrenDeep(self): - """Accessor to distinct child particles with different PDG id, or last ones in the chain""" + """ + Accessor to distinct child particles with different PDG id, or last ones in the chain. + Note that this does not always find the correct children, since this sometimes depends + on the MC generator! See `here ` for more + information. + """ warnings.warn( "distinctChildrenDeep may not give correct answers for all generators!" ) From 3167d1c9c8dc8e219f0b5ea6432fab63aa7a5aae Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 10:50:44 -0500 Subject: [PATCH 22/32] Add docstrings to parent classes --- src/coffea/nanoevents/methods/nanoaod.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coffea/nanoevents/methods/nanoaod.py b/src/coffea/nanoevents/methods/nanoaod.py index 3cd2d4378..d48e4f46d 100644 --- a/src/coffea/nanoevents/methods/nanoaod.py +++ b/src/coffea/nanoevents/methods/nanoaod.py @@ -106,6 +106,9 @@ def hasFlags(self, *flags): @dask_property def parent(self): + """ + Accessor to the direct parent of this particle. + """ return self._events().GenPart._apply_global_index(self.genPartIdxMotherG) @parent.dask @@ -116,6 +119,9 @@ def parent(self, dask_array): @dask_property def distinctParent(self): + """ + Accessor to distinct (different PDG id) parent particle. + """ return self._events().GenPart._apply_global_index(self.distinctParentIdxG) @distinctParent.dask From c116d9c9fea91b9d3dd31cad2811c7d4f1137942 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 15:04:02 -0500 Subject: [PATCH 23/32] Add matched_* docstrings --- src/coffea/nanoevents/methods/nanoaod.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/coffea/nanoevents/methods/nanoaod.py b/src/coffea/nanoevents/methods/nanoaod.py index d48e4f46d..52a98dc85 100644 --- a/src/coffea/nanoevents/methods/nanoaod.py +++ b/src/coffea/nanoevents/methods/nanoaod.py @@ -261,6 +261,7 @@ def isTight(self): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -269,6 +270,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -277,6 +279,7 @@ def matched_jet(self, dask_array): @dask_property def matched_photon(self): + """The associated photon as determined by the NanoAOD branch photonIdx""" return self._events().Photon._apply_global_index(self.photonIdxG) @matched_photon.dask @@ -302,6 +305,7 @@ class LowPtElectron(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Syst @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -310,6 +314,7 @@ def matched_gen(self, dask_array): @dask_property def matched_electron(self): + """The matched gen-level electron as determined by the NanoAOD branch electronIdx""" return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electron.dask @@ -320,6 +325,7 @@ def matched_electron(self, dask_array): @dask_property def matched_photon(self): + """The associated photon as determined by the NanoAOD branch photonIdx""" return self._events().Photon._apply_global_index(self.photonIdxG) @matched_photon.dask @@ -343,6 +349,7 @@ class Muon(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Systematic): @dask_property def matched_fsrPhoton(self): + """The matched FSR photon with the lowest dR/ET2. Accessed via the NanoAOD branch fsrPhotonIdx""" return self._events().FsrPhoton._apply_global_index(self.fsrPhotonIdxG) @matched_fsrPhoton.dask @@ -353,6 +360,7 @@ def matched_fsrPhoton(self, dask_array): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -361,6 +369,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -384,6 +393,7 @@ class Tau(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Systematic): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -392,6 +402,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -456,6 +467,7 @@ def isTight(self): @dask_property def matched_electron(self): + """The matched electron as determined by the NanoAOD branch electronIdx""" return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electron.dask @@ -466,6 +478,7 @@ def matched_electron(self, dask_array): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -474,6 +487,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -499,6 +513,7 @@ class FsrPhoton(candidate.PtEtaPhiMCandidate, base.NanoCollection): @dask_property def matched_muon(self): + """The matched muon as determined by the NanoAOD branch muonIdx""" return self._events().Muon._apply_global_index(self.muonIdxG) @matched_muon.dask @@ -548,6 +563,11 @@ def isTightLeptonVeto(self): @dask_property def matched_electrons(self): + """ + The matched electrons as determined by the NanoAOD branch electronIdx. The resulting awkward + array has two entries per jet, where if there are fewer than 2 electrons matched to a jet, the + innermost dimensions are padded with None to be of size 2. + """ return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electrons.dask @@ -558,6 +578,11 @@ def matched_electrons(self, dask_array): @dask_property def matched_muons(self): + """ + The matched muons as determined by the NanoAOD branch muonIdx. The resulting awkward + array has two entries per jet, where if there are fewer than 2 muons matched to a jet, the + innermost dimensions are padded with None to be of size 2. + """ return self._events().Muon._apply_global_index(self.muonIdxG) @matched_muons.dask @@ -566,6 +591,9 @@ def matched_muons(self, dask_array): @dask_property def matched_gen(self): + """ + AK4 jets made with visible genparticles, matched to this jet via the NanoAOD branch genJetIdx + """ return self._events().GenJet._apply_global_index(self.genJetIdxG) @matched_gen.dask @@ -637,6 +665,7 @@ def subjets(self, dask_array): @dask_property def matched_gen(self): + """AK8 jets made of visible genparticles, matched via the NanoAOD branch genJetAK8Idx""" return self._events().GenJetAK8._apply_global_index(self.genJetAK8IdxG) @matched_gen.dask From 128e4a3b9c76982b7a9ea0a40236a64622e0e9bd Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 15:47:59 -0500 Subject: [PATCH 24/32] Modify docstrings --- src/coffea/btag_tools/btagscalefactor.py | 17 +++++++++++++++++ src/coffea/nanoevents/methods/nanoaod.py | 1 + 2 files changed, 18 insertions(+) diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 6c91588e4..72ac8b995 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -19,6 +19,23 @@ class BTagScaleFactor: Defaults to 'comb,comb,incl' keep_df : bool, optional If set true, keep the parsed dataframe as an attribute (.df) for later inspection + + Attributes + ---------- + LOOSE: int + Value is 0. This is the integer for the loose WP + MEDIUM: int + Value is 1. This is the integer for the medium WP + TIGHT: int + Value is 2. This is the integer for the tight WP + RESHAPE: int + Value is 3. This is the integer for the reshape WP + FLAV_B: int + Value is 0. This is the integer to represent the b flavor. Input choice to some methods. + FLAV_C: int + Value is 1. This is the integer to represent the c flavor. Input choice to some methods. + FLAV_UDSG: int + Value is 2. This is the integer to represent u, d, and s flavors, as well as gluons. Input choice to some methods. """ LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) diff --git a/src/coffea/nanoevents/methods/nanoaod.py b/src/coffea/nanoevents/methods/nanoaod.py index 52a98dc85..539e940e4 100644 --- a/src/coffea/nanoevents/methods/nanoaod.py +++ b/src/coffea/nanoevents/methods/nanoaod.py @@ -705,6 +705,7 @@ class MissingET(vector.PolarTwoVector, base.NanoCollection, base.Systematic): @property def r(self): + """Distance from origin in XY plane""" return self["pt"] From 36af6429b793e96790980484021566f808d68955 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 15:52:25 -0500 Subject: [PATCH 25/32] Remove DDT from default dependencies --- src/coffea/dataset_tools/__init__.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index b5e97e547..647fd336c 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,5 +1,4 @@ from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset -from coffea.dataset_tools.dataset_query import DataDiscoveryCLI, print_dataset_query from coffea.dataset_tools.manipulations import ( filter_files, get_failed_steps_for_dataset, @@ -10,18 +9,8 @@ slice_files, ) from coffea.dataset_tools.preprocess import preprocess -from coffea.dataset_tools.rucio_utils import ( - get_dataset_files_replicas, - get_rucio_client, - query_dataset, -) __all__ = [ - "get_rucio_client", - "get_dataset_files_replicas", - "query_dataset", - "DataDiscoveryCLI", - "print_dataset_query", "preprocess", "apply_to_dataset", "apply_to_fileset", From 02b19ec516320c44920256e7c359fb15b460ac15 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Mon, 14 Oct 2024 16:32:11 -0500 Subject: [PATCH 26/32] Add dataset_query and rucio_utils to API ref page --- docs/source/reference.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/reference.rst b/docs/source/reference.rst index e5b5a9a17..4a33e0ed6 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -28,3 +28,8 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea`` coffea.nanoevents.methods.vector coffea.processor coffea.util + +.. automodule:: coffea.dataset_tools.dataset_query + :members: +.. automodule:: coffea.dataset_tools.rucio_utils + :members: From 26153d47352f4fe29a88037e1da9675b6b3a5e14 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:03:41 -0500 Subject: [PATCH 27/32] Move dataset_tools DDT docs to their own page --- docs/source/dataset_tools.rst | 11 +++++++++++ docs/source/reference.rst | 15 +++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 docs/source/dataset_tools.rst diff --git a/docs/source/dataset_tools.rst b/docs/source/dataset_tools.rst new file mode 100644 index 000000000..206c0fb32 --- /dev/null +++ b/docs/source/dataset_tools.rst @@ -0,0 +1,11 @@ +Dataset Tools +************* + +This page contains documentation for parts of the ``coffea.dataset_tools`` +package that are not included in the ``coffea`` namespace. That is, they +must be explicitly imported. + +.. automodule:: coffea.dataset_tools.dataset_query + :members: +.. automodule:: coffea.dataset_tools.rucio_utils + :members: \ No newline at end of file diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 4a33e0ed6..246b067ed 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -9,6 +9,11 @@ When executing a subset of the full coffea package is imported into the python environment. Some packages must be imported explicitly, so as to avoid importing unnecessary and/or heavy dependencies. Below lists the packages available in the ``coffea`` namespace. +Under that, we list documentation for some of the coffea packages that need to be +imported explicitly. + +In ``coffea`` Namespace +----------------------- .. autosummary:: :toctree: modules @@ -29,7 +34,9 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea`` coffea.processor coffea.util -.. automodule:: coffea.dataset_tools.dataset_query - :members: -.. automodule:: coffea.dataset_tools.rucio_utils - :members: +Not in ``coffea`` Namespace +--------------------------- +Here is documentation for some of the packages that are not automatically +imported on a call to ``import coffea``. + +* :doc:`dataset_tools.rst` \ No newline at end of file From fe2f2b6d2ea22113f771e1f979e9e3a2008c91ee Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:19:06 -0500 Subject: [PATCH 28/32] Fix linking to DDT docs --- docs/source/dataset_tools.rst | 2 ++ docs/source/reference.rst | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/dataset_tools.rst b/docs/source/dataset_tools.rst index 206c0fb32..523774c53 100644 --- a/docs/source/dataset_tools.rst +++ b/docs/source/dataset_tools.rst @@ -1,3 +1,5 @@ +.. _dataset-tools: + Dataset Tools ************* diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 246b067ed..7fc49794e 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -12,7 +12,7 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea`` Under that, we list documentation for some of the coffea packages that need to be imported explicitly. -In ``coffea`` Namespace +In coffea Namespace ----------------------- .. autosummary:: @@ -34,9 +34,9 @@ In ``coffea`` Namespace coffea.processor coffea.util -Not in ``coffea`` Namespace +Not in coffea Namespace --------------------------- Here is documentation for some of the packages that are not automatically imported on a call to ``import coffea``. -* :doc:`dataset_tools.rst` \ No newline at end of file +* :ref:`dataset-tools`. \ No newline at end of file From 17180d41e81276a0e8e7e1f72c36775e3038bd9c Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:24:13 -0500 Subject: [PATCH 29/32] Pre-commit changes --- src/coffea/jetmet_tools/CorrectedMETFactory.py | 3 ++- src/coffea/jetmet_tools/FactorizedJetCorrector.py | 2 +- src/coffea/jetmet_tools/JECStack.py | 3 ++- src/coffea/jetmet_tools/JetCorrectionUncertainty.py | 2 +- src/coffea/lookup_tools/extractor.py | 6 +++--- src/coffea/ml_tools/triton_wrapper.py | 4 ++-- src/coffea/nanoevents/factory.py | 2 +- src/coffea/nanoevents/schemas/nanoaod.py | 8 ++++---- src/coffea/nanoevents/schemas/treemaker.py | 2 +- 9 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/coffea/jetmet_tools/CorrectedMETFactory.py b/src/coffea/jetmet_tools/CorrectedMETFactory.py index ded176cf9..8eb74eab0 100644 --- a/src/coffea/jetmet_tools/CorrectedMETFactory.py +++ b/src/coffea/jetmet_tools/CorrectedMETFactory.py @@ -42,6 +42,7 @@ class CorrectedMETFactory: and each of those must be mapped to the corresponding field name of the input arrays `in_MET` and `in_corrected_jets` for the `build` method. """ + def __init__(self, name_map): for name in [ "METpt", @@ -70,7 +71,7 @@ def build(self, in_MET, in_corrected_jets): An array of raw (uncorrected) MET values. in_corrected_jets: (Awkward array[jets]) An array of corrected jets, as produced by `CorrectedJetsFactory`. - + Returns ------- Awkward array of corrected MET values, with shape matching `in_MET`. diff --git a/src/coffea/jetmet_tools/FactorizedJetCorrector.py b/src/coffea/jetmet_tools/FactorizedJetCorrector.py index 1bea71179..22946d6df 100644 --- a/src/coffea/jetmet_tools/FactorizedJetCorrector.py +++ b/src/coffea/jetmet_tools/FactorizedJetCorrector.py @@ -67,7 +67,7 @@ class FactorizedJetCorrector: in which `jetCorrs` are the corrected jet scaled factors, with the same shape as the input parameters. In order to see what parameters must be passed to `getCorrection()`, one can do `fjc.signature`. - + You construct a FactorizedJetCorrector by passing in a dict of names and functions. Names must be formatted as '____'. You can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from diff --git a/src/coffea/jetmet_tools/JECStack.py b/src/coffea/jetmet_tools/JECStack.py index f6c1dce89..9e98ce602 100644 --- a/src/coffea/jetmet_tools/JECStack.py +++ b/src/coffea/jetmet_tools/JECStack.py @@ -11,7 +11,7 @@ class JECStack: """ Mostly used as an input to `CorrectedJetsFactory`. Hosts and organizes multiple corrections under one object. - + jec, junc, etc. can be explicitly set by passing in the appropriate corrector class (eg: FactorizedJetCorrector). If they are not set, correctors will be created, using the info in `corrections` as input. @@ -35,6 +35,7 @@ class JECStack: If provided, overrides the jersf that would be created from `corrections` in the stack. """ + def __init__(self, corrections, jec=None, junc=None, jer=None, jersf=None): self._jec = None self._junc = None diff --git a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py index 4532873d9..4bc8868cd 100644 --- a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py +++ b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py @@ -76,7 +76,7 @@ class JetCorrectionUncertainty: You construct a JetCorrectionUncertainty by passing in a dict of names and functions. Names must be formatted as '____'. You can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from - some input files. + some input files. """ def __init__(self, **kwargs): diff --git a/src/coffea/lookup_tools/extractor.py b/src/coffea/lookup_tools/extractor.py index 3ba3c8aaa..7b217601f 100644 --- a/src/coffea/lookup_tools/extractor.py +++ b/src/coffea/lookup_tools/extractor.py @@ -128,7 +128,7 @@ def add_weight_sets(self, weightsdescs): def import_file(self, thefile): """ Cache the whole contents of a file for later processing - + Parameters ---------- thefile: str @@ -152,7 +152,7 @@ def import_file(self, thefile): def extract_from_file(self, thefile, name): """ Import a file and then extract a lookup set - + Parameters ---------- thefile: str @@ -199,7 +199,7 @@ def finalize(self, reduce_list=None): def make_evaluator(self): """ Produce an evaluator based on the finalized extractor - + Returns ------- An evaluator based on the names, weight types, and weights of the finalized extractor. diff --git a/src/coffea/ml_tools/triton_wrapper.py b/src/coffea/ml_tools/triton_wrapper.py index 626bb0751..c92180b1c 100644 --- a/src/coffea/ml_tools/triton_wrapper.py +++ b/src/coffea/ml_tools/triton_wrapper.py @@ -45,7 +45,7 @@ class triton_wrapper(nonserializable_attribute, numpy_call_wrapper): ---------- model_url: str A string in the format of: `triton+://
//` - + client_args: dict[str,str], optional Optional keyword arguments to pass to the underlying `InferenceServerClient` objects. @@ -187,7 +187,7 @@ def validate_numpy_input( required `tritonclient.InferRequestedOutput` objects. This is identical to the first argument the user passes in when calling the `triton_wrapper` instance. - + input_dict: dict[str,np.array] Dictionary with the model's input-names as the key and the appropriate numpy array as the dictionary value. This dictionary is diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 58dbd6cb2..9ef58a409 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -596,7 +596,7 @@ def from_preloaded( Arbitrary metadata to add to the `base.NanoEvents` object access_log : list, optional Pass a list instance to record which branches were lazily accessed by this instance - + Returns ------- out: NanoEventsFactory diff --git a/src/coffea/nanoevents/schemas/nanoaod.py b/src/coffea/nanoevents/schemas/nanoaod.py index 1fc15fd17..8e731731b 100644 --- a/src/coffea/nanoevents/schemas/nanoaod.py +++ b/src/coffea/nanoevents/schemas/nanoaod.py @@ -43,8 +43,8 @@ class NanoAODSchema(BaseSchema): """ __dask_capable__ = True - warn_missing_crossrefs = True #If True, issues a warning when a missing global index cross-ref target is encountered - error_missing_event_ids = True #If True, raises an exception when 'run', 'event', or 'luminosityBlock' fields are missing + warn_missing_crossrefs = True # If True, issues a warning when a missing global index cross-ref target is encountered + error_missing_event_ids = True # If True, raises an exception when 'run', 'event', or 'luminosityBlock' fields are missing event_ids = ["run", "luminosityBlock", "event"] """List of NanoAOD event IDs @@ -200,7 +200,7 @@ def v7(cls, base_form): @classmethod def v6(cls, base_form): """Build the NanoEvents assuming NanoAODv6 - + Returns ------- out: NanoAODSchema @@ -211,7 +211,7 @@ def v6(cls, base_form): @classmethod def v5(cls, base_form): """Build the NanoEvents assuming NanoAODv5 - + Returns ------- out: NanoAODSchema diff --git a/src/coffea/nanoevents/schemas/treemaker.py b/src/coffea/nanoevents/schemas/treemaker.py index c420307f5..754ad186d 100644 --- a/src/coffea/nanoevents/schemas/treemaker.py +++ b/src/coffea/nanoevents/schemas/treemaker.py @@ -186,7 +186,7 @@ def uproot_writeable(cls, events): ---------- events: TreeMakerSchema events The TreeMakerSchema events to be turned into something uproot-writeable - + Returns ------- out: dict From 110ddecc7eb684388405800f7069656f183870b9 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:33:53 -0500 Subject: [PATCH 30/32] Change _FLAV_* to public attributes --- src/coffea/btag_tools/btagscalefactor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index b2b5caa31..72ac8b995 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -108,13 +108,13 @@ def __init__(self, filename, workingpoint, methods="comb,comb,incl", keep_df=Fal f"The BTag csv file {filename} is in the new UL format which is not supported by coffea.btag_tools.\n" "Instead one can use correctionlib for UL scale factors." ) - cut = (df["jetFlavor"] == self._FLAV_B) & (df["measurementType"] == methods[0]) + cut = (df["jetFlavor"] == self.FLAV_B) & (df["measurementType"] == methods[0]) if len(methods) > 1: - cut |= (df["jetFlavor"] == self._FLAV_C) & ( + cut |= (df["jetFlavor"] == self.FLAV_C) & ( df["measurementType"] == methods[1] ) if len(methods) > 2: - cut |= (df["jetFlavor"] == self._FLAV_UDSG) & ( + cut |= (df["jetFlavor"] == self.FLAV_UDSG) & ( df["measurementType"] == methods[2] ) cut &= df["OperatingPoint"] == workingpoint From 83377fdc6cbe6339b44f71dceb69737ac1e723b3 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:35:15 -0500 Subject: [PATCH 31/32] Stash changes before merging --- docs/source/dataset_tools.rst | 2 +- docs/source/reference.rst | 2 +- src/coffea/jetmet_tools/CorrectedJetsFactory.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/dataset_tools.rst b/docs/source/dataset_tools.rst index 523774c53..15ef21295 100644 --- a/docs/source/dataset_tools.rst +++ b/docs/source/dataset_tools.rst @@ -10,4 +10,4 @@ must be explicitly imported. .. automodule:: coffea.dataset_tools.dataset_query :members: .. automodule:: coffea.dataset_tools.rucio_utils - :members: \ No newline at end of file + :members: diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 7fc49794e..155f45c64 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -39,4 +39,4 @@ Not in coffea Namespace Here is documentation for some of the packages that are not automatically imported on a call to ``import coffea``. -* :ref:`dataset-tools`. \ No newline at end of file +* :ref:`dataset-tools`. diff --git a/src/coffea/jetmet_tools/CorrectedJetsFactory.py b/src/coffea/jetmet_tools/CorrectedJetsFactory.py index e6621a4f3..5e118f536 100644 --- a/src/coffea/jetmet_tools/CorrectedJetsFactory.py +++ b/src/coffea/jetmet_tools/CorrectedJetsFactory.py @@ -149,6 +149,7 @@ class CorrectedJetsFactory: Contains the corrections that will be applied to the input jet array when calling `build`. """ + def __init__(self, name_map, jec_stack): # from PhysicsTools/PatUtils/interface/SmearedJetProducerT.h#L283 self.forceStochastic = False @@ -215,7 +216,7 @@ def build(self, injets): ---------- injets: (Awkward array[jets]) An array of uncorrected jets, to which we want to apply corrections. - + Returns ------- Awkward array of jets, representing the corrected jets, with shape matching From e09779a2ec80af7d8e311474c8f58777e513d484 Mon Sep 17 00:00:00 2001 From: Ryan Simeon Date: Tue, 15 Oct 2024 16:39:41 -0500 Subject: [PATCH 32/32] Pre-commit changes --- src/coffea/dataset_tools/rucio_utils.py | 4 ++-- src/coffea/jetmet_tools/JetResolutionScaleFactor.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index dce5b4abf..ea6e8eb0d 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -308,8 +308,8 @@ def query_dataset( tree: bool, default False If True, return the results splitting the dataset name in parts datatype: str, default "container" - Options are "container", "datset". rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. - scope: str, defualt "cms" + Options are "container", "dataset". rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. + scope: str, default "cms" Rucio instance Returns diff --git a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py index db1d6a555..73311991c 100644 --- a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py +++ b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py @@ -47,7 +47,7 @@ class JetResolutionScaleFactor: jersf = JetResolutionScaleFactor(name1=corrL1,...) jetResSF = jersf.getScaleFactor(JetParameter1=jet.parameter1,...) - in which `jetResSF` are the scale factors, with the same shape as the input paramters. + in which `jetResSF` are the scale factors, with the same shape as the input parameters. In order to see which parameters must be passed to `getScaleFactor`, one can do `jersf.signature`.