From 0e24cb7db1c0fb3baef524175d2a1f90f8a78ec2 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 12 Jun 2023 18:09:01 +0200 Subject: [PATCH 01/80] some first movement on dask-task-graph style runner/executor --- src/coffea/processor/executor.py | 129 +++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py index 618b1c741..3bf6a735b 100644 --- a/src/coffea/processor/executor.py +++ b/src/coffea/processor/executor.py @@ -362,6 +362,107 @@ def copy(self, **kwargs): return type(self)(**tmp) +@dataclass +class DaskExecutorBase(ExecutorBase): + """This base class for dak-based processors + synthesizes all analysis inputs into one + task graph that's then executed by derived + classes. + """ + + def prepare_dataset_graph(self, items, function, accumulator): + accumulator = None + for dset, info in items.items(): + if isinstance(items, dict) and "object_path" not in list(items.values()): + raise ValueError( + "items should be normalized to uproot spec in prepare_dataset_graph" + ) + + metadata = info["metadata"].copy() + metadata["dataset"] = dset + + temp = function(info["files"], metadata=metadata) + if accumulator is None: + accumulator = temp + else: + accumulator = accumulate((accumulator, temp)) + + return accumulator + + +@dataclass +class DaskSyncExecutor(DaskExecutorBase): + """Execute dask task graph in one thread + + Parameters + ---------- + items : list + List of input arguments + function : callable + A function to be called on each input, which returns an accumulator instance + accumulator : Accumulatable + An accumulator to collect the output of the function + status : bool + If true (default), enable progress bar + unit : str + Label of progress bar unit + desc : str + Label of progress bar description + compression : int, optional + Ignored for iterative executor + """ + + def __call__( + self, + items: Iterable, + function: Callable, + accumulator: Accumulatable, + ): + import dask + + to_compute = self.prepare_dataset_graph(items, function, None) + computed = dask.compute(to_compute, scheduler="sync") + return computed[0] if len(computed) == 1 else computed + + +@dataclass +class DaskProcessesExecutor(DaskExecutorBase): + """Execute dask task graph in a multiprocessing pool + + Parameters + ---------- + items : list + List of input arguments + function : callable + A function to be called on each input, which returns an accumulator instance + accumulator : Accumulatable + An accumulator to collect the output of the function + status : bool + If true (default), enable progress bar + unit : str + Label of progress bar unit + desc : str + Label of progress bar description + compression : int, optional + Ignored for iterative executor + """ + + workers = 1 + + def __call__( + self, + items: Iterable, + function: Callable, + accumulator: Accumulatable, + ): + import dask + + to_compute = self.prepare_dataset_graph(items, function, None) + with dask.config.set(num_workers=self.workers): + computed = dask.compute(to_compute, scheduler="processes") + return computed[0] if len(computed) == 1 else computed + + def _watcher( FH: _FuturesHolder, executor: ExecutorBase, @@ -1764,6 +1865,8 @@ def __call__( processor_instance : ProcessorABC An instance of a class deriving from ProcessorABC """ + if isinstance(self.executor, DaskExecutorBase): + return self.run_dask(fileset, processor_instance, treename) wrapped_out = self.run(fileset, processor_instance, treename) if self.use_dataframes: @@ -1819,6 +1922,32 @@ def preprocess( return self._chunk_generator(fileset, treename) + def run_dask( + self, + fileset: Union[Dict, str, List[WorkItem], Generator], + processor_instance: ProcessorABC, + treename: str = None, + ) -> Accumulatable: + """Run the processor_instance on a given fileset + + Parameters + ---------- + fileset : dict | str | List[WorkItem] | Generator + - A dictionary ``{dataset: [file, file], }`` + Optionally, if some files' tree name differ, the dictionary can be specified: + ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` + - A single file name + - File chunks for self.preprocess() + - Chunk generator + treename : str, optional + name of tree inside each root file, can be ``None``; + treename can also be defined in fileset, which will override the passed treename + Not needed if processing premade chunks + processor_instance : ProcessorABC + An instance of a class deriving from ProcessorABC + """ + pass + def run( self, fileset: Union[Dict, str, List[WorkItem], Generator], From dcb6f742f2071331c1e2c3de00870de56f5e238b Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 27 Jun 2023 11:35:55 -0500 Subject: [PATCH 02/80] changes for preprocessing prototype --- src/coffea/nanoevents/factory.py | 11 ++++++++++- src/coffea/processor/accumulator.py | 6 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 38e06d601..c12485249 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -232,7 +232,7 @@ def from_root( treepath="/Events", entry_start=None, entry_stop=None, - chunks_per_file=1, + chunks_per_file=None, runtime_cache=None, persistent_cache=None, schemaclass=NanoAODSchema, @@ -327,6 +327,15 @@ def from_root( filter_branch=_remove_not_interpretable, steps_per_file=chunks_per_file, ) + elif chunks_per_file is None: + opener = partial( + uproot.dask, + file, + full_paths=True, + open_files=False, + ak_add_doc=True, + filter_branch=_remove_not_interpretable, + ) else: opener = partial( uproot.dask, diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py index 8ad12dab1..282214a6c 100644 --- a/src/coffea/processor/accumulator.py +++ b/src/coffea/processor/accumulator.py @@ -57,14 +57,14 @@ def add(a: Accumulatable, b: Accumulatable) -> Accumulatable: out[key] = ( copy.deepcopy(a[key]) if not isinstance(a[key], DaskMethodsMixin) - else copy.copy(a[key]) + else a[key] ) for key in b: if key not in lhs: out[key] = ( copy.deepcopy(b[key]) if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) + else b[key] ) return out raise ValueError( @@ -93,7 +93,7 @@ def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable: a[key] = ( copy.deepcopy(b[key]) if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) + else b[key] ) return a raise ValueError( From a03beb9be03ce10eff9b42649c1ae581ac88a829 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 22 Aug 2023 17:21:43 -0500 Subject: [PATCH 03/80] new dask-based dataset pre-processor --- src/coffea/dataset_tools/__init__.py | 3 + src/coffea/dataset_tools/preprocess.py | 159 +++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 src/coffea/dataset_tools/__init__.py create mode 100644 src/coffea/dataset_tools/preprocess.py diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py new file mode 100644 index 000000000..c02a96e48 --- /dev/null +++ b/src/coffea/dataset_tools/__init__.py @@ -0,0 +1,3 @@ +from coffea.dataset_tools.preprocess import preprocess + +__all__ = ["preprocess"] diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py new file mode 100644 index 000000000..7fdc46857 --- /dev/null +++ b/src/coffea/dataset_tools/preprocess.py @@ -0,0 +1,159 @@ +import math + +import awkward +import dask +import dask_awkward +import numpy +import uproot + + +def _get_steps( + normed_files, + maybe_step_size=None, + align_clusters=False, + recalculate_seen_steps=False, +): + nf_backend = awkward.backend(normed_files) + lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files) + + array = [] if nf_backend != "typetracer" else lz_or_nf + for arg in lz_or_nf: + try: + the_file = uproot.open({arg.file: None}) + except FileNotFoundError: + array.append(None) + continue + + tree = the_file[arg.object_path] + num_entries = tree.num_entries + + target_step_size = num_entries if maybe_step_size is None else maybe_step_size + + file_uuid = str(the_file.file.uuid) + + out_uuid = arg.uuid + out_steps = arg.steps + + if out_uuid != file_uuid or recalculate_seen_steps: + if align_clusters: + clusters = tree.common_entry_offsets() + out = [0] + for c in clusters: + if c >= out[-1] + target_step_size: + out.append(c) + if clusters[-1] != out[-1]: + out.append(clusters[-1]) + out = numpy.array(out, dtype="int64") + out = numpy.stack((out[:-1], out[1:]), axis=1) + else: + n_steps = num_entries // target_step_size + out = numpy.array( + [ + [ + i * target_step_size, + min((i + 1) * target_step_size, num_entries), + ] + for i in range(n_steps) + ], + dtype="int64", + ) + + out_uuid = file_uuid + out_steps = out.tolist() + + array.append( + { + "file": arg.file, + "object_path": arg.object_path, + "steps": out_steps, + "uuid": out_uuid, + } + ) + + if len(array) == 0: + array = awkward.Array( + [ + {"file": "junk", "object_path": "junk", "steps": [[]], "uuid": "junk"}, + None, + ] + ) + array = awkward.Array(array.layout.form.length_zero_array(highlevel=False)) + else: + array = awkward.Array(array) + + if nf_backend == "typetracer": + array = awkward.Array( + array.layout.to_typetracer(forget_length=True), + ) + + return array + + +def preprocess( + fileset, + maybe_step_size=None, + align_clusters=False, + recalculate_seen_steps=False, + files_per_batch=1, +): + out_updated = fileset.copy() + out_available = fileset.copy() + all_ak_norm_files = {} + files_to_preprocess = {} + for name, info in fileset.items(): + norm_files = uproot._util.regularize_files(info["files"], steps_allowed=True) + for ifile in range(len(norm_files)): + the_file_info = norm_files[ifile] + maybe_finfo = info["files"].get(the_file_info[0], None) + maybe_uuid = ( + None + if not isinstance(maybe_finfo, dict) + else maybe_finfo.get("uuid", None) + ) + norm_files[ifile] += (3 - len(norm_files[ifile])) * (None,) + (maybe_uuid,) + fields = ["file", "object_path", "steps", "uuid"] + ak_norm_files = awkward.from_iter(norm_files) + ak_norm_files = awkward.Array( + {field: ak_norm_files[str(ifield)] for ifield, field in enumerate(fields)} + ) + all_ak_norm_files[name] = ak_norm_files + + dak_norm_files = dask_awkward.from_awkward( + ak_norm_files, math.ceil(len(ak_norm_files) / files_per_batch) + ) + + files_to_preprocess[name] = dask_awkward.map_partitions( + _get_steps, + dak_norm_files, + maybe_step_size=maybe_step_size, + align_clusters=align_clusters, + recalculate_seen_steps=recalculate_seen_steps, + ) + + all_processed_files = dask.compute(files_to_preprocess)[0] + + for name, processed_files in all_processed_files.items(): + files_available = { + item["file"]: { + "object_path": item["object_path"], + "steps": item["steps"], + "uuid": item["uuid"], + } + for item in awkward.drop_none(processed_files).to_list() + } + + files_out = {} + for proc_item, orig_item in zip( + processed_files.to_list(), all_ak_norm_files[name].to_list() + ): + item = orig_item if proc_item is None else proc_item + files_out[item["file"]] = { + "object_path": item["object_path"], + "steps": item["steps"], + "uuid": item["uuid"], + } + + out_updated[name]["files"] = files_out + out_available[name]["files"] = files_available + + return out_available, out_updated From 95c5fc80a79195e354f80a072f3ebebd631aa677 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 23 Aug 2023 11:41:29 +0200 Subject: [PATCH 04/80] Added the rucio utils functions from pocketcoffea --- pyproject.toml | 1 + src/coffea/dataset_tools/rucio_utils.py | 266 ++++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 src/coffea/dataset_tools/rucio_utils.py diff --git a/pyproject.toml b/pyproject.toml index 454ed3319..1ddcd9fe6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dependencies = [ "pandas", "hist>=2", "cachetools", + "rucio>=32.2.0" ] dynamic = ["version"] diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py new file mode 100644 index 000000000..545884962 --- /dev/null +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -0,0 +1,266 @@ +import os +import getpass +import re +import json +from rucio.client import Client +from collections import defaultdict +import subprocess + +# Rucio needs the default configuration --> taken from CMS cvmfs defaults +os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" + + +def get_proxy_path() -> str: + """ + Checks if the VOMS proxy exists and if it is valid + for at least 1 hour. + If it exists, returns the path of it""" + try: + subprocess.run("voms-proxy-info -exists -hours 1", shell=True, check=True) + except subprocess.CalledProcessError: + raise Exception( + "VOMS proxy expirend or non-existing: please run `voms-proxy-init -voms cms -rfc --valid 168:0`" + ) + + # Now get the path of the certificate + proxy = subprocess.check_output( + "voms-proxy-info -path", shell=True, text=True + ).strip() + return proxy + + +def get_rucio_client(proxy=None) -> Client: + """ + Open a client to the CMS rucio server using x509 proxy. + + Parameters + ---------- + proxy : str, optional + Use the provided proxy file if given, if not use `voms-proxy-info` to get the current active one. + + Returns + ------- + nativeClient: rucio.Client + Rucio client + """ + try: + if not proxy: + proxy = get_proxy_path() + + nativeClient = Client( + rucio_host="https://cms-rucio.cern.ch", + auth_host="https://cms-rucio-auth.cern.ch", + account=getpass.getuser(), + creds={"client_cert": proxy, "client_key": proxy}, + auth_type="x509", + ) + return nativeClient + + except Exception as e: + print("Wrong Rucio configuration, impossible to create client") + raise e + + +def get_xrootd_sites_map(): + """ + The mapping beetween RSE (sites) and the xrootd prefix rules is read + from `/cvmfs/cms/cern.ch/SITECONF/*site*/storage.json`. + + This function returns the list of xrootd prefix rules for each site. + """ + sites_xrootd_access = defaultdict(dict) + # TODO Do not rely on local sites_map cache. Just reload it? + if not os.path.exists(".sites_map.json"): + print("Loading SITECONF info") + sites = [ + (s, "/cvmfs/cms.cern.ch/SITECONF/" + s + "/storage.json") + for s in os.listdir("/cvmfs/cms.cern.ch/SITECONF/") + if s.startswith("T") + ] + for site_name, conf in sites: + if not os.path.exists(conf): + continue + try: + data = json.load(open(conf)) + except: + continue + for site in data: + if site["type"] != "DISK": + continue + if site["rse"] == None: + continue + for proc in site["protocols"]: + if proc["protocol"] == "XRootD": + if proc["access"] not in ["global-ro", "global-rw"]: + continue + if "prefix" not in proc: + if "rules" in proc: + for rule in proc["rules"]: + sites_xrootd_access[site["rse"]][ + rule["lfn"] + ] = rule["pfn"] + else: + sites_xrootd_access[site["rse"]] = proc["prefix"] + json.dump(sites_xrootd_access, open(".sites_map.json", "w")) + + return json.load(open(".sites_map.json")) + + +def _get_pfn_for_site(path, rules): + """ + Utility function that converts the file path to a valid pfn matching + the file path with the site rules (regexes). + """ + if isinstance(rules, dict): + for rule, pfn in rules.items(): + if m := re.match(rule, path): + grs = m.groups() + for i in range(len(grs)): + pfn = pfn.replace(f"${i+1}", grs[i]) + return pfn + else: + return rules + "/" + path + + +def get_dataset_files_replicas( + dataset, whitelist_sites=None, blacklist_sites=None, regex_sites=None, mode="full" +): + """ + This function queries the Rucio server to get information about the location + of all the replicas of the files in a CMS dataset. + + The sites can be filtered in 3 different ways: + - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Expection. + - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception + - `regex_sites`: regex expression to restrict the list of sites. + + The fileset returned by the function is controlled by the `mode` parameter: + - "full": returns the full set of replicas and sites (passing the filtering parameters) + - "first": returns the first replica found for each file + - "best": to be implemented (ServiceX..) + - "roundrobin": try to distribute the replicas over different sites + + Parameters + ---------- + + dataset: str + whilelist_sites: list + blacklist_sites: list + regex_sites: list + mode: str, default "full" + + Returns + ------- + files: list + depending on the `mode` option. + - If `mode=="full"`, returns the complete list of replicas for each file in the dataset + - If `mode=="first"`, returns only the first replica for each file. + + sites: list + depending on the `mode` option. + - If `mode=="full"`, returns the list of sites where the file replica is available for each file in the dataset + - If `mode=="first"`, returns a list of sites for the first replica of each file. + + """ + sites_xrootd_prefix = get_xrootd_sites_map() + client = get_rucio_client() + outsites = [] + outfiles = [] + for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]): + outfile = [] + outsite = [] + rses = filedata["rses"] + found = False + if whitelist_sites: + for site in whitelist_sites: + if site in rses: + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] == True + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site]) + ) + outsite.append(site) + found = True + + if not found: + raise Exception( + f"No SITE available in the whitelist for file {filedata['name']}" + ) + else: + possible_sites = list(rses.keys()) + if blacklist_sites: + possible_sites = list( + filter(lambda key: key not in blacklist_sites, possible_sites) + ) + + if len(possible_sites) == 0: + raise Exception(f"No SITE available for file {filedata['name']}") + + # now check for regex + for site in possible_sites: + if regex_sites: + if re.search(regex_sites, site): + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] == True + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site( + filedata["name"], sites_xrootd_prefix[site] + ) + ) + outsite.append(site) + found = True + else: + # Just take the first one + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] == True + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site]) + ) + outsite.append(site) + found = True + + if not found: + raise Exception(f"No SITE available for file {filedata['name']}") + else: + if mode == "full": + outfiles.append(outfile) + outsites.append(outsite) + elif mode == "first": + outfiles.append(outfile[0]) + outsites.append(outsite[0]) + else: + raise NotImplemented(f"Mode {mode} not yet implemented!") + + # Computing replicas by site: + totfiles = len(outfiles) + sites_counts = defaultdict(float) + if mode == "full": + for sites_by_file in outsites: + for site in sites_by_file: + sites_counts[site] += 1 / totfiles + elif mode == "first": + for site_by_file in outsites: + sites_counts[site] += 1 / totfiles + + return outfiles, outsites, sites_counts From 1f870c2dfdb013d0c9cd6c4118a23bb51ac71dfd Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 23 Aug 2023 18:14:43 +0200 Subject: [PATCH 05/80] Added dataset querying function --- src/coffea/dataset_tools/rucio_utils.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 545884962..36954ea6e 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -123,7 +123,12 @@ def _get_pfn_for_site(path, rules): def get_dataset_files_replicas( - dataset, whitelist_sites=None, blacklist_sites=None, regex_sites=None, mode="full" + dataset, + whitelist_sites=None, + blacklist_sites=None, + regex_sites=None, + mode="full", + client=None, ): """ This function queries the Rucio server to get information about the location @@ -148,6 +153,7 @@ def get_dataset_files_replicas( blacklist_sites: list regex_sites: list mode: str, default "full" + client: rucio Client, optional Returns ------- @@ -163,7 +169,7 @@ def get_dataset_files_replicas( """ sites_xrootd_prefix = get_xrootd_sites_map() - client = get_rucio_client() + client = client if client else get_rucio_client() outsites = [] outfiles = [] for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]): @@ -264,3 +270,9 @@ def get_dataset_files_replicas( sites_counts[site] += 1 / totfiles return outfiles, outsites, sites_counts + + +def query_dataset(query, client=None): + client = client if client else get_rucio_client() + return list(client.list_dids(scope="cms", filters={"name": query, "type":"container"},long=False)) + From c5d2d579287537f3017c3bd3dd45a5b35fc5e2cd Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 10:09:36 +0200 Subject: [PATCH 06/80] Working on interface for datasets query --- src/coffea/dataset_tools/dataset_query.py | 109 ++++++++++++++++++++++ src/coffea/dataset_tools/rucio_utils.py | 17 +++- 2 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 src/coffea/dataset_tools/dataset_query.py diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py new file mode 100644 index 000000000..b2e782095 --- /dev/null +++ b/src/coffea/dataset_tools/dataset_query.py @@ -0,0 +1,109 @@ +from cmd2 import Cmd +import cmd2 +from rich import print +from rich.pretty import pprint +from rich.console import Console +from rich.table import Table +from rich.tree import Tree +import rucio_utils + + +def print_dataset_query(query, dataset_list, selected, console): + table = Table(title=f"Query: [bold red]{query}") + table.add_column("name", justify="left", style="cyan", no_wrap=True) + table.add_column("tag", style="magenta", no_wrap=True) + table.add_column("selected", justify="center") + table.row_styles = ["dim", "none"] + j = 1 + for name, conds in dataset_list.items(): + ic = 0 + ncond = len(conds) + for c, tiers in conds.items(): + dataset = f"/{name}/{c}/{tiers[0]}" + sel = dataset in selected + if ic ==0: + table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + f"[green]Y" if sel else f"[red]N", + end_section = ic==ncond-1) + else: + table.add_row("", f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + f"[green]Y" if sel else f"[red]N", + end_section = ic==ncond-1) + ic+=1 + j+=1 + + console.print(table) + + +class MyCmdApp(cmd2.Cmd): + + prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > " + + def __init__(self): + shortcuts = cmd2.DEFAULT_SHORTCUTS + shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas', + 'S': 'select', + 'lr': 'list_results'}) + self.console = Console() + self.rucio_client = None + self.selected_datasets = [ ] + self.last_query = "" + self.last_query_results = None + super().__init__(shortcuts=shortcuts) + + def do_login(self, args): + '''Login to the rucio client. Optionally a specific proxy file can be passed to the command. + If the proxy file is not specified, `voms-proxy-info` is used''' + if args: + self.rucio_client = rucio_utils.get_rucio_client(args[0]) + else: + self.rucio_client = rucio_utils.get_rucio_client() + + print(self.rucio_client) + #pprint(self.rucio_client.whoami()) + + def do_whoami(self, args): + # Your code here + if not self.rucio_client: + print("First [bold]login (L)[/] to the rucio server") + return + print(self.rucio_client.whoami()) + + def do_query(self, args): + # Your code here + with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): + out = rucio_utils.query_dataset(args.arg_list[0], + client=self.rucio_client, + tree=True) + # Now let's print the results as a tree + print_dataset_query(args, out, + self.selected_datasets, + self.console) + self.last_query = args + self.last_query_results = out + + def do_list_results(self, args): + if self.last_query_results: + print_dataset_query(self.last_query, self.last_query_results, + self.selected_datasets, self.console) + else: + print("First [bold red]query (Q)[/] for a dataset") + + def do_select(self, args): + if not self.last_query_results: + print("First [bold red]query (Q)[/] for a dataset") + return + + for s in map(int, args.arg_list): + print(s) + + + + + def do_replicas(self, args): + # Your code here + self.poutput("Replicas command executed") + +if __name__ == "__main__": + app = MyCmdApp() + app.cmdloop() diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 36954ea6e..70f301a5b 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -272,7 +272,18 @@ def get_dataset_files_replicas( return outfiles, outsites, sites_counts -def query_dataset(query, client=None): +def query_dataset(query, client=None, tree=False): client = client if client else get_rucio_client() - return list(client.list_dids(scope="cms", filters={"name": query, "type":"container"},long=False)) - + out = list(client.list_dids( + scope="cms", filters={"name": query, "type":"container"}, + long=False)) + if tree: + outdict = {} + for dataset in out: + split = dataset[1:].split("/") + if split[0] not in outdict: + outdict[split[0]] = defaultdict(list) + outdict[split[0]][split[1]].append(split[2]) + return outdict + else: + return out From d4d371c094a0e5f980f6366d6c237405fd9936a5 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 10:34:17 +0200 Subject: [PATCH 07/80] Querying and listing implemented: selected of results --- pyproject.toml | 3 +- src/coffea/dataset_tools/dataset_query.py | 45 +++++++++++++++-------- src/coffea/dataset_tools/rucio_utils.py | 2 +- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1ddcd9fe6..82fa7e7e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,8 @@ dependencies = [ "pandas", "hist>=2", "cachetools", - "rucio>=32.2.0" + "rucio>=32.2.0", + "cmd2" ] dynamic = ["version"] diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index b2e782095..299f03c72 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -23,11 +23,11 @@ def print_dataset_query(query, dataset_list, selected, console): sel = dataset in selected if ic ==0: table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green]Y" if sel else f"[red]N", + f"[green bold]Y" if sel else f"[red]N", end_section = ic==ncond-1) else: table.add_row("", f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green]Y" if sel else f"[red]N", + f"[green bold]Y" if sel else f"[red]N", end_section = ic==ncond-1) ic+=1 j+=1 @@ -42,13 +42,14 @@ class MyCmdApp(cmd2.Cmd): def __init__(self): shortcuts = cmd2.DEFAULT_SHORTCUTS shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas', - 'S': 'select', + 'S': 'select', "LS": 'list_selected', 'lr': 'list_results'}) self.console = Console() self.rucio_client = None self.selected_datasets = [ ] self.last_query = "" - self.last_query_results = None + self.last_query_tree = None + self.last_query_list = None super().__init__(shortcuts=shortcuts) def do_login(self, args): @@ -72,37 +73,51 @@ def do_whoami(self, args): def do_query(self, args): # Your code here with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): - out = rucio_utils.query_dataset(args.arg_list[0], + outlist, outtree = rucio_utils.query_dataset(args.arg_list[0], client=self.rucio_client, tree=True) # Now let's print the results as a tree - print_dataset_query(args, out, + print_dataset_query(args, outtree, self.selected_datasets, self.console) self.last_query = args - self.last_query_results = out + self.last_query_list = outlist + self.last_query_tree = outtree + print("Use the command [bold red]select (S)[/] to selected the datasets") def do_list_results(self, args): - if self.last_query_results: - print_dataset_query(self.last_query, self.last_query_results, + if self.last_query_list: + print_dataset_query(self.last_query, self.last_query_tree, self.selected_datasets, self.console) else: print("First [bold red]query (Q)[/] for a dataset") def do_select(self, args): - if not self.last_query_results: + if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return + Nresults = len(self.last_query_list) + print("[cyan]Selected datasets:") for s in map(int, args.arg_list): - print(s) - - + if s <= Nresults: + self.selected_datasets.append(self.last_query_list[s-1]) + print(f"- ({s}) {self.last_query_list[s-1]}") + else: + print(f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}") + def do_list_selected(self, args): + print("[cyan]Selected datasets:") + for i, ds in enumerate(self.selected_datasets): + print(f"- [{i}] [blue]{ds}") def do_replicas(self, args): - # Your code here - self.poutput("Replicas command executed") + if len(args.arg_list)==0: + print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze") + return + + + if __name__ == "__main__": app = MyCmdApp() diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 70f301a5b..884b25039 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -284,6 +284,6 @@ def query_dataset(query, client=None, tree=False): if split[0] not in outdict: outdict[split[0]] = defaultdict(list) outdict[split[0]][split[1]].append(split[2]) - return outdict + return out, outdict else: return out From e1f11cfff29e983893aae1b1d1c7f6365fb5f499 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 11:07:41 +0200 Subject: [PATCH 08/80] Printing sites availability for replicas --- src/coffea/dataset_tools/dataset_query.py | 42 ++++++++++++++++++++--- src/coffea/dataset_tools/rucio_utils.py | 9 +++-- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 299f03c72..60fab9973 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -6,13 +6,14 @@ from rich.table import Table from rich.tree import Tree import rucio_utils +from collections import defaultdict def print_dataset_query(query, dataset_list, selected, console): table = Table(title=f"Query: [bold red]{query}") - table.add_column("name", justify="left", style="cyan", no_wrap=True) - table.add_column("tag", style="magenta", no_wrap=True) - table.add_column("selected", justify="center") + table.add_column("Name", justify="left", style="cyan", no_wrap=True) + table.add_column("Tag", style="magenta", no_wrap=True) + table.add_column("Selected", justify="center") table.row_styles = ["dim", "none"] j = 1 for name, conds in dataset_list.items(): @@ -50,6 +51,10 @@ def __init__(self): self.last_query = "" self.last_query_tree = None self.last_query_list = None + self.sites_whitelist = None + self.sites_blacklist = None + self.sites_regex = None + self.replicas_results = defaultdict(list) super().__init__(shortcuts=shortcuts) def do_login(self, args): @@ -113,10 +118,37 @@ def do_list_selected(self, args): def do_replicas(self, args): if len(args.arg_list)==0: - print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze") - return + print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]") + + if args.isdigit(): + if int(args) <= len(self.selected_datasets): + dataset = self.selected_datasets[int(args)-1] + else: + print(f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}") + else: + dataset = args + with self.console.status(f"Querying rucio for replicas: [bold red]{dataset}[/]"): + outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(dataset, + whitelist_sites=self.sites_whitelist, + blacklist_sites=self.sites_blacklist, + regex_sites=self.sites_regex, + mode="full", + client=self.rucio_client) + table = Table(title=f"[cyan]Sites availability for dataset: [red]{dataset}") + table.add_column("Site", justify="left", style="cyan", no_wrap=True) + table.add_column("Files", style="magenta", no_wrap=True) + table.add_column("Availability", justify="center") + table.row_styles = ["dim", "none"] + Nfiles = len(outfiles) + + sorted_sites = dict(sorted(sites_counts.items(), key=lambda x:x[1], reverse=True)) + for site, stat in sorted_sites.items(): + table.add_row(site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%") + + self.console.print(table) + if __name__ == "__main__": diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 884b25039..20d3f7955 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -167,6 +167,9 @@ def get_dataset_files_replicas( - If `mode=="full"`, returns the list of sites where the file replica is available for each file in the dataset - If `mode=="first"`, returns a list of sites for the first replica of each file. + sites_counts: dict + Metadata couting the coverage of the dataset by site + """ sites_xrootd_prefix = get_xrootd_sites_map() client = client if client else get_rucio_client() @@ -260,14 +263,14 @@ def get_dataset_files_replicas( # Computing replicas by site: totfiles = len(outfiles) - sites_counts = defaultdict(float) + sites_counts = defaultdict(int) if mode == "full": for sites_by_file in outsites: for site in sites_by_file: - sites_counts[site] += 1 / totfiles + sites_counts[site] += 1 elif mode == "first": for site_by_file in outsites: - sites_counts[site] += 1 / totfiles + sites_counts[site] += 1 return outfiles, outsites, sites_counts From 1e191bf002625798970ddfe1bc48e53d2488c7b9 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 14:23:00 +0200 Subject: [PATCH 09/80] Added replica site selection --- src/coffea/dataset_tools/dataset_query.py | 253 +++++++++++++++++----- src/coffea/dataset_tools/rucio_utils.py | 8 +- 2 files changed, 204 insertions(+), 57 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 60fab9973..5359c2c1c 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -5,8 +5,10 @@ from rich.console import Console from rich.table import Table from rich.tree import Tree +from rich.prompt import Prompt import rucio_utils from collections import defaultdict +import random def print_dataset_query(query, dataset_list, selected, console): @@ -22,51 +24,65 @@ def print_dataset_query(query, dataset_list, selected, console): for c, tiers in conds.items(): dataset = f"/{name}/{c}/{tiers[0]}" sel = dataset in selected - if ic ==0: - table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green bold]Y" if sel else f"[red]N", - end_section = ic==ncond-1) + if ic == 0: + table.add_row( + name, + f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + f"[green bold]Y" if sel else f"[red]N", + end_section=ic == ncond - 1, + ) else: - table.add_row("", f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green bold]Y" if sel else f"[red]N", - end_section = ic==ncond-1) - ic+=1 - j+=1 + table.add_row( + "", + f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + f"[green bold]Y" if sel else f"[red]N", + end_section=ic == ncond - 1, + ) + ic += 1 + j += 1 console.print(table) class MyCmdApp(cmd2.Cmd): - prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > " - + def __init__(self): shortcuts = cmd2.DEFAULT_SHORTCUTS - shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas', - 'S': 'select', "LS": 'list_selected', - 'lr': 'list_results'}) + shortcuts.update( + { + "L": "login", + "Q": "query", + "R": "replicas", + "S": "select", + "LS": "list_selected", + "lr": "list_results", + } + ) self.console = Console() self.rucio_client = None - self.selected_datasets = [ ] + self.selected_datasets = [] self.last_query = "" self.last_query_tree = None self.last_query_list = None self.sites_whitelist = None self.sites_blacklist = None self.sites_regex = None - self.replicas_results = defaultdict(list) + self.last_replicas_results = None + + self.replica_results = defaultdict(list) super().__init__(shortcuts=shortcuts) def do_login(self, args): - '''Login to the rucio client. Optionally a specific proxy file can be passed to the command. - If the proxy file is not specified, `voms-proxy-info` is used''' + """Login to the rucio client. Optionally a specific proxy file can be passed to the command. + If the proxy file is not specified, `voms-proxy-info` is used""" if args: self.rucio_client = rucio_utils.get_rucio_client(args[0]) else: self.rucio_client = rucio_utils.get_rucio_client() - + print(self.rucio_client) - #pprint(self.rucio_client.whoami()) + # pprint(self.rucio_client.whoami()) def do_whoami(self, args): # Your code here @@ -74,17 +90,15 @@ def do_whoami(self, args): print("First [bold]login (L)[/] to the rucio server") return print(self.rucio_client.whoami()) - + def do_query(self, args): # Your code here - with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): - outlist, outtree = rucio_utils.query_dataset(args.arg_list[0], - client=self.rucio_client, - tree=True) + with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): + outlist, outtree = rucio_utils.query_dataset( + args.arg_list[0], client=self.rucio_client, tree=True + ) # Now let's print the results as a tree - print_dataset_query(args, outtree, - self.selected_datasets, - self.console) + print_dataset_query(args, outtree, self.selected_datasets, self.console) self.last_query = args self.last_query_list = outlist self.last_query_tree = outtree @@ -92,8 +106,12 @@ def do_query(self, args): def do_list_results(self, args): if self.last_query_list: - print_dataset_query(self.last_query, self.last_query_tree, - self.selected_datasets, self.console) + print_dataset_query( + self.last_query, + self.last_query_tree, + self.selected_datasets, + self.console, + ) else: print("First [bold red]query (Q)[/] for a dataset") @@ -106,50 +124,177 @@ def do_select(self, args): print("[cyan]Selected datasets:") for s in map(int, args.arg_list): if s <= Nresults: - self.selected_datasets.append(self.last_query_list[s-1]) + self.selected_datasets.append(self.last_query_list[s - 1]) print(f"- ({s}) {self.last_query_list[s-1]}") else: - print(f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}") - + print( + f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}" + ) + def do_list_selected(self, args): print("[cyan]Selected datasets:") for i, ds in enumerate(self.selected_datasets): print(f"- [{i}] [blue]{ds}") - + def do_replicas(self, args): - if len(args.arg_list)==0: - print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]") + if len(args.arg_list) == 0: + print( + "[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]" + ) + return if args.isdigit(): if int(args) <= len(self.selected_datasets): - dataset = self.selected_datasets[int(args)-1] + dataset = self.selected_datasets[int(args) - 1] else: - print(f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}") + print( + f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}" + ) else: - dataset = args - - with self.console.status(f"Querying rucio for replicas: [bold red]{dataset}[/]"): - outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(dataset, - whitelist_sites=self.sites_whitelist, - blacklist_sites=self.sites_blacklist, - regex_sites=self.sites_regex, - mode="full", - client=self.rucio_client) - - table = Table(title=f"[cyan]Sites availability for dataset: [red]{dataset}") + dataset = args.arg_list[0] + + with self.console.status( + f"Querying rucio for replicas: [bold red]{dataset}[/]" + ): + outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas( + dataset, + whitelist_sites=self.sites_whitelist, + blacklist_sites=self.sites_blacklist, + regex_sites=self.sites_regex, + mode="full", + client=self.rucio_client, + ) + self.last_replicas_results = (outfiles, outsites, sites_counts) + print(f"[cyan]Sites availability for dataset: [red]{dataset}") + table = Table(title="Available replicas") + table.add_column("Index", justify="center") table.add_column("Site", justify="left", style="cyan", no_wrap=True) table.add_column("Files", style="magenta", no_wrap=True) table.add_column("Availability", justify="center") table.row_styles = ["dim", "none"] Nfiles = len(outfiles) - sorted_sites = dict(sorted(sites_counts.items(), key=lambda x:x[1], reverse=True)) - for site, stat in sorted_sites.items(): - table.add_row(site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%") + sorted_sites = dict( + sorted(sites_counts.items(), key=lambda x: x[1], reverse=True) + ) + for i, (site, stat) in enumerate(sorted_sites.items()): + table.add_row(str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%") self.console.print(table) - - + strategy = Prompt.ask( + "Select sites", + choices=["round-robin", "choice", "quit"], + default="round-robin", + ) + + files_by_site = defaultdict(list) + + if strategy == "choice": + ind = list( + map(int, Prompt.ask("Enter list of sites index to be used").split(" ")) + ) + sites_to_use = [list(sorted_sites.keys())[i] for i in ind] + print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}") + + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + random.shuffle(sites_to_use) + found = False + # loop on shuffled selected sites until one is found + for site in sites_to_use: + try: + iS = sites.index(site) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + found = True + break # keep only one replica + except ValueError: + # if the index is not found just go to the next site + pass + + if not found: + print( + f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:" + ) + for f, s in zip(files, sites): + print(f"\t- [green]{s} [cyan]{f}") + return + + self.replica_results[dataset] = output + + elif strategy == "round-robin": + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + # selecting randomly from the sites + iS = random.randint(0, len(sites) - 1) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + self.replica_results[dataset] = output + + elif strategy == "quit": + print("[orange]Doing nothing...") + return + + # Now let's print the results + tree = Tree(label=f"Replicas for [green]{dataset}") + for site, files in files_by_site.items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") + + print("Final replicas selection") + self.console.print(tree) + + def do_whitelist_sites(self, args): + if self.sites_whitelist == None: + self.sites_whitelist = args.arg_list + else: + self.sites_whitelist += args.arg_list + print("[green]Whitelisted sites:") + for s in self.sites_whitelist: + print(f"- {s}") + + def do_blacklist_sites(self, args): + if self.sites_blacklist == None: + self.sites_blacklist = args.arg_list + else: + self.sites_blacklist += args.arg_list + print("[red]Blacklisted sites:") + for s in self.sites_blacklist: + print(f"- {s}") + + def do_regex_sites(self, args): + if args.startswith('"'): + args = args[1:] + if args.endswith('"'): + args = args[:-1] + self.sites_regex = r"{}".format(args) + print(f"New sites regex: [cyan]{self.sites_regex}") + + def do_sites_filters(self, args): + if args == "": + print("[green bold]Whitelisted sites:") + if self.sites_whitelist: + for s in self.sites_whitelist: + print(f"- {s}") + + print("[bold red]Blacklisted sites:") + if self.sites_blacklist: + for s in self.sites_blacklist: + print(f"- {s}") + + print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") + if args == "clear": + self.sites_whitelist = None + self.sites_blacklist = None + self.sites_regex = None + print("[bold green]Sites filters cleared") + + def do_list_replicas(self, args): + print("Datasets with selected replicas: ") + for dataset in self.replica_results: + print(f"\t-[cyan]{dataset}") + if __name__ == "__main__": app = MyCmdApp() diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 20d3f7955..eb6a3ae0b 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -277,9 +277,11 @@ def get_dataset_files_replicas( def query_dataset(query, client=None, tree=False): client = client if client else get_rucio_client() - out = list(client.list_dids( - scope="cms", filters={"name": query, "type":"container"}, - long=False)) + out = list( + client.list_dids( + scope="cms", filters={"name": query, "type": "container"}, long=False + ) + ) if tree: outdict = {} for dataset in out: From 02cbbbece82dd61fe5319072eed26c2088eb9029 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 14:58:40 +0200 Subject: [PATCH 10/80] Added saving --- src/coffea/dataset_tools/dataset_query.py | 58 ++++++++++++++++++----- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 5359c2c1c..6f77fcce4 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -9,6 +9,7 @@ import rucio_utils from collections import defaultdict import random +import yaml def print_dataset_query(query, dataset_list, selected, console): @@ -53,10 +54,11 @@ def __init__(self): { "L": "login", "Q": "query", + "QR": "query_results", "R": "replicas", "S": "select", "LS": "list_selected", - "lr": "list_results", + "LR": "list_replicas", } ) self.console = Console() @@ -71,6 +73,7 @@ def __init__(self): self.last_replicas_results = None self.replica_results = defaultdict(list) + self.replica_results_bysite = {} super().__init__(shortcuts=shortcuts) def do_login(self, args): @@ -104,7 +107,7 @@ def do_query(self, args): self.last_query_tree = outtree print("Use the command [bold red]select (S)[/] to selected the datasets") - def do_list_results(self, args): + def do_query_results(self, args): if self.last_query_list: print_dataset_query( self.last_query, @@ -133,9 +136,16 @@ def do_select(self, args): def do_list_selected(self, args): print("[cyan]Selected datasets:") + table = Table(title="Selected datasets") + table.add_column("Index", justify="left", style="cyan", no_wrap=True) + table.add_column("Dataset", style="magenta", no_wrap=True) + table.add_column("Replicas selected", justify="center") + table.add_column("N. of files", justify="center") for i, ds in enumerate(self.selected_datasets): - print(f"- [{i}] [blue]{ds}") - + table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N", + str(len(self.replica_results[ds])) if ds in self.replica_results else "-") + self.console.print(table) + def do_replicas(self, args): if len(args.arg_list) == 0: print( @@ -152,6 +162,8 @@ def do_replicas(self, args): ) else: dataset = args.arg_list[0] + # adding it to the selected datasets + self.selected_datasets.append(dataset) with self.console.status( f"Querying rucio for replicas: [bold red]{dataset}[/]" @@ -235,14 +247,14 @@ def do_replicas(self, args): print("[orange]Doing nothing...") return + self.replica_results_bysite[dataset] = files_by_site + # Now let's print the results - tree = Tree(label=f"Replicas for [green]{dataset}") + tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") for site, files in files_by_site.items(): T = tree.add(f"[green]{site}") for f in files: T.add(f"[cyan]{f}") - - print("Final replicas selection") self.console.print(tree) def do_whitelist_sites(self, args): @@ -291,10 +303,34 @@ def do_sites_filters(self, args): print("[bold green]Sites filters cleared") def do_list_replicas(self, args): - print("Datasets with selected replicas: ") - for dataset in self.replica_results: - print(f"\t-[cyan]{dataset}") - + if len(args.arg_list)==0: + print("[red]Please call the command with the index of a selected dataset") + else: + if int(args) > len(self.selected_datasets): + print(f"[red] Select the replica with index < {len(self.selected_datasets)}") + return + else: + dataset = self.selected_datasets[int(args)-1] + if dataset not in self.replica_results: + print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}") + tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") + + for site, files in self.replica_results_bysite[dataset].items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") + + self.console.print(tree) + + def do_save(self, args): + '''Save the replica information in yaml format''' + if not len(args): + print("[red]Please provide an output filename") + else: + with open(args, "w") as file: + yaml.dump(dict(self.replica_results), file, + default_flow_style=False) + print(f"[green]File {args} saved!") if __name__ == "__main__": app = MyCmdApp() From daf5e52c3b8dd00aac3338a17b7d4d3ea503d076 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Aug 2023 13:04:00 +0000 Subject: [PATCH 11/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 51 ++++++++++++++--------- src/coffea/dataset_tools/rucio_utils.py | 9 ++-- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 6f77fcce4..3a0c8da59 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,15 +1,16 @@ -from cmd2 import Cmd +import random +from collections import defaultdict + import cmd2 +import rucio_utils +import yaml +from cmd2 import Cmd from rich import print -from rich.pretty import pprint from rich.console import Console +from rich.pretty import pprint +from rich.prompt import Prompt from rich.table import Table from rich.tree import Tree -from rich.prompt import Prompt -import rucio_utils -from collections import defaultdict -import random -import yaml def print_dataset_query(query, dataset_list, selected, console): @@ -142,10 +143,16 @@ def do_list_selected(self, args): table.add_column("Replicas selected", justify="center") table.add_column("N. of files", justify="center") for i, ds in enumerate(self.selected_datasets): - table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N", - str(len(self.replica_results[ds])) if ds in self.replica_results else "-") + table.add_row( + str(i + 1), + ds, + "[green bold]Y" if ds in self.replica_results else "[red]N", + str(len(self.replica_results[ds])) + if ds in self.replica_results + else "-", + ) self.console.print(table) - + def do_replicas(self, args): if len(args.arg_list) == 0: print( @@ -248,7 +255,7 @@ def do_replicas(self, args): return self.replica_results_bysite[dataset] = files_by_site - + # Now let's print the results tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") for site, files in files_by_site.items(): @@ -280,7 +287,7 @@ def do_regex_sites(self, args): args = args[1:] if args.endswith('"'): args = args[:-1] - self.sites_regex = r"{}".format(args) + self.sites_regex = fr"{args}" print(f"New sites regex: [cyan]{self.sites_regex}") def do_sites_filters(self, args): @@ -303,18 +310,22 @@ def do_sites_filters(self, args): print("[bold green]Sites filters cleared") def do_list_replicas(self, args): - if len(args.arg_list)==0: + if len(args.arg_list) == 0: print("[red]Please call the command with the index of a selected dataset") else: if int(args) > len(self.selected_datasets): - print(f"[red] Select the replica with index < {len(self.selected_datasets)}") + print( + f"[red] Select the replica with index < {len(self.selected_datasets)}" + ) return else: - dataset = self.selected_datasets[int(args)-1] + dataset = self.selected_datasets[int(args) - 1] if dataset not in self.replica_results: - print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}") + print( + f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}" + ) tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") - + for site, files in self.replica_results_bysite[dataset].items(): T = tree.add(f"[green]{site}") for f in files: @@ -323,15 +334,15 @@ def do_list_replicas(self, args): self.console.print(tree) def do_save(self, args): - '''Save the replica information in yaml format''' + """Save the replica information in yaml format""" if not len(args): print("[red]Please provide an output filename") else: with open(args, "w") as file: - yaml.dump(dict(self.replica_results), file, - default_flow_style=False) + yaml.dump(dict(self.replica_results), file, default_flow_style=False) print(f"[green]File {args} saved!") + if __name__ == "__main__": app = MyCmdApp() app.cmdloop() diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index eb6a3ae0b..3ddb0fcf8 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -1,10 +1,11 @@ -import os import getpass -import re import json -from rucio.client import Client -from collections import defaultdict +import os +import re import subprocess +from collections import defaultdict + +from rucio.client import Client # Rucio needs the default configuration --> taken from CMS cvmfs defaults os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" From c9101bb8f975b2f9131e809f831da055e3b7167d Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 15:19:11 +0200 Subject: [PATCH 12/80] Formatting and flake8 --- src/coffea/dataset_tools/dataset_query.py | 46 +++++++++++++---------- src/coffea/dataset_tools/rucio_utils.py | 13 +++---- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 6f77fcce4..ed88bebaf 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,12 +1,10 @@ -from cmd2 import Cmd import cmd2 from rich import print -from rich.pretty import pprint from rich.console import Console from rich.table import Table from rich.tree import Tree from rich.prompt import Prompt -import rucio_utils +from . import rucio_utils from collections import defaultdict import random import yaml @@ -29,14 +27,14 @@ def print_dataset_query(query, dataset_list, selected, console): table.add_row( name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green bold]Y" if sel else f"[red]N", + "[green bold]Y" if sel else "[red]N", end_section=ic == ncond - 1, ) else: table.add_row( "", f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", - f"[green bold]Y" if sel else f"[red]N", + "[green bold]Y" if sel else "[red]N", end_section=ic == ncond - 1, ) ic += 1 @@ -142,10 +140,16 @@ def do_list_selected(self, args): table.add_column("Replicas selected", justify="center") table.add_column("N. of files", justify="center") for i, ds in enumerate(self.selected_datasets): - table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N", - str(len(self.replica_results[ds])) if ds in self.replica_results else "-") + table.add_row( + str(i + 1), + ds, + "[green bold]Y" if ds in self.replica_results else "[red]N", + str(len(self.replica_results[ds])) + if ds in self.replica_results + else "-", + ) self.console.print(table) - + def do_replicas(self, args): if len(args.arg_list) == 0: print( @@ -248,7 +252,7 @@ def do_replicas(self, args): return self.replica_results_bysite[dataset] = files_by_site - + # Now let's print the results tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") for site, files in files_by_site.items(): @@ -258,7 +262,7 @@ def do_replicas(self, args): self.console.print(tree) def do_whitelist_sites(self, args): - if self.sites_whitelist == None: + if self.sites_whitelist is None: self.sites_whitelist = args.arg_list else: self.sites_whitelist += args.arg_list @@ -267,7 +271,7 @@ def do_whitelist_sites(self, args): print(f"- {s}") def do_blacklist_sites(self, args): - if self.sites_blacklist == None: + if self.sites_blacklist is None: self.sites_blacklist = args.arg_list else: self.sites_blacklist += args.arg_list @@ -303,18 +307,22 @@ def do_sites_filters(self, args): print("[bold green]Sites filters cleared") def do_list_replicas(self, args): - if len(args.arg_list)==0: + if len(args.arg_list) == 0: print("[red]Please call the command with the index of a selected dataset") else: if int(args) > len(self.selected_datasets): - print(f"[red] Select the replica with index < {len(self.selected_datasets)}") + print( + f"[red] Select the replica with index < {len(self.selected_datasets)}" + ) return else: - dataset = self.selected_datasets[int(args)-1] + dataset = self.selected_datasets[int(args) - 1] if dataset not in self.replica_results: - print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}") + print( + f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}" + ) tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") - + for site, files in self.replica_results_bysite[dataset].items(): T = tree.add(f"[green]{site}") for f in files: @@ -323,15 +331,15 @@ def do_list_replicas(self, args): self.console.print(tree) def do_save(self, args): - '''Save the replica information in yaml format''' + """Save the replica information in yaml format""" if not len(args): print("[red]Please provide an output filename") else: with open(args, "w") as file: - yaml.dump(dict(self.replica_results), file, - default_flow_style=False) + yaml.dump(dict(self.replica_results), file, default_flow_style=False) print(f"[green]File {args} saved!") + if __name__ == "__main__": app = MyCmdApp() app.cmdloop() diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index eb6a3ae0b..ce2825c0f 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -82,12 +82,12 @@ def get_xrootd_sites_map(): continue try: data = json.load(open(conf)) - except: + except Exception: continue for site in data: if site["type"] != "DISK": continue - if site["rse"] == None: + if site["rse"] is None: continue for proc in site["protocols"]: if proc["protocol"] == "XRootD": @@ -187,7 +187,7 @@ def get_dataset_files_replicas( meta = filedata["pfns"][rses[site][0]] if ( meta["type"] != "DISK" - or meta["volatile"] == True + or meta["volatile"] or filedata["states"][site] != "AVAILABLE" or site not in sites_xrootd_prefix ): @@ -220,7 +220,7 @@ def get_dataset_files_replicas( meta = filedata["pfns"][rses[site][0]] if ( meta["type"] != "DISK" - or meta["volatile"] == True + or meta["volatile"] or filedata["states"][site] != "AVAILABLE" or site not in sites_xrootd_prefix ): @@ -238,7 +238,7 @@ def get_dataset_files_replicas( meta = filedata["pfns"][rses[site][0]] if ( meta["type"] != "DISK" - or meta["volatile"] == True + or meta["volatile"] or filedata["states"][site] != "AVAILABLE" or site not in sites_xrootd_prefix ): @@ -259,10 +259,9 @@ def get_dataset_files_replicas( outfiles.append(outfile[0]) outsites.append(outsite[0]) else: - raise NotImplemented(f"Mode {mode} not yet implemented!") + raise NotImplementedError(f"Mode {mode} not yet implemented!") # Computing replicas by site: - totfiles = len(outfiles) sites_counts = defaultdict(int) if mode == "full": for sites_by_file in outsites: From b78f6403f1b05ad5197baa0c8efaf00d234480c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Aug 2023 13:27:01 +0000 Subject: [PATCH 13/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 99c41579b..cab5c7ef8 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,14 +1,14 @@ import random from collections import defaultdict + import cmd2 +import rucio_utils import yaml from rich import print from rich.console import Console from rich.prompt import Prompt from rich.table import Table from rich.tree import Tree -from rich.prompt import Prompt -import rucio_utils def print_dataset_query(query, dataset_list, selected, console): From 21cd240117f351d933a4874ed5474e0835c6d992 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 28 Aug 2023 15:31:17 +0200 Subject: [PATCH 14/80] Fixed comments spelling --- src/coffea/dataset_tools/rucio_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 8fa8dd906..96b17b454 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -64,7 +64,7 @@ def get_rucio_client(proxy=None) -> Client: def get_xrootd_sites_map(): """ - The mapping beetween RSE (sites) and the xrootd prefix rules is read + The mapping between RSE (sites) and the xrootd prefix rules is read from `/cvmfs/cms/cern.ch/SITECONF/*site*/storage.json`. This function returns the list of xrootd prefix rules for each site. @@ -136,7 +136,7 @@ def get_dataset_files_replicas( of all the replicas of the files in a CMS dataset. The sites can be filtered in 3 different ways: - - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Expection. + - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Exception. - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception - `regex_sites`: regex expression to restrict the list of sites. @@ -169,7 +169,7 @@ def get_dataset_files_replicas( - If `mode=="first"`, returns a list of sites for the first replica of each file. sites_counts: dict - Metadata couting the coverage of the dataset by site + Metadata counting the coverage of the dataset by site """ sites_xrootd_prefix = get_xrootd_sites_map() From dbed7d4a5506233632b8b050084f3faf5d24c32f Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 28 Aug 2023 09:15:12 -0500 Subject: [PATCH 15/80] py 3.9 for cirrus --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 42c13adb8..223b900c4 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -11,7 +11,7 @@ task: cpu: 2 memory: 7G matrix: - - image: python:3.8 + - image: python:3.9 - image: python:3.11 create_venv_script: | From 153c0ae1c7a9c394a2a61c20ad3c52fb4acdcf6d Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 30 Aug 2023 14:06:50 +0200 Subject: [PATCH 16/80] Switched to rucio-clients --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 82fa7e7e1..f6521fe64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,8 @@ dependencies = [ "pandas", "hist>=2", "cachetools", - "rucio>=32.2.0", + "rucio-clients>=32;python_version>'3.8'", + "rucio-clients<32;python_version<'3.9'", "cmd2" ] dynamic = ["version"] From 5a45e4a270cbd73c05adf9fe74fdf5d005823d98 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 30 Aug 2023 14:42:14 +0200 Subject: [PATCH 17/80] Added some docs to the cli --- src/coffea/dataset_tools/dataset_query.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index cab5c7ef8..17a599096 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -2,7 +2,7 @@ from collections import defaultdict import cmd2 -import rucio_utils +from . import rucio_utils import yaml from rich import print from rich.console import Console @@ -342,5 +342,24 @@ def do_save(self, args): if __name__ == "__main__": + intro_msg = """[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] +Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. +Some basic commands: + - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS) + - [bold cyan]select (S)[/]: Select datasets to process further from query results + - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites + - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets + - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset + - [bold cyan]sites_filters[/]: show the active sites filters + - [bold cyan]sites_filters clear[/]: clear all the active sites filters + - [bold cyan]whitelist_sites[/]: Select sites to whitelist for replica queries + - [bold cyan]blacklist_sites[/]: Select sites to blacklist for replica queries + - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" + - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing + - [bold cyan]help[/]: get help! +""" + console = Console() + console.print(intro_msg, justify="left") + app = DatasetQueryApp() app.cmdloop() From 00fb57c2ea23e75046669ebeaf85c827bfe9fa59 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Aug 2023 12:47:43 +0000 Subject: [PATCH 18/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pyproject.toml | 2 +- src/coffea/dataset_tools/dataset_query.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f6521fe64..4adec6b30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ dependencies = [ "hist>=2", "cachetools", "rucio-clients>=32;python_version>'3.8'", - "rucio-clients<32;python_version<'3.9'", + "rucio-clients<32;python_version<'3.9'", "cmd2" ] dynamic = ["version"] diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 17a599096..91b3910c7 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -2,7 +2,6 @@ from collections import defaultdict import cmd2 -from . import rucio_utils import yaml from rich import print from rich.console import Console @@ -10,6 +9,8 @@ from rich.table import Table from rich.tree import Tree +from . import rucio_utils + def print_dataset_query(query, dataset_list, selected, console): table = Table(title=f"Query: [bold red]{query}") @@ -342,7 +343,7 @@ def do_save(self, args): if __name__ == "__main__": - intro_msg = """[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] + intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. Some basic commands: - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS) From 6d5d8005506efbeccf78fe401204c934413c15c8 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 30 Aug 2023 12:43:17 -0500 Subject: [PATCH 19/80] roll back test to py3.8 --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 223b900c4..42c13adb8 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -11,7 +11,7 @@ task: cpu: 2 memory: 7G matrix: - - image: python:3.9 + - image: python:3.8 - image: python:3.11 create_venv_script: | From e0f17266bfae5fdce7784465d3d7eeccf4d74cf6 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 30 Aug 2023 12:46:01 -0500 Subject: [PATCH 20/80] math.ceil instead of integer division --- src/coffea/dataset_tools/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index 7fdc46857..ca5400492 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -3,6 +3,7 @@ import awkward import dask import dask_awkward +import math import numpy import uproot @@ -46,7 +47,7 @@ def _get_steps( out = numpy.array(out, dtype="int64") out = numpy.stack((out[:-1], out[1:]), axis=1) else: - n_steps = num_entries // target_step_size + n_steps = math.ceil(num_entries // target_step_size) out = numpy.array( [ [ From 885bbf0aa153f639f7bf71f7c686cf9564f97e5d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Aug 2023 17:46:31 +0000 Subject: [PATCH 21/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/preprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index ca5400492..18f4e615c 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -3,7 +3,6 @@ import awkward import dask import dask_awkward -import math import numpy import uproot From 3283256e46c9144f8ad8b43197c715e8ded80fb3 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 30 Aug 2023 14:00:56 -0500 Subject: [PATCH 22/80] Forgot to remove the slash. --- src/coffea/dataset_tools/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index 18f4e615c..f97153c92 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -46,7 +46,7 @@ def _get_steps( out = numpy.array(out, dtype="int64") out = numpy.stack((out[:-1], out[1:]), axis=1) else: - n_steps = math.ceil(num_entries // target_step_size) + n_steps = math.ceil(num_entries / target_step_size) out = numpy.array( [ [ From 4c8d917ab28307d844b650faa3da7ba9537f749f Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 17 Oct 2023 13:39:40 -0500 Subject: [PATCH 23/80] make rucio-clients an extra --- pyproject.toml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9bc350125..d832f945b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,9 +58,6 @@ dependencies = [ "pandas", "hist>=2", "cachetools", - "rucio-clients>=32;python_version>'3.8'", - "rucio-clients<32;python_version<'3.9'", - "cmd2" ] dynamic = ["version"] @@ -88,6 +85,11 @@ servicex = [ "servicex>=2.5.3", "func-adl_servicex", ] +rucio = [ + "rucio-clients>=32;python_version>'3.8'", + "rucio-clients<32;python_version<'3.9'", + "cmd2", +] dev = [ "pre-commit", "flake8", From 9c598bba509caec6a489b8fbf13aff51202e9849 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Fri, 10 Nov 2023 07:05:26 -0600 Subject: [PATCH 24/80] two wrappers to apply processor wrapped code to datasets --- src/coffea/dataset_tools/__init__.py | 3 ++- src/coffea/dataset_tools/apply_processor.py | 23 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 src/coffea/dataset_tools/apply_processor.py diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index c02a96e48..445481c3c 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,3 +1,4 @@ +from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset from coffea.dataset_tools.preprocess import preprocess -__all__ = ["preprocess"] +__all__ = ["preprocess", "apply_to_one_dataset", "apply_to_fileset"] diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py new file mode 100644 index 000000000..1bcd7d18a --- /dev/null +++ b/src/coffea/dataset_tools/apply_processor.py @@ -0,0 +1,23 @@ +from coffea.nanoevents import NanoAODSchema, NanoEventsFactory +from coffea.processor import ProcessorABC + + +def apply_to_one_dataset( + proc: ProcessorABC, dataset, schemaclass=NanoAODSchema, metadata={} +): + files = dataset["files"] + events = NanoEventsFactory.from_root( + files, + metadata=metadata, + schemaclass=NanoAODSchema, + ) + return proc.process(events) + + +def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema): + out = {} + for name, dataset in fileset.items(): + metadata = dataset.get("metadata", {}) + metadata["dataset"] = name + out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata) + return out From 20697862a40cf95132fa72730b7984dbc0396e58 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Fri, 10 Nov 2023 08:31:45 -0600 Subject: [PATCH 25/80] let preprocess deal with missing files in a configurable way, add in helper to replicate maxchunks functionality --- src/coffea/dataset_tools/__init__.py | 9 ++++++++- src/coffea/dataset_tools/manipulations.py | 14 ++++++++++++++ src/coffea/dataset_tools/preprocess.py | 15 ++++++++++++--- 3 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 src/coffea/dataset_tools/manipulations.py diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index 445481c3c..888df7eaf 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,4 +1,11 @@ from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset +from coffea.dataset_tools.manipulations import max_chunks, slice_chunks from coffea.dataset_tools.preprocess import preprocess -__all__ = ["preprocess", "apply_to_one_dataset", "apply_to_fileset"] +__all__ = [ + "preprocess", + "apply_to_one_dataset", + "apply_to_fileset", + "max_chunks", + "slice_chunks", +] diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py new file mode 100644 index 000000000..b7e40e785 --- /dev/null +++ b/src/coffea/dataset_tools/manipulations.py @@ -0,0 +1,14 @@ +def max_chunks(fileset, maxchunks=None): + return slice_chunks(fileset, slice(maxchunks)) + + +def slice_chunks(fileset, theslice=slice(None)): + if not isinstance(theslice, slice): + theslice = slice(theslice) + + out = fileset.copy() + for name, entry in fileset.items(): + for fname, finfo in entry["files"].items(): + out[name]["files"][fname]["steps"] = finfo["steps"][theslice] + + return out diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index f97153c92..eef96b549 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -12,6 +12,8 @@ def _get_steps( maybe_step_size=None, align_clusters=False, recalculate_seen_steps=False, + skip_bad_files=False, + file_exceptions=(FileNotFoundError, OSError), ): nf_backend = awkward.backend(normed_files) lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files) @@ -20,9 +22,12 @@ def _get_steps( for arg in lz_or_nf: try: the_file = uproot.open({arg.file: None}) - except FileNotFoundError: - array.append(None) - continue + except file_exceptions as e: + if skip_bad_files: + array.append(None) + continue + else: + raise e tree = the_file[arg.object_path] num_entries = tree.num_entries @@ -95,6 +100,8 @@ def preprocess( align_clusters=False, recalculate_seen_steps=False, files_per_batch=1, + skip_bad_files=False, + file_exceptions=(FileNotFoundError, OSError), ): out_updated = fileset.copy() out_available = fileset.copy() @@ -128,6 +135,8 @@ def preprocess( maybe_step_size=maybe_step_size, align_clusters=align_clusters, recalculate_seen_steps=recalculate_seen_steps, + skip_bad_files=skip_bad_files, + file_exceptions=file_exceptions, ) all_processed_files = dask.compute(files_to_preprocess)[0] From e3959f4252114ba8bbd477a179e4fbe64756f241 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 16:46:11 -0600 Subject: [PATCH 26/80] remove old tests and switch to new tools in tests --- src/coffea/dataset_tools/apply_processor.py | 6 +- src/coffea/dataset_tools/manipulations.py | 5 +- src/coffea/dataset_tools/preprocess.py | 5 +- src/coffea/processor/__init__.py | 141 +- src/coffea/processor/accumulator.py | 6 +- src/coffea/processor/dask/__init__.py | 77 - src/coffea/processor/dataframe.py | 117 - src/coffea/processor/executor.py | 2220 ----------------- src/coffea/processor/helpers.py | 273 -- src/coffea/processor/parsl/__init__.py | 0 src/coffea/processor/parsl/condor_config.py | 77 - src/coffea/processor/parsl/detail.py | 89 - src/coffea/processor/parsl/slurm_config.py | 67 - src/coffea/processor/parsl/timeout.py | 21 - src/coffea/processor/servicex/__init__.py | 39 - src/coffea/processor/servicex/analysis.py | 45 - .../processor/servicex/dask_executor.py | 89 - src/coffea/processor/servicex/data_source.py | 116 - src/coffea/processor/servicex/executor.py | 183 -- .../processor/servicex/local_executor.py | 69 - src/coffea/processor/spark/__init__.py | 0 src/coffea/processor/spark/detail.py | 133 - src/coffea/processor/spark/spark_executor.py | 195 -- src/coffea/processor/templates/__init__.py | 0 src/coffea/processor/templates/spark.py.tmpl | 24 - tests/test_local_executors.py | 125 - tests/test_workitem.py | 29 - tests/wq.py | 66 - 28 files changed, 16 insertions(+), 4201 deletions(-) delete mode 100644 src/coffea/processor/dask/__init__.py delete mode 100644 src/coffea/processor/dataframe.py delete mode 100644 src/coffea/processor/executor.py delete mode 100644 src/coffea/processor/helpers.py delete mode 100644 src/coffea/processor/parsl/__init__.py delete mode 100644 src/coffea/processor/parsl/condor_config.py delete mode 100644 src/coffea/processor/parsl/detail.py delete mode 100644 src/coffea/processor/parsl/slurm_config.py delete mode 100644 src/coffea/processor/parsl/timeout.py delete mode 100644 src/coffea/processor/servicex/__init__.py delete mode 100644 src/coffea/processor/servicex/analysis.py delete mode 100644 src/coffea/processor/servicex/dask_executor.py delete mode 100644 src/coffea/processor/servicex/data_source.py delete mode 100644 src/coffea/processor/servicex/executor.py delete mode 100644 src/coffea/processor/servicex/local_executor.py delete mode 100644 src/coffea/processor/spark/__init__.py delete mode 100644 src/coffea/processor/spark/detail.py delete mode 100644 src/coffea/processor/spark/spark_executor.py delete mode 100644 src/coffea/processor/templates/__init__.py delete mode 100644 src/coffea/processor/templates/spark.py.tmpl delete mode 100644 tests/test_local_executors.py delete mode 100644 tests/test_workitem.py delete mode 100755 tests/wq.py diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 1bcd7d18a..610bae8fa 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -9,15 +9,15 @@ def apply_to_one_dataset( events = NanoEventsFactory.from_root( files, metadata=metadata, - schemaclass=NanoAODSchema, - ) + schemaclass=schemaclass, + ).events() return proc.process(events) def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema): out = {} for name, dataset in fileset.items(): - metadata = dataset.get("metadata", {}) + metadata = dataset.get("metadata", {}).copy() metadata["dataset"] = name out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata) return out diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index b7e40e785..e963a067c 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -1,3 +1,6 @@ +import copy + + def max_chunks(fileset, maxchunks=None): return slice_chunks(fileset, slice(maxchunks)) @@ -6,7 +9,7 @@ def slice_chunks(fileset, theslice=slice(None)): if not isinstance(theslice, slice): theslice = slice(theslice) - out = fileset.copy() + out = copy.deepcopy(fileset) for name, entry in fileset.items(): for fname, finfo in entry["files"].items(): out[name]["files"][fname]["steps"] = finfo["steps"][theslice] diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index eef96b549..ad4d4bc6d 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -1,3 +1,4 @@ +import copy import math import awkward @@ -103,8 +104,8 @@ def preprocess( skip_bad_files=False, file_exceptions=(FileNotFoundError, OSError), ): - out_updated = fileset.copy() - out_available = fileset.copy() + out_updated = copy.deepcopy(fileset) + out_available = copy.deepcopy(fileset) all_ak_norm_files = {} files_to_preprocess = {} for name, info in fileset.items(): diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py index 54aea102d..91b6f357d 100644 --- a/src/coffea/processor/__init__.py +++ b/src/coffea/processor/__init__.py @@ -2,146 +2,11 @@ """ -# deprecated run_uproot_job & executor usage: -from functools import partial - -from coffea.nanoevents.schemas import NanoAODSchema, TreeMakerSchema - -from .accumulator import ( - Accumulatable, - AccumulatorABC, - accumulate, - column_accumulator, - defaultdict_accumulator, - dict_accumulator, - list_accumulator, - set_accumulator, - value_accumulator, -) -from .dataframe import LazyDataFrame -from .executor import ( - DaskExecutor, - FuturesExecutor, - IterativeExecutor, - ParslExecutor, - Runner, - WorkQueueExecutor, - run_spark_job, -) -from .helpers import PackedSelection, Weights +from .accumulator import AccumulatorABC, dict_accumulator from .processor import ProcessorABC - -def _run_x_job( - fileset, - treename, - processor_instance, - executor, - executor_args={}, - pre_executor=None, - pre_args=None, - chunksize=100000, - maxchunks=None, - metadata_cache=None, - dynamic_chunksize=None, - format="root", -): - """ - Please use instead, e.g.: - - executor = IterativeExecutor() - run = processor.Runner( - executor=executor, - schema=processor.NanoAODSchema, - ) - hists = run(filelist, "Events", processor_instance=processor_instance) - """ - - # turn this deprecation warning on from coffea.__version__ >= 0.8 on - # from coffea.util import deprecate - # deprecate( - # RuntimeError(f"This method is deprecated, please use directly the new: {executor} and {Runner} classes.\n {_run_x_job.__doc__}"), # noqa: E501 - # 0.9, - # ) - - # extract executor kwargs - exe_args = {} - exe_fields = executor.__dataclass_fields__.keys() - exe_keys = list(executor_args.keys()) - for k in exe_keys: - if k in exe_fields: - exe_args[k] = executor_args.pop(k) - - executor = executor(**exe_args) - - # extract preexecutor kwargs - if pre_executor is not None and pre_args is not None: - pre_exe_args = {} - pre_exe_fields = pre_executor.__dataclass_fields__.keys() - pre_exe_keys = list(pre_args.keys()) - for k in pre_exe_keys: - if k in pre_exe_fields: - pre_exe_args[k] = pre_args.pop(k) - - pre_executor = pre_executor(**pre_exe_args) - - # make Runner instance, assume other args are for _work_function & co. - run = Runner( - executor=executor, - chunksize=chunksize, - maxchunks=maxchunks, - metadata_cache=metadata_cache, - dynamic_chunksize=dynamic_chunksize, - format=format, - **executor_args, - ) - - return run( - fileset, - treename, - processor_instance=processor_instance, - ) - - -run_uproot_job = partial(_run_x_job, format="root") -run_parquet_job = partial(_run_x_job, format="parquet") - -iterative_executor = IterativeExecutor -futures_executor = FuturesExecutor -dask_executor = DaskExecutor -parsl_executor = ParslExecutor -work_queue_executor = WorkQueueExecutor - - __all__ = [ - "ProcessorABC", - "LazyDataFrame", - "Weights", - "PackedSelection", - "IterativeExecutor", - "FuturesExecutor", - "DaskExecutor", - "ParslExecutor", - "WorkQueueExecutor", - "Runner", - "run_spark_job", - "accumulate", - "Accumulatable", - "AccumulatorABC", - "value_accumulator", - "list_accumulator", - "set_accumulator", "dict_accumulator", - "defaultdict_accumulator", - "column_accumulator", - "NanoAODSchema", - "TreeMakerSchema", - # following methods are deprecated - "run_uproot_job", - "run_parquet_job", - "iterative_executor", - "futures_executor", - "dask_executor", - "parsl_executor", - "work_queue_executor", + "AccumulatorABC", + "ProcessorABC", ] diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py index 282214a6c..8ad12dab1 100644 --- a/src/coffea/processor/accumulator.py +++ b/src/coffea/processor/accumulator.py @@ -57,14 +57,14 @@ def add(a: Accumulatable, b: Accumulatable) -> Accumulatable: out[key] = ( copy.deepcopy(a[key]) if not isinstance(a[key], DaskMethodsMixin) - else a[key] + else copy.copy(a[key]) ) for key in b: if key not in lhs: out[key] = ( copy.deepcopy(b[key]) if not isinstance(b[key], DaskMethodsMixin) - else b[key] + else copy.copy(b[key]) ) return out raise ValueError( @@ -93,7 +93,7 @@ def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable: a[key] = ( copy.deepcopy(b[key]) if not isinstance(b[key], DaskMethodsMixin) - else b[key] + else copy.copy(b[key]) ) return a raise ValueError( diff --git a/src/coffea/processor/dask/__init__.py b/src/coffea/processor/dask/__init__.py deleted file mode 100644 index fc73f0d8e..000000000 --- a/src/coffea/processor/dask/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -from collections.abc import MutableMapping -from threading import Lock - -import blosc -from distributed import WorkerPlugin, get_worker -from zict import LRU, Buffer, File, Func - - -class ColumnCache(WorkerPlugin, MutableMapping): - name = "columncache" - - def __init__(self, maxmem=5e8, maxcompressed=2e9, maxdisk=1e10): - self._maxmem = maxmem - self._maxcompressed = maxcompressed - self._maxdisk = maxdisk - - def setup(self, worker): - self.cache = Buffer( - fast={}, - slow=Func( - dump=blosc.pack_array, - load=blosc.unpack_array, - d=Buffer( - fast={}, - slow=LRU( - n=self._maxdisk, - d=File(os.path.join(worker.local_directory, "cache")), - weight=lambda k, v: len(v), - ), - n=self._maxcompressed, - weight=lambda k, v: len(v), - ), - ), - n=self._maxmem, - weight=lambda k, v: v.nbytes, - ) - self.lock = Lock() - self.hits = 0 - self.misses = 0 - - def teardown(self, worker): - pass - - def __getitem__(self, key): - with self.lock: - try: - out = self.cache[key] - self.hits += 1 - return out - except KeyError: - self.misses += 1 - raise - - def __setitem__(self, key, value): - with self.lock: - self.cache[key] = value - - def __delitem__(self, key): - with self.lock: - del self.cache[key] - - def __iter__(self): - with self.lock: - return iter(self.cache) - - def __len__(self): - with self.lock: - return len(self.cache) - - -def register_columncache(client): - plugins = set() - for p in client.run(lambda: set(get_worker().plugins)).values(): - plugins |= p - if ColumnCache.name not in plugins: - client.register_worker_plugin(ColumnCache()) diff --git a/src/coffea/processor/dataframe.py b/src/coffea/processor/dataframe.py deleted file mode 100644 index 20fa48445..000000000 --- a/src/coffea/processor/dataframe.py +++ /dev/null @@ -1,117 +0,0 @@ -from collections.abc import MutableMapping - -import uproot - - -class LazyDataFrame(MutableMapping): - """Simple delayed uproot reader (a la lazyarrays) - - One can access branches either through ``df["bname"]`` or ``df.bname``, although - the latter is restricted to branches that do not start with a leading underscore. - Keeps track of values accessed, in the `materialized` attribute. - - Parameters - ---------- - tree : uproot.TTree - Tree to read - entrystart : int, optional - First entry to read, default: 0 - entrystop : int, optional - Last entry to read, default None (read to end) - preload_items : iterable - Force preloading of a set of columns from the tree - metadata : Mapping - Additional metadata for the dataframe - """ - - def __init__( - self, tree, entrystart=None, entrystop=None, preload_items=None, metadata=None - ): - self._tree = tree - self._branchargs = { - "decompression_executor": uproot.source.futures.TrivialExecutor(), - "interpretation_executor": uproot.source.futures.TrivialExecutor(), - } - if entrystart is None or entrystart < 0: - entrystart = 0 - if entrystop is None or entrystop > tree.num_entries: - entrystop = tree.num_entries - self._branchargs["entry_start"] = entrystart - self._branchargs["entry_stop"] = entrystop - self._available = {k for k in self._tree.keys()} - self._dict = {} - self._materialized = set() - if preload_items: - self.preload(preload_items) - self._metadata = metadata - - def __delitem__(self, key): - del self._dict[key] - - def __getitem__(self, key): - if key in self._dict: - return self._dict[key] - elif key in self._tree: - self._materialized.add(key) - array = self._tree[key].array(**self._branchargs) - self._dict[key] = array - return self._dict[key] - else: - raise KeyError(key) - - def __getattr__(self, key): - if key.startswith("_"): - raise AttributeError(key) - try: - return self.__getitem__(key) - except KeyError: - raise AttributeError(key) - - def __iter__(self): - yield from self._available - - def __len__(self): - return len(self._dict) - - def __setitem__(self, key, value): - self._dict[key] = value - - def __contains__(self, key): - # by default, MutableMapping uses __getitem__ to test, but we want to avoid materialization - return key in self._dict or key in self._tree - - @property - def available(self): - """Set of available columns""" - return self._available - - @property - def columns(self): - """Set of available columns""" - return self._available - - @property - def materialized(self): - """Set of columns read from tree""" - return self._materialized - - @property - def size(self): - """Length of column vector""" - return self._branchargs["entry_stop"] - self._branchargs["entry_start"] - - @property - def metadata(self): - return self._metadata - - def preload(self, columns): - """Force loading of several columns - - Parameters - ---------- - columns : iterable - A list of columns to load - """ - for name in columns: - if name in self._tree: - _ = self[name] diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py deleted file mode 100644 index d61735078..000000000 --- a/src/coffea/processor/executor.py +++ /dev/null @@ -1,2220 +0,0 @@ -import concurrent.futures -import json -import math -import os -import pickle -import shutil -import sys -import time -import traceback -import uuid -import warnings -from collections import defaultdict -from collections.abc import Mapping, MutableMapping -from contextlib import ExitStack -from dataclasses import asdict, dataclass, field -from functools import partial -from io import BytesIO -from itertools import repeat -from typing import ( - Awaitable, - Callable, - Dict, - Generator, - Iterable, - List, - Optional, - Set, - Tuple, - Union, -) - -import cloudpickle -import lz4.frame as lz4f -import toml -import uproot -from cachetools import LRUCache - -from ..nanoevents import NanoEventsFactory, schemas -from ..util import _exception_chain, _hash, rich_bar -from .accumulator import Accumulatable, accumulate, set_accumulator -from .dataframe import LazyDataFrame -from .processor import ProcessorABC - -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal - - -try: - from functools import cached_property -except ImportError: - cached_property = property - - -_PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL -DEFAULT_METADATA_CACHE: MutableMapping = LRUCache(100000) - -_PROTECTED_NAMES = { - "dataset", - "filename", - "treename", - "metadata", - "entrystart", - "entrystop", - "fileuuid", - "numentries", - "uuid", - "clusters", -} - - -class UprootMissTreeError(uproot.exceptions.KeyInFileError): - pass - - -class FileMeta: - __slots__ = ["dataset", "filename", "treename", "metadata"] - - def __init__(self, dataset, filename, treename, metadata=None): - self.dataset = dataset - self.filename = filename - self.treename = treename - self.metadata = metadata - - def __str__(self): - return f"FileMeta({self.filename}:{self.treename})" - - def __hash__(self): - # As used to lookup metadata, no need for dataset - return _hash((self.filename, self.treename)) - - def __eq__(self, other): - # In case of hash collisions - return self.filename == other.filename and self.treename == other.treename - - def maybe_populate(self, cache): - if cache and self in cache: - self.metadata = cache[self] - - def populated(self, clusters=False): - """Return true if metadata is populated - - By default, only require bare minimum metadata (numentries, uuid) - If clusters is True, then require cluster metadata to be populated - """ - if self.metadata is None: - return False - elif "numentries" not in self.metadata or "uuid" not in self.metadata: - return False - elif clusters and "clusters" not in self.metadata: - return False - return True - - def chunks(self, target_chunksize, align_clusters): - if not self.populated(clusters=align_clusters): - raise RuntimeError - user_keys = set(self.metadata.keys()) - _PROTECTED_NAMES - user_meta = {k: self.metadata[k] for k in user_keys} - if align_clusters: - chunks = [0] - for c in self.metadata["clusters"]: - if c >= chunks[-1] + target_chunksize: - chunks.append(c) - if self.metadata["clusters"][-1] != chunks[-1]: - chunks.append(self.metadata["clusters"][-1]) - for start, stop in zip(chunks[:-1], chunks[1:]): - yield WorkItem( - self.dataset, - self.filename, - self.treename, - start, - stop, - self.metadata["uuid"], - user_meta, - ) - return target_chunksize - else: - numentries = self.metadata["numentries"] - update = True - start = 0 - while start < numentries: - if update: - n = max(round((numentries - start) / target_chunksize), 1) - actual_chunksize = math.ceil((numentries - start) / n) - stop = min(numentries, start + actual_chunksize) - next_chunksize = yield WorkItem( - self.dataset, - self.filename, - self.treename, - start, - stop, - self.metadata["uuid"], - user_meta, - ) - start = stop - if next_chunksize and next_chunksize != target_chunksize: - target_chunksize = next_chunksize - update = True - else: - update = False - return target_chunksize - - -@dataclass(unsafe_hash=True, frozen=True) -class WorkItem: - dataset: str - filename: str - treename: str - entrystart: int - entrystop: int - fileuuid: str - usermeta: Optional[Dict] = field(default=None, compare=False) - - def __len__(self) -> int: - return self.entrystop - self.entrystart - - -def _compress(item, compression): - if item is None or compression is None: - return item - else: - with BytesIO() as bf: - with lz4f.open(bf, mode="wb", compression_level=compression) as f: - pickle.dump(item, f, protocol=_PICKLE_PROTOCOL) - result = bf.getvalue() - return result - - -def _decompress(item): - if isinstance(item, bytes): - # warning: if item is not exactly of type bytes, BytesIO(item) will - # make a copy of it, increasing the memory usage. - with BytesIO(item) as bf: - with lz4f.open(bf, mode="rb") as f: - return pickle.load(f) - else: - return item - - -class _compression_wrapper: - def __init__(self, level, function, name=None): - self.level = level - self.function = function - self.name = name - - def __str__(self): - if self.name is not None: - return self.name - try: - name = self.function.__name__ - if name == "": - return "lambda" - return name - except AttributeError: - return str(self.function) - - # no @wraps due to pickle - def __call__(self, *args, **kwargs): - out = self.function(*args, **kwargs) - return _compress(out, self.level) - - -class _reduce: - def __init__(self, compression): - self.compression = compression - - def __str__(self): - return "reduce" - - def __call__(self, items): - items = list(it for it in items if it is not None) - if len(items) == 0: - raise ValueError("Empty list provided to reduction") - if self.compression is not None: - out = _decompress(items.pop()) - out = accumulate(map(_decompress, items), out) - return _compress(out, self.compression) - return accumulate(items) - - -class _FuturesHolder: - def __init__(self, futures: Set[Awaitable], refresh=2): - self.futures = set(futures) - self.merges = set() - self.completed = set() - self.done = {"futures": 0, "merges": 0} - self.running = len(self.futures) - self.refresh = refresh - - def update(self, refresh: int = None): - if refresh is None: - refresh = self.refresh - if self.futures: - completed, self.futures = concurrent.futures.wait( - self.futures, - timeout=refresh, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - self.completed.update(completed) - self.done["futures"] += len(completed) - - if self.merges: - completed, self.merges = concurrent.futures.wait( - self.merges, - timeout=refresh, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - self.completed.update(completed) - self.done["merges"] += len(completed) - self.running = len(self.futures) + len(self.merges) - - def add_merge(self, merges: Awaitable[Accumulatable]): - self.merges.add(merges) - self.running = len(self.futures) + len(self.merges) - - def fetch(self, N: int) -> List[Accumulatable]: - _completed = [self.completed.pop() for _ in range(min(N, len(self.completed)))] - if all(_good_future(future) for future in _completed): - return [future.result() for future in _completed if _good_future(future)] - else: # Make recoverable - good_futures = [future for future in _completed if _good_future(future)] - bad_futures = [future for future in _completed if not _good_future(future)] - self.completed.update(good_futures) - raise bad_futures[0].exception() - - -def _good_future(future: Awaitable) -> bool: - return future.done() and not future.cancelled() and future.exception() is None - - -def _futures_handler(futures, timeout): - """Essentially the same as concurrent.futures.as_completed - but makes sure not to hold references to futures any longer than strictly necessary, - which is important if the future holds a large result. - """ - futures = set(futures) - try: - while futures: - try: - done, futures = concurrent.futures.wait( - futures, - timeout=timeout, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - if len(done) == 0: - warnings.warn( - f"No finished jobs after {timeout}s, stopping remaining {len(futures)} jobs early" - ) - break - while done: - try: - yield done.pop().result() - except concurrent.futures.CancelledError: - pass - except KeyboardInterrupt as e: - for job in futures: - try: - job.cancel() - # this is not implemented with parsl AppFutures - except NotImplementedError: - raise e from None - running = sum(job.running() for job in futures) - warnings.warn( - f"Early stop: cancelled {len(futures) - running} jobs, will wait for {running} running jobs to complete" - ) - finally: - running = sum(job.running() for job in futures) - if running: - warnings.warn( - f"Cancelling {running} running jobs (likely due to an exception)" - ) - try: - while futures: - futures.pop().cancel() - except NotImplementedError: - pass - - -@dataclass -class ExecutorBase: - # shared by all executors - status: bool = True - unit: str = "items" - desc: str = "Processing" - compression: Optional[int] = 1 - function_name: Optional[str] = None - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - raise NotImplementedError( - "This class serves as a base class for executors, do not instantiate it!" - ) - - def copy(self, **kwargs): - tmp = self.__dict__.copy() - tmp.update(kwargs) - return type(self)(**tmp) - - -@dataclass -class DaskExecutorBase(ExecutorBase): - """This base class for dak-based processors - synthesizes all analysis inputs into one - task graph that's then executed by derived - classes. - """ - - def prepare_dataset_graph(self, items, function, accumulator): - accumulator = None - for dset, info in items.items(): - if isinstance(items, dict) and "object_path" not in list(items.values()): - raise ValueError( - "items should be normalized to uproot spec in prepare_dataset_graph" - ) - - metadata = info["metadata"].copy() - metadata["dataset"] = dset - - temp = function(info["files"], metadata=metadata) - if accumulator is None: - accumulator = temp - else: - accumulator = accumulate((accumulator, temp)) - - return accumulator - - -@dataclass -class DaskSyncExecutor(DaskExecutorBase): - """Execute dask task graph in one thread - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Ignored for iterative executor - """ - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - import dask - - to_compute = self.prepare_dataset_graph(items, function, None) - computed = dask.compute(to_compute, scheduler="sync") - return computed[0] if len(computed) == 1 else computed - - -@dataclass -class DaskProcessesExecutor(DaskExecutorBase): - """Execute dask task graph in a multiprocessing pool - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Ignored for iterative executor - """ - - workers = 1 - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - import dask - - to_compute = self.prepare_dataset_graph(items, function, None) - with dask.config.set(num_workers=self.workers): - computed = dask.compute(to_compute, scheduler="processes") - return computed[0] if len(computed) == 1 else computed - - -def _watcher( - FH: _FuturesHolder, - executor: ExecutorBase, - merge_fcn: Callable, - pool: Optional[Callable] = None, -) -> Accumulatable: - with rich_bar() as progress: - p_id = progress.add_task(executor.desc, total=FH.running, unit=executor.unit) - desc_m = "Merging" if executor.merging else "Merging (local)" - p_idm = progress.add_task(desc_m, total=0, unit="merges") - - merged = None - while FH.running > 0: - FH.update() - progress.update(p_id, completed=FH.done["futures"], refresh=True) - - if executor.merging: # Merge jobs - merge_size = executor._merge_size(len(FH.completed)) - progress.update(p_idm, completed=FH.done["merges"]) - while len(FH.completed) > 1: - if FH.running > 0 and len(FH.completed) < executor.merging[1]: - break - batch = FH.fetch(merge_size) - # Add debug for batch mem size? TODO with logging? - if isinstance(executor, FuturesExecutor) and pool is not None: - FH.add_merge(pool.submit(merge_fcn, batch)) - elif isinstance(executor, ParslExecutor): - FH.add_merge(merge_fcn(batch)) - else: - raise RuntimeError("Invalid executor") - progress.update( - p_idm, - total=progress._tasks[p_idm].total + 1, - refresh=True, - ) - else: # Merge within process - batch = FH.fetch(len(FH.completed)) - merged = _compress( - accumulate( - progress.track( - map(_decompress, (c for c in batch)), - task_id=p_idm, - total=progress._tasks[p_idm].total + len(batch), - ), - _decompress(merged), - ), - executor.compression, - ) - # Add checkpointing - - if executor.merging: - progress.refresh() - merged = FH.completed.pop().result() - if len(FH.completed) > 0 or len(FH.futures) > 0 or len(FH.merges) > 0: - raise RuntimeError("Not all futures are added.") - return merged - - -def _wait_for_merges(FH: _FuturesHolder, executor: ExecutorBase) -> Accumulatable: - with rich_bar() as progress: - if executor.merging: - to_finish = len(FH.merges) - p_id_w = progress.add_task( - "Waiting for merge jobs", - total=to_finish, - unit=executor.unit, - ) - while len(FH.merges) > 0: - FH.update() - progress.update( - p_id_w, - completed=(to_finish - len(FH.merges)), - refresh=True, - ) - - FH.update() - recovered = [future.result() for future in FH.completed if _good_future(future)] - p_id_m = progress.add_task("Merging finished jobs", unit="merges") - return _compress( - accumulate( - progress.track( - map(_decompress, (c for c in recovered)), - task_id=p_id_m, - total=len(recovered), - ) - ), - executor.compression, - ) - - -@dataclass -class WorkQueueExecutor(ExecutorBase): - """Execute using Work Queue - - For more information, see :ref:`intro-coffea-wq` - - Parameters - ---------- - items : sequence or generator - Sequence of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 9) - `None`` sets level to 1 (minimal compression) - # work queue specific options: - cores : int - Maximum number of cores for work queue task. If unset, use a whole worker. - memory : int - Maximum amount of memory (in MB) for work queue task. If unset, use a whole worker. - disk : int - Maximum amount of disk space (in MB) for work queue task. If unset, use a whole worker. - gpus : int - Number of GPUs to allocate to each task. If unset, use zero. - resource_monitor : str - If given, one of 'off', 'measure', or 'watchdog'. Default is 'off'. - - 'off': turns off resource monitoring. Overridden to 'watchdog' if resources_mode - is not set to 'fixed'. - - 'measure': turns on resource monitoring for Work Queue. The - resources used per task are measured. - - 'watchdog': in addition to measuring resources, tasks are terminated if they - go above the cores, memory, or disk specified. - resources_mode : str - one of 'fixed', 'max-seen', or 'max-throughput'. Default is 'max-seen'. - Sets the strategy to automatically allocate resources to tasks. - - 'fixed': allocate cores, memory, and disk specified for each task. - - 'max-seen' or 'auto': use the cores, memory, and disk given as maximum values to allocate, - but first try each task by allocating the maximum values seen. Leads - to a good compromise between parallelism and number of retries. - - 'max-throughput': Like max-seen, but first tries the task with an - allocation that maximizes overall throughput. - If resources_mode is other than 'fixed', preprocessing and - accumulation tasks always use the 'max-seen' strategy, as the - former tasks always use the same resources, the latter has a - distribution of resources that increases over time. - split_on_exhaustion: bool - Whether to split a processing task in half according to its chunksize when it exhausts its - the cores, memory, or disk allocated to it. If False, a task that exhausts resources - permanently fails. Default is True. - fast_terminate_workers: int - Terminate workers on which tasks have been running longer than average. - The time limit is computed by multiplying the average runtime of tasks - by the value of 'fast_terminate_workers'. Since there are - legitimately slow tasks, no task may trigger fast termination in - two distinct workers. Less than 1 disables it. - - manager_name : str - Name to refer to this work queue manager. - Sets port to 0 (any available port) if port not given. - port : int or tuple(int, int) - Port number or range (inclusive of ports )for work queue manager program. - Defaults to 9123 if manager_name not given. - password_file: str - Location of a file containing a password used to authenticate workers. - ssl: bool or tuple(str, str) - Enable ssl encryption between manager and workers. If a tuple, then it - should be of the form (key, cert), where key and cert are paths to the files - containing the key and certificate in pem format. If True, auto-signed temporary - key and cert are generated for the session. - - extra_input_files: list - A list of files in the current working directory to send along with each task. - Useful for small custom libraries and configuration files needed by the processor. - x509_proxy : str - Path to the X509 user proxy. If None (the default), use the value of the - environment variable X509_USER_PROXY, or fallback to the file /tmp/x509up_u${UID} if - exists. If False, disables the default behavior and no proxy is sent. - - environment_file : optional, str - Conda python environment tarball to use. If not given, assume that - the python environment is already setup at the execution site. - wrapper : str - Wrapper script to run/open python environment tarball. Defaults to python_package_run found in PATH. - - treereduction : int - Number of processed chunks per accumulation task. Defaults is 20. - - verbose : bool - If true, emit a message on each task submission and completion. - Default is false. - print_stdout : bool - If true (default), print the standard output of work queue task on completion. - - debug_log : str - Filename for debug output - stats_log : str - Filename for tasks statistics output - transactions_log : str - Filename for tasks lifetime reports output - tasks_accum_log : str - Filename for the log of tasks that have been processed and accumulated. - - filepath: str - Path to the parent directory where to create the staging directory. - Default is "." (current working directory). - - custom_init : function, optional - A function that takes as an argument the queue's WorkQueue object. - The function is called just before the first work unit is submitted - to the queue. - """ - - # Standard executor options: - compression: Optional[int] = 9 # as recommended by lz4 - retries: int = 2 # task executes at most 3 times - # wq executor options: - manager_name: Optional[str] = None - port: Optional[Union[int, Tuple[int, int]]] = None - filepath: str = "." - events_total: Optional[int] = None - x509_proxy: Optional[str] = None - verbose: bool = False - print_stdout: bool = False - status_display_interval: Optional[int] = 10 - debug_log: Optional[str] = None - stats_log: Optional[str] = None - transactions_log: Optional[str] = None - tasks_accum_log: Optional[str] = None - password_file: Optional[str] = None - ssl: Union[bool, Tuple[str, str]] = False - environment_file: Optional[str] = None - extra_input_files: List = field(default_factory=list) - wrapper: Optional[str] = shutil.which("poncho_package_run") - resource_monitor: Optional[str] = "off" - resources_mode: Optional[str] = "max-seen" - split_on_exhaustion: Optional[bool] = True - fast_terminate_workers: Optional[int] = None - cores: Optional[int] = None - memory: Optional[int] = None - disk: Optional[int] = None - gpus: Optional[int] = None - treereduction: int = 20 - chunksize: int = 100000 - dynamic_chunksize: Optional[Dict] = None - custom_init: Optional[Callable] = None - - # deprecated - bar_format: Optional[str] = None - chunks_accum_in_mem: Optional[int] = None - master_name: Optional[str] = None - chunks_per_accum: Optional[int] = None - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - from .work_queue_tools import run - - return ( - run( - self, - items, - function, - accumulator, - ), - 0, - ) - - -@dataclass -class IterativeExecutor(ExecutorBase): - """Execute in one thread iteratively - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Ignored for iterative executor - """ - - workers: int = 1 - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - with rich_bar() as progress: - p_id = progress.add_task( - self.desc, total=len(items), unit=self.unit, disable=not self.status - ) - return ( - accumulate( - progress.track( - map(function, (c for c in items)), - total=len(items), - task_id=p_id, - ), - accumulator, - ), - 0, - ) - - -@dataclass -class FuturesExecutor(ExecutorBase): - """Execute using multiple local cores using python futures - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - pool : concurrent.futures.Executor class or instance, optional - The type of futures executor to use, defaults to ProcessPoolExecutor. - You can pass an instance instead of a class to reuse an executor - workers : int, optional - Number of parallel processes for futures (default 1) - status : bool, optional - If true (default), enable progress bar - desc : str, optional - Label of progress description (default: 'Processing') - unit : str, optional - Label of progress bar bar unit (default: 'items') - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - recoverable : bool, optional - Instead of raising Exception right away, the exception is captured and returned - up for custom parsing. Already completed items will be returned as well. - checkpoints : bool - To do - merging : bool | tuple(int, int, int), optional - Enables submitting intermediate merge jobs to the executor. Format is - (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100), - aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items. - Default is ``False`` - results get merged as they finish in the main process. - nparts : int, optional - Number of merge jobs to create at a time. Also pass via ``merging(X, ..., ...)'' - minred : int, optional - Minimum number of items to merge in one job. Also pass via ``merging(..., X, ...)'' - maxred : int, optional - maximum number of items to merge in one job. Also pass via ``merging(..., ..., X)'' - mergepool : concurrent.futures.Executor class or instance | int, optional - Supply an additional executor to process merge jobs independently. - An ``int`` will be interpreted as ``ProcessPoolExecutor(max_workers=int)``. - tailtimeout : int, optional - Timeout requirement on job tails. Cancel all remaining jobs if none have finished - in the timeout window. - """ - - pool: Union[ - Callable[..., concurrent.futures.Executor], concurrent.futures.Executor - ] = concurrent.futures.ProcessPoolExecutor # fmt: skip - mergepool: Optional[ - Union[ - Callable[..., concurrent.futures.Executor], - concurrent.futures.Executor, - bool, - ] - ] = None - recoverable: bool = False - merging: Union[bool, Tuple[int, int, int]] = False - workers: int = 1 - tailtimeout: Optional[int] = None - - def __post_init__(self): - if not ( - isinstance(self.merging, bool) - or (isinstance(self.merging, tuple) and len(self.merging) == 3) - ): - raise ValueError( - f"merging={self.merging} not understood. Required format is " - "(n_batches, min_batch_size, max_batch_size)" - ) - elif self.merging is True: - self.merging = (5, 4, 100) - - def _merge_size(self, size: int): - return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1])) - - def __getstate__(self): - return dict(self.__dict__, pool=None) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - if self.compression is not None: - function = _compression_wrapper(self.compression, function) - reducer = _reduce(self.compression) - - def _processwith(pool, mergepool): - FH = _FuturesHolder( - {pool.submit(function, item) for item in items}, refresh=2 - ) - - try: - if mergepool is None: - merged = _watcher(FH, self, reducer, pool) - else: - merged = _watcher(FH, self, reducer, mergepool) - return accumulate([_decompress(merged), accumulator]), 0 - - except Exception as e: - traceback.print_exc() - if self.recoverable: - print("Exception occurred, recovering progress...") - for job in FH.futures: - job.cancel() - - merged = _wait_for_merges(FH, self) - return accumulate([_decompress(merged), accumulator]), e - else: - raise e from None - - if isinstance(self.pool, concurrent.futures.Executor): - return _processwith(pool=self.pool, mergepool=self.mergepool) - else: - # assume its a class then - with ExitStack() as stack: - poolinstance = stack.enter_context(self.pool(max_workers=self.workers)) - if self.mergepool is not None: - if isinstance(self.mergepool, int): - self.mergepool = concurrent.futures.ProcessPoolExecutor( - max_workers=self.mergepool - ) - mergepoolinstance = stack.enter_context(self.mergepool) - else: - mergepoolinstance = None - return _processwith(pool=poolinstance, mergepool=mergepoolinstance) - - -@dataclass -class DaskExecutor(ExecutorBase): - """Execute using dask futures - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - client : distributed.client.Client - A dask distributed client instance - treereduction : int, optional - Tree reduction factor for output accumulators (default: 20) - status : bool, optional - If true (default), enable progress bar - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - priority : int, optional - Task priority, default 0 - retries : int, optional - Number of retries for failed tasks (default: 3) - heavy_input : serializable, optional - Any value placed here will be broadcast to workers and joined to input - items in a tuple (item, heavy_input) that is passed to function. - function_name : str, optional - Name of the function being passed - use_dataframes: bool, optional - Retrieve output as a distributed Dask DataFrame (default: False). - The outputs of individual tasks must be Pandas DataFrames. - - .. note:: If ``heavy_input`` is set, ``function`` is assumed to be pure. - """ - - client: Optional["dask.distributed.Client"] = None # noqa - treereduction: int = 20 - priority: int = 0 - retries: int = 3 - heavy_input: Optional[bytes] = None - use_dataframes: bool = False - # secret options - worker_affinity: bool = False - - def __getstate__(self): - return dict(self.__dict__, client=None) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - - import dask.dataframe as dd - from dask.distributed import Client - from distributed.scheduler import KilledWorker - - if self.client is None: - self.client = Client(threads_per_worker=1) - - if self.use_dataframes: - self.compression = None - - reducer = _reduce(self.compression) - if self.compression is not None: - function = _compression_wrapper( - self.compression, function, name=self.function_name - ) - - if self.heavy_input is not None: - # client.scatter is not robust against adaptive clusters - # https://github.com/CoffeaTeam/coffea/issues/465 - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Large object of size") - items = list( - zip( - items, repeat(self.client.submit(lambda x: x, self.heavy_input)) - ) - ) - - work = [] - key_to_item = {} - if self.worker_affinity: - workers = list(self.client.run(lambda: 0)) - - def belongsto(heavy_input, workerindex, item): - if heavy_input is not None: - item = item[0] - hashed = _hash( - (item.fileuuid, item.treename, item.entrystart, item.entrystop) - ) - return hashed % len(workers) == workerindex - - for workerindex, worker in enumerate(workers): - items_worker = [ - item - for item in items - if belongsto(self.heavy_input, workerindex, item) - ] - work_worker = self.client.map( - function, - items_worker, - pure=(self.heavy_input is not None), - priority=self.priority, - retries=self.retries, - workers={worker}, - allow_other_workers=False, - ) - work.extend(work_worker) - key_to_item.update( - { - future.key: item - for future, item in zip(work_worker, items_worker) - } - ) - else: - work = self.client.map( - function, - items, - pure=(self.heavy_input is not None), - priority=self.priority, - retries=self.retries, - ) - key_to_item.update({future.key: item for future, item in zip(work, items)}) - if (self.function_name == "get_metadata") or not self.use_dataframes: - while len(work) > 1: - work = self.client.map( - reducer, - [ - work[i : i + self.treereduction] - for i in range(0, len(work), self.treereduction) - ], - pure=True, - priority=self.priority, - retries=self.retries, - ) - key_to_item.update({future.key: "(output reducer)" for future in work}) - work = work[0] - try: - if self.status: - from distributed import progress - - # FIXME: fancy widget doesn't appear, have to live with boring pbar - progress(work, multi=True, notebook=False) - return ( - accumulate( - [ - work.result() - if self.compression is None - else _decompress(work.result()) - ], - accumulator, - ), - 0, - ) - except KilledWorker as ex: - baditem = key_to_item[ex.task] - if self.heavy_input is not None and isinstance(baditem, tuple): - baditem = baditem[0] - raise RuntimeError( - f"Work item {baditem} caused a KilledWorker exception (likely a segfault or out-of-memory issue)" - ) - else: - if self.status: - from distributed import progress - - progress(work, multi=True, notebook=False) - return {"out": dd.from_delayed(work)}, 0 - - -@dataclass -class ParslExecutor(ExecutorBase): - """Execute using parsl pyapp wrapper - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - config : parsl.config.Config, optional - A parsl DataFlow configuration object. Necessary if there is no active kernel - - .. note:: In general, it is safer to construct the DFK with ``parsl.load(config)`` prior to calling this function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - recoverable : bool, optional - Instead of raising Exception right away, the exception is captured and returned - up for custom parsing. Already completed items will be returned as well. - merging : bool | tuple(int, int, int), optional - Enables submitting intermediate merge jobs to the executor. Format is - (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100), - aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items. - Default is ``False`` - results get merged as they finish in the main process. - jobs_executors : list | "all" optional - Labels of the executors (from dfk.config.executors) that will process main jobs. - Default is 'all'. Recommended is ``['jobs']``, while passing ``label='jobs'`` to the primary executor. - merges_executors : list | "all" optional - Labels of the executors (from dfk.config.executors) that will process main jobs. - Default is 'all'. Recommended is ``['merges']``, while passing ``label='merges'`` to the executor dedicated towards merge jobs. - tailtimeout : int, optional - Timeout requirement on job tails. Cancel all remaining jobs if none have finished - in the timeout window. - """ - - tailtimeout: Optional[int] = None - config: Optional["parsl.config.Config"] = None # noqa - recoverable: bool = False - merging: Optional[Union[bool, Tuple[int, int, int]]] = False - jobs_executors: Union[str, List] = "all" - merges_executors: Union[str, List] = "all" - - def __post_init__(self): - if not ( - isinstance(self.merging, bool) - or (isinstance(self.merging, tuple) and len(self.merging) == 3) - ): - raise ValueError( - f"merging={self.merging} not understood. Required format is " - "(n_batches, min_batch_size, max_batch_size)" - ) - elif self.merging is True: - self.merging = (5, 4, 100) - - def _merge_size(self, size: int): - return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1])) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - import parsl - from parsl.app.app import python_app - - from .parsl.timeout import timeout - - if self.compression is not None: - function = _compression_wrapper(self.compression, function) - - # Parse config if passed - cleanup = False - try: - parsl.dfk() - except RuntimeError: - cleanup = True - pass - if cleanup and self.config is None: - raise RuntimeError( - "No active parsl DataFlowKernel, must specify a config to construct one" - ) - elif not cleanup and self.config is not None: - raise RuntimeError("An active parsl DataFlowKernel already exists") - elif self.config is not None: - parsl.clear() - parsl.load(self.config) - - # Check config/executors - _exec_avail = [exe.label for exe in parsl.dfk().config.executors] - _execs_tried = ( - [] if self.jobs_executors == "all" else [e for e in self.jobs_executors] - ) - _execs_tried += ( - [] if self.merges_executors == "all" else [e for e in self.merges_executors] - ) - if not all([_e in _exec_avail for _e in _execs_tried]): - raise RuntimeError( - f"Executors: [{','.join(_e for _e in _execs_tried if _e not in _exec_avail)}] not available in the config." - ) - - # Apps - app = timeout(python_app(function, executors=self.jobs_executors)) - reducer = timeout( - python_app(_reduce(self.compression), executors=self.merges_executors) - ) - - FH = _FuturesHolder(set(map(app, items)), refresh=2) - try: - merged = _watcher(FH, self, reducer) - return accumulate([_decompress(merged), accumulator]), 0 - - except Exception as e: - traceback.print_exc() - if self.recoverable: - print("Exception occurred, recovering progress...") - # for job in FH.futures: # NotImplemented in parsl - # job.cancel() - - merged = _wait_for_merges(FH, self) - return accumulate([_decompress(merged), accumulator]), e - else: - raise e from None - finally: - if cleanup: - parsl.dfk().cleanup() - parsl.clear() - - -class ParquetFileUprootShim: - def __init__(self, table, name): - self.table = table - self.name = name - - def array(self, **kwargs): - import awkward - - return awkward.Array(self.table[self.name]) - - -class ParquetFileContext: - def __init__(self, filename): - self.filename = filename - self.filehandle = None - self.branchnames = None - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - pass - - def _get_handle(self): - import pyarrow.parquet as pq - - if self.filehandle is None: - self.filehandle = pq.ParquetFile(self.filename) - self.branchnames = { - item.path.split(".")[0] for item in self.filehandle.schema - } - - @property - def num_entries(self): - self._get_handle() - return self.filehandle.metadata.num_rows - - def keys(self): - self._get_handle() - return self.branchnames - - def __iter__(self): - self._get_handle() - return iter(self.branchnames) - - def __getitem__(self, name): - self._get_handle() - if name in self.branchnames: - return ParquetFileUprootShim( - self.filehandle.read([name], use_threads=False), name - ) - else: - return KeyError(name) - - def __contains__(self, name): - self._get_handle() - return name in self.branchnames - - -@dataclass -class Runner: - """A tool to run a processor using uproot for data delivery - - A convenience wrapper to submit jobs for a file set, which is a - dictionary of dataset: [file list] entries. Supports only uproot TTree - reading, via NanoEvents or LazyDataFrame. For more customized processing, - e.g. to read other objects from the files and pass them into data frames, - one can write a similar function in their user code. - - Parameters - ---------- - executor : ExecutorBase instance - Executor, which implements a callable with inputs: items, function, accumulator - and performs some action equivalent to: - ``for item in items: accumulator += function(item)`` - pre_executor : ExecutorBase instance - Executor, used to calculate fileset metadata - Defaults to executor - chunksize : int, optional - Maximum number of entries to process at a time in the data frame, default: 100k - maxchunks : int, optional - Maximum number of chunks to process per dataset - Defaults to processing the whole dataset - metadata_cache : mapping, optional - A dict-like object to use as a cache for (file, tree) metadata that is used to - determine chunking. Defaults to a in-memory LRU cache that holds 100k entries - (about 1MB depending on the length of filenames, etc.) If you edit an input file - (please don't) during a session, the session can be restarted to clear the cache. - dynamic_chunksize : dict, optional - Whether to adapt the chunksize for units of work to run in the targets given. - Currently supported are 'wall_time' (in seconds), and 'memory' (in MB). - E.g., with {"wall_time": 120, "memory": 2048}, the chunksize will - be dynamically adapted so that processing jobs each run in about - two minutes, using two GB of memory. (Currently only for the WorkQueueExecutor.) - """ - - executor: ExecutorBase - pre_executor: Optional[ExecutorBase] = None - chunksize: int = 100000 - maxchunks: Optional[int] = None - metadata_cache: Optional[MutableMapping] = None - dynamic_chunksize: Optional[Dict] = None - skipbadfiles: bool = False - xrootdtimeout: Optional[int] = 60 - align_clusters: bool = False - savemetrics: bool = False - mmap: bool = False - schema: Optional[schemas.BaseSchema] = schemas.BaseSchema - cachestrategy: Optional[ - Union[Literal["dask-worker"], Callable[..., MutableMapping]] - ] = None # fmt: skip - processor_compression: int = 1 - use_skyhook: Optional[bool] = False - skyhook_options: Optional[Dict] = field(default_factory=dict) - format: str = "root" - - @staticmethod - def read_coffea_config(): - config_path = None - if "HOME" in os.environ: - config_path = os.path.join(os.environ["HOME"], ".coffea.toml") - elif "_CONDOR_SCRATCH_DIR" in os.environ: - config_path = os.path.join( - os.environ["_CONDOR_SCRATCH_DIR"], ".coffea.toml" - ) - - if config_path is not None and os.path.exists(config_path): - with open(config_path) as f: - return toml.loads(f.read()) - else: - return dict() - - def __post_init__(self): - if self.pre_executor is None: - self.pre_executor = self.executor - - assert isinstance( - self.executor, ExecutorBase - ), "Expected executor to derive from ExecutorBase" - assert isinstance( - self.pre_executor, ExecutorBase - ), "Expected pre_executor to derive from ExecutorBase" - - if self.metadata_cache is None: - self.metadata_cache = DEFAULT_METADATA_CACHE - - if self.align_clusters and self.dynamic_chunksize: - raise RuntimeError( - "align_clusters and dynamic_chunksize cannot be used simultaneously" - ) - if self.maxchunks and self.dynamic_chunksize: - raise RuntimeError( - "maxchunks and dynamic_chunksize cannot be used simultaneously" - ) - if self.dynamic_chunksize and not isinstance(self.executor, WorkQueueExecutor): - raise RuntimeError( - "dynamic_chunksize currently only supported by the WorkQueueExecutor" - ) - - assert self.format in ("root", "parquet") - - @property - def retries(self): - if isinstance(self.executor, DaskExecutor): - retries = 0 - else: - retries = getattr(self.executor, "retries", 0) - assert retries >= 0 - return retries - - @property - def use_dataframes(self): - if isinstance(self.executor, DaskExecutor): - return self.executor.use_dataframes - else: - return False - - @staticmethod - def get_cache(cachestrategy): - cache = None - if cachestrategy == "dask-worker": - from distributed import get_worker - - from coffea.processor.dask import ColumnCache - - worker = get_worker() - try: - cache = worker.plugins[ColumnCache.name] - except KeyError: - # emit warning if not found? - pass - elif callable(cachestrategy): - cache = cachestrategy() - return cache - - @staticmethod - def automatic_retries(retries: int, skipbadfiles: bool, func, *args, **kwargs): - """This should probably defined on Executor-level.""" - import warnings - - retry_count = 0 - while retry_count <= retries: - try: - return func(*args, **kwargs) - # catch xrootd errors and optionally skip - # or retry to read the file - except Exception as e: - chain = _exception_chain(e) - if skipbadfiles and any( - isinstance(c, (FileNotFoundError, UprootMissTreeError)) - for c in chain - ): - warnings.warn(str(e)) - break - if ( - skipbadfiles - and (retries == retry_count) - and any( - e in str(c) - for c in chain - for e in [ - "Invalid redirect URL", - "Operation expired", - "Socket timeout", - ] - ) - ): - warnings.warn(str(e)) - break - if ( - not skipbadfiles - or any("Auth failed" in str(c) for c in chain) - or retries == retry_count - ): - raise e - warnings.warn("Attempt %d of %d." % (retry_count + 1, retries + 1)) - retry_count += 1 - - @staticmethod - def _normalize_fileset( - fileset: Dict, - treename: str, - ) -> Generator[FileMeta, None, None]: - if isinstance(fileset, str): - with open(fileset) as fin: - fileset = json.load(fin) - elif not isinstance(fileset, Mapping): - raise ValueError("Expected fileset to be a path string or mapping") - reserved_metakeys = _PROTECTED_NAMES - for dataset, filelist in fileset.items(): - user_meta = None - if isinstance(filelist, dict): - user_meta = filelist["metadata"] if "metadata" in filelist else None - if user_meta is not None: - for rkey in reserved_metakeys: - if rkey in user_meta.keys(): - raise ValueError( - f'Reserved word "{rkey}" in metadata section of fileset dictionary, please rename this entry!' - ) - if "treename" not in filelist and treename is None: - raise ValueError( - "treename must be specified if the fileset does not contain tree names" - ) - local_treename = ( - filelist["treename"] if "treename" in filelist else treename - ) - filelist = filelist["files"] - elif isinstance(filelist, list): - if treename is None: - raise ValueError( - "treename must be specified if the fileset does not contain tree names" - ) - local_treename = treename - else: - raise ValueError( - "list of filenames in fileset must be a list or a dict" - ) - for filename in filelist: - yield FileMeta(dataset, filename, local_treename, user_meta) - - @staticmethod - def metadata_fetcher_root( - xrootdtimeout: int, align_clusters: bool, item: FileMeta - ) -> Accumulatable: - with uproot.open({item.filename: None}, timeout=xrootdtimeout) as file: - try: - tree = file[item.treename] - except uproot.exceptions.KeyInFileError as e: - raise UprootMissTreeError(str(e)) from e - - metadata = {} - if item.metadata: - metadata.update(item.metadata) - metadata.update({"numentries": tree.num_entries, "uuid": file.file.fUUID}) - if align_clusters: - metadata["clusters"] = tree.common_entry_offsets() - out = set_accumulator( - [FileMeta(item.dataset, item.filename, item.treename, metadata)] - ) - return out - - @staticmethod - def metadata_fetcher_parquet(item: FileMeta): - with ParquetFileContext(item.filename) as file: - metadata = {} - if item.metadata: - metadata.update(item.metadata) - metadata.update( - {"numentries": file.num_entries, "uuid": b"NO_UUID_0000_000"} - ) - out = set_accumulator( - [FileMeta(item.dataset, item.filename, item.treename, metadata)] - ) - return out - - def _preprocess_fileset_root(self, fileset: Dict) -> None: - # this is a bit of an abuse of map-reduce but ok - to_get = { - filemeta - for filemeta in fileset - if not filemeta.populated(clusters=self.align_clusters) - } - if len(to_get) > 0: - out = set_accumulator() - pre_arg_override = { - "function_name": "get_metadata", - "desc": "Preprocessing", - "unit": "file", - "compression": None, - } - if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)): - pre_arg_override.update({"tailtimeout": None}) - if isinstance(self.pre_executor, (DaskExecutor)): - self.pre_executor.heavy_input = None - pre_arg_override.update({"worker_affinity": False}) - pre_executor = self.pre_executor.copy(**pre_arg_override) - closure = partial( - self.automatic_retries, - self.retries, - self.skipbadfiles, - partial( - self.metadata_fetcher_root, self.xrootdtimeout, self.align_clusters - ), - ) - out, _ = pre_executor(to_get, closure, out) - while out: - item = out.pop() - self.metadata_cache[item] = item.metadata - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - def _preprocess_fileset_parquet(self, fileset: Dict) -> None: - # this is a bit of an abuse of map-reduce but ok - to_get = { - filemeta - for filemeta in fileset - if not filemeta.populated(clusters=self.align_clusters) - } - if len(to_get) > 0: - out = set_accumulator() - pre_arg_override = { - "function_name": "get_metadata", - "desc": "Preprocessing", - "unit": "file", - "compression": None, - } - if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)): - pre_arg_override.update({"tailtimeout": None}) - if isinstance(self.pre_executor, (DaskExecutor)): - self.pre_executor.heavy_input = None - pre_arg_override.update({"worker_affinity": False}) - pre_executor = self.pre_executor.copy(**pre_arg_override) - closure = partial( - self.automatic_retries, - self.retries, - self.skipbadfiles, - self.metadata_fetcher_parquet, - ) - out, _ = pre_executor(to_get, closure, out) - while out: - item = out.pop() - self.metadata_cache[item] = item.metadata - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - def _filter_badfiles(self, fileset: Dict) -> List: - final_fileset = [] - for filemeta in fileset: - if filemeta.populated(clusters=self.align_clusters): - final_fileset.append(filemeta) - elif not self.skipbadfiles: - raise RuntimeError( - f"Metadata for file {filemeta.filename} could not be accessed." - ) - return final_fileset - - def _chunk_generator(self, fileset: Dict, treename: str) -> Generator: - config = None - if self.use_skyhook: - config = Runner.read_coffea_config() - if not self.use_skyhook and (self.format == "root" or self.format == "parquet"): - if self.maxchunks is None: - last_chunksize = self.chunksize - for filemeta in fileset: - last_chunksize = yield from filemeta.chunks( - last_chunksize, - self.align_clusters, - ) - else: - # get just enough file info to compute chunking - nchunks = defaultdict(int) - chunks = [] - for filemeta in fileset: - if nchunks[filemeta.dataset] >= self.maxchunks: - continue - for chunk in filemeta.chunks(self.chunksize, self.align_clusters): - chunks.append(chunk) - nchunks[filemeta.dataset] += 1 - if nchunks[filemeta.dataset] >= self.maxchunks: - break - yield from (c for c in chunks) - else: - if self.use_skyhook and not config.get("skyhook", None): - print("No skyhook config found, using defaults") - config["skyhook"] = dict() - - dataset_filelist_map = {} - if self.use_skyhook: - import pyarrow.dataset as ds - - for dataset, basedir in fileset.items(): - ds_ = ds.dataset(basedir, format="parquet") - dataset_filelist_map[dataset] = ds_.files - else: - for dataset, maybe_filelist in fileset.items(): - if isinstance(maybe_filelist, list): - dataset_filelist_map[dataset] = maybe_filelist - elif isinstance(maybe_filelist, dict): - if "files" not in maybe_filelist: - raise ValueError( - "Dataset definition must have key 'files' defined!" - ) - dataset_filelist_map[dataset] = maybe_filelist["files"] - else: - raise ValueError( - "Dataset definition in fileset must be dict[str: list[str]] or dict[str: dict[str: Any]]" - ) - chunks = [] - for dataset, filelist in dataset_filelist_map.items(): - for filename in filelist: - # If skyhook config is provided and is not empty, - if self.use_skyhook: - ceph_config_path = config["skyhook"].get( - "ceph_config_path", "/etc/ceph/ceph.conf" - ) - ceph_data_pool = config["skyhook"].get( - "ceph_data_pool", "cephfs_data" - ) - filename = f"{ceph_config_path}:{ceph_data_pool}:{filename}" - chunks.append( - WorkItem( - dataset, - filename, - treename, - 0, - 0, - "", - fileset[dataset]["metadata"] - if "metadata" in fileset[dataset] - else None, - ) - ) - yield from iter(chunks) - - @staticmethod - def _work_function( - format: str, - xrootdtimeout: int, - mmap: bool, - schema: schemas.BaseSchema, - cache_function: Callable[[], MutableMapping], - use_dataframes: bool, - savemetrics: bool, - item: WorkItem, - processor_instance: ProcessorABC, - ) -> Dict: - if processor_instance == "heavy": - item, processor_instance = item - if not isinstance(processor_instance, ProcessorABC): - processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance)) - - if format == "root": - filecontext = uproot.open( - {item.filename: None}, - timeout=xrootdtimeout, - file_handler=uproot.MemmapSource - if mmap - else uproot.MultithreadedFileSource, - ) - elif format == "parquet": - filecontext = ParquetFileContext(item.filename) - - metadata = { - "dataset": item.dataset, - "filename": item.filename, - "treename": item.treename, - "entrystart": item.entrystart, - "entrystop": item.entrystop, - "fileuuid": str(uuid.UUID(bytes=item.fileuuid)) - if len(item.fileuuid) > 0 - else "", - } - if item.usermeta is not None: - metadata.update(item.usermeta) - - with filecontext as file: - if schema is None: - # To deprecate - tree = None - events = None - if format == "root": - tree = file[item.treename] - events = uproot.dask(tree, ak_add_doc=True)[ - item.entrystart : item.entrystop - ] - setattr(events, "metadata", metadata) - elif format == "parquet": - import dask_awkward - - tree = file - events = dask_awkward.from_parquet(item.filename)[ - item.entrystart : item.entrystop - ] - setattr(events, "metadata", metadata) - else: - raise ValueError("Format can only be root or parquet!") - elif issubclass(schema, schemas.BaseSchema): - # change here - if format == "root": - materialized = [] - factory = NanoEventsFactory.from_root( - file=file, - treepath=item.treename, - persistent_cache=cache_function(), - schemaclass=schema, - metadata=metadata, - access_log=materialized, - delayed=True, - ) - events = factory.events()[item.entrystart : item.entrystop] - elif format == "parquet": - skyhook_options = {} - if ":" in item.filename: - ( - ceph_config_path, - ceph_data_pool, - filename, - ) = item.filename.split(":") - # patch back filename into item - item = WorkItem(**dict(asdict(item), filename=filename)) - skyhook_options["ceph_config_path"] = ceph_config_path - skyhook_options["ceph_data_pool"] = ceph_data_pool - - factory = NanoEventsFactory.from_parquet( - file=item.filename, - treepath=item.treename, - schemaclass=schema, - metadata=metadata, - skyhook_options=skyhook_options, - permit_dask=True, - ) - events = factory.events()[item.entrystart : item.entrystop] - else: - raise ValueError( - "Expected schema to derive from nanoevents.BaseSchema, instead got %r" - % schema - ) - tic = time.time() - try: - out = None - if isinstance(events, LazyDataFrame): - out = processor_instance.process(events) - else: - import dask - import dask_awkward - - to_compute = processor_instance.process(events) - # materialized = dask_awkward.report_necessary_buffers(to_compute) - out = dask.compute(to_compute, scheduler="single-threaded")[0] - except Exception as e: - raise Exception(f"Failed processing file: {item!r}") from e - if out is None: - raise ValueError( - "Output of process() should not be None. Make sure your processor's process() function returns an accumulator." - ) - toc = time.time() - if use_dataframes: - return out - else: - if savemetrics: - metrics = {} - if isinstance(file, uproot.ReadOnlyDirectory): - metrics["bytesread"] = file.file.source.num_requested_bytes - # metrics["data_and_shape_buffers"] = set(materialized) - # metrics["shape_only_buffers"] = set(materialized) - if schema is not None and issubclass(schema, schemas.BaseSchema): - metrics["entries"] = len(events) - else: - metrics["entries"] = events.size - metrics["processtime"] = toc - tic - return {"out": out, "metrics": metrics, "processed": {item}} - return {"out": out, "processed": {item}} - - def __call__( - self, - fileset: Dict, - treename: str, - processor_instance: ProcessorABC, - ) -> Accumulatable: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - treename : str - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - """ - if isinstance(self.executor, DaskExecutorBase): - return self.run_dask(fileset, processor_instance, treename) - - wrapped_out = self.run(fileset, processor_instance, treename) - if self.use_dataframes: - return wrapped_out # not wrapped anymore - if self.savemetrics: - return wrapped_out["out"], wrapped_out["metrics"] - return wrapped_out["out"] - - def preprocess( - self, - fileset: Dict, - treename: str, - ) -> Generator: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - treename : str - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - """ - - if not isinstance(fileset, (Mapping, str)): - raise ValueError( - "Expected fileset to be a mapping dataset: list(files) or filename" - ) - if self.format == "root": - fileset = list(self._normalize_fileset(fileset, treename)) - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - self._preprocess_fileset_root(fileset) - fileset = self._filter_badfiles(fileset) - - # reverse fileset list to match the order of files as presented in version - # v0.7.4. This fixes tests using maxchunks. - fileset.reverse() - elif self.format == "parquet": - fileset = list(self._normalize_fileset(fileset, treename)) - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - self._preprocess_fileset_parquet(fileset) - fileset = self._filter_badfiles(fileset) - - # reverse fileset list to match the order of files as presented in version - # v0.7.4. This fixes tests using maxchunks. - fileset.reverse() - - return self._chunk_generator(fileset, treename) - - def run_dask( - self, - fileset: Union[Dict, str, List[WorkItem], Generator], - processor_instance: ProcessorABC, - treename: str = None, - ) -> Accumulatable: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict | str | List[WorkItem] | Generator - - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - - A single file name - - File chunks for self.preprocess() - - Chunk generator - treename : str, optional - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - Not needed if processing premade chunks - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - """ - pass - - def run( - self, - fileset: Union[Dict, str, List[WorkItem], Generator], - processor_instance: ProcessorABC, - treename: str = None, - ) -> Accumulatable: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict | str | List[WorkItem] | Generator - - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - - A single file name - - File chunks for self.preprocess() - - Chunk generator - treename : str, optional - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - Not needed if processing premade chunks - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - """ - - meta = False - if not isinstance(fileset, (Mapping, str)): - if isinstance(fileset, Generator) or isinstance(fileset[0], WorkItem): - meta = True - else: - raise ValueError( - "Expected fileset to be a mapping dataset: list(files) or filename" - ) - if not isinstance(processor_instance, ProcessorABC): - raise ValueError("Expected processor_instance to derive from ProcessorABC") - - if meta: - chunks = fileset - else: - chunks = self.preprocess(fileset, treename) - - if self.processor_compression is None: - pi_to_send = processor_instance - else: - pi_to_send = lz4f.compress( - cloudpickle.dumps(processor_instance), - compression_level=self.processor_compression, - ) - # hack around dask/dask#5503 which is really a silly request but here we are - if isinstance(self.executor, DaskExecutor): - self.executor.heavy_input = pi_to_send - closure = partial( - self._work_function, - self.format, - self.xrootdtimeout, - self.mmap, - self.schema, - partial(self.get_cache, self.cachestrategy), - self.use_dataframes, - self.savemetrics, - processor_instance="heavy", - ) - else: - closure = partial( - self._work_function, - self.format, - self.xrootdtimeout, - self.mmap, - self.schema, - partial(self.get_cache, self.cachestrategy), - self.use_dataframes, - self.savemetrics, - processor_instance=pi_to_send, - ) - - if self.format == "root" and isinstance(self.executor, WorkQueueExecutor): - # keep chunks in generator, use a copy to count number of events - # this is cheap, as we are reading from the cache - chunks_to_count = self.preprocess(fileset, treename) - else: - # materialize chunks to list, then count that list - chunks = list(chunks) - chunks_to_count = chunks - - events_total = sum(len(c) for c in chunks_to_count) - - exe_args = { - "unit": "chunk", - "function_name": type(processor_instance).__name__, - } - if isinstance(self.executor, WorkQueueExecutor): - exe_args.update( - { - "unit": "event", - "events_total": events_total, - "dynamic_chunksize": self.dynamic_chunksize, - "chunksize": self.chunksize, - } - ) - - closure = partial( - self.automatic_retries, self.retries, self.skipbadfiles, closure - ) - - executor = self.executor.copy(**exe_args) - wrapped_out, e = executor(chunks, closure, None) - if wrapped_out is None: - raise ValueError( - "No chunks returned results, verify ``processor`` instance structure.\n\ - if you used skipbadfiles=True, it is possible all your files are bad." - ) - wrapped_out["exception"] = e - - if not self.use_dataframes: - processor_instance.postprocess(wrapped_out["out"]) - - if "metrics" in wrapped_out.keys(): - wrapped_out["metrics"]["chunks"] = len(chunks) - for k, v in wrapped_out["metrics"].items(): - if isinstance(v, set): - wrapped_out["metrics"][k] = list(v) - if self.use_dataframes: - return wrapped_out["out"] - else: - return wrapped_out - - -def run_spark_job( - fileset, - processor_instance, - executor, - executor_args={}, - spark=None, - partitionsize=200000, - thread_workers=16, -): - """A wrapper to submit spark jobs - - A convenience wrapper to submit jobs for spark datasets, which is a - dictionary of dataset: [file list] entries. Presently supports reading of - parquet files converted from root. For more customized processing, - e.g. to read other objects from the files and pass them into data frames, - one can write a similar function in their user code. - - Parameters - ---------- - fileset : dict - dictionary {dataset: [file, file], } - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - - .. note:: The processor instance must define all the columns in data and MC that it reads as ``.columns`` - executor: - anything that inherits from `SparkExecutor` like `spark_executor` - - In general, a function that takes 3 arguments: items, function accumulator - and performs some action equivalent to: - for item in items: accumulator += function(item) - executor_args: - arguments to send to the creation of a spark session - spark: - an optional already created spark instance - - if ``None`` then we create an ephemeral spark instance using a config - partitionsize: - partition size to try to aim for (coalescese only, repartition too expensive) - thread_workers: - how many spark jobs to let fly in parallel during processing steps - """ - - try: - import pyspark - except ImportError as e: - print( - "you must have pyspark installed to call run_spark_job()!", file=sys.stderr - ) - raise e - - import warnings - - import pyarrow as pa - from packaging import version - - arrow_env = ("ARROW_PRE_0_15_IPC_FORMAT", "1") - if version.parse(pa.__version__) >= version.parse("0.15.0") and version.parse( - pyspark.__version__ - ) < version.parse("3.0.0"): - import os - - if arrow_env[0] not in os.environ or os.environ[arrow_env[0]] != arrow_env[1]: - warnings.warn( - "If you are using pyarrow >= 0.15.0, make sure to set %s=%s in your environment!" - % arrow_env - ) - - import pyspark.sql - - from .spark.detail import _spark_initialize, _spark_make_dfs, _spark_stop - from .spark.spark_executor import SparkExecutor - - if not isinstance(fileset, Mapping): - raise ValueError("Expected fileset to be a mapping dataset: list(files)") - if not isinstance(processor_instance, ProcessorABC): - raise ValueError("Expected processor_instance to derive from ProcessorABC") - if not isinstance(executor, SparkExecutor): - raise ValueError("Expected executor to derive from SparkExecutor") - - executor_args.setdefault("config", None) - executor_args.setdefault("file_type", "parquet") - executor_args.setdefault("laurelin_version", "1.1.1") - executor_args.setdefault("treeName", "Events") - executor_args.setdefault("schema", None) - executor_args.setdefault("cache", True) - executor_args.setdefault("skipbadfiles", False) - executor_args.setdefault("retries", 0) - executor_args.setdefault("xrootdtimeout", None) - file_type = executor_args["file_type"] - treeName = executor_args["treeName"] - schema = executor_args["schema"] - if "flatten" in executor_args: - raise ValueError( - "Executor argument 'flatten' is deprecated, please refactor your processor to accept awkward arrays" - ) - if "nano" in executor_args: - raise ValueError( - "Awkward0 NanoEvents no longer supported.\n" - "Please use 'schema': processor.NanoAODSchema to enable awkward NanoEvents processing." - ) - use_cache = executor_args["cache"] - - if executor_args["config"] is None: - executor_args.pop("config") - - # initialize spark if we need to - # if we initialize, then we deconstruct - # when we're done - killSpark = False - if spark is None: - spark = _spark_initialize(**executor_args) - killSpark = True - use_cache = False # if we always kill spark then we cannot use the cache - else: - if not isinstance(spark, pyspark.sql.session.SparkSession): - raise ValueError( - "Expected 'spark' to be a pyspark.sql.session.SparkSession" - ) - - dfslist = {} - if executor._cacheddfs is None: - dfslist = _spark_make_dfs( - spark, - fileset, - partitionsize, - processor_instance.columns, - thread_workers, - file_type, - treeName, - ) - - output = executor( - spark, dfslist, processor_instance, None, thread_workers, use_cache, schema - ) - processor_instance.postprocess(output) - - if killSpark: - _spark_stop(spark) - del spark - spark = None - - return output diff --git a/src/coffea/processor/helpers.py b/src/coffea/processor/helpers.py deleted file mode 100644 index dcdf03ba2..000000000 --- a/src/coffea/processor/helpers.py +++ /dev/null @@ -1,273 +0,0 @@ -import numpy - -from coffea.util import deprecate - - -class Weights: - """Container for event weights and associated systematic shifts - - This container keeps track of correction factors and systematic - effects that can be encoded as multiplicative modifiers to the event weight. - All weights are stored in vector form. - - Parameters - ---------- - size : int - size of the weight arrays to be handled (i.e. the number of events / instances). - storeIndividual : bool, optional - store not only the total weight + variations, but also each individual weight. - Default is false. - """ - - def __init__(self, size, storeIndividual=False): - deprecate( - RuntimeError( - "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!" - ), - 0.8, - ) - self._weight = numpy.ones(size) - self._weights = {} - self._modifiers = {} - self._weightStats = {} - self._storeIndividual = storeIndividual - - def add(self, name, weight, weightUp=None, weightDown=None, shift=False): - """Add a new weight - - Adds a named correction to the event weight, and optionally also associated - systematic uncertainties. - - Parameters - ---------- - name : str - name of correction - weight : numpy.ndarray - the nominal event weight associated with the correction - weightUp : numpy.ndarray, optional - weight with correction uncertainty shifted up (if available) - weightDown : numpy.ndarray, optional - weight with correction uncertainty shifted down. If ``weightUp`` is supplied, and - the correction uncertainty is symmetric, this can be set to None to auto-calculate - the down shift as ``1 / weightUp``. - shift : bool, optional - if True, interpret weightUp and weightDown as a relative difference (additive) to the - nominal value - - .. note:: ``weightUp`` and ``weightDown`` are assumed to be rvalue-like and may be modified in-place by this function - """ - if name.endswith("Up") or name.endswith("Down"): - raise ValueError( - "Avoid using 'Up' and 'Down' in weight names, instead pass appropriate shifts to add() call" - ) - weight = numpy.array(weight) - self._weight = self._weight * weight - if self._storeIndividual: - self._weights[name] = weight - if weightUp is not None: - weightUp = numpy.array(weightUp) - if shift: - weightUp += weight - weightUp[weight != 0.0] /= weight[weight != 0.0] - self._modifiers[name + "Up"] = weightUp - if weightDown is not None: - weightDown = numpy.array(weightDown) - if shift: - weightDown = weight - weightDown - weightDown[weight != 0.0] /= weight[weight != 0.0] - self._modifiers[name + "Down"] = weightDown - self._weightStats[name] = { - "sumw": weight.sum(), - "sumw2": (weight**2).sum(), - "min": weight.min(), - "max": weight.max(), - "n": weight.size, - } - - def weight(self, modifier=None): - """Current event weight vector - - Parameters - ---------- - modifier : str, optional - if supplied, provide event weight corresponding to a particular - systematic uncertainty shift, of form ``str(name + 'Up')`` or (Down) - - Returns - ------- - weight : numpy.ndarray - The weight vector, possibly modified by the effect of a given systematic variation. - """ - if modifier is None: - return self._weight - elif "Down" in modifier and modifier not in self._modifiers: - return self._weight / self._modifiers[modifier.replace("Down", "Up")] - return self._weight * self._modifiers[modifier] - - def partial_weight(self, include=[], exclude=[]): - """Partial event weight vector - - Return a partial weight by multiplying a subset of all weights. - Can be operated either by specifying weights to include or - weights to exclude, but not both at the same time. The method - can only be used if the individual weights are stored via the - ``storeIndividual`` argument in the `Weights` initializer. - - Parameters - ---------- - include : list | set - Weight names to include, defaults to [] - exclude : list | set - Weight names to exclude, defaults to [] - Returns - ------- - weight : numpy.ndarray - The weight vector, corresponding to only the effect of the - corrections specified. - """ - if not self._storeIndividual: - raise ValueError( - "To be able to request weight exclusion, use storeIndividual=True when creating Weights object." - ) - if (include and exclude) or not (include or exclude): - raise ValueError( - "Need to specify exactly one of the 'exclude' or 'include' arguments." - ) - if include and not isinstance(include, (list, set)): - raise ValueError("'include' should be a list or set of weight names") - if exclude and not isinstance(exclude, (list, set)): - raise ValueError("'exclude' should be a list or set of weight names") - - names = set(self._weights.keys()) - if include: - names = names & set(include) - if exclude: - names = names - set(exclude) - - w = numpy.ones(self._weight.size) - for name in names: - w = w * self._weights[name] - - return w - - @property - def variations(self): - """List of available modifiers""" - keys = set(self._modifiers.keys()) - # add any missing 'Down' variation - for k in self._modifiers.keys(): - keys.add(k.replace("Up", "Down")) - return keys - - -class PackedSelection: - """Store boolean mask vectors in a compact manner - - This class can store several boolean masks (cuts, selections) and - evaluate arbitrary combinations of the requirements in an CPU-efficient way - - Parameters - ---------- - dtype : str - internal bitwidth of mask vector, which governs the maximum - number of boolean masks storable in this object. - By default, up to 64 masks can be stored, but smaller values - for the `numpy.dtype` may be more efficient. - """ - - def __init__(self, dtype="uint64"): - """ - TODO: extend to multi-column for arbitrary bit depth - """ - deprecate( - RuntimeError( - "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!" - ), - 0.8, - ) - self._dtype = numpy.dtype(dtype) - self._names = [] - self._mask = None - - @property - def names(self): - """Current list of mask names available""" - return self._names - - def add(self, name, selection): - """Add a named mask - - Parameters - ---------- - name : str - name of the mask - selection : numpy.ndarray - a flat array of dtype bool. - If not the first mask added, it must also have - the same shape as previously added masks. - """ - if isinstance(selection, numpy.ndarray) and selection.dtype == numpy.dtype( - "bool" - ): - if len(self._names) == 0: - self._mask = numpy.zeros(shape=selection.shape, dtype=self._dtype) - elif len(self._names) == 64: - raise RuntimeError( - "Exhausted all slots for %r, consider a larger dtype or fewer selections" - % self._dtype - ) - elif self._mask.shape != selection.shape: - raise ValueError( - "New selection '%s' has different shape than existing ones (%r vs. %r)" - % (name, selection.shape, self._mask.shape) - ) - self._mask |= selection.astype(self._dtype) << len(self._names) - self._names.append(name) - else: - raise ValueError( - "PackedSelection only understands numpy boolean arrays, got %r" - % selection - ) - - def require(self, **names): - """Return a mask vector corresponding to specific requirements - - Specify an exact requirement on an arbitrary subset of the masks - - Parameters - ---------- - ``**names`` : kwargs - Each argument to require specific value for, in form ``arg=True`` - or ``arg=False``. - - Examples - -------- - If - - >>> selection.names - ['cut1', 'cut2', 'cut3'] - - then - - >>> selection.require(cut1=True, cut2=False) - array([True, False, True, ...]) - - returns a boolean array where each entry passes if the corresponding entry has - ``cut1 == True``, ``cut2 == False``, and ``cut3`` arbitrary. - """ - mask = 0 - require = 0 - for name, val in names.items(): - if not isinstance(val, bool): - raise ValueError( - "Please use only booleans in PackedSelection.require(), received %r for %s" - % (val, name) - ) - idx = self._names.index(name) - mask |= 1 << idx - require |= int(val) << idx - return (self._mask & mask) == require - - def all(self, *names): - """Shorthand for `require`, where all the values are True""" - return self.require(**{name: True for name in names}) diff --git a/src/coffea/processor/parsl/__init__.py b/src/coffea/processor/parsl/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/parsl/condor_config.py b/src/coffea/processor/parsl/condor_config.py deleted file mode 100644 index 49c066c81..000000000 --- a/src/coffea/processor/parsl/condor_config.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import os.path as osp - -from parsl.addresses import address_by_hostname -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import CondorProvider - -x509_proxy = f"x509up_u{os.getuid()}" - - -def condor_config( - cores_per_job=4, - mem_per_core=2048, - total_workers=24, - max_workers=200, - pyenv_dir="{}/.local".format(os.environ["HOME"]), - grid_proxy_dir="/tmp", - htex_label="coffea_parsl_condor_htex", - wrk_init=None, - condor_cfg=None, -): - pyenv_relpath = pyenv_dir.split("/")[-1] - - if wrk_init is None: - wrk_init = """ - source /cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/setup.sh - export PATH=`pwd`/{}:$PATH - export PYTHONPATH=`pwd`/{}:$PYTHONPATH - - export X509_USER_PROXY=`pwd`/{} - mkdir -p ./{} - """.format( - "%s/bin" % pyenv_relpath, - "%s/lib/python3.6/site-packages" % pyenv_relpath, - x509_proxy, - htex_label, - ) - - if condor_cfg is None: - condor_cfg = """ - transfer_output_files = %s - RequestMemory = %d - RequestCpus = %d - """ % ( - htex_label, - mem_per_core * cores_per_job, - cores_per_job, - ) - - xfer_files = [pyenv_dir, osp.join(grid_proxy_dir, x509_proxy)] - - condor_htex = Config( - executors=[ - HighThroughputExecutor( - label=htex_label, - address=address_by_hostname(), - prefetch_capacity=0, - cores_per_worker=1, - max_workers=cores_per_job, - worker_logdir_root="./", - provider=CondorProvider( - channel=LocalChannel(), - init_blocks=total_workers, - max_blocks=max_workers, - nodes_per_block=1, - worker_init=wrk_init, - transfer_input_files=xfer_files, - scheduler_options=condor_cfg, - ), - ) - ], - strategy=None, - ) - - return condor_htex diff --git a/src/coffea/processor/parsl/detail.py b/src/coffea/processor/parsl/detail.py deleted file mode 100644 index a618d99f5..000000000 --- a/src/coffea/processor/parsl/detail.py +++ /dev/null @@ -1,89 +0,0 @@ -import parsl -from parsl.app.app import python_app -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import LocalProvider - -from ..executor import _futures_handler -from .timeout import timeout - -_default_cfg = Config( - executors=[ - HighThroughputExecutor( - label="coffea_parsl_default", - cores_per_worker=1, - provider=LocalProvider( - channel=LocalChannel(), - init_blocks=1, - max_blocks=1, - ), - ) - ], - strategy=None, -) - - -def _parsl_initialize(config=None): - parsl.clear() - parsl.load(config) - - -def _parsl_stop(): - parsl.dfk().cleanup() - parsl.clear() - - -@timeout -@python_app -def derive_chunks(filename, treename, chunksize, ds, timeout=10): - from collections.abc import Sequence - - import uproot - - uproot.XRootDSource.defaults["parallel"] = False - - a_file = uproot.open({filename: None}) - - tree = None - if isinstance(treename, str): - tree = a_file[treename] - elif isinstance(treename, Sequence): - for name in reversed(treename): - if name in a_file: - tree = a_file[name] - else: - raise Exception( - "treename must be a str or Sequence but is a %s!" % repr(type(treename)) - ) - - if tree is None: - raise Exception( - "No tree found, out of possible tree names: %s" % repr(treename) - ) - - nentries = tree.numentries - return ( - ds, - treename, - [(filename, chunksize, index) for index in range(nentries // chunksize + 1)], - ) - - -def _parsl_get_chunking(filelist, chunksize, status=True, timeout=10): - futures = { - derive_chunks(fn, tn, chunksize, ds, timeout=timeout) for ds, fn, tn in filelist - } - - items = [] - - def chunk_accumulator(total, result): - ds, treename, chunks = result - for chunk in chunks: - total.append((ds, chunk[0], treename, chunk[1], chunk[2])) - - _futures_handler( - futures, items, status, "files", "Preprocessing", chunk_accumulator, None - ) - - return items diff --git a/src/coffea/processor/parsl/slurm_config.py b/src/coffea/processor/parsl/slurm_config.py deleted file mode 100644 index 48c33ae01..000000000 --- a/src/coffea/processor/parsl/slurm_config.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import os.path as osp -import shutil - -from parsl.addresses import address_by_hostname -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.launchers import SrunLauncher -from parsl.providers import SlurmProvider - -x509_proxy = "x509up_u%s" % (os.getuid()) - - -def slurm_config( - cores_per_job=16, - mem_per_core=2048, - jobs_per_worker=1, - initial_workers=4, - max_workers=8, - work_dir="./", - grid_proxy_dir="/tmp", - partition="", - walltime="02:00:00", - htex_label="coffea_parsl_slurm_htex", -): - shutil.copy2(osp.join(grid_proxy_dir, x509_proxy), osp.join(work_dir, x509_proxy)) - - wrk_init = """ - export XRD_RUNFORKHANDLER=1 - export X509_USER_PROXY=%s - """ % ( - osp.join(work_dir, x509_proxy) - ) - - sched_opts = """ - #SBATCH --cpus-per-task=%d - #SBATCH --mem-per-cpu=%d - """ % ( - cores_per_job, - mem_per_core, - ) - - slurm_htex = Config( - executors=[ - HighThroughputExecutor( - label=htex_label, - address=address_by_hostname(), - prefetch_capacity=0, - max_workers=cores_per_job, - provider=SlurmProvider( - channel=LocalChannel(), - launcher=SrunLauncher(), - init_blocks=initial_workers, - max_blocks=max_workers, - nodes_per_block=jobs_per_worker, - partition=partition, - scheduler_options=sched_opts, # Enter scheduler_options if needed - worker_init=wrk_init, # Enter worker_init if needed - walltime=walltime, - ), - ) - ], - strategy=None, - ) - - return slurm_htex diff --git a/src/coffea/processor/parsl/timeout.py b/src/coffea/processor/parsl/timeout.py deleted file mode 100644 index 35c7b42dc..000000000 --- a/src/coffea/processor/parsl/timeout.py +++ /dev/null @@ -1,21 +0,0 @@ -from functools import wraps - - -def timeout(func): - @wraps(func) - def wrapper(*args, **kwargs): - import signal - - def _timeout_handler(signum, frame): - raise Exception("Timeout hit") - - signal.signal(signal.SIGALRM, _timeout_handler) - if kwargs.get("timeout"): - signal.alarm(max(1, int(kwargs["timeout"]))) - try: - result = func(*args, **kwargs) - finally: - signal.alarm(0) - return result - - return wrapper diff --git a/src/coffea/processor/servicex/__init__.py b/src/coffea/processor/servicex/__init__.py deleted file mode 100644 index 12d26e635..000000000 --- a/src/coffea/processor/servicex/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from .analysis import * -from .dask_executor import * -from .data_source import * -from .local_executor import * - -__all__ = [ - "DataSource", - "Analysis", - "LocalExecutor", - "DaskExecutor", -] diff --git a/src/coffea/processor/servicex/analysis.py b/src/coffea/processor/servicex/analysis.py deleted file mode 100644 index 669ba8662..000000000 --- a/src/coffea/processor/servicex/analysis.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from abc import ABC, abstractmethod - -from coffea.nanoevents.methods.base import NanoEvents - - -class Analysis(ABC): - @staticmethod - @abstractmethod - def process(events: NanoEvents) -> dict: - """ - Implement this abstract method to perform the actual analysis operations. The - executor will wrap this in code to construct a NanoEvents instance and will pass - in the analysis instance's accumulator. - :param events: NanoEvents - :return: dict[str, Accumulatable] - Filled with the results from this analysis - """ - raise NotImplementedError diff --git a/src/coffea/processor/servicex/dask_executor.py b/src/coffea/processor/servicex/dask_executor.py deleted file mode 100644 index f9dcc4851..000000000 --- a/src/coffea/processor/servicex/dask_executor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Callable, Dict, Optional - -from dask.distributed import Client - -from .executor import Executor, run_coffea_processor - - -class DaskExecutor(Executor): - def __init__( - self, - client_addr: Optional[str] = None, - provided_dask_client: Optional[Client] = None, - ): - """Create a Dask executor to process the analysis - - Args: - client_addr (Optional[str]): If `None` then create a local cluster that runs - in-process. Otherwise connect to an already - existing cluster. - provided_dask_client (Optional[Client]): Pass in an initialized Dask Client. - This client must have asynchronous=True. - """ - if not provided_dask_client: - self.is_local = not client_addr - - self.dask = ( - Client(threads_per_worker=10, asynchronous=True) - if self.is_local - else Client(client_addr, asynchronous=True) - ) - else: - assert provided_dask_client.asynchronous - self.dask = provided_dask_client - self.is_local = False - - def get_result_file_stream(self, datasource, title): - if self.is_local: - return datasource.stream_result_files(title) - else: - return datasource.stream_result_file_uris(title) - - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - schema, - ): - """Create a dask future for a dask task to run the analysis.""" - data_result = self.dask.submit( - run_coffea_processor, - events_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - proc=process_func, - schema=schema, - ) - - return data_result diff --git a/src/coffea/processor/servicex/data_source.py b/src/coffea/processor/servicex/data_source.py deleted file mode 100644 index 741363e67..000000000 --- a/src/coffea/processor/servicex/data_source.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2021, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from typing import AsyncGenerator, Dict, List, Optional, Tuple - -from func_adl import ObjectStream, find_EventDataset -from servicex import ServiceXDataset, StreamInfoPath, StreamInfoUrl - - -class DataSource: - def __init__( - self, - query: ObjectStream, - metadata: Dict[str, str] = {}, - datasets: List[ServiceXDataset] = [], - ): - self.query = query - self.metadata = metadata - self.schema = None - self.datasets = datasets - - async def _get_query(self) -> str: - """Return the qastle query. - - Note: To do this we have to forward-cast the object: by design, not all `func_adl` - queries are `ServiceX` queries. But this library only works with datasets that are - based in `ServiceX`. Thus some duck typing occurs in this method. - """ - event_dataset_ast = find_EventDataset(self.query.query_ast) - event_dataset = event_dataset_ast._eds_object # type: ignore - if not hasattr(event_dataset, "return_qastle"): - raise Exception( - f"Base func_adl query {str(event_dataset)} does not have a way to generate qastle!" - ) - event_dataset.return_qastle = True # type: ignore - return await self.query.value_async() - - async def stream_result_file_uris( - self, title: Optional[str] = None - ) -> AsyncGenerator[Tuple[str, str, StreamInfoUrl], None]: - """Launch all datasources off to servicex - - Yields: - Tuple[str, StreamInfoUrl]: List of data types and url's to process - """ - qastle = await self._get_query() - - # TODO: Make this for loop parallel - for dataset in self.datasets: - data_type = dataset.first_supported_datatype(["parquet", "root"]) - if data_type == "root": - async for file in dataset.get_data_rootfiles_uri_stream( - qastle, title=title, as_signed_url=True - ): - yield (data_type, dataset.dataset_as_name, file) - elif data_type == "parquet": - async for file in dataset.get_data_parquet_uri_stream( - qastle, title=title, as_signed_url=True - ): - yield (data_type, dataset.dataset_as_name, file) - else: - raise Exception( - f"This dataset ({str(dataset)}) supports unknown datatypes" - ) - - async def stream_result_files( - self, title: Optional[str] = None - ) -> AsyncGenerator[Tuple[str, str, StreamInfoPath], None]: - """Launch all datasources at once off to servicex - - Yields: - Tuple[str, StreamInfoPath]: List of data types and file paths to process - """ - qastle = await self._get_query() - - # TODO: Make this for loop parallel - for dataset in self.datasets: - data_type = dataset.first_supported_datatype(["parquet", "root"]) - if data_type == "root": - async for file in dataset.get_data_rootfiles_stream( - qastle, title=title - ): - yield (data_type, dataset.dataset_as_name, file) - elif data_type == "parquet": - async for file in dataset.get_data_parquet_stream(qastle, title=title): - yield (data_type, dataset.dataset_as_name, file) - else: - raise Exception( - f"This dataset ({str(dataset)}) supports unknown datatypes" - ) diff --git a/src/coffea/processor/servicex/executor.py b/src/coffea/processor/servicex/executor.py deleted file mode 100644 index 704f3da8a..000000000 --- a/src/coffea/processor/servicex/executor.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from abc import ABC, abstractmethod -from typing import Any, AsyncGenerator, Callable, Dict, Optional, Tuple - -import aiostream -import uproot -from servicex import StreamInfoUrl - -from ..accumulator import async_accumulate - -# from urllib.parse import urlparse, unquote -# from urllib.request import url2pathname - - -class Executor(ABC): - @abstractmethod - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - ): - raise NotImplementedError - - def get_result_file_stream(self, datasource, title: Optional[str] = None): - return datasource.stream_result_file_uris(title) - - async def execute( - self, analysis, datasource, title: Optional[str] = None, schema=None - ): - """ - Launch an analysis against the given dataset on the implementation's task framework - :param analysis: - The analysis to run - :param datasource: - The datasource to run against - :param schema: - The schema to apply to data, defaults to None (will then use auto_schema). - :return: - Stream of up to date histograms. Grows as each result is received - """ - # Stream transformed file references from ServiceX - result_file_stream = self.get_result_file_stream(datasource, title=title) - - # Launch a task against this file - func_results = self.launch_analysis_tasks_from_stream( - result_file_stream, datasource.metadata, analysis.process, schema=schema - ) - - # Wait for all the data to show up - async def inline_wait(r): - "This could be inline, but python 3.6" - x = await r - return x - - finished_events = aiostream.stream.map(func_results, inline_wait, ordered=False) - # Finally, accumulate! - # There is an accumulate pattern in the aiostream lib - async with finished_events.stream() as streamer: - async for results in async_accumulate(streamer): - yield results - - async def launch_analysis_tasks_from_stream( - self, - result_file_stream: AsyncGenerator[Tuple[str, str, StreamInfoUrl], None], - meta_data: Dict[str, str], - process_func: Callable, - schema, - ) -> AsyncGenerator[Any, None]: - """ - Invoke the implementation's task runner on each file from the serviceX stream. - We don't know the file's tree name in advance, so grab a sample the first time - around to inspect the tree name - :param result_file_stream: - :param accumulator: - :param process_func: - :param schema: - The schema to apply to data. - :return: - """ - tree_name = None - async for sx_data in result_file_stream: - file_url = sx_data[2].url - sample_md = dict(meta_data, dataset=sx_data[1]) - data_type = sx_data[0] - - # Determine the tree name if we've not gotten it already - if data_type == "root": - if tree_name is None: - with uproot.open({file_url: None}) as sample: - tree_name = sample.keys()[0] - - # Invoke the implementation's task launcher - data_result = self.run_async_analysis( - file_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=sample_md, - process_func=process_func, - schema=schema, - ) - - # Pass this down to the next item in the stream. - yield data_result - - -def run_coffea_processor( - events_url: str, tree_name: Optional[str], proc, data_type, meta_data, schema -): - """ - Process a single file from a tree via a coffea processor on the remote node - :param events_url: - a URL to a ROOT file that uproot4 can open - :param tree_name: - The tree in the ROOT file to use for our data. Can be null if the data isn't a root - tree! - :param accumulator: - Accumulator to store the results - :param proc: - Analysis function to execute. Must have signature - :param data_type: - What datatype is the data (root, parquet?) - :param schema: - The schema to apply to data (if None, will use auto_schema). - :return: - Populated accumulator - """ - # Since we execute remotely, explicitly include everything we need. - from coffea.nanoevents import NanoEventsFactory - - if schema is None: - from coffea.nanoevents.schemas.schema import auto_schema - - schema = auto_schema - - if data_type == "root": - # Use NanoEvents to build a 4-vector - assert tree_name is not None - events = NanoEventsFactory.from_root( - file=str(events_url), - treepath=f"/{tree_name}", - schemaclass=schema, - metadata=dict(meta_data, filename=str(events_url)), - ).events() - elif data_type == "parquet": - events = NanoEventsFactory.from_parquet( - file=str(events_url), - treepath="/", - schemaclass=schema, - metadata=dict(meta_data, filename=str(events_url)), - ).events() - else: - raise Exception(f"Unknown stream data type of {data_type} - cannot process.") - - return proc(events) diff --git a/src/coffea/processor/servicex/local_executor.py b/src/coffea/processor/servicex/local_executor.py deleted file mode 100644 index fd8670c0c..000000000 --- a/src/coffea/processor/servicex/local_executor.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Callable, Dict, Optional - -from .executor import Executor, run_coffea_processor - - -class LocalExecutor(Executor): - def __init__(self): - pass - - def get_result_file_stream(self, datasource, title): - return datasource.stream_result_files(title) - - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - schema, - ): - # TODO: Do we need a second routine here? Can we just use this one? - return self._async_analysis( - events_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - process_func=process_func, - schema=schema, - ) - - async def _async_analysis( - self, events_url, tree_name, data_type, meta_data, process_func, schema - ): - return run_coffea_processor( - events_url=events_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - proc=process_func, - schema=schema, - ) diff --git a/src/coffea/processor/spark/__init__.py b/src/coffea/processor/spark/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/spark/detail.py b/src/coffea/processor/spark/detail.py deleted file mode 100644 index 6e372bd2b..000000000 --- a/src/coffea/processor/spark/detail.py +++ /dev/null @@ -1,133 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor - -import pyspark.sql -import pyspark.sql.functions as fn -from pyarrow.util import guid -from tqdm import tqdm - -try: - from collections.abc import Sequence -except ImportError: - from collections.abc import Sequence - -from coffea.processor.executor import _futures_handler - -# this is a reasonable local spark configuration -_default_config = ( - pyspark.sql.SparkSession.builder.appName("coffea-analysis-%s" % guid()) - .master("local[*]") - .config("spark.sql.execution.arrow.enabled", "true") - .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000) -) - - -def _spark_initialize(config=_default_config, **kwargs): - spark_progress = False - if "spark_progress" in kwargs.keys(): - spark_progress = kwargs["spark_progress"] - - cfg_actual = config - # get spark to not complain about missing log configs - cfg_actual = cfg_actual.config( - "spark.driver.extraJavaOptions", "-Dlog4jspark.root.logger=ERROR,console" - ) - if not spark_progress: - cfg_actual = cfg_actual.config("spark.ui.showConsoleProgress", "false") - - kwargs.setdefault("bindAddress", None) - if kwargs["bindAddress"] is not None: - cfg_actual = cfg_actual.config( - "spark.driver.bindAddress", kwargs["bindAddress"] - ) - kwargs.setdefault("host", None) - if kwargs["host"] is not None: - cfg_actual = cfg_actual.config("spark.driver.host", kwargs["host"]) - - session = cfg_actual.getOrCreate() - sc = session.sparkContext - - if "log_level" in kwargs.keys(): - sc.setLogLevel(kwargs["log_level"]) - else: - sc.setLogLevel("ERROR") - - return session - - -def _read_df( - spark, dataset, files_or_dirs, ana_cols, partitionsize, file_type, treeName -): - flist = files_or_dirs - tname = treeName - if isinstance(files_or_dirs, dict): - tname = files_or_dirs["treename"] - flist = files_or_dirs["files"] - if not isinstance(flist, Sequence): - raise ValueError("spark dataset file list must be a Sequence (like list())") - df = ( - spark.read.format(file_type) - .option("tree", tname) - .option("threadCount", "-1") - .load(flist) - ) - count = df.count() - - df_cols = set(df.columns) - cols_in_df = ana_cols.intersection(df_cols) - df = df.select(*cols_in_df) - missing_cols = ana_cols - cols_in_df - for missing in missing_cols: - df = df.withColumn(missing, fn.lit(0.0)) - # compatibility with older pyarrow which doesn't understand array - for col, dtype in df.dtypes: - if dtype == "array": - tempcol = col + "tempbool" - df = df.withColumnRenamed(col, tempcol) - df = df.withColumn(col, df[tempcol].cast("array")).drop(tempcol) - df = df.withColumn("dataset", fn.lit(dataset)) - npartitions = (count // partitionsize) + 1 - actual_partitions = df.rdd.getNumPartitions() - avg_counts = count / actual_partitions - if actual_partitions > 1.50 * npartitions or avg_counts > partitionsize: - df = df.repartition(npartitions) - - return df, dataset, count - - -def _spark_make_dfs( - spark, - fileset, - partitionsize, - columns, - thread_workers, - file_type, - treeName, - status=True, -): - dfs = {} - ana_cols = set(columns) - - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = { - executor.submit( - _read_df, spark, ds, files, ana_cols, partitionsize, file_type, treeName - ) - for ds, files in fileset.items() - } - - for df, ds, count in tqdm( - _futures_handler(futures, timeout=None), - disable=not status, - unit="dataset", - total=len(fileset), - desc="loading", - ): - dfs[ds] = (df, count) - - return dfs - - -def _spark_stop(spark): - # this may do more later? - spark._jvm.SparkSession.clearActiveSession() - spark.stop() diff --git a/src/coffea/processor/spark/spark_executor.py b/src/coffea/processor/spark/spark_executor.py deleted file mode 100644 index 0db32f475..000000000 --- a/src/coffea/processor/spark/spark_executor.py +++ /dev/null @@ -1,195 +0,0 @@ -import pickle # noqa: F401 -from concurrent.futures import ThreadPoolExecutor - -import awkward # noqa: F401 -import lz4.frame # noqa: F401 - -# must preload these for exec calls -import numpy # noqa: F401 -import pandas # noqa: F401 -import pyspark.sql.functions as fn -from jinja2 import Environment, PackageLoader, select_autoescape -from pyspark.sql.types import StringType # noqa: F401 -from pyspark.sql.types import BinaryType, StructField, StructType -from tqdm import tqdm - -from coffea.nanoevents import NanoEventsFactory, schemas # noqa: F401 -from coffea.nanoevents.mapping import SimplePreloadedColumnSource # noqa: F401 -from coffea.processor.accumulator import accumulate -from coffea.processor.executor import _decompress, _futures_handler, _reduce - -lz4_clevel = 1 - - -# this is a UDF that takes care of summing histograms across -# various spark results where the outputs are histogram blobs -def agg_histos_raw(series, lz4_clevel): - goodlines = series[series.str.len() > 0] - if goodlines.size == 1: # short-circuit trivial aggregations - return goodlines[0] - return _reduce(lz4_clevel)(goodlines) - - -@fn.pandas_udf(BinaryType()) -def agg_histos(series: pandas.Series) -> bytes: - global lz4_clevel - return agg_histos_raw(series, lz4_clevel) - - -def reduce_histos_raw(df, lz4_clevel): - histos = df["histos"] - outhist = _reduce(lz4_clevel)(histos[histos.str.len() > 0]) - return pandas.DataFrame(data={"histos": numpy.array([outhist], dtype="O")}) - - -@fn.pandas_udf( - StructType([StructField("histos", BinaryType(), True)]), -) -def reduce_histos(df: pandas.DataFrame) -> pandas.DataFrame: - global lz4_clevel - return reduce_histos_raw(df, lz4_clevel) - - -def _get_ds_bistream(item): - global lz4_clevel - ds, bitstream = item - if bitstream is None: - raise Exception( - "No pandas dataframe returned from spark in dataset: %s, something went wrong!" - % ds - ) - if bitstream.empty: - raise Exception( - "The histogram list returned from spark is empty in dataset: %s, something went wrong!" - % ds - ) - out = bitstream[bitstream.columns[0]][0] - if lz4_clevel is not None: - return _decompress(out) - return out - - -class SparkExecutor: - _template_name = "spark.py.tmpl" - - def __init__(self): - self._cacheddfs = None - self._counts = None - self._env = Environment( - loader=PackageLoader("coffea.processor", "templates"), - autoescape=select_autoescape(["py"]), - ) - - @property - def counts(self): - return self._counts - - def __call__( - self, - spark, - dfslist, - theprocessor, - output, - thread_workers, - use_df_cache, - schema, - status=True, - unit="datasets", - desc="Processing", - ): - # processor needs to be a global - global processor_instance, coffea_udf, nano_schema - processor_instance = theprocessor - if schema is None: - schema = schemas.BaseSchema - if not issubclass(schema, schemas.BaseSchema): - raise ValueError( - "Expected schema to derive from BaseSchema (%s)" - % (str(schema.__name__)) - ) - nano_schema = schema - # get columns from processor - columns = processor_instance.columns - cols_w_ds = ["dataset"] + columns - # make our udf - tmpl = self._env.get_template(self._template_name) - render = tmpl.render(cols=columns) - print(render) - exec(render) - - # cache the input datasets if it's not already done - if self._counts is None: - self._counts = {} - # go through each dataset and thin down to the columns we want - for ds, (df, counts) in dfslist.items(): - self._counts[ds] = counts - - if self._cacheddfs is None: - self._cacheddfs = {} - cachedesc = "caching" if use_df_cache else "pruning" - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = set() - for ds, (df, counts) in dfslist.items(): - futures.add( - executor.submit( - self._pruneandcache_data, ds, df, cols_w_ds, use_df_cache - ) - ) - gen = _futures_handler(futures, timeout=None) - try: - for ds, df in tqdm( - gen, - disable=not status, - unit=unit, - total=len(dfslist), - desc=cachedesc, - ): - self._cacheddfs[ds] = df - finally: - gen.close() - - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = set() - for ds, df in self._cacheddfs.items(): - co_udf = coffea_udf - futures.add( - executor.submit(self._launch_analysis, ds, df, co_udf, cols_w_ds) - ) - gen = _futures_handler(futures, timeout=None) - try: - output = accumulate( - tqdm( - map(_get_ds_bistream, gen), - disable=not status, - unit=unit, - total=len(self._cacheddfs), - desc=desc, - ), - output, - ) - finally: - gen.close() - - return output - - def _pruneandcache_data(self, ds, df, columns, cacheit): - if cacheit: - return ds, df.select(*columns).cache() - return ds, df.select(*columns) - - def _launch_analysis(self, ds, df, udf, columns): - histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1 - return ( - ds, - df.select(udf(*columns).alias("histos")) - .withColumn("hpid", fn.spark_partition_id() % histo_map_parts) - .repartition(histo_map_parts, "hpid") - .groupBy("hpid") - .apply(reduce_histos) - .groupBy() - .agg(agg_histos("histos")) - .toPandas(), - ) - - -spark_executor = SparkExecutor() diff --git a/src/coffea/processor/templates/__init__.py b/src/coffea/processor/templates/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/templates/spark.py.tmpl b/src/coffea/processor/templates/spark.py.tmpl deleted file mode 100644 index 5e6e5f818..000000000 --- a/src/coffea/processor/templates/spark.py.tmpl +++ /dev/null @@ -1,24 +0,0 @@ -global coffea_udf - - -def coffea_udf(dataset: pd.Series, {% for col in cols %}{{col + ": pd.Series"}}{{ "," if not loop.last }}{% endfor %}): - global processor_instance, lz4_clevel, nano_schema - - columns = [{% for col in cols %}awkward.Array({{col}}){{ "," if not loop.last }}{% endfor %}] - names = [{% for col in cols %}{{"'"|safe+col+"'"|safe}}{{ "," if not loop.last }}{% endfor %}] - - size = len(dataset) - src = SimplePreloadedColumnSource(dict(zip(names, columns)), None, size, object_path='/Events') - - events = NanoEventsFactory \ - .from_preloaded(src, metadata={'dataset': dataset[0]}, schemaclass=nano_schema) \ - .events() - - vals = processor_instance.process(events) - - valsblob = lz4.frame.compress(pickle.dumps(vals), compression_level=lz4_clevel) - - outs = numpy.full(shape=(size, ), fill_value=b'', dtype='O') - outs[0] = valsblob - - return pandas.Series(outs) diff --git a/tests/test_local_executors.py b/tests/test_local_executors.py deleted file mode 100644 index ccc35f5f4..000000000 --- a/tests/test_local_executors.py +++ /dev/null @@ -1,125 +0,0 @@ -import os.path as osp -import sys - -import pytest - -from coffea import processor -from coffea.nanoevents import schemas -from coffea.processor.executor import UprootMissTreeError - -if sys.platform.startswith("win"): - pytest.skip("skipping tests that only function in linux", allow_module_level=True) - - -@pytest.mark.parametrize("filetype", ["root", "parquet"]) -@pytest.mark.parametrize("skipbadfiles", [True, False]) -@pytest.mark.parametrize("maxchunks", [1, None]) -@pytest.mark.parametrize("chunksize", [100000, 5]) -@pytest.mark.parametrize("schema", [None, schemas.BaseSchema]) -@pytest.mark.parametrize( - "executor", [processor.IterativeExecutor] # , processor.FuturesExecutor -) -def test_dataframe_analysis( - executor, schema, chunksize, maxchunks, skipbadfiles, filetype -): - from coffea.processor.test_items import NanoTestProcessor - - if schema is not None and filetype == "parquet": - pytest.xfail("parquet nanoevents not supported yet") - - filelist = { - "ZJets": {"files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")]}, - "Data": {"files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")]}, - } - - executor = executor() - run = processor.Runner( - executor=executor, - schema=schema, - chunksize=chunksize, - maxchunks=maxchunks, - skipbadfiles=skipbadfiles, - format=filetype, - ) - - hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) - - if maxchunks is None: - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - else: - assert maxchunks == 1 - print(hists["cutflow"]["ZJets_pt"]) - assert hists["cutflow"]["ZJets_pt"] == (18 if chunksize == 100_000 else 2) - assert hists["cutflow"]["ZJets_mass"] == (6 if chunksize == 100_000 else 1) - assert hists["cutflow"]["Data_pt"] == (84 if chunksize == 100_000 else 13) - assert hists["cutflow"]["Data_mass"] == (66 if chunksize == 100_000 else 12) - - -@pytest.mark.parametrize("filetype", ["root", "parquet"]) -@pytest.mark.parametrize("skipbadfiles", [True, False]) -@pytest.mark.parametrize("maxchunks", [None, 1000]) -@pytest.mark.parametrize("compression", [None, 0, 2]) -@pytest.mark.parametrize( - "executor", [processor.IterativeExecutor] # , processor.FuturesExecutor -) -def test_nanoevents_analysis(executor, compression, maxchunks, skipbadfiles, filetype): - from coffea.processor.test_items import NanoEventsProcessor - - if filetype == "parquet": - pytest.xfail("parquet nanoevents not supported yet") - - filelist = { - "DummyBadMissingFile": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/non_existent.{filetype}")], - }, - "ZJetsBadMissingTree": { - "treename": "NotEvents", - "files": [ - osp.abspath(f"tests/samples/nano_dy.{filetype}"), - osp.abspath(f"tests/samples/nano_dy_SpecialTree.{filetype}"), - ], - }, - "ZJetsBadMissingTreeAllFiles": { - "treename": "NotEvents", - "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")], - }, - "ZJets": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")], - "metadata": {"checkusermeta": True, "someusermeta": "hello"}, - }, - "Data": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")], - "metadata": {"checkusermeta": True, "someusermeta2": "world"}, - }, - } - - executor = executor(compression=compression) - run = processor.Runner( - executor=executor, - skipbadfiles=skipbadfiles, - schema=processor.NanoAODSchema, - maxchunks=maxchunks, - format=filetype, - ) - - if skipbadfiles: - hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["ZJetsBadMissingTree_pt"] == 18 - assert hists["cutflow"]["ZJetsBadMissingTree_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - else: - LookForError = (FileNotFoundError, UprootMissTreeError) - with pytest.raises(LookForError): - hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) - with pytest.raises(LookForError): - hists = run(filelist, "NotEvents", processor_instance=NanoEventsProcessor()) diff --git a/tests/test_workitem.py b/tests/test_workitem.py deleted file mode 100644 index 205c4d161..000000000 --- a/tests/test_workitem.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from coffea.processor.executor import WorkItem - - -def test_work_item(): - item1 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {}) - item2 = WorkItem( - "TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {"meta": "data"} - ) - item3 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 760, "abc", {}) - - assert item1 == item1 - assert item1 == item2 - assert item1 != item3 - assert item1.dataset == "TestDataSet" - assert item1.filename == "/a/b/c.root" - assert item1.treename == "Events" - assert item1.entrystart == 500 - assert item1.entrystop == 670 - assert item1.fileuuid == "abc" - assert len(item1) == 670 - 500 - assert len(item3) == 760 - 500 - - # Test if hashable - hash(item2) - - # Test if usermeta is mutable - item1.usermeta["user"] = "meta" diff --git a/tests/wq.py b/tests/wq.py deleted file mode 100755 index 888cd4b01..000000000 --- a/tests/wq.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys - -try: - import work_queue as wq - - work_queue_port = 9123 -except ImportError: - print("work_queue is not installed. Omitting test.") - sys.exit(0) - - -def template_analysis(environment_file, filelist, executor): - from coffea.processor import Runner - from coffea.processor.test_items import NanoTestProcessor - - executor = executor( - environment_file=environment_file, - cores=2, - memory=500, # MB - disk=1000, # MB - manager_name="coffea_test", - port=work_queue_port, - print_stdout=True, - ) - - run = Runner(executor) - - hists = run(filelist, "Events", NanoTestProcessor()) - - print(hists) - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - -def work_queue_example(environment_file): - from coffea.processor import WorkQueueExecutor - - # Work Queue does not allow absolute paths - filelist = { - "ZJets": ["./samples/nano_dy.root"], - "Data": ["./samples/nano_dimuon.root"], - } - - workers = wq.Factory( - batch_type="local", manager_host_port=f"localhost:{work_queue_port}" - ) - workers.max_workers = 1 - workers.min_workers = 1 - workers.cores = 4 - workers.memory = 1000 # MB - workers.disk = 4000 # MB - - with workers: - template_analysis(environment_file, filelist, WorkQueueExecutor) - - -if __name__ == "__main__": - try: - # see https://coffeateam.github.io/coffea/wq.html for constructing an - # environment that can be shipped with a task. - environment_file = sys.argv[1] - except IndexError: - environment_file = None - work_queue_example(environment_file) From 707195eecbaf8117aaf39591bd9ebc5164619e1d Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 17:01:29 -0600 Subject: [PATCH 27/80] disable workqueue tests (to be replaced with taskvine) --- .github/workflows/ci.yml | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ea61615a7..a5263f459 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -126,33 +126,33 @@ jobs: env: GH_PAT: ${{ secrets.GITHUB_OAUTH }} - testwq: - runs-on: ubuntu-latest - needs: pre-commit - strategy: - matrix: - python-version: ["3.11"] - name: test coffea-workqueue - - steps: - - uses: actions/checkout@v4 - - name: Set up Conda - uses: conda-incubator/setup-miniconda@v2 - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - channels: conda-forge - - name: Test work_queue - shell: bash -l {0} - run: | - conda create --yes --name coffea-env -c conda-forge python=${{ matrix.python-version }} ndcctools dill conda-pack conda - conda activate coffea-env - python -m pip install --ignore-installed . - cd tests - conda-pack --output coffea-env.tar.gz - python wq.py coffea-env.tar.gz +# testwq: +# runs-on: ubuntu-latest +# needs: pre-commit +# strategy: +# matrix: +# python-version: ["3.11"] +# name: test coffea-workqueue +# +# steps: +# - uses: actions/checkout@v4 +# - name: Set up Conda +# uses: conda-incubator/setup-miniconda@v2 +# env: +# ACTIONS_ALLOW_UNSECURE_COMMANDS: true +# with: +# auto-update-conda: true +# python-version: ${{ matrix.python-version }} +# channels: conda-forge +# - name: Test work_queue +# shell: bash -l {0} +# run: | +# conda create --yes --name coffea-env -c conda-forge python=${{ matrix.python-version }} ndcctools dill conda-pack conda +# conda activate coffea-env +# python -m pip install --ignore-installed . +# cd tests +# conda-pack --output coffea-env.tar.gz +# python wq.py coffea-env.tar.gz # testskyhookjob: # runs-on: ubuntu-latest From 2c65b36ce3ff16b874998f3895d1d1334e817f5b Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 17:08:05 -0600 Subject: [PATCH 28/80] patch up ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5263f459..5534495d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -201,7 +201,7 @@ jobs: password: ${{ secrets.PYPI_TOKEN }} pass: - needs: [test, testwq] + needs: [test] runs-on: ubuntu-latest steps: - run: echo "All jobs passed" From cc22be89ae54dbc41758f0706460a910eed88901 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 17:09:56 -0600 Subject: [PATCH 29/80] more testwq removal --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5534495d4..cc635a4a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -178,7 +178,7 @@ jobs: release: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') runs-on: ubuntu-latest - needs: [test, testwq] + needs: [test] strategy: matrix: python-version: ["3.11"] From dfed4b9282eaf78cd59bf0f1a5f39606955cf8ed Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 17:46:47 -0600 Subject: [PATCH 30/80] adjustments to removing executor / lazydataframe --- src/coffea/processor/__init__.py | 4 +- tests/test_parsl.py | 144 ------------------------------- tests/test_processor.py | 117 ------------------------- tests/test_spark.py | 136 ----------------------------- 4 files changed, 3 insertions(+), 398 deletions(-) delete mode 100644 tests/test_parsl.py delete mode 100644 tests/test_spark.py diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py index 91b6f357d..62b533d31 100644 --- a/src/coffea/processor/__init__.py +++ b/src/coffea/processor/__init__.py @@ -2,11 +2,13 @@ """ -from .accumulator import AccumulatorABC, dict_accumulator +from .accumulator import AccumulatorABC, accumulate, dict_accumulator, value_accumulator from .processor import ProcessorABC __all__ = [ "dict_accumulator", + "value_accumulator", + "accumulate", "AccumulatorABC", "ProcessorABC", ] diff --git a/tests/test_parsl.py b/tests/test_parsl.py deleted file mode 100644 index c879e80be..000000000 --- a/tests/test_parsl.py +++ /dev/null @@ -1,144 +0,0 @@ -import multiprocessing -import sys - -import pytest - -from coffea import processor - - -def test_parsl_start_stop(): - pytest.importorskip("parsl", minversion="0.7.2") - - from coffea.processor.parsl.detail import ( - _default_cfg, - _parsl_initialize, - _parsl_stop, - ) - - _parsl_initialize(config=_default_cfg) - - _parsl_stop() - - -def do_parsl_job(filelist, flatten=False, compression=0, config=None): - from coffea.processor.test_items import NanoTestProcessor - - executor = processor.ParslExecutor(compression=compression, config=config) - run = processor.Runner(executor=executor) - - hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) - - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - -# @pytest.mark.skipif(sys.platform.startswith('darwin'), reason='parsl htex not working on osx again') -def test_parsl_htex_executor(): - pytest.importorskip("parsl", minversion="0.7.2") - import os - import os.path as osp - - import parsl - from parsl.channels import LocalChannel - from parsl.config import Config - from parsl.executors import HighThroughputExecutor - from parsl.providers import LocalProvider - - parsl_config = Config( - executors=[ - HighThroughputExecutor( - label="coffea_parsl_default", - address="127.0.0.1", - cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), - max_workers=1, - provider=LocalProvider( - channel=LocalChannel(), - init_blocks=1, - max_blocks=1, - nodes_per_block=1, - ), - ) - ], - strategy=None, - ) - parsl.load(parsl_config) - - filelist = { - "ZJets": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - "Data": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")], - } - - do_parsl_job(filelist) - do_parsl_job(filelist, compression=1) - - filelist = { - "ZJets": { - "treename": "Events", - "files": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - }, - "Data": { - "treename": "Events", - "files": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")], - }, - } - - do_parsl_job(filelist) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="signals are different on windows" -) -def test_timeout(): - import signal - - from coffea.processor.parsl.timeout import timeout - - @timeout - def too_long(timeout=None): - import time - - time.sleep(20) - - @timeout - def make_except(timeout=None): - import time - - time.sleep(1) - raise Exception("oops!") - - try: - too_long(timeout=5) - except Exception as e: - assert e.args[0] == "Timeout hit" - - try: - make_except(timeout=20) - except Exception as e: - assert e.args[0] == "oops!" - - # reset alarms for other tests, this is suspicious - signal.alarm(0) - - -def test_parsl_condor_cfg(): - pytest.importorskip("parsl", minversion="0.7.2") - - from coffea.processor.parsl.condor_config import condor_config - - print(condor_config()) - - -def test_parsl_slurm_cfg(): - pytest.importorskip("parsl", minversion="0.7.2") - import os - - x509_proxy = "x509up_u%s" % (os.getuid()) - fname = "/tmp/%s" % x509_proxy - with open(fname, "w+"): - os.utime(fname, None) - - from coffea.processor.parsl.slurm_config import slurm_config - - print(slurm_config()) diff --git a/tests/test_processor.py b/tests/test_processor.py index 732762b08..b5d836b67 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -1,6 +1,3 @@ -import os.path as osp -import sys - import pytest @@ -28,117 +25,3 @@ def postprocess(self, accumulator): acc = None super(test, proc).postprocess(acc) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="problems with paths on windows" -) -def test_lazy_dataframe(): - import uproot - - from coffea.processor import LazyDataFrame - - tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"] - entrystart = 0 - entrystop = 100 - - df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"]) - - assert len(df) == 1 - - pt = df["Muon_pt"] - assert len(df) == 2 - df["Muon_pt_up"] = pt * 1.05 - assert len(df) == 3 - assert "Muon_pt" in df.materialized - - assert "Muon_eta" in df.available - - assert df.size == tree.num_entries - - with pytest.raises(KeyError): - df["notthere"] - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="problems with paths on windows" -) -def test_lazy_dataframe_getattr(): - import uproot - - from coffea.processor import LazyDataFrame - - tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"] - entrystart = 0 - entrystop = 100 - - df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"]) - - assert len(df) == 1 - - df.Muon_pt - assert len(df) == 2 - assert "Muon_pt" in df.materialized - - assert "Muon_eta" in df.available - - assert df.size == tree.num_entries - - with pytest.raises(AttributeError): - df.notthere - - import copy - - df2 = copy.copy(df) - df2.Muon_pt - - with pytest.raises(AttributeError): - df2.notthere - - -def test_processor_newaccumulator(): - from coffea.processor import ( - IterativeExecutor, - ProcessorABC, - defaultdict_accumulator, - ) - - class Test(ProcessorABC): - def process(self, item): - return {"itemsum": item} - - def postprocess(self, accumulator): - pass - - proc = Test() - - exe = IterativeExecutor() - out = exe( - range(10), - proc.process, - None, - ) - assert out == ({"itemsum": 45}, 0) - - class TestOldStyle(ProcessorABC): - @property - def accumulator(self): - return defaultdict_accumulator(int) - - def process(self, item): - out = self.accumulator.identity() - out["itemsum"] += item - return out - - def postprocess(self, accumulator): - pass - - proc = TestOldStyle() - - exe = IterativeExecutor() - out = exe( - range(10), - proc.process, - proc.accumulator, - ) - assert out[0]["itemsum"] == 45 diff --git a/tests/test_spark.py b/tests/test_spark.py deleted file mode 100644 index 25581213a..000000000 --- a/tests/test_spark.py +++ /dev/null @@ -1,136 +0,0 @@ -import pytest - - -def test_spark_imports(): - pytest.importorskip("pyspark", minversion="3.3.0") - - from coffea.processor.spark.detail import _spark_initialize, _spark_stop - - spark = _spark_initialize(bindAddress="127.0.0.1", host="127.0.0.1") - _spark_stop(spark) - - -@pytest.mark.skip(reason="pyspark executor work currently in progress") -def test_spark_executor(): - pyspark = pytest.importorskip("pyspark", minversion="3.3.0") - import os - import os.path as osp - - import pyspark.sql - from pyarrow.util import guid - - from coffea.nanoevents import schemas - from coffea.processor import run_spark_job - from coffea.processor.spark.detail import _spark_initialize, _spark_stop - - spark_config = ( - pyspark.sql.SparkSession.builder.appName("spark-executor-test-%s" % guid()) - .master("local[*]") - .config("spark.sql.execution.arrow.enabled", "true") - .config("spark.driver.host", "127.0.0.1") - .config("spark.driver.bindAddress", "127.0.0.1") - .config("spark.executor.x509proxyname", "x509_u12409") - .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000) - ) - - spark = _spark_initialize( - config=spark_config, log_level="ERROR", spark_progress=False - ) - - filelist = { - "ZJets": { - "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - "treename": "Events", - }, - "Data": { - "files": [ - "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") - ], - "treename": "Events", - }, - } - - from coffea.processor.spark.spark_executor import spark_executor - from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor - - columns = ["nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"] - proc = NanoTestProcessor(columns=columns) - - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root"}, - ) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root"}, - ) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - proc = NanoEventsProcessor(columns=columns) - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root", "schema": schemas.NanoAODSchema}, - ) - - _spark_stop(spark) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - -def test_spark_hist_adders(): - pytest.importorskip("pyspark", minversion="3.3.0") - - import pickle as pkl - - import lz4.frame as lz4f - import pandas as pd - - from coffea.processor.spark.spark_executor import agg_histos_raw, reduce_histos_raw - from coffea.processor.test_items import NanoTestProcessor - from coffea.util import numpy as np - - proc = NanoTestProcessor() - - one = proc.accumulator - two = proc.accumulator - hlist1 = [lz4f.compress(pkl.dumps(one))] - hlist2 = [lz4f.compress(pkl.dumps(one)), lz4f.compress(pkl.dumps(two))] - harray1 = np.array(hlist1, dtype="O") - harray2 = np.array(hlist2, dtype="O") - - series1 = pd.Series(harray1) - series2 = pd.Series(harray2) - df = pd.DataFrame({"histos": harray2}) - - # correctness of these functions is checked in test_spark_executor - agg_histos_raw(series1, 1) - agg_histos_raw(series2, 1) - reduce_histos_raw(df, 1) From a6619875b02a71bec69f51320938d194ff8425b9 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 21 Nov 2023 18:15:32 -0600 Subject: [PATCH 31/80] accumulator tests --- src/coffea/processor/__init__.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py index 62b533d31..465fc0ed0 100644 --- a/src/coffea/processor/__init__.py +++ b/src/coffea/processor/__init__.py @@ -2,11 +2,24 @@ """ -from .accumulator import AccumulatorABC, accumulate, dict_accumulator, value_accumulator +from .accumulator import ( + AccumulatorABC, + accumulate, + column_accumulator, + defaultdict_accumulator, + dict_accumulator, + list_accumulator, + set_accumulator, + value_accumulator, +) from .processor import ProcessorABC __all__ = [ + "column_accumulator", + "defaultdict_accumulator", "dict_accumulator", + "list_accumulator", + "set_accumulator", "value_accumulator", "accumulate", "AccumulatorABC", From 080f2af1c09f287a91bf37e6f1fc4601ca29a1c8 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 22 Nov 2023 11:46:10 -0600 Subject: [PATCH 32/80] also allow Callables in apply_to_fileset --- src/coffea/dataset_tools/apply_processor.py | 25 ++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 610bae8fa..798c73623 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -1,9 +1,15 @@ +import copy +from typing import Callable, Union + from coffea.nanoevents import NanoAODSchema, NanoEventsFactory from coffea.processor import ProcessorABC def apply_to_one_dataset( - proc: ProcessorABC, dataset, schemaclass=NanoAODSchema, metadata={} + data_manipulation: Union[ProcessorABC, Callable], + dataset, + schemaclass=NanoAODSchema, + metadata={}, ): files = dataset["files"] events = NanoEventsFactory.from_root( @@ -11,13 +17,22 @@ def apply_to_one_dataset( metadata=metadata, schemaclass=schemaclass, ).events() - return proc.process(events) + if isinstance(data_manipulation, ProcessorABC): + return data_manipulation.process(events) + elif isinstance(data_manipulation, Callable): + return data_manipulation(events) + else: + raise ValueError("data_manipulation must either be a ProcessorABC or Callable") -def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema): +def apply_to_fileset( + data_manipulation: Union[ProcessorABC, Callable], fileset, schemaclass=NanoAODSchema +): out = {} for name, dataset in fileset.items(): - metadata = dataset.get("metadata", {}).copy() + metadata = copy.deepcopy(dataset.get("metadata", {})) metadata["dataset"] = name - out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata) + out[name] = apply_to_one_dataset( + data_manipulation, dataset, schemaclass, metadata + ) return out From 4a600536bbb5e4f2549aab52ebbf88bdf88e8657 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 27 Nov 2023 13:43:50 -0600 Subject: [PATCH 33/80] add dataset tools test --- tests/test_dataset_tools.py | 220 ++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 tests/test_dataset_tools.py diff --git a/tests/test_dataset_tools.py b/tests/test_dataset_tools.py new file mode 100644 index 000000000..ca46c0060 --- /dev/null +++ b/tests/test_dataset_tools.py @@ -0,0 +1,220 @@ +import dask +import pytest +from distributed import Client + +from coffea.dataset_tools import apply_to_fileset, max_chunks, preprocess, slice_chunks +from coffea.nanoevents import BaseSchema, NanoAODSchema +from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor + +_starting_fileset = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": "Events", + "tests/samples/nano_dimuon_not_there.root": "Events", + } + }, +} + +_runnable_result = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, +} + +_updated_result = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + }, + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": None, + "uuid": None, + }, + } + }, +} + + +@pytest.mark.parametrize( + "proc_and_schema", + [(NanoTestProcessor, BaseSchema), (NanoEventsProcessor, NanoAODSchema)], +) +def test_apply_to_fileset(proc_and_schema): + proc, schemaclass = proc_and_schema + + with Client() as _: + to_compute = apply_to_fileset( + proc(), + _runnable_result, + schemaclass=schemaclass, + ) + out = dask.compute(to_compute)[0] + + assert out["ZJets"]["cutflow"]["ZJets_pt"] == 18 + assert out["ZJets"]["cutflow"]["ZJets_mass"] == 6 + assert out["Data"]["cutflow"]["Data_pt"] == 84 + assert out["Data"]["cutflow"]["Data_mass"] == 66 + + to_compute = apply_to_fileset( + proc(), + max_chunks(_runnable_result, 1), + schemaclass=schemaclass, + ) + out = dask.compute(to_compute)[0] + + assert out["ZJets"]["cutflow"]["ZJets_pt"] == 5 + assert out["ZJets"]["cutflow"]["ZJets_mass"] == 2 + assert out["Data"]["cutflow"]["Data_pt"] == 17 + assert out["Data"]["cutflow"]["Data_mass"] == 14 + + +def test_preprocess(): + with Client() as _: + starting_fileset = _starting_fileset + + dataset_runnable, dataset_updated = preprocess( + starting_fileset, + maybe_step_size=7, + align_clusters=False, + files_per_batch=10, + skip_bad_files=True, + ) + + assert dataset_runnable == _runnable_result + assert dataset_updated == _updated_result + + +def test_preprocess_failed_file(): + with Client() as _, pytest.raises(FileNotFoundError): + starting_fileset = _starting_fileset + + dataset_runnable, dataset_updated = preprocess( + starting_fileset, + maybe_step_size=7, + align_clusters=False, + files_per_batch=10, + skip_bad_files=False, + ) + + +def test_maxchunks(): + max_chunked = max_chunks(_runnable_result, 3) + + assert max_chunked == { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [[0, 7], [7, 14], [14, 21]], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [[0, 7], [7, 14], [14, 21]], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, + } + + +def test_slicechunks(): + slice_chunked = slice_chunks(_runnable_result, slice(None, None, 2)) + + assert slice_chunked == { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [[0, 7], [14, 21], [28, 35]], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [[0, 7], [14, 21], [28, 35]], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, + } From 98a719cb98a8137a8e5cd1909c2c34b9f71777c2 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Tue, 28 Nov 2023 16:33:32 +0100 Subject: [PATCH 34/80] Make scope of dataset_query less cms-only. Edits language --- src/coffea/dataset_tools/dataset_query.py | 54 +++++++++++------------ src/coffea/dataset_tools/rucio_utils.py | 37 +++++++++------- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 91b3910c7..a4d53df20 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -67,8 +67,8 @@ def __init__(self): self.last_query = "" self.last_query_tree = None self.last_query_list = None - self.sites_whitelist = None - self.sites_blacklist = None + self.sites_allowlist = None + self.sites_blocklist = None self.sites_regex = None self.last_replicas_results = None @@ -98,7 +98,7 @@ def do_query(self, args): # Your code here with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): outlist, outtree = rucio_utils.query_dataset( - args.arg_list[0], client=self.rucio_client, tree=True + args.arg_list[0], client=self.rucio_client, tree=True, scope="cms" #TODO configure scope ) # Now let's print the results as a tree print_dataset_query(args, outtree, self.selected_datasets, self.console) @@ -176,8 +176,8 @@ def do_replicas(self, args): ): outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas( dataset, - whitelist_sites=self.sites_whitelist, - blacklist_sites=self.sites_blacklist, + allowlist_sites=self.sites_allowlist, + blocklist_sites=self.sites_blocklist, regex_sites=self.sites_regex, mode="full", client=self.rucio_client, @@ -263,22 +263,22 @@ def do_replicas(self, args): T.add(f"[cyan]{f}") self.console.print(tree) - def do_whitelist_sites(self, args): - if self.sites_whitelist is None: - self.sites_whitelist = args.arg_list + def do_allowlist_sites(self, args): + if self.sites_allowlist is None: + self.sites_allowlist = args.arg_list else: - self.sites_whitelist += args.arg_list - print("[green]Whitelisted sites:") - for s in self.sites_whitelist: + self.sites_allowlist += args.arg_list + print("[green]Allowlisted sites:") + for s in self.sites_allowlist: print(f"- {s}") - def do_blacklist_sites(self, args): - if self.sites_blacklist is None: - self.sites_blacklist = args.arg_list + def do_blocklist_sites(self, args): + if self.sites_blocklist is None: + self.sites_blocklist = args.arg_list else: - self.sites_blacklist += args.arg_list - print("[red]Blacklisted sites:") - for s in self.sites_blacklist: + self.sites_blocklist += args.arg_list + print("[red]Blocklisted sites:") + for s in self.sites_blocklist: print(f"- {s}") def do_regex_sites(self, args): @@ -291,20 +291,20 @@ def do_regex_sites(self, args): def do_sites_filters(self, args): if args == "": - print("[green bold]Whitelisted sites:") - if self.sites_whitelist: - for s in self.sites_whitelist: + print("[green bold]Allow-listed sites:") + if self.sites_allowlist: + for s in self.sites_allowlist: print(f"- {s}") - print("[bold red]Blacklisted sites:") - if self.sites_blacklist: - for s in self.sites_blacklist: + print("[bold red]Block-listed sites:") + if self.sites_blocklist: + for s in self.sites_blocklist: print(f"- {s}") print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") if args == "clear": - self.sites_whitelist = None - self.sites_blacklist = None + self.sites_allowlist = None + self.sites_blocklist = None self.sites_regex = None print("[bold green]Sites filters cleared") @@ -353,8 +353,8 @@ def do_save(self, args): - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset - [bold cyan]sites_filters[/]: show the active sites filters - [bold cyan]sites_filters clear[/]: clear all the active sites filters - - [bold cyan]whitelist_sites[/]: Select sites to whitelist for replica queries - - [bold cyan]blacklist_sites[/]: Select sites to blacklist for replica queries + - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries + - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing - [bold cyan]help[/]: get help! diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 96b17b454..f4a3f3a7d 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -4,12 +4,17 @@ import re import subprocess from collections import defaultdict +import tomli from rucio.client import Client # Rucio needs the default configuration --> taken from CMS cvmfs defaults -os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" +if not "RUCIO_HOME" in os.environ: + os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" +# with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f: +# rucio_cfg = tomli.load(f) +# print(rucio_cfg) def get_proxy_path() -> str: """ @@ -125,19 +130,20 @@ def _get_pfn_for_site(path, rules): def get_dataset_files_replicas( dataset, - whitelist_sites=None, - blacklist_sites=None, + allowlist_sites=None, + blocklist_sites=None, regex_sites=None, mode="full", client=None, + scope="cms" ): """ This function queries the Rucio server to get information about the location of all the replicas of the files in a CMS dataset. The sites can be filtered in 3 different ways: - - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Exception. - - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception + - `allowlist_sites`: list of sites to select from. If the file is not found there, raise an Exception. + - `blocklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception - `regex_sites`: regex expression to restrict the list of sites. The fileset returned by the function is controlled by the `mode` parameter: @@ -150,11 +156,12 @@ def get_dataset_files_replicas( ---------- dataset: str - whilelist_sites: list - blacklist_sites: list + allowlist_sites: list + blocklist_sites: list regex_sites: list mode: str, default "full" client: rucio Client, optional + scope: rucio scope, "cms" Returns ------- @@ -176,13 +183,13 @@ def get_dataset_files_replicas( client = client if client else get_rucio_client() outsites = [] outfiles = [] - for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]): + for filedata in client.list_replicas([{"scope": scope, "name": dataset}]): outfile = [] outsite = [] rses = filedata["rses"] found = False - if whitelist_sites: - for site in whitelist_sites: + if allowlist_sites: + for site in allowlist_sites: if site in rses: # Check actual availability meta = filedata["pfns"][rses[site][0]] @@ -201,13 +208,13 @@ def get_dataset_files_replicas( if not found: raise Exception( - f"No SITE available in the whitelist for file {filedata['name']}" + f"No SITE available in the allowlist for file {filedata['name']}" ) else: possible_sites = list(rses.keys()) - if blacklist_sites: + if blocklist_sites: possible_sites = list( - filter(lambda key: key not in blacklist_sites, possible_sites) + filter(lambda key: key not in blocklist_sites, possible_sites) ) if len(possible_sites) == 0: @@ -275,11 +282,11 @@ def get_dataset_files_replicas( return outfiles, outsites, sites_counts -def query_dataset(query, client=None, tree=False): +def query_dataset(query, client=None, tree=False, scope="cms"): client = client if client else get_rucio_client() out = list( client.list_dids( - scope="cms", filters={"name": query, "type": "container"}, long=False + scope=scope, filters={"name": query, "type": "container"}, long=False ) ) if tree: From 8f669a000933280a6a61a80f5b08959d1e49f5c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Nov 2023 15:36:21 +0000 Subject: [PATCH 35/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 5 ++++- src/coffea/dataset_tools/rucio_utils.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index a4d53df20..46236b805 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -98,7 +98,10 @@ def do_query(self, args): # Your code here with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): outlist, outtree = rucio_utils.query_dataset( - args.arg_list[0], client=self.rucio_client, tree=True, scope="cms" #TODO configure scope + args.arg_list[0], + client=self.rucio_client, + tree=True, + scope="cms", # TODO configure scope ) # Now let's print the results as a tree print_dataset_query(args, outtree, self.selected_datasets, self.console) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index f4a3f3a7d..2df9a62ec 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -4,8 +4,8 @@ import re import subprocess from collections import defaultdict -import tomli +import tomli from rucio.client import Client # Rucio needs the default configuration --> taken from CMS cvmfs defaults @@ -16,6 +16,7 @@ # rucio_cfg = tomli.load(f) # print(rucio_cfg) + def get_proxy_path() -> str: """ Checks if the VOMS proxy exists and if it is valid @@ -135,7 +136,7 @@ def get_dataset_files_replicas( regex_sites=None, mode="full", client=None, - scope="cms" + scope="cms", ): """ This function queries the Rucio server to get information about the location From 5e127c25911eea67178e48ec0b8b112fe259f530 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 11:08:29 -0600 Subject: [PATCH 36/80] remove accumulator concept --- src/coffea/analysis_tools.py | 13 +- src/coffea/processor/__init__.py | 18 -- src/coffea/processor/accumulator.py | 380 ---------------------------- src/coffea/processor/processor.py | 2 +- tests/test_accumulators.py | 184 -------------- 5 files changed, 10 insertions(+), 587 deletions(-) delete mode 100644 src/coffea/processor/accumulator.py delete mode 100644 tests/test_accumulators.py diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py index facf14e97..0f88cab27 100644 --- a/src/coffea/analysis_tools.py +++ b/src/coffea/analysis_tools.py @@ -19,7 +19,7 @@ import coffea.util -class WeightStatistics(coffea.processor.AccumulatorABC): +class WeightStatistics: def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0): self.sumw = sumw self.sumw2 = sumw2 @@ -40,6 +40,13 @@ def add(self, other): self.maxw = max(self.maxw, other.maxw) self.n += other.n + def __add__(self, other): + temp = WeightStatistics(self.sumw, self.sumw2, self.minw, self.maxw, self.n) + return temp.add(other) + + def __iadd__(self, other): + return self.add(other) + class Weights: """Container for event weights and associated systematic shifts @@ -62,7 +69,7 @@ def __init__(self, size, storeIndividual=False): self._weight = None if size is None else numpy.ones(size) self._weights = {} self._modifiers = {} - self._weightStats = coffea.processor.dict_accumulator() + self._weightStats = {} self._storeIndividual = storeIndividual @property @@ -102,8 +109,6 @@ def __add_delayed(self, name, weight, weightUp, weightDown, shift): if self._storeIndividual: self._weights[name] = weight self.__add_variation(name, weight, weightUp, weightDown, shift) - if isinstance(self._weightStats, coffea.processor.dict_accumulator): - self._weightStats = {} self._weightStats[name] = { "sumw": dask_awkward.to_dask_array(weight).sum(), "sumw2": dask_awkward.to_dask_array(weight**2).sum(), diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py index 465fc0ed0..888ed0cf2 100644 --- a/src/coffea/processor/__init__.py +++ b/src/coffea/processor/__init__.py @@ -2,26 +2,8 @@ """ -from .accumulator import ( - AccumulatorABC, - accumulate, - column_accumulator, - defaultdict_accumulator, - dict_accumulator, - list_accumulator, - set_accumulator, - value_accumulator, -) from .processor import ProcessorABC __all__ = [ - "column_accumulator", - "defaultdict_accumulator", - "dict_accumulator", - "list_accumulator", - "set_accumulator", - "value_accumulator", - "accumulate", - "AccumulatorABC", "ProcessorABC", ] diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py deleted file mode 100644 index 8ad12dab1..000000000 --- a/src/coffea/processor/accumulator.py +++ /dev/null @@ -1,380 +0,0 @@ -import copy -import operator -from abc import ABCMeta, abstractmethod -from collections import defaultdict -from collections.abc import MutableMapping, MutableSet -from typing import Iterable, Optional, TypeVar, Union - -from dask.base import DaskMethodsMixin - -try: - from typing import Protocol, runtime_checkable # type: ignore -except ImportError: - from typing_extensions import Protocol # type: ignore - from typing import runtime_checkable - -import numpy - -T = TypeVar("T") - - -@runtime_checkable -class Addable(Protocol): - def __add__(self: T, other: T) -> T: - ... - - -Accumulatable = Union[Addable, MutableSet, MutableMapping] - - -def add(a: Accumulatable, b: Accumulatable) -> Accumulatable: - """Add two accumulatables together, without altering inputs - - This may make copies in certain situations - """ - if isinstance(a, Addable) and isinstance(b, Addable): - return operator.add(a, b) - if isinstance(a, MutableSet) and isinstance(b, MutableSet): - return operator.or_(a, b) - elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping): - # capture type(X) by shallow copy and clear - # since we don't know the signature of type(X).__init__ - if isinstance(b, type(a)): - out = copy.copy(a) - elif isinstance(a, type(b)): - out = copy.copy(b) - else: - raise ValueError( - f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})" - ) - out.clear() - lhs, rhs = set(a), set(b) - # Keep the order of elements as far as possible - for key in a: - if key in rhs: - out[key] = add(a[key], b[key]) - else: - out[key] = ( - copy.deepcopy(a[key]) - if not isinstance(a[key], DaskMethodsMixin) - else copy.copy(a[key]) - ) - for key in b: - if key not in lhs: - out[key] = ( - copy.deepcopy(b[key]) - if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) - ) - return out - raise ValueError( - f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})" - ) - - -def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable: - """Add two accumulatables together, assuming the first is mutable""" - if isinstance(a, Addable) and isinstance(b, Addable): - return operator.iadd(a, b) - elif isinstance(a, MutableSet) and isinstance(b, MutableSet): - return operator.ior(a, b) - elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping): - if not isinstance(b, type(a)): - raise ValueError( - f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})" - ) - lhs, rhs = set(a), set(b) - # Keep the order of elements as far as possible - for key in a: - if key in rhs: - a[key] = iadd(a[key], b[key]) - for key in b: - if key not in lhs: - a[key] = ( - copy.deepcopy(b[key]) - if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) - ) - return a - raise ValueError( - f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})" - ) - - -def accumulate( - items: Iterable[Optional[Accumulatable]], accum: Optional[Accumulatable] = None -) -> Optional[Accumulatable]: - gen = (x for x in items if x is not None) - try: - if accum is None: - accum = next(gen) - # we want to produce a new object so that the input is not mutated - accum = add(accum, next(gen)) - while True: - # subsequent additions can happen in-place, which may be more performant - accum = iadd(accum, next(gen)) - except StopIteration: - pass - return accum - - -async def async_accumulate(result_stream): - output = None - async for results in result_stream: - if output: - output = iadd(output, results) - else: - output = results - yield output - - -class AccumulatorABC(metaclass=ABCMeta): - """Abstract base class for an accumulator - - Accumulators are abstract objects that enable the reduce stage of the typical map-reduce - scaleout that we do in Coffea. One concrete example is a histogram. The idea is that an - accumulator definition holds enough information to be able to create an empty accumulator - (the ``identity()`` method) and add two compatible accumulators together (the ``add()`` method). - The former is not strictly necessary, but helps with book-keeping. Here we show an example usage - of a few accumulator types. An arbitrary-depth nesting of dictionary accumulators is supported, much - like the behavior of directories in ROOT hadd. - - After defining an accumulator:: - - from coffea.processor import dict_accumulator, column_accumulator, defaultdict_accumulator - from coffea.hist import Hist, Bin - import numpy as np - - adef = dict_accumulator({ - 'cutflow': defaultdict_accumulator(int), - 'pt': Hist("counts", Bin("pt", "$p_T$", 100, 0, 100)), - 'final_pt': column_accumulator(np.zeros(shape=(0,))), - }) - - Notice that this function does not mutate ``adef``:: - - def fill(n): - ptvals = np.random.exponential(scale=30, size=n) - cut = ptvals > 200. - acc = adef.identity() - acc['cutflow']['pt>200'] += cut.sum() - acc['pt'].fill(pt=ptvals) - acc['final_pt'] += column_accumulator(ptvals[cut]) - return acc - - As such, we can execute it several times in parallel and reduce the result:: - - import concurrent.futures - with concurrent.futures.ThreadPoolExecutor() as executor: - outputs = executor.map(fill, [2000, 2000]) - - combined = sum(outputs, adef.identity()) - - - Derived classes must implement - - ``identity()``: returns a new object of same type as self, - such that ``self + self.identity() == self`` - - ``add(other)``: adds an object of same type as self to self - - Concrete implementations are then provided for ``__add__``, ``__radd__``, and ``__iadd__``. - """ - - @abstractmethod - def identity(self): - """Identity of the accumulator - - A value such that any other value added to it will return - the other value - """ - pass - - @abstractmethod - def add(self, other): - """Add another accumulator to this one in-place""" - pass - - def __add__(self, other): - ret = self.identity() - ret.add(self) - ret.add(other) - return ret - - def __radd__(self, other): - ret = self.identity() - ret.add(other) - ret.add(self) - return ret - - def __iadd__(self, other): - self.add(other) - return self - - -class value_accumulator(AccumulatorABC): - """Holds a value of arbitrary type - - Parameters - ---------- - default_factory : callable - a function that returns an instance of the desired identity value - initial : bool, optional - an initial value, if the identity is not the desired initial value - """ - - def __init__(self, default_factory, initial=None): - self.value = default_factory() if initial is None else initial - self.default_factory = default_factory - - def __repr__(self): - if type(self.default_factory) is type: - defrepr = self.default_factory.__name__ - else: - defrepr = repr(self.default_factory) - return f"value_accumulator({defrepr}, {self.value!r})" - - def identity(self): - return value_accumulator(self.default_factory) - - def add(self, other): - if isinstance(other, value_accumulator): - self.value = self.value + other.value - else: - self.value = self.value + other - - -class list_accumulator(list, AccumulatorABC): - """A list with accumulator semantics - - See `list` for further info - """ - - def identity(self): - return list() - - def add(self, other): - """Add another accumulator to this one in-place""" - if isinstance(other, list): - list.extend(self, other) - else: - raise ValueError - - -class set_accumulator(set, AccumulatorABC): - """A set with accumulator semantics - - See `set` for further info - """ - - def identity(self): - return set_accumulator() - - def add(self, other): - """Add another accumulator to this one in-place - - Note - ---- - This replaces `set.add` behavior, unfortunately. - A workaround is to use `set.update`, e.g. ``a.update({'val'})`` - """ - if isinstance(other, MutableSet): - set.update(self, other) - else: - set.add(self, other) - - -class dict_accumulator(dict, AccumulatorABC): - """A dictionary with accumulator semantics - - See `dict` for further info. - It is assumed that the contents of the dict have accumulator semantics. - """ - - def identity(self): - ret = dict_accumulator() - for key, value in self.items(): - ret[key] = value.identity() - return ret - - def add(self, other): - if isinstance(other, MutableMapping): - for key, value in other.items(): - if key not in self: - if isinstance(value, AccumulatorABC): - self[key] = value.identity() - else: - raise ValueError - self[key] += value - else: - raise ValueError - - -class defaultdict_accumulator(defaultdict, AccumulatorABC): - """A defaultdict with accumulator semantics - - See `collections.defaultdict` for further info. - It is assumed that the contents of the dict have accumulator semantics - """ - - def identity(self): - return defaultdict_accumulator(self.default_factory) - - def add(self, other): - for key, value in other.items(): - self[key] += value - - -class column_accumulator(AccumulatorABC): - """An appendable numpy ndarray - - Parameters - ---------- - value : numpy.ndarray - The identity value array, which should be an empty ndarray - with the desired row shape. The column dimension will correspond to - the first index of `value` shape. - - Examples - -------- - If a set of accumulators is defined as:: - - a = column_accumulator(np.array([])) - b = column_accumulator(np.array([1., 2., 3.])) - c = column_accumulator(np.array([4., 5., 6.])) - - then: - - >>> a + b - column_accumulator(array([1., 2., 3.])) - >>> c + b + a - column_accumulator(array([4., 5., 6., 1., 2., 3.])) - """ - - def __init__(self, value): - if not isinstance(value, numpy.ndarray): - raise ValueError("column_accumulator only works with numpy arrays") - self._empty = numpy.zeros(dtype=value.dtype, shape=(0,) + value.shape[1:]) - self._value = value - - def __repr__(self): - return "column_accumulator(%r)" % self.value - - def identity(self): - return column_accumulator(self._empty) - - def add(self, other): - if not isinstance(other, column_accumulator): - raise ValueError("column_accumulator cannot be added to %r" % type(other)) - if other._empty.shape != self._empty.shape: - raise ValueError( - "Cannot add two column_accumulator objects of dissimilar shape (%r vs %r)" - % (self._empty.shape, other._empty.shape) - ) - self._value = numpy.concatenate((self._value, other._value)) - - @property - def value(self): - """The current value of the column - - Returns a numpy array where the first dimension is the column dimension - """ - return self._value diff --git a/src/coffea/processor/processor.py b/src/coffea/processor/processor.py index edc2d6162..6cbc427a8 100644 --- a/src/coffea/processor/processor.py +++ b/src/coffea/processor/processor.py @@ -22,7 +22,7 @@ def __init__(self, flag=False): self._flag = flag def process(self, events): - out = {"sumw": len(events)} + out = {"sumw": ak.num(events, axis=0)} # ... diff --git a/tests/test_accumulators.py b/tests/test_accumulators.py deleted file mode 100644 index ccf5d1d11..000000000 --- a/tests/test_accumulators.py +++ /dev/null @@ -1,184 +0,0 @@ -from collections import defaultdict -from functools import partial - -import numpy as np -import pytest - -from coffea import processor - - -def test_accumulators(): - a = processor.value_accumulator(float) - a += 3.0 - assert a.value == 3.0 - assert a.identity().value == 0.0 - - a = processor.value_accumulator(partial(np.array, [2.0])) - a += 3.0 - assert np.array_equal(a.value, np.array([5.0])) - assert np.array_equal(a.identity().value, np.array([2.0])) - - lacc = processor.list_accumulator(range(4)) - lacc += [3] - lacc += processor.list_accumulator([1, 2]) - assert lacc == [0, 1, 2, 3, 3, 1, 2] - - b = processor.set_accumulator({"apples", "oranges"}) - b += {"pears"} - b += "grapes" - assert b == {"apples", "oranges", "pears", "grapes"} - - c = processor.dict_accumulator({"num": a, "fruit": b}) - c["num"] += 2.0 - c += processor.dict_accumulator( - { - "num2": processor.value_accumulator(int), - "fruit": processor.set_accumulator({"apples", "cherries"}), - } - ) - assert c["num2"].value == 0 - assert np.array_equal(c["num"].value, np.array([7.0])) - assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"} - - d = processor.defaultdict_accumulator(float) - d["x"] = 0.0 - d["x"] += 4.0 - d["y"] += 5.0 - d["z"] += d["x"] - d["x"] += d["y"] - assert d["x"] == 9.0 - assert d["y"] == 5.0 - assert d["z"] == 4.0 - assert d["w"] == 0.0 - - f = processor.defaultdict_accumulator(lambda: 2.0) - f["x"] += 4.0 - assert f["x"] == 6.0 - - f += f - assert f["x"] == 12.0 - assert f["y"] == 2.0 - - a = processor.column_accumulator(np.arange(6).reshape(2, 3)) - b = processor.column_accumulator(np.arange(12).reshape(4, 3)) - a += b - assert a.value.sum() == 81 - - -def test_new_accumulators(): - a = processor.accumulate((0.0, 3.0)) - assert a == 3.0 - - a = processor.accumulate( - ( - np.array([2.0]), - 3.0, - ) - ) - assert np.array_equal(a, np.array([5.0])) - - lacc = processor.accumulate( - ( - list(range(4)), - [3], - [1, 2], - ) - ) - assert lacc == [0, 1, 2, 3, 3, 1, 2] - - b = processor.accumulate( - ( - {"apples", "oranges"}, - {"pears"}, - {"grapes"}, - ) - ) - assert b == {"apples", "oranges", "pears", "grapes"} - - c = processor.accumulate( - ( - {"num": a, "fruit": b}, - {"num": 2.0}, - { - "num2": 0, - "fruit": {"apples", "cherries"}, - }, - ) - ) - assert c["num2"] == 0 - assert np.array_equal(c["num"], np.array([7.0])) - assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"} - - d = processor.accumulate( - ( - defaultdict(float), - {"x": 4.0, "y": 5.0}, - {"z": 4.0, "x": 5.0}, - ) - ) - assert d["x"] == 9.0 - assert d["y"] == 5.0 - assert d["z"] == 4.0 - # this is different than old style! - with pytest.raises(KeyError): - d["w"] - - f = processor.accumulate( - ( - defaultdict(lambda: 2.0), - defaultdict(lambda: 2, {"x": 4.0}), - ) - ) - assert f["x"] == 4.0 - assert f["y"] == 2.0 - - # this is different than old style! - f = processor.accumulate([f], f) - assert f["x"] == 8.0 - assert f["y"] == 4.0 - assert f["z"] == 2.0 - - a = processor.accumulate( - ( - processor.column_accumulator(np.arange(6).reshape(2, 3)), - processor.column_accumulator(np.arange(12).reshape(4, 3)), - ) - ) - assert a.value.sum() == 81 - - -def test_accumulator_types(): - class MyDict(dict): - pass - - out = processor.accumulate( - ( - {"x": 2}, - MyDict({"x": 3}), - ) - ) - assert type(out) is dict - - with pytest.raises(ValueError): - processor.accumulate( - ( - defaultdict(lambda: 2), - MyDict({"x": 3}), - ) - ) - - out = processor.accumulate( - ( - MyDict({"x": 3}), - {"x": 2}, - ) - ) - assert type(out) is dict - - with pytest.raises(ValueError): - processor.accumulate( - ( - MyDict({"x": 3}), - defaultdict(lambda: 2), - ) - ) From 1de78e605b2cf0576c294e137ffab7bc929315dc Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 11:55:27 -0600 Subject: [PATCH 37/80] typing for apply_to_dataset/fileset, use setdefault --- src/coffea/dataset_tools/__init__.py | 4 +-- src/coffea/dataset_tools/apply_processor.py | 33 ++++++++++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index 888df7eaf..dfa40296c 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,10 +1,10 @@ -from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset +from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset from coffea.dataset_tools.manipulations import max_chunks, slice_chunks from coffea.dataset_tools.preprocess import preprocess __all__ = [ "preprocess", - "apply_to_one_dataset", + "apply_to_dataset", "apply_to_fileset", "max_chunks", "slice_chunks", diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 798c73623..d08b772dd 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -1,12 +1,29 @@ import copy -from typing import Callable, Union +from typing import Callable, Dict, Hashable, List, Set, Tuple, Union + +import dask.base +import dask_awkward from coffea.nanoevents import NanoAODSchema, NanoEventsFactory from coffea.processor import ProcessorABC +GenericHEPAnalysis = Callable[ + [dask_awkward.Array], + Tuple[ + Union[ + dask.base.DaskMethodsMixin, + Dict[Hashable, dask.base.DaskMethodsMixin], + Set[dask.base.DaskMethodsMixin], + List[dask.base.DaskMethodsMixin], + Tuple[dask.base.DaskMethodsMixin], + ], + ..., + ], # NOTE TO USERS: You can use nested python containers as arguments to dask.compute! +] + -def apply_to_one_dataset( - data_manipulation: Union[ProcessorABC, Callable], +def apply_to_dataset( + data_manipulation: Union[ProcessorABC, GenericHEPAnalysis], dataset, schemaclass=NanoAODSchema, metadata={}, @@ -26,13 +43,13 @@ def apply_to_one_dataset( def apply_to_fileset( - data_manipulation: Union[ProcessorABC, Callable], fileset, schemaclass=NanoAODSchema + data_manipulation: Union[ProcessorABC, GenericHEPAnalysis], + fileset, + schemaclass=NanoAODSchema, ): out = {} for name, dataset in fileset.items(): metadata = copy.deepcopy(dataset.get("metadata", {})) - metadata["dataset"] = name - out[name] = apply_to_one_dataset( - data_manipulation, dataset, schemaclass, metadata - ) + metadata.setdefault("dataset", name) + out[name] = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata) return out From 5ee0b20d025d2a70245a73dacd1468cb94edffc9 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 13:59:21 -0600 Subject: [PATCH 38/80] typing for preprocess --- src/coffea/dataset_tools/preprocess.py | 70 ++++++++++++++++++++------ 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index ad4d4bc6d..b5dbe9168 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -1,5 +1,9 @@ +from __future__ import annotations + import copy import math +from dataclasses import dataclass +from typing import Any, Hashable import awkward import dask @@ -9,13 +13,13 @@ def _get_steps( - normed_files, - maybe_step_size=None, - align_clusters=False, - recalculate_seen_steps=False, - skip_bad_files=False, - file_exceptions=(FileNotFoundError, OSError), -): + normed_files: awkward.Array | dask_awkward.Array, + maybe_step_size: None | int = None, + align_clusters: bool = False, + recalculate_seen_steps: bool = False, + skip_bad_files: bool = False, + file_exceptions: Exception | Warning = (FileNotFoundError, OSError), +) -> awkward.Array | dask_awkward.Array: nf_backend = awkward.backend(normed_files) lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files) @@ -95,15 +99,51 @@ def _get_steps( return array +@dataclass +class UprootFileSpec: + object_path: str + steps: list[list[int]] | list[int] | None + + +@dataclass +class CoffeaFileSpec: + object_path: str + steps: list[list[int]] + uuid: str + + +@dataclass +class CoffeaFileSpecOptional(UprootFileSpec): + uuid: str | None + + +@dataclass +class DatasetSpecOptional: + files: ( + dict[str, str] | list[str] | dict[str, UprootFileSpec | CoffeaFileSpecOptional] + ) + metadata: dict[Hashable, Any] | None + + +@dataclass +class DatasetSpec: + files: dict[str, CoffeaFileSpec] + metadata: dict[Hashable, Any] | None + + +FilesetSpecOptional = dict[str, DatasetSpecOptional] +FilesetSpec = dict[str, DatasetSpec] + + def preprocess( - fileset, - maybe_step_size=None, - align_clusters=False, - recalculate_seen_steps=False, - files_per_batch=1, - skip_bad_files=False, - file_exceptions=(FileNotFoundError, OSError), -): + fileset: FilesetSpecOptional, + maybe_step_size: None | int = None, + align_clusters: bool = False, + recalculate_seen_steps: bool = False, + files_per_batch: int = 1, + skip_bad_files: bool = False, + file_exceptions: Exception | Warning = (FileNotFoundError, OSError), +) -> tuple[FilesetSpec, FilesetSpecOptional]: out_updated = copy.deepcopy(fileset) out_available = copy.deepcopy(fileset) all_ak_norm_files = {} From 009fdd0e1013a63390cbdc8c71120981ba73ee77 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 14:02:50 -0600 Subject: [PATCH 39/80] flake8 --- src/coffea/dataset_tools/rucio_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 2df9a62ec..76ce58971 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -5,11 +5,10 @@ import subprocess from collections import defaultdict -import tomli from rucio.client import Client # Rucio needs the default configuration --> taken from CMS cvmfs defaults -if not "RUCIO_HOME" in os.environ: +if "RUCIO_HOME" not in os.environ: os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" # with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f: From e00e691bd4ccb5c2a320308e49fcd4e24b4d5109 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 14:05:57 -0600 Subject: [PATCH 40/80] fix up typing --- src/coffea/dataset_tools/preprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index b5dbe9168..da4ca7d6a 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -3,7 +3,7 @@ import copy import math from dataclasses import dataclass -from typing import Any, Hashable +from typing import Any, Dict, Hashable import awkward import dask @@ -131,8 +131,8 @@ class DatasetSpec: metadata: dict[Hashable, Any] | None -FilesetSpecOptional = dict[str, DatasetSpecOptional] -FilesetSpec = dict[str, DatasetSpec] +FilesetSpecOptional = Dict[str, DatasetSpecOptional] +FilesetSpec = Dict[str, DatasetSpec] def preprocess( From 7644fe71de0793e744a54d2c2ab6c145fc4e3336 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 28 Nov 2023 17:14:35 -0600 Subject: [PATCH 41/80] more typing for apply_processor --- src/coffea/dataset_tools/apply_processor.py | 55 ++++++++++++--------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index d08b772dd..883ec7f0b 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -1,33 +1,40 @@ +from __future__ import annotations + import copy -from typing import Callable, Dict, Hashable, List, Set, Tuple, Union +from typing import Any, Callable, Dict, Hashable, List, Set, Tuple, Union import dask.base import dask_awkward -from coffea.nanoevents import NanoAODSchema, NanoEventsFactory +from coffea.dataset_tools.preprocess import ( + DatasetSpec, + DatasetSpecOptional, + FilesetSpec, + FilesetSpecOptional, +) +from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory from coffea.processor import ProcessorABC -GenericHEPAnalysis = Callable[ - [dask_awkward.Array], - Tuple[ - Union[ - dask.base.DaskMethodsMixin, - Dict[Hashable, dask.base.DaskMethodsMixin], - Set[dask.base.DaskMethodsMixin], - List[dask.base.DaskMethodsMixin], - Tuple[dask.base.DaskMethodsMixin], - ], - ..., - ], # NOTE TO USERS: You can use nested python containers as arguments to dask.compute! -] +DaskOutputType = Tuple[ + Union[ + dask.base.DaskMethodsMixin, + Dict[Hashable, dask.base.DaskMethodsMixin], + Set[dask.base.DaskMethodsMixin], + List[dask.base.DaskMethodsMixin], + Tuple[dask.base.DaskMethodsMixin], + ], + ..., +] # NOTE TO USERS: You can use nested python containers as arguments to dask.compute! + +GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType] def apply_to_dataset( - data_manipulation: Union[ProcessorABC, GenericHEPAnalysis], - dataset, - schemaclass=NanoAODSchema, - metadata={}, -): + data_manipulation: ProcessorABC | GenericHEPAnalysis, + dataset: DatasetSpec | DatasetSpecOptional, + schemaclass: BaseSchema = NanoAODSchema, + metadata: dict[Hashable, Any] = {}, +) -> DaskOutputType: files = dataset["files"] events = NanoEventsFactory.from_root( files, @@ -43,10 +50,10 @@ def apply_to_dataset( def apply_to_fileset( - data_manipulation: Union[ProcessorABC, GenericHEPAnalysis], - fileset, - schemaclass=NanoAODSchema, -): + data_manipulation: ProcessorABC | GenericHEPAnalysis, + fileset: FilesetSpec | FilesetSpecOptional, + schemaclass: BaseSchema = NanoAODSchema, +) -> dict[str, DaskOutputType]: out = {} for name, dataset in fileset.items(): metadata = copy.deepcopy(dataset.get("metadata", {})) From 9fee7d36e712526f4afb66bb2557886ff262cc53 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 29 Nov 2023 14:52:36 -0600 Subject: [PATCH 42/80] being pedantic about types --- src/coffea/dataset_tools/apply_processor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 883ec7f0b..186eb069e 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -15,16 +15,16 @@ from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory from coffea.processor import ProcessorABC -DaskOutputType = Tuple[ - Union[ - dask.base.DaskMethodsMixin, - Dict[Hashable, dask.base.DaskMethodsMixin], - Set[dask.base.DaskMethodsMixin], - List[dask.base.DaskMethodsMixin], - Tuple[dask.base.DaskMethodsMixin], - ], - ..., -] # NOTE TO USERS: You can use nested python containers as arguments to dask.compute! +DaskOutputBaseType = Union[ + dask.base.DaskMethodsMixin, + Dict[Hashable, dask.base.DaskMethodsMixin], + Set[dask.base.DaskMethodsMixin], + List[dask.base.DaskMethodsMixin], + Tuple[dask.base.DaskMethodsMixin], +] + +# NOTE TO USERS: You can use nested python containers as arguments to dask.compute! +DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]] GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType] From 9ba1442ecb110bba03c82dfabd4ace3ed8f3403a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Nov 2023 20:53:11 +0000 Subject: [PATCH 43/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/apply_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 186eb069e..eb84580f7 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -24,7 +24,7 @@ ] # NOTE TO USERS: You can use nested python containers as arguments to dask.compute! -DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]] +DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]] GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType] From 059061a676c887b608acf64dfafa5c2313a26d5e Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 4 Dec 2023 16:00:07 -0600 Subject: [PATCH 44/80] taskvine test was using old location of NanoAODSchema --- tests/test_taskvine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py index 8e533919e..e6f4580be 100755 --- a/tests/test_taskvine.py +++ b/tests/test_taskvine.py @@ -5,12 +5,12 @@ import pytest from coffea import processor -from coffea.nanoevents import NanoEventsFactory +from coffea.nanoevents import NanoEventsFactory, NanoAODSchema def histogram_common(): # The opendata files are non-standard NanoAOD, so some optional data columns are missing - processor.NanoAODSchema.warn_missing_crossrefs = False + NanoAODSchema.warn_missing_crossrefs = False # "file:/tmp/Run2012B_SingleMu.root", events = NanoEventsFactory.from_root( From 3385bdb313151978c48a481758d682dc721401e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Dec 2023 22:00:31 +0000 Subject: [PATCH 45/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_taskvine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py index e6f4580be..a35ea741e 100755 --- a/tests/test_taskvine.py +++ b/tests/test_taskvine.py @@ -5,7 +5,7 @@ import pytest from coffea import processor -from coffea.nanoevents import NanoEventsFactory, NanoAODSchema +from coffea.nanoevents import NanoAODSchema, NanoEventsFactory def histogram_common(): From 0e2baf102d43c9eaf66a5c50fa1c8ad150c18b78 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 4 Dec 2023 16:01:56 -0600 Subject: [PATCH 46/80] lint: no longer need to import processor --- tests/test_taskvine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py index a35ea741e..858a35349 100755 --- a/tests/test_taskvine.py +++ b/tests/test_taskvine.py @@ -4,7 +4,6 @@ import hist.dask as hda import pytest -from coffea import processor from coffea.nanoevents import NanoAODSchema, NanoEventsFactory From 59ef3529184f3ed7af9da241e75a4d2a0131e6eb Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 6 Dec 2023 22:11:30 +0100 Subject: [PATCH 47/80] Getting rucio client from config in environmental variable --- src/coffea/dataset_tools/rucio_utils.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 76ce58971..71ab31360 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -11,10 +11,6 @@ if "RUCIO_HOME" not in os.environ: os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" -# with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f: -# rucio_cfg = tomli.load(f) -# print(rucio_cfg) - def get_proxy_path() -> str: """ @@ -52,14 +48,7 @@ def get_rucio_client(proxy=None) -> Client: try: if not proxy: proxy = get_proxy_path() - - nativeClient = Client( - rucio_host="https://cms-rucio.cern.ch", - auth_host="https://cms-rucio-auth.cern.ch", - account=getpass.getuser(), - creds={"client_cert": proxy, "client_key": proxy}, - auth_type="x509", - ) + nativeClient = Client() return nativeClient except Exception as e: From 25df8b16be6a66c1a358c2037464adb4dbabde03 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 6 Dec 2023 23:57:02 +0100 Subject: [PATCH 48/80] Added preprocess command to cli --- src/coffea/dataset_tools/dataset_query.py | 62 ++++++++++++++++++++--- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 46236b805..550f3f60d 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,8 +1,9 @@ import random from collections import defaultdict +import os import cmd2 -import yaml +import yaml, json from rich import print from rich.console import Console from rich.prompt import Prompt @@ -10,6 +11,8 @@ from rich.tree import Tree from . import rucio_utils +from .preprocess import preprocess +from dask.distributed import Client def print_dataset_query(query, dataset_list, selected, console): @@ -59,6 +62,8 @@ def __init__(self): "S": "select", "LS": "list_selected", "LR": "list_replicas", + "O": "save", + "P": "preprocess", } ) self.console = Console() @@ -326,6 +331,7 @@ def do_list_replicas(self, args): print( f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}" ) + return tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") for site, files in self.replica_results_bysite[dataset].items(): @@ -338,11 +344,54 @@ def do_list_replicas(self, args): def do_save(self, args): """Save the replica information in yaml format""" if not len(args): - print("[red]Please provide an output filename") + print("[red]Please provide an output filename and format") + return + format = os.path.splitext(args)[1] + output = {} + for fileset, files in self.replica_results.items(): + output[fileset] = {"files": files, "metadata": {}} + + with open(args, "w") as file: + if format == ".yaml": + yaml.dump(output, file, default_flow_style=False) + elif format == ".json": + json.dump(output, file, indent=2) + print(f"[green]File {args} saved!") + + def do_preprocess(self, args): + """Perform preprocessing for concrete fileset extraction. + Args: output_name [step_size] [dask cluster url]""" + args_list = args.split() + if len(args_list) < 1: + print( + "Please provide an output name and optionally a step size and dask cluster url" + ) + return else: - with open(args, "w") as file: - yaml.dump(dict(self.replica_results), file, default_flow_style=False) - print(f"[green]File {args} saved!") + output_file = args_list[0] + step_size = None + dask_url = None + if len(args_list) == 2: + step_size = args_list[1] + elif len(args_list) == 3: + dask_url = args_list[2] + replicas = {} + for fileset, files in self.replica_results.items(): + replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}} + # init a local Dask cluster + with self.console.status( + "[red] Preprocessing files to extract available chunks with dask[/]" + ): + client = Client(dask_url) if dask_url else Client() + fileset = preprocess(replicas) + out_available, out_updated = preprocess(replicas) + + with open(f"{output_file}_available.json", "w") as file: + print(f"Saved available fileset chunks to {output_file}_available.json") + json.dump(out_available, file, indent=2) + with open(f"{output_file}_all.json", "w") as file: + print(f"Saved all fileset chunks to {output_file}_all.json") + json.dump(out_updated, file, indent=2) if __name__ == "__main__": @@ -359,7 +408,8 @@ def do_save(self, args): - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" - - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing + - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing + - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json) - [bold cyan]help[/]: get help! """ console = Console() From c8a87b351936eb90264b34b91e41230c7ca46c21 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:57:23 +0000 Subject: [PATCH 49/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 550f3f60d..b9d00b4b8 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,9 +1,11 @@ +import json +import os import random from collections import defaultdict -import os import cmd2 -import yaml, json +import yaml +from dask.distributed import Client from rich import print from rich.console import Console from rich.prompt import Prompt @@ -12,7 +14,6 @@ from . import rucio_utils from .preprocess import preprocess -from dask.distributed import Client def print_dataset_query(query, dataset_list, selected, console): From 31e74eb0d999a25f6de17ea7ec372385906db255 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 6 Dec 2023 17:18:16 -0600 Subject: [PATCH 50/80] save json as gzipped, add some options --- src/coffea/dataset_tools/dataset_query.py | 37 +++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index b9d00b4b8..3c16108cc 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,3 +1,4 @@ +import gzip import json import os import random @@ -361,21 +362,34 @@ def do_save(self, args): def do_preprocess(self, args): """Perform preprocessing for concrete fileset extraction. - Args: output_name [step_size] [dask cluster url]""" + Args: output_name [step_size] [align to file cluster boundaries] [dask cluster url]""" args_list = args.split() if len(args_list) < 1: print( - "Please provide an output name and optionally a step size and dask cluster url" + "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url" ) return else: output_file = args_list[0] step_size = None + align_to_clusters = False dask_url = None - if len(args_list) == 2: + if len(args_list) >= 2: step_size = args_list[1] - elif len(args_list) == 3: - dask_url = args_list[2] + if len(args_list) >= 3: + if args_list[2] == "True": + align_to_clusters = True + elif args_list[2] == "False": + align_to_clusters = False + else: + raise ValueError("align_to_clusters must be either \"True\" or \"False\"") + if len(args_list) == 4: + dask_url = args_list[3] + if len(args_list) > 4: + print( + "preprocess accepts at most 3 commandline arguments!" + ) + return replicas = {} for fileset, files in self.replica_results.items(): replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}} @@ -383,15 +397,14 @@ def do_preprocess(self, args): with self.console.status( "[red] Preprocessing files to extract available chunks with dask[/]" ): - client = Client(dask_url) if dask_url else Client() - fileset = preprocess(replicas) - out_available, out_updated = preprocess(replicas) + with Client(dask_url) as _: + out_available, out_updated = preprocess(replicas, maybe_step_size=step_size, align_clusters=align_to_clusters, skip_bad_files=True) - with open(f"{output_file}_available.json", "w") as file: - print(f"Saved available fileset chunks to {output_file}_available.json") + with gzip.open(f"{output_file}_available.json.gz", "w") as file: + print(f"Saved available fileset chunks to {output_file}_available.json.gz") json.dump(out_available, file, indent=2) - with open(f"{output_file}_all.json", "w") as file: - print(f"Saved all fileset chunks to {output_file}_all.json") + with gzip.open(f"{output_file}_all.json.gz", "w") as file: + print(f"Saved all fileset chunks to {output_file}_all.json.gz") json.dump(out_updated, file, indent=2) From 048767195a47b1760fa186ee607c3ff7afd66756 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:18:31 +0000 Subject: [PATCH 51/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 3c16108cc..04a9f9f25 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -362,7 +362,8 @@ def do_save(self, args): def do_preprocess(self, args): """Perform preprocessing for concrete fileset extraction. - Args: output_name [step_size] [align to file cluster boundaries] [dask cluster url]""" + Args: output_name [step_size] [align to file cluster boundaries] [dask cluster url] + """ args_list = args.split() if len(args_list) < 1: print( @@ -382,13 +383,11 @@ def do_preprocess(self, args): elif args_list[2] == "False": align_to_clusters = False else: - raise ValueError("align_to_clusters must be either \"True\" or \"False\"") + raise ValueError('align_to_clusters must be either "True" or "False"') if len(args_list) == 4: dask_url = args_list[3] if len(args_list) > 4: - print( - "preprocess accepts at most 3 commandline arguments!" - ) + print("preprocess accepts at most 3 commandline arguments!") return replicas = {} for fileset, files in self.replica_results.items(): @@ -397,8 +396,13 @@ def do_preprocess(self, args): with self.console.status( "[red] Preprocessing files to extract available chunks with dask[/]" ): - with Client(dask_url) as _: - out_available, out_updated = preprocess(replicas, maybe_step_size=step_size, align_clusters=align_to_clusters, skip_bad_files=True) + with Client(dask_url) as _: + out_available, out_updated = preprocess( + replicas, + maybe_step_size=step_size, + align_clusters=align_to_clusters, + skip_bad_files=True, + ) with gzip.open(f"{output_file}_available.json.gz", "w") as file: print(f"Saved available fileset chunks to {output_file}_available.json.gz") From 9c970e4ea338cb0eac01c5cb88eaa0df9d22ed27 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 6 Dec 2023 17:19:52 -0600 Subject: [PATCH 52/80] flake: drop getpass since it is not used --- src/coffea/dataset_tools/rucio_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 71ab31360..a2cf11fe9 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -1,4 +1,4 @@ -import getpass +# import getpass import json import os import re From 681782debc32ee41b04d2e6c6ab0e75b4bb55c68 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 16:13:05 -0600 Subject: [PATCH 53/80] add failed-tail processing for uproot reports --- src/coffea/dataset_tools/manipulations.py | 40 +++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index e963a067c..e80ffb4cb 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -1,4 +1,6 @@ +import awkward import copy +import numpy def max_chunks(fileset, maxchunks=None): @@ -15,3 +17,41 @@ def slice_chunks(fileset, theslice=slice(None)): out[name]["files"][fname]["steps"] = finfo["steps"][theslice] return out + + +def get_failed_steps_for_dataset(dataset, report): + failed_dataset = {} + failures = report[~awkward.is_none(report.exception)] + + if not awkward.all(report.args[:,4] == "True"): + raise RuntimeError("step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file.") + + for fdesc in dataset.values(): + if "steps" not in fdesc: + raise RuntimeError("steps specification not found in dataset, please specify steps in input dataset.") + + fnames = set(dataset.keys()) + rnames = set(np.unique(report.args[:, 0][:, 1:-1:])) + if not rnames.issubset(fnames): + raise RuntimeError(f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!") + + for failure in failures: + args_as_types = tuple(eval(arg) for arg in failure.args) + + fname, object_path, start, stop, is_step = args_as_types + + if fname in failed_dataset: + failed_dataset[fname]["steps"].append([start, stop]) + else: + failed_dataset[fname] = copy.deepcopy(dataset[fname]) + failed_dataset[fname]["steps"] = [[start, stop]] + + return failed_dataset + +def get_failed_steps_for_fileset(fileset, report): + failed_fileset = {} + for name, dataset in fileset.items(): + failed_dataset = get_failed_steps_for_dataset(dataset, report) + if len(failed_dataset) > 0: + failed_fileset[name] = failed_dataset + return failed_fileset From 3ca2d96b55a4e912147e18f46bcb2def88a97113 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 16:13:42 -0600 Subject: [PATCH 54/80] add failed-tail stuff to __all__ of dataset_tools --- src/coffea/dataset_tools/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index dfa40296c..1895bf722 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -8,4 +8,6 @@ "apply_to_fileset", "max_chunks", "slice_chunks", + "get_failed_steps_for_dataset", + "get_failed_steps_for_fileset", ] From 3bd760a0a95e5542571e5965c0c881b771901627 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:15:58 +0000 Subject: [PATCH 55/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/manipulations.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index e80ffb4cb..0076b7117 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -1,5 +1,6 @@ -import awkward import copy + +import awkward import numpy @@ -23,23 +24,29 @@ def get_failed_steps_for_dataset(dataset, report): failed_dataset = {} failures = report[~awkward.is_none(report.exception)] - if not awkward.all(report.args[:,4] == "True"): - raise RuntimeError("step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file.") + if not awkward.all(report.args[:, 4] == "True"): + raise RuntimeError( + "step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file." + ) for fdesc in dataset.values(): if "steps" not in fdesc: - raise RuntimeError("steps specification not found in dataset, please specify steps in input dataset.") + raise RuntimeError( + "steps specification not found in dataset, please specify steps in input dataset." + ) fnames = set(dataset.keys()) rnames = set(np.unique(report.args[:, 0][:, 1:-1:])) if not rnames.issubset(fnames): - raise RuntimeError(f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!") - + raise RuntimeError( + f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!" + ) + for failure in failures: args_as_types = tuple(eval(arg) for arg in failure.args) fname, object_path, start, stop, is_step = args_as_types - + if fname in failed_dataset: failed_dataset[fname]["steps"].append([start, stop]) else: @@ -48,6 +55,7 @@ def get_failed_steps_for_dataset(dataset, report): return failed_dataset + def get_failed_steps_for_fileset(fileset, report): failed_fileset = {} for name, dataset in fileset.items(): From a29e9102132c574a7e3b2c8bd2704ae0b7de9fae Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 16:17:28 -0600 Subject: [PATCH 56/80] lint --- src/coffea/dataset_tools/manipulations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index 0076b7117..49b4486fc 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -36,7 +36,7 @@ def get_failed_steps_for_dataset(dataset, report): ) fnames = set(dataset.keys()) - rnames = set(np.unique(report.args[:, 0][:, 1:-1:])) + rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:])) if not rnames.issubset(fnames): raise RuntimeError( f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!" From 13921ab2a20e48a3f457d5d724e9cecc026e1d81 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 16:25:01 -0600 Subject: [PATCH 57/80] typo --- src/coffea/dataset_tools/manipulations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index 49b4486fc..89adf8169 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -39,7 +39,7 @@ def get_failed_steps_for_dataset(dataset, report): rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:])) if not rnames.issubset(fnames): raise RuntimeError( - f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!" + f"Files: {rnames - fnames} are not in input dataset, please ensure report correspond to input dataset!" ) for failure in failures: From 8e4424d5abc20aa4fdb67ff2fa644caa4c515c3f Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 18:15:09 -0600 Subject: [PATCH 58/80] typo, and fileset entrypoint needs dict of reports. --- src/coffea/dataset_tools/manipulations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index 89adf8169..a749561e0 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -39,7 +39,7 @@ def get_failed_steps_for_dataset(dataset, report): rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:])) if not rnames.issubset(fnames): raise RuntimeError( - f"Files: {rnames - fnames} are not in input dataset, please ensure report correspond to input dataset!" + f"Files: {rnames - fnames} are not in input dataset, please ensure report corresponds to input dataset!" ) for failure in failures: @@ -56,10 +56,10 @@ def get_failed_steps_for_dataset(dataset, report): return failed_dataset -def get_failed_steps_for_fileset(fileset, report): +def get_failed_steps_for_fileset(fileset, report_dict): failed_fileset = {} for name, dataset in fileset.items(): - failed_dataset = get_failed_steps_for_dataset(dataset, report) + failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name]) if len(failed_dataset) > 0: failed_fileset[name] = failed_dataset return failed_fileset From fe7984f844261935c721729165545adc580c45af Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 18:21:42 -0600 Subject: [PATCH 59/80] adapt apply_processor to possibility of reports --- src/coffea/dataset_tools/apply_processor.py | 28 ++++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index eb84580f7..ed242ed4d 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -34,29 +34,49 @@ def apply_to_dataset( dataset: DatasetSpec | DatasetSpecOptional, schemaclass: BaseSchema = NanoAODSchema, metadata: dict[Hashable, Any] = {}, + uproot_options: dict[str, Any] = {}, ) -> DaskOutputType: files = dataset["files"] events = NanoEventsFactory.from_root( files, metadata=metadata, - schemaclass=schemaclass, + schemaclass=schemaclass, + uproot_options=uproot_options, ).events() + + report = None + if isinstance(events, tuple): + events, report = events + + out = None if isinstance(data_manipulation, ProcessorABC): - return data_manipulation.process(events) + out = data_manipulation.process(events) elif isinstance(data_manipulation, Callable): - return data_manipulation(events) + out = data_manipulation(events) else: raise ValueError("data_manipulation must either be a ProcessorABC or Callable") + if report is not None: + return out, report + return out + def apply_to_fileset( data_manipulation: ProcessorABC | GenericHEPAnalysis, fileset: FilesetSpec | FilesetSpecOptional, schemaclass: BaseSchema = NanoAODSchema, + uproot_options: dict[str, Any] = {}, ) -> dict[str, DaskOutputType]: out = {} + report = {} for name, dataset in fileset.items(): metadata = copy.deepcopy(dataset.get("metadata", {})) metadata.setdefault("dataset", name) - out[name] = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata) + out = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata) + if isinstance(out, tuple): + out[name], report[name] = out + else: + out[name] = out + if len(report) > 0: + return out, report return out From 87332b87baa5a12b207dfa7cd92f77bb3058e898 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Dec 2023 00:21:56 +0000 Subject: [PATCH 60/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/apply_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index ed242ed4d..859765fe9 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -40,7 +40,7 @@ def apply_to_dataset( events = NanoEventsFactory.from_root( files, metadata=metadata, - schemaclass=schemaclass, + schemaclass=schemaclass, uproot_options=uproot_options, ).events() From f846c7011198a95ee8f712cbb5f518120553df93 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 7 Dec 2023 20:06:37 -0600 Subject: [PATCH 61/80] fix bugs --- src/coffea/dataset_tools/apply_processor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 859765fe9..69994b127 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -72,11 +72,13 @@ def apply_to_fileset( for name, dataset in fileset.items(): metadata = copy.deepcopy(dataset.get("metadata", {})) metadata.setdefault("dataset", name) - out = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata) + dataset_out = apply_to_dataset( + data_manipulation, dataset, schemaclass, metadata + ) if isinstance(out, tuple): - out[name], report[name] = out + out[name], report[name] = dataset_out else: - out[name] = out + out[name] = dataset_out if len(report) > 0: return out, report return out From 6e5caddfddac175f21cd3fe52ae500c91693fe2c Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Fri, 8 Dec 2023 19:29:16 +0100 Subject: [PATCH 62/80] Added processing of multiple datasets to get replicas in CLI --- src/coffea/dataset_tools/dataset_query.py | 219 ++++++++++++---------- 1 file changed, 119 insertions(+), 100 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 04a9f9f25..1c74737d1 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -165,113 +165,123 @@ def do_list_selected(self, args): def do_replicas(self, args): if len(args.arg_list) == 0: print( - "[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]" + "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets" ) return - if args.isdigit(): - if int(args) <= len(self.selected_datasets): - dataset = self.selected_datasets[int(args) - 1] - else: - print( - f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}" - ) + if args == "all": + datasets = self.selected_datasets else: - dataset = args.arg_list[0] - # adding it to the selected datasets - self.selected_datasets.append(dataset) - - with self.console.status( - f"Querying rucio for replicas: [bold red]{dataset}[/]" - ): - outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas( - dataset, - allowlist_sites=self.sites_allowlist, - blocklist_sites=self.sites_blocklist, - regex_sites=self.sites_regex, - mode="full", - client=self.rucio_client, + for index in args.arg_list: + if index.isdigit(): + if int(index) <= len(self.selected_datasets): + datasets = [self.selected_datasets[int(index) - 1]] + else: + print( + f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}" + ) + + for dataset in datasets: + with self.console.status( + f"Querying rucio for replicas: [bold red]{dataset}[/]" + ): + ( + outfiles, + outsites, + sites_counts, + ) = rucio_utils.get_dataset_files_replicas( + dataset, + allowlist_sites=self.sites_allowlist, + blocklist_sites=self.sites_blocklist, + regex_sites=self.sites_regex, + mode="full", + client=self.rucio_client, + ) + self.last_replicas_results = (outfiles, outsites, sites_counts) + print(f"[cyan]Sites availability for dataset: [red]{dataset}") + table = Table(title="Available replicas") + table.add_column("Index", justify="center") + table.add_column("Site", justify="left", style="cyan", no_wrap=True) + table.add_column("Files", style="magenta", no_wrap=True) + table.add_column("Availability", justify="center") + table.row_styles = ["dim", "none"] + Nfiles = len(outfiles) + + sorted_sites = dict( + sorted(sites_counts.items(), key=lambda x: x[1], reverse=True) ) - self.last_replicas_results = (outfiles, outsites, sites_counts) - print(f"[cyan]Sites availability for dataset: [red]{dataset}") - table = Table(title="Available replicas") - table.add_column("Index", justify="center") - table.add_column("Site", justify="left", style="cyan", no_wrap=True) - table.add_column("Files", style="magenta", no_wrap=True) - table.add_column("Availability", justify="center") - table.row_styles = ["dim", "none"] - Nfiles = len(outfiles) - - sorted_sites = dict( - sorted(sites_counts.items(), key=lambda x: x[1], reverse=True) - ) - for i, (site, stat) in enumerate(sorted_sites.items()): - table.add_row(str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%") - - self.console.print(table) - strategy = Prompt.ask( - "Select sites", - choices=["round-robin", "choice", "quit"], - default="round-robin", - ) - - files_by_site = defaultdict(list) + for i, (site, stat) in enumerate(sorted_sites.items()): + table.add_row( + str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%" + ) - if strategy == "choice": - ind = list( - map(int, Prompt.ask("Enter list of sites index to be used").split(" ")) + self.console.print(table) + strategy = Prompt.ask( + "Select sites", + choices=["round-robin", "choice", "quit"], + default="round-robin", ) - sites_to_use = [list(sorted_sites.keys())[i] for i in ind] - print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}") - - output = [] - for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): - random.shuffle(sites_to_use) - found = False - # loop on shuffled selected sites until one is found - for site in sites_to_use: - try: - iS = sites.index(site) - output.append(files[iS]) - files_by_site[sites[iS]].append(files[iS]) - found = True - break # keep only one replica - except ValueError: - # if the index is not found just go to the next site - pass - - if not found: - print( - f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:" - ) - for f, s in zip(files, sites): - print(f"\t- [green]{s} [cyan]{f}") - return - - self.replica_results[dataset] = output - elif strategy == "round-robin": - output = [] - for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): - # selecting randomly from the sites - iS = random.randint(0, len(sites) - 1) - output.append(files[iS]) - files_by_site[sites[iS]].append(files[iS]) - self.replica_results[dataset] = output + files_by_site = defaultdict(list) - elif strategy == "quit": - print("[orange]Doing nothing...") - return + if strategy == "choice": + ind = list( + map( + int, + Prompt.ask("Enter list of sites index to be used").split(" "), + ) + ) + sites_to_use = [list(sorted_sites.keys())[i] for i in ind] + print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}") + + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + random.shuffle(sites_to_use) + found = False + # loop on shuffled selected sites until one is found + for site in sites_to_use: + try: + iS = sites.index(site) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + found = True + break # keep only one replica + except ValueError: + # if the index is not found just go to the next site + pass + + if not found: + print( + f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:" + ) + for f, s in zip(files, sites): + print(f"\t- [green]{s} [cyan]{f}") + return + + self.replica_results[dataset] = output + + elif strategy == "round-robin": + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + # selecting randomly from the sites + iS = random.randint(0, len(sites) - 1) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + self.replica_results[dataset] = output + + elif strategy == "quit": + print("[orange]Doing nothing...") + return - self.replica_results_bysite[dataset] = files_by_site + self.replica_results_bysite[dataset] = files_by_site - # Now let's print the results - tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") - for site, files in files_by_site.items(): - T = tree.add(f"[green]{site}") - for f in files: - T.add(f"[cyan]{f}") - self.console.print(tree) + # Now let's print the results + tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") + for site, files in files_by_site.items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") + self.console.print(tree) def do_allowlist_sites(self, args): if self.sites_allowlist is None: @@ -413,6 +423,14 @@ def do_preprocess(self, args): if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--cli", help="Start interactive CLI for dataset discovery", action="store_true" + ) + args = parser.parse_args() + intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. Some basic commands: @@ -430,8 +448,9 @@ def do_preprocess(self, args): - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json) - [bold cyan]help[/]: get help! """ - console = Console() - console.print(intro_msg, justify="left") - app = DatasetQueryApp() - app.cmdloop() + if args.cli: + console = Console() + console.print(intro_msg, justify="left") + app = DatasetQueryApp() + app.cmdloop() From f04b60f6beca291c12b5fb8d04ed1683d5a78964 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Fri, 8 Dec 2023 19:51:54 +0100 Subject: [PATCH 63/80] Added Select all options to cli --- src/coffea/dataset_tools/dataset_query.py | 24 +++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 1c74737d1..d62ec5e17 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -129,16 +129,22 @@ def do_query_results(self, args): print("First [bold red]query (Q)[/] for a dataset") def do_select(self, args): + """Selected the datasets from the list of query results. Input a list of indices of "all".""" if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return Nresults = len(self.last_query_list) print("[cyan]Selected datasets:") - for s in map(int, args.arg_list): - if s <= Nresults: - self.selected_datasets.append(self.last_query_list[s - 1]) - print(f"- ({s}) {self.last_query_list[s-1]}") + if args == "all": + indices = range(0, len(self.last_query_list)) # 1 based list + else: + indices = map(lambda k: int(k) - 1, args.arg_list) + + for s in indices: + if s < Nresults: + self.selected_datasets.append(self.last_query_list[s]) + print(f"- ({s+1}) {self.last_query_list[s]}") else: print( f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}" @@ -374,7 +380,7 @@ def do_preprocess(self, args): """Perform preprocessing for concrete fileset extraction. Args: output_name [step_size] [align to file cluster boundaries] [dask cluster url] """ - args_list = args.split() + args_list = args.arg_list if len(args_list) < 1: print( "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url" @@ -386,7 +392,7 @@ def do_preprocess(self, args): align_to_clusters = False dask_url = None if len(args_list) >= 2: - step_size = args_list[1] + step_size = int(args_list[1]) if len(args_list) >= 3: if args_list[2] == "True": align_to_clusters = True @@ -413,11 +419,13 @@ def do_preprocess(self, args): align_clusters=align_to_clusters, skip_bad_files=True, ) + from IPython import embed - with gzip.open(f"{output_file}_available.json.gz", "w") as file: + embed() + with gzip.open(f"{output_file}_available.json.gz", "wb") as file: print(f"Saved available fileset chunks to {output_file}_available.json.gz") json.dump(out_available, file, indent=2) - with gzip.open(f"{output_file}_all.json.gz", "w") as file: + with gzip.open(f"{output_file}_all.json.gz", "wb") as file: print(f"Saved all fileset chunks to {output_file}_all.json.gz") json.dump(out_updated, file, indent=2) From 42af84bd7242d2ac7b15dfd35c1effeebcad65c7 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Fri, 8 Dec 2023 14:15:13 -0600 Subject: [PATCH 64/80] lint --- src/coffea/dataset_tools/dataset_query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index d62ec5e17..956ad25f7 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -171,7 +171,8 @@ def do_list_selected(self, args): def do_replicas(self, args): if len(args.arg_list) == 0: print( - "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets" + "[red] Please provide a list of indices of the [bold]selected[/bold] datasets " + "to analyze or [bold]all[/bold] to loop on all the selected datasets" ) return From 4ba8c0a8626fba5aeb4cb172eb29adff7da544bb Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 13:04:41 +0100 Subject: [PATCH 65/80] Moved from cmd2 to pure rich interface for the CLI --- pyproject.toml | 1 - src/coffea/dataset_tools/dataset_query.py | 432 ++++++++++++---------- 2 files changed, 244 insertions(+), 189 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 04ede6167..66f9e43b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,6 @@ servicex = [ rucio = [ "rucio-clients>=32;python_version>'3.8'", "rucio-clients<32;python_version<'3.9'", - "cmd2", ] dev = [ "pre-commit", diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index d62ec5e17..0700879fe 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -3,13 +3,13 @@ import os import random from collections import defaultdict +from typing import List -import cmd2 import yaml from dask.distributed import Client from rich import print from rich.console import Console -from rich.prompt import Prompt +from rich.prompt import Prompt, IntPrompt, Confirm from rich.table import Table from rich.tree import Tree @@ -50,24 +50,44 @@ def print_dataset_query(query, dataset_list, selected, console): console.print(table) -class DatasetQueryApp(cmd2.Cmd): - prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > " +def get_indices_query(input_str: str, maxN: int) -> List[int]: + tokens = input_str.strip().split(" ") + final_tokens = [] + for t in tokens: + if t.isdigit(): + if int(t) > maxN: + print( + f"[red bold]Requested index {t} larger than available elements {maxN}" + ) + return False + final_tokens.append(int(t) - 1) # index 0 + elif "-" in t: + rng = t.split("-") + try: + for i in range( + int(rng[0]), int(rng[1]) + 1 + ): # including the last index + if i > maxN: + print( + f"[red bold]Requested index {t} larger than available elements {maxN}" + ) + return False + final_tokens.append(i - 1) + except: + print( + "[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9" + ) + return False + elif t == "all": + final_tokens = list(range(0, maxN)) + else: + print("[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9") + return False + return final_tokens + +class DataDiscoveryCLI: def __init__(self): - shortcuts = cmd2.DEFAULT_SHORTCUTS - shortcuts.update( - { - "L": "login", - "Q": "query", - "QR": "query_results", - "R": "replicas", - "S": "select", - "LS": "list_selected", - "LR": "list_replicas", - "O": "save", - "P": "preprocess", - } - ) self.console = Console() self.rucio_client = None self.selected_datasets = [] @@ -81,43 +101,116 @@ def __init__(self): self.replica_results = defaultdict(list) self.replica_results_bysite = {} - super().__init__(shortcuts=shortcuts) - def do_login(self, args): + self.commands = [ + "help", + "login", + "query", + "query-results", + "select", + "list-selected", + "replicas", + "list-replicas", + "save", + "preprocess", + "allow-sites", + "block-sites", + "regex-sites", + "sites-filters", + "quit", + ] + + def start_cli(self): + while True: + command = Prompt.ask(">", choices=self.commands) + if command == "help": + print( + r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] +Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. +Some basic commands: + - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS) + - [bold cyan]select (S)[/]: Select datasets to process further from query results + - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites + - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets + - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset + - [bold cyan]sites_filters[/]: show the active sites filters + - [bold cyan]sites_filters clear[/]: clear all the active sites filters + - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries + - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries + - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" + - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing + - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json) + - [bold cyan]help[/]: get help! + """ + ) + elif command == "login": + self.do_login() + elif command == "quit": + print("Bye!") + break + elif command == "query": + self.do_query() + elif command == "query-results": + self.do_query_results() + elif command == "select": + self.do_select() + elif command == "list-selected": + self.do_list_selected() + elif command == "replicas": + self.do_replicas() + elif command == "list-replicas": + self.do_list_replicas() + elif command == "save": + self.do_save() + elif command == "preprocess": + self.do_preprocess() + elif command == "allow-sites": + self.do_allowlist_sites() + elif command == "block-sites": + self.do_blocklist_sites() + elif command == "regex-sites": + self.do_regex_sites() + elif command == "sites-filters": + self.do_sites_filters() + else: + break + + def do_login(self, proxy=None): """Login to the rucio client. Optionally a specific proxy file can be passed to the command. If the proxy file is not specified, `voms-proxy-info` is used""" - if args: - self.rucio_client = rucio_utils.get_rucio_client(args[0]) + if proxy: + self.rucio_client = rucio_utils.get_rucio_client(proxy) else: self.rucio_client = rucio_utils.get_rucio_client() - print(self.rucio_client) - # pprint(self.rucio_client.whoami()) - def do_whoami(self, args): + def do_whoami(self): # Your code here if not self.rucio_client: print("First [bold]login (L)[/] to the rucio server") return print(self.rucio_client.whoami()) - def do_query(self, args): + def do_query(self): # Your code here - with self.console.status(f"Querying rucio for: [bold red]{args}[/]"): + query = Prompt.ask( + "[yellow bold]Query for[/]", + ) + with self.console.status(f"Querying rucio for: [bold red]{query}[/]"): outlist, outtree = rucio_utils.query_dataset( - args.arg_list[0], + query, client=self.rucio_client, tree=True, scope="cms", # TODO configure scope ) # Now let's print the results as a tree - print_dataset_query(args, outtree, self.selected_datasets, self.console) - self.last_query = args + print_dataset_query(query, outtree, self.selected_datasets, self.console) + self.last_query = query self.last_query_list = outlist self.last_query_tree = outtree print("Use the command [bold red]select (S)[/] to selected the datasets") - def do_query_results(self, args): + def do_query_results(self): if self.last_query_list: print_dataset_query( self.last_query, @@ -128,20 +221,24 @@ def do_query_results(self, args): else: print("First [bold red]query (Q)[/] for a dataset") - def do_select(self, args): - """Selected the datasets from the list of query results. Input a list of indices of "all".""" + def do_select(self): + """Selected the datasets from the list of query results. Input a list of indices + also with range 4-6 or "all".""" if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + final_tokens = get_indices_query(selection, len(self.last_query_list)) + if not final_tokens: + return + Nresults = len(self.last_query_list) print("[cyan]Selected datasets:") - if args == "all": - indices = range(0, len(self.last_query_list)) # 1 based list - else: - indices = map(lambda k: int(k) - 1, args.arg_list) - for s in indices: + for s in final_tokens: if s < Nresults: self.selected_datasets.append(self.last_query_list[s]) print(f"- ({s+1}) {self.last_query_list[s]}") @@ -150,7 +247,7 @@ def do_select(self, args): f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}" ) - def do_list_selected(self, args): + def do_list_selected(self): print("[cyan]Selected datasets:") table = Table(title="Selected datasets") table.add_column("Index", justify="left", style="cyan", no_wrap=True) @@ -168,42 +265,37 @@ def do_list_selected(self, args): ) self.console.print(table) - def do_replicas(self, args): - if len(args.arg_list) == 0: - print( - "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets" - ) + def do_replicas(self): + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + indices = get_indices_query(selection, len(self.selected_datasets)) + if not indices: return - - if args == "all": - datasets = self.selected_datasets - else: - for index in args.arg_list: - if index.isdigit(): - if int(index) <= len(self.selected_datasets): - datasets = [self.selected_datasets[int(index) - 1]] - else: - print( - f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}" - ) + datasets = [self.selected_datasets[ind] for ind in indices] for dataset in datasets: with self.console.status( f"Querying rucio for replicas: [bold red]{dataset}[/]" ): - ( - outfiles, - outsites, - sites_counts, - ) = rucio_utils.get_dataset_files_replicas( - dataset, - allowlist_sites=self.sites_allowlist, - blocklist_sites=self.sites_blocklist, - regex_sites=self.sites_regex, - mode="full", - client=self.rucio_client, - ) + try: + ( + outfiles, + outsites, + sites_counts, + ) = rucio_utils.get_dataset_files_replicas( + dataset, + allowlist_sites=self.sites_allowlist, + blocklist_sites=self.sites_blocklist, + regex_sites=self.sites_regex, + mode="full", + client=self.rucio_client, + ) + except Exception as e: + print(f"\n[red bold] Exception: {e}[/]") + return self.last_replicas_results = (outfiles, outsites, sites_counts) + print(f"[cyan]Sites availability for dataset: [red]{dataset}") table = Table(title="Available replicas") table.add_column("Index", justify="center") @@ -289,122 +381,118 @@ def do_replicas(self, args): T.add(f"[cyan]{f}") self.console.print(tree) - def do_allowlist_sites(self, args): + def do_allowlist_sites(self): + sites = Prompt.ask( + "[yellow]Restrict the available sites to (comma-separated list)" + ).split(",") if self.sites_allowlist is None: - self.sites_allowlist = args.arg_list + self.sites_allowlist = sites else: - self.sites_allowlist += args.arg_list + self.sites_allowlist += sites print("[green]Allowlisted sites:") for s in self.sites_allowlist: print(f"- {s}") - def do_blocklist_sites(self, args): + def do_blocklist_sites(self): + sites = Prompt.ask("[yellow]Exclude the sites (comma-separated list)").split( + "," + ) if self.sites_blocklist is None: - self.sites_blocklist = args.arg_list + self.sites_blocklist = sites else: - self.sites_blocklist += args.arg_list + self.sites_blocklist += sites print("[red]Blocklisted sites:") for s in self.sites_blocklist: print(f"- {s}") - def do_regex_sites(self, args): - if args.startswith('"'): - args = args[1:] - if args.endswith('"'): - args = args[:-1] - self.sites_regex = rf"{args}" - print(f"New sites regex: [cyan]{self.sites_regex}") - - def do_sites_filters(self, args): - if args == "": - print("[green bold]Allow-listed sites:") - if self.sites_allowlist: - for s in self.sites_allowlist: - print(f"- {s}") - - print("[bold red]Block-listed sites:") - if self.sites_blocklist: - for s in self.sites_blocklist: - print(f"- {s}") - - print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") - if args == "clear": + def do_regex_sites(self): + regex = Prompt.ask("[yellow]Regex to restrict the available sites") + if len(regex): + self.sites_regex = rf"{regex}" + print(f"New sites regex: [cyan]{self.sites_regex}") + + def do_sites_filters(self): + print("[green bold]Allow-listed sites:") + if self.sites_allowlist: + for s in self.sites_allowlist: + print(f"- {s}") + + print("[bold red]Block-listed sites:") + if self.sites_blocklist: + for s in self.sites_blocklist: + print(f"- {s}") + + print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") + + if Confirm.ask("Clear sites restrinction?", default=False): self.sites_allowlist = None self.sites_blocklist = None self.sites_regex = None print("[bold green]Sites filters cleared") - def do_list_replicas(self, args): - if len(args.arg_list) == 0: - print("[red]Please call the command with the index of a selected dataset") - else: - if int(args) > len(self.selected_datasets): + def do_list_replicas(self): + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + indices = get_indices_query(selection, len(self.selected_datasets)) + datasets = [self.selected_datasets[ind] for ind in indices] + + for dataset in datasets: + if dataset not in self.replica_results: print( - f"[red] Select the replica with index < {len(self.selected_datasets)}" + f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas [/cyan] command[/]" ) return - else: - dataset = self.selected_datasets[int(args) - 1] - if dataset not in self.replica_results: - print( - f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}" - ) - return - tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") - - for site, files in self.replica_results_bysite[dataset].items(): - T = tree.add(f"[green]{site}") - for f in files: - T.add(f"[cyan]{f}") + tree = Tree(label=f"[bold orange]Replicas for [/][green]{dataset}[/]") + for site, files in self.replica_results_bysite[dataset].items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") - self.console.print(tree) + self.console.print(tree) - def do_save(self, args): + def do_save(self, filename=None): """Save the replica information in yaml format""" - if not len(args): - print("[red]Please provide an output filename and format") - return - format = os.path.splitext(args)[1] + if not filename: + filename = Prompt.ask( + "[yellow bold]Output file name (.yaml or .json)", default="output.json" + ) + format = os.path.splitext(filename)[1] output = {} for fileset, files in self.replica_results.items(): output[fileset] = {"files": files, "metadata": {}} - - with open(args, "w") as file: + with open(filename, "w") as file: if format == ".yaml": yaml.dump(output, file, default_flow_style=False) elif format == ".json": json.dump(output, file, indent=2) - print(f"[green]File {args} saved!") - - def do_preprocess(self, args): + print(f"[green]File {filename} saved!") + + def do_preprocess( + self, + output_file=None, + step_size=None, + align_to_clusters=None, + dask_cluster=None, + ): """Perform preprocessing for concrete fileset extraction. - Args: output_name [step_size] [align to file cluster boundaries] [dask cluster url] + Args: output_file [step_size] [align to file cluster boundaries] [dask cluster url] """ - args_list = args.arg_list - if len(args_list) < 1: - print( - "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url" + if not output_file: + output_file = Prompt.ask( + "[yellow bold]Output name", default="output_preprocessing" ) - return - else: - output_file = args_list[0] - step_size = None - align_to_clusters = False - dask_url = None - if len(args_list) >= 2: - step_size = int(args_list[1]) - if len(args_list) >= 3: - if args_list[2] == "True": - align_to_clusters = True - elif args_list[2] == "False": - align_to_clusters = False - else: - raise ValueError('align_to_clusters must be either "True" or "False"') - if len(args_list) == 4: - dask_url = args_list[3] - if len(args_list) > 4: - print("preprocess accepts at most 3 commandline arguments!") - return + if not step_size: + step_size = IntPrompt.ask("[yellow bold]Step size", default=None) + if align_to_clusters is None: + align_to_clusters = Confirm.ask( + "[yellow bold]Align to clusters", default=True + ) + if not dask_cluster: + dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None") + if dask_cluster == "None": + dask_cluster = None + replicas = {} for fileset, files in self.replica_results.items(): replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}} @@ -412,53 +500,21 @@ def do_preprocess(self, args): with self.console.status( "[red] Preprocessing files to extract available chunks with dask[/]" ): - with Client(dask_url) as _: + with Client(dask_cluster) as _: out_available, out_updated = preprocess( replicas, maybe_step_size=step_size, align_clusters=align_to_clusters, skip_bad_files=True, ) - from IPython import embed - - embed() - with gzip.open(f"{output_file}_available.json.gz", "wb") as file: + with gzip.open(f"{output_file}_available.json.gz", "wt") as file: print(f"Saved available fileset chunks to {output_file}_available.json.gz") json.dump(out_available, file, indent=2) - with gzip.open(f"{output_file}_all.json.gz", "wb") as file: + with gzip.open(f"{output_file}_all.json.gz", "wt") as file: print(f"Saved all fileset chunks to {output_file}_all.json.gz") json.dump(out_updated, file, indent=2) if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--cli", help="Start interactive CLI for dataset discovery", action="store_true" - ) - args = parser.parse_args() - - intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] -Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. -Some basic commands: - - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS) - - [bold cyan]select (S)[/]: Select datasets to process further from query results - - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites - - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets - - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset - - [bold cyan]sites_filters[/]: show the active sites filters - - [bold cyan]sites_filters clear[/]: clear all the active sites filters - - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries - - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries - - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" - - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing - - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json) - - [bold cyan]help[/]: get help! -""" - - if args.cli: - console = Console() - console.print(intro_msg, justify="left") - app = DatasetQueryApp() - app.cmdloop() + cli = DataDiscoveryCLI() + cli.start_cli() From dd21cbc42ab65b78a0319ff43024aedf1fe494b3 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 13:12:22 +0100 Subject: [PATCH 66/80] Updated help message --- src/coffea/dataset_tools/dataset_query.py | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 0700879fe..3a28a486d 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -128,19 +128,19 @@ def start_cli(self): r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. Some basic commands: - - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS) - - [bold cyan]select (S)[/]: Select datasets to process further from query results - - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites - - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets - - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset - - [bold cyan]sites_filters[/]: show the active sites filters - - [bold cyan]sites_filters clear[/]: clear all the active sites filters - - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries - - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries - - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+" - - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing - - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json) - - [bold cyan]help[/]: get help! + - [bold cyan]query[/]: Look for datasets with * wildcards (like in DAS) + - [bold cyan]select[/]: Select datasets to process further from query results + - [bold cyan]replicas[/]: Query rucio to look for files replica and then select the preferred sites + - [bold cyan]query-results[/]: List the results of the last dataset query + - [bold cyan]list-selected[/]: Print a list of the selected datasets + - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset + - [bold cyan]sites-filters[/]: show the active sites filters and aks to clear them + - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list + - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query + - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" + - [bold cyan]save[/]: Save the replicas query results to file (json or yaml) for further processing + - [bold cyan]preprocess[/]: Preprocess the replicas with dask and save the fileset for further processing with uproot/coffea + - [bold cyan]help[/]: Print this help message """ ) elif command == "login": From c07963463a32940153efe2faad21413a2b3b8383 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 12:13:01 +0000 Subject: [PATCH 67/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 3a28a486d..8a92e8b5d 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -9,7 +9,7 @@ from dask.distributed import Client from rich import print from rich.console import Console -from rich.prompt import Prompt, IntPrompt, Confirm +from rich.prompt import Confirm, IntPrompt, Prompt from rich.table import Table from rich.tree import Tree From f09b64f4188beb42f1d6062c1027c3f72eb55879 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 13:14:58 +0100 Subject: [PATCH 68/80] linting --- src/coffea/dataset_tools/dataset_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 3a28a486d..9f73f61fb 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -73,7 +73,7 @@ def get_indices_query(input_str: str, maxN: int) -> List[int]: ) return False final_tokens.append(i - 1) - except: + except Exception: print( "[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9" ) From 2049913a52817e3d40d744b59055da6b38adcbce Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 11 Dec 2023 07:56:34 -0600 Subject: [PATCH 69/80] typo --- src/coffea/dataset_tools/dataset_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index b5323723f..8bf24e8a9 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -134,7 +134,7 @@ def start_cli(self): - [bold cyan]query-results[/]: List the results of the last dataset query - [bold cyan]list-selected[/]: Print a list of the selected datasets - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset - - [bold cyan]sites-filters[/]: show the active sites filters and aks to clear them + - [bold cyan]sites-filters[/]: show the active sites filters and ask to clear them - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" From 524011066bfdb50e8dedab305be129263c9154b7 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 16:39:39 +0100 Subject: [PATCH 70/80] Adding non-cli interaction from datacard --- src/coffea/dataset_tools/dataset_query.py | 164 ++++++++++++++++++---- 1 file changed, 139 insertions(+), 25 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index b5323723f..61c45f559 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -6,6 +6,7 @@ from typing import List import yaml +import argparse from dask.distributed import Client from rich import print from rich.console import Console @@ -91,6 +92,7 @@ def __init__(self): self.console = Console() self.rucio_client = None self.selected_datasets = [] + self.selected_datasets_metadata = [] self.last_query = "" self.last_query_tree = None self.last_query_list = None @@ -100,6 +102,7 @@ def __init__(self): self.last_replicas_results = None self.replica_results = defaultdict(list) + self.replica_results_metadata = {} self.replica_results_bysite = {} self.commands = [ @@ -191,11 +194,12 @@ def do_whoami(self): return print(self.rucio_client.whoami()) - def do_query(self): + def do_query(self, query=None): # Your code here - query = Prompt.ask( - "[yellow bold]Query for[/]", - ) + if query is None: + query = Prompt.ask( + "[yellow bold]Query for[/]", + ) with self.console.status(f"Querying rucio for: [bold red]{query}[/]"): outlist, outtree = rucio_utils.query_dataset( query, @@ -221,16 +225,17 @@ def do_query_results(self): else: print("First [bold red]query (Q)[/] for a dataset") - def do_select(self): + def do_select(self, selection=None, metadata=None): """Selected the datasets from the list of query results. Input a list of indices also with range 4-6 or "all".""" if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return - selection = Prompt.ask( - "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" - ) + if selection is None: + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) final_tokens = get_indices_query(selection, len(self.last_query_list)) if not final_tokens: return @@ -241,6 +246,10 @@ def do_select(self): for s in final_tokens: if s < Nresults: self.selected_datasets.append(self.last_query_list[s]) + if metadata: + self.selected_datasets_metadata.append(metadata) + else: + self.selected_datasets_metadata.append({}) print(f"- ({s+1}) {self.last_query_list[s]}") else: print( @@ -265,16 +274,25 @@ def do_list_selected(self): ) self.console.print(table) - def do_replicas(self): - selection = Prompt.ask( - "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" - ) + def do_replicas(self, mode=None, selection=None): + """Query Rucio for replicas. + Mode: - None: ask the user about the mode + - round-robin (take files randomly from available sites), + - choice: ask the user to choose the specific site + """ + if selection is None: + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) indices = get_indices_query(selection, len(self.selected_datasets)) if not indices: return - datasets = [self.selected_datasets[ind] for ind in indices] + datasets = [ + (self.selected_datasets[ind], self.selected_datasets_metadata[ind]) + for ind in indices + ] - for dataset in datasets: + for dataset, dataset_metadata in datasets: with self.console.status( f"Querying rucio for replicas: [bold red]{dataset}[/]" ): @@ -314,15 +332,16 @@ def do_replicas(self): ) self.console.print(table) - strategy = Prompt.ask( - "Select sites", - choices=["round-robin", "choice", "quit"], - default="round-robin", - ) + if mode is None: + mode = Prompt.ask( + "Select sites", + choices=["round-robin", "choice", "quit"], + default="round-robin", + ) files_by_site = defaultdict(list) - if strategy == "choice": + if mode == "choice": ind = list( map( int, @@ -357,8 +376,9 @@ def do_replicas(self): return self.replica_results[dataset] = output + self.replica_results_metadata[dataset] = dataset_metadata - elif strategy == "round-robin": + elif mode == "round-robin": output = [] for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): # selecting randomly from the sites @@ -366,8 +386,9 @@ def do_replicas(self): output.append(files[iS]) files_by_site[sites[iS]].append(files[iS]) self.replica_results[dataset] = output + self.replica_results_metadata[dataset] = dataset_metadata - elif strategy == "quit": + elif mode == "quit": print("[orange]Doing nothing...") return @@ -460,7 +481,10 @@ def do_save(self, filename=None): format = os.path.splitext(filename)[1] output = {} for fileset, files in self.replica_results.items(): - output[fileset] = {"files": files, "metadata": {}} + output[fileset] = { + "files": files, + "metadata": self.replica_results_metadata[fileset], + } with open(filename, "w") as file: if format == ".yaml": yaml.dump(output, file, default_flow_style=False) @@ -495,7 +519,10 @@ def do_preprocess( replicas = {} for fileset, files in self.replica_results.items(): - replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}} + replicas[fileset] = { + "files": {f: "Events" for f in files}, + "metadata": self.replica_results_metadata[fileset], + } # init a local Dask cluster with self.console.status( "[red] Preprocessing files to extract available chunks with dask[/]" @@ -516,5 +543,92 @@ def do_preprocess( if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--cli", help="Start the dataset discovery CLI", action="store_true" + ) + parser.add_argument( + "-d", + "--dataset-definition", + help="Dataset definition file", + type=str, + required=False, + ) + parser.add_argument( + "-o", + "--output", + help="Output name for fileset", + type=str, + required=False, + default="output_fileset", + ) + parser.add_argument( + "-as", + "--allow-sites", + help="List of sites to be allowlisted", + nargs="+", + type=str, + ) + parser.add_argument( + "-bs", + "--block-sites", + help="List of sites to be blocklisted", + nargs="+", + type=str, + ) + parser.add_argument( + "-rs", + "--regex-sites", + help="Regex string to be used to filter the sites", + type=str, + ) + parser.add_argument( + "--replicas-strategy", + help="Mode for selecting replicas for datasets: [manual|round-robin|choice]", + default="round-robin", + required=False, + ) + args = parser.parse_args() + cli = DataDiscoveryCLI() - cli.start_cli() + + if args.dataset_definition: + # Load the dataset definition if present: + with open(args.dataset_definition, "r") as file: + dataset_definition = json.load(file) + + for dataset_query, dataset_meta in dataset_definition.items(): + print(f"\nProcessing query: {dataset_query}") + # Adding queries + cli.do_query(dataset_query) + # Now selecting the results depending on the interactive mode or not. + # Metadata are passed to the selection function to associated them with the selected dataset. + cli.do_select(selection="all", metadata=dataset_meta) + + # Now list all + cli.do_list_selected() + + if args.allow_sites: + cli.sites_allowlist = args.allow_sites + if args.block_sites: + cli.sites_blocklist = args.block_sites + if args.regex_sites: + cli.sites_regex = args.regex_sites + + # selecting replicas + if args.replicas_strategy == "manual": + cli.do_replicas(mode=None, selection="all") + else: + if args.replicas_strategy not in ["round-robin", "choice"]: + print( + "Invalid replicas-strategy: please choice between manual|round-robin|choice" + ) + exit(1) + cli.do_replicas(mode=args.replicas_strategy, selection="all") + + # Now list all + cli.do_list_selected() + print("CIAO") + + if args.cli: + cli.start_cli() From d8260301f22c88b7d06ab305598cdbe02dea41a9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 15:40:49 +0000 Subject: [PATCH 71/80] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/coffea/dataset_tools/dataset_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 4b3c26724..2ce4dc421 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -1,3 +1,4 @@ +import argparse import gzip import json import os @@ -6,7 +7,6 @@ from typing import List import yaml -import argparse from dask.distributed import Client from rich import print from rich.console import Console @@ -594,7 +594,7 @@ def do_preprocess( if args.dataset_definition: # Load the dataset definition if present: - with open(args.dataset_definition, "r") as file: + with open(args.dataset_definition) as file: dataset_definition = json.load(file) for dataset_query, dataset_meta in dataset_definition.items(): From dd31a2a9a16840d61b3da5648fe14f01ffb49f89 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 20:06:34 +0100 Subject: [PATCH 72/80] better defaults and typos --- src/coffea/dataset_tools/dataset_query.py | 88 +++++++++++++++++------ 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 2ce4dc421..80db62e53 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -278,7 +278,7 @@ def do_replicas(self, mode=None, selection=None): """Query Rucio for replicas. Mode: - None: ask the user about the mode - round-robin (take files randomly from available sites), - - choice: ask the user to choose the specific site + - choose: ask the user to choose the specific site """ if selection is None: selection = Prompt.ask( @@ -335,13 +335,13 @@ def do_replicas(self, mode=None, selection=None): if mode is None: mode = Prompt.ask( "Select sites", - choices=["round-robin", "choice", "quit"], + choices=["round-robin", "choose", "quit"], default="round-robin", ) files_by_site = defaultdict(list) - if mode == "choice": + if mode == "choose": ind = list( map( int, @@ -432,7 +432,7 @@ def do_regex_sites(self): self.sites_regex = rf"{regex}" print(f"New sites regex: [cyan]{self.sites_regex}") - def do_sites_filters(self): + def do_sites_filters(self, ask_clear=True): print("[green bold]Allow-listed sites:") if self.sites_allowlist: for s in self.sites_allowlist: @@ -445,11 +445,12 @@ def do_sites_filters(self): print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") - if Confirm.ask("Clear sites restrinction?", default=False): - self.sites_allowlist = None - self.sites_blocklist = None - self.sites_regex = None - print("[bold green]Sites filters cleared") + if ask_clear: + if Confirm.ask("Clear sites restrinction?", default=False): + self.sites_allowlist = None + self.sites_blocklist = None + self.sites_regex = None + print("[bold green]Sites filters cleared") def do_list_replicas(self): selection = Prompt.ask( @@ -506,13 +507,13 @@ def do_preprocess( output_file = Prompt.ask( "[yellow bold]Output name", default="output_preprocessing" ) - if not step_size: + if step_size is None: step_size = IntPrompt.ask("[yellow bold]Step size", default=None) if align_to_clusters is None: align_to_clusters = Confirm.ask( "[yellow bold]Align to clusters", default=True ) - if not dask_cluster: + if dask_cluster is None: dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None") if dask_cluster == "None": dask_cluster = None @@ -557,11 +558,26 @@ def do_preprocess( parser.add_argument( "-o", "--output", + help="Output name for dataset discovery output (no fileset preprocessing)", + type=str, + required=False, + default="output_dataset", + ) + parser.add_argument( + "-fo", + "--fileset-output", help="Output name for fileset", type=str, required=False, default="output_fileset", ) + parser.add_argument( + "-p", "--preprocess", help="Preprocess with dask", action="store_true" + ) + parser.add_argument( + "--step-size", help="Step size for preprocessing", type=int, default=500000 + ) + parser.add_argument("--dask-cluster", help="Dask cluster url", type=str, default="") parser.add_argument( "-as", "--allow-sites", @@ -582,9 +598,15 @@ def do_preprocess( help="Regex string to be used to filter the sites", type=str, ) + parser.add_argument( + "--query-results-strategy", + help="Mode for query results selection: [all|manual]", + type=str, + default="all", + ) parser.add_argument( "--replicas-strategy", - help="Mode for selecting replicas for datasets: [manual|round-robin|choice]", + help="Mode for selecting replicas for datasets: [manual|round-robin|choose]", default="round-robin", required=False, ) @@ -592,6 +614,13 @@ def do_preprocess( cli = DataDiscoveryCLI() + if args.allow_sites: + cli.sites_allowlist = args.allow_sites + if args.block_sites: + cli.sites_blocklist = args.block_sites + if args.regex_sites: + cli.sites_regex = args.regex_sites + if args.dataset_definition: # Load the dataset definition if present: with open(args.dataset_definition) as file: @@ -603,32 +632,45 @@ def do_preprocess( cli.do_query(dataset_query) # Now selecting the results depending on the interactive mode or not. # Metadata are passed to the selection function to associated them with the selected dataset. - cli.do_select(selection="all", metadata=dataset_meta) + if args.query_results_strategy not in ["all", "manual"]: + print( + "Invalid query-results-strategy option: please choose between: manual|all" + ) + exit(1) + elif args.query_results_strategy == "manual": + cli.do_select(selection=None, metadata=dataset_meta) + else: + cli.do_select(selection="all", metadata=dataset_meta) # Now list all cli.do_list_selected() - if args.allow_sites: - cli.sites_allowlist = args.allow_sites - if args.block_sites: - cli.sites_blocklist = args.block_sites - if args.regex_sites: - cli.sites_regex = args.regex_sites - # selecting replicas + cli.do_sites_filters(ask_clear=False) + print("Getting replicas") if args.replicas_strategy == "manual": cli.do_replicas(mode=None, selection="all") else: - if args.replicas_strategy not in ["round-robin", "choice"]: + if args.replicas_strategy not in ["round-robin", "choose"]: print( - "Invalid replicas-strategy: please choice between manual|round-robin|choice" + "Invalid replicas-strategy: please choose between manual|round-robin|choose" ) exit(1) cli.do_replicas(mode=args.replicas_strategy, selection="all") # Now list all cli.do_list_selected() - print("CIAO") + + # Save + if args.output: + cli.do_save(filename=args.output) + if args.preprocess: + cli.do_preprocess( + output_file=args.fileset_output, + step_size=args.step_size, + dask_cluster=args.dask_cluster, + align_to_clusters=False, + ) if args.cli: cli.start_cli() From 6986a109ff590638295501c2552ea2033eeb1b92 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 23:17:43 +0100 Subject: [PATCH 73/80] Adding docs and dataset_discovery notebook --- binder/dataset_discovery.ipynb | 1436 +++++++++++++++++++++ src/coffea/dataset_tools/dataset_query.py | 122 +- src/coffea/dataset_tools/rucio_utils.py | 25 +- 3 files changed, 1524 insertions(+), 59 deletions(-) create mode 100644 binder/dataset_discovery.ipynb diff --git a/binder/dataset_discovery.ipynb b/binder/dataset_discovery.ipynb new file mode 100644 index 000000000..9c29063fe --- /dev/null +++ b/binder/dataset_discovery.ipynb @@ -0,0 +1,1436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c5754206-f41b-4e08-bc4d-496df85e8194", + "metadata": {}, + "source": [ + "# Dataset discovery tools" + ] + }, + { + "cell_type": "markdown", + "id": "42242097-c04e-459e-9f3a-1d746df4e9dd", + "metadata": {}, + "source": [ + "# Using Rucio utils directly" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "09103c77-b8e6-4d61-920b-b1ff8fba8791", + "metadata": {}, + "outputs": [], + "source": [ + "from coffea.dataset_tools import rucio_utils\n", + "from coffea.dataset_tools.dataset_query import print_dataset_query\n", + "from rich.console import Console\n", + "from rich.table import Table" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d62b43cb-53c0-4e2d-b571-1a0683e34dc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = rucio_utils.get_rucio_client()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0359afc0-fc98-4aa8-acf4-288ef19ac7db", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "659bee88-9fb0-4d1a-9544-a97372595f18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/TTToSemiLeptonic_TuneCP5CR1_erdON_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5CR2_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outlist, outtree = rucio_utils.query_dataset(\n", + " query,\n", + " client=client,\n", + " tree=True,\n", + " scope=\"cms\", \n", + " )\n", + "\n", + "outlist[1:5]" + ] + }, + { + "cell_type": "markdown", + "id": "9bc2a454-4915-4366-9c02-2e389e9eb6fb", + "metadata": {}, + "source": [ + "Let's now pretty-print the results in a table using an utility function in the `dataset_query` module." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4487d997-dc22-4a47-87df-4da14fa5b35a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
              Query: /TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM               \n",
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n",
+       "┃ Name                               Tag                                                                        ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n",
+       "│ TTToSemiLeptonic_TuneCP5CR1_13Te…  (1) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5CR1_erdO…  (2) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5CR2_13Te…  (3) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5_13TeV-p…  (4) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… ││\n",
+       "│                                    (5) RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v… │\n",
+       "│                                    (6) RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16… ││\n",
+       "│                                    (7) RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1… │\n",
+       "│                                    (8) RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5_erdON_1…  (9) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5down_13T…  (10) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5up_13TeV…  (11) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_Vcb_TuneCP5_13T…  (12) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_hdampDOWN_TuneC…  (13) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_hdampUP_TuneCP5…  (14) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop166p5_TuneC…  (15) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop169p5_TuneC…  (16) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop171p5_TuneC…  (17) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop173p5_TuneC…  (18) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop175p5_TuneC…  (19) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop178p5_TuneC…  (20) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p55_Tune…  (21) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p7_TuneC…  (22) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p85_Tune…  (23) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p15_Tune…  (24) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p3_TuneC…  (25) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p45_Tune…  (26) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Query: \u001b[0m\u001b[1;3;31m/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\u001b[0m\u001b[3m \u001b[0m\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mTag \u001b[0m\u001b[1m \u001b[0m┃┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR1_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(1)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5CR1_erdO…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(2)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR2_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(3)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5_13TeV-p…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(4)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(5)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "│\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(6)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16…\u001b[0m\u001b[35m \u001b[0m││\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(7)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "│\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(8)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5_erdON_1…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(9)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5down_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(10)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5up_13TeV…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(11)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_Vcb_TuneCP5_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(12)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_hdampDOWN_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(13)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_hdampUP_TuneCP5…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(14)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop166p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(15)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop169p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(16)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop171p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(17)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop173p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(18)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop175p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(19)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop178p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(20)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p55_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(21)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx0p7_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(22)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p85_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(23)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p15_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(24)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx1p3_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(25)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p45_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(26)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "console = Console()\n", + "print_dataset_query(query, outtree, console)" + ] + }, + { + "cell_type": "markdown", + "id": "c213d5fc-6424-4cdf-8751-88ced7987a59", + "metadata": {}, + "source": [ + "### Dataset replicas" + ] + }, + { + "cell_type": "markdown", + "id": "961b4ad8-e3d6-49b1-a2ce-7cad49b46f06", + "metadata": {}, + "source": [ + "Let's select one dataset and look for available replicas" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d08fd6ed-4b3a-4e9f-994a-d1bd529421a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = outlist[0]\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "id": "a605fb64-6e0b-4fbe-8807-84b9d75f2d53", + "metadata": {}, + "source": [ + "Using the option `mode='full'` in the function `rucio_utils.get_dataset_file_replicas()` one gets all the available replicas. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2d64069e-ea8f-48c2-bd33-43fc555f6ec8", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[],\n", + " regex_sites=[],\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e4fc6c2-f378-40d2-a4ea-f265b6c18887", + "metadata": {}, + "outputs": [], + "source": [ + "def print_replicas(sites_counts):\n", + " console.print(f\"[cyan]Sites availability for dataset: [red]{dataset}\")\n", + " table = Table(title=\"Available replicas\")\n", + " table.add_column(\"Index\", justify=\"center\")\n", + " table.add_column(\"Site\", justify=\"left\", style=\"cyan\", no_wrap=True)\n", + " table.add_column(\"Files\", style=\"magenta\", no_wrap=True)\n", + " table.add_column(\"Availability\", justify=\"center\")\n", + " table.row_styles = [\"dim\", \"none\"]\n", + " Nfiles = len(outfiles)\n", + " \n", + " sorted_sites = dict(\n", + " sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)\n", + " )\n", + " for i, (site, stat) in enumerate(sorted_sites.items()):\n", + " table.add_row(\n", + " str(i), site, f\"{stat} / {Nfiles}\", f\"{stat*100/Nfiles:.1f}%\"\n", + " )\n", + " console.print(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "79c68044-dc3b-4dd5-a0d3-c3f6ddd0bea1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           294 / 294     100.0%    │\n",
+       "│   1   │ T1_DE_KIT_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_UK_RAL_Disk       294 / 294     100.0%    │\n",
+       "│   3   │ T1_RU_JINR_Disk      294 / 294 │    100.0%    │\n",
+       "│   4    T3_CH_PSI            294 / 294     100.0%    │\n",
+       "│   5   │ T3_KR_UOS            294 / 294 │    100.0%    │\n",
+       "│   6    T1_US_FNAL_Disk      193 / 294     65.6%     │\n",
+       "│   7   │ T2_US_Nebraska       99 / 294  │    33.7%     │\n",
+       "│   8    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   9   │ T2_US_Purdue         53 / 294  │    18.0%     │\n",
+       "│  10    T2_BE_IIHE           50 / 294      17.0%     │\n",
+       "│  11   │ T2_US_MIT            50 / 294  │    17.0%     │\n",
+       "│  12    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│  13   │ T2_US_Vanderbilt     40 / 294  │    13.6%     │\n",
+       "│  14    T2_BR_SPRACE         39 / 294      13.3%     │\n",
+       "│  15   │ T2_US_Florida        33 / 294  │    11.2%     │\n",
+       "│  16    T2_IT_Legnaro        28 / 294       9.5%     │\n",
+       "│  17   │ T2_US_UCSD           28 / 294  │     9.5%     │\n",
+       "│  18    T2_UA_KIPT           26 / 294       8.8%     │\n",
+       "│  19   │ T2_US_Caltech        24 / 294  │     8.2%     │\n",
+       "│  20    T2_US_Wisconsin      22 / 294       7.5%     │\n",
+       "│  21   │ T2_TR_METU           18 / 294  │     6.1%     │\n",
+       "│  22    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│  23   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  24    T2_BR_UERJ           7 / 294        2.4%     │\n",
+       "│  25   │ T2_UK_SGrid_Bristol  3 / 294   │     1.0%     │\n",
+       "│  26    T2_ES_IFCA           2 / 294        0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT1_RU_JINR_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_CH_PSI \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 65.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│ 33.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│ 18.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 17.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│ 13.6% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 13.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│ 11.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 8.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│ 8.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 7.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│ 6.1% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 24 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 2.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 25 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294 \u001b[0m\u001b[35m \u001b[0m│ 1.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 26 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 0.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "c9544ceb-5949-4bd3-b997-14da4aa2d956", + "metadata": {}, + "source": [ + "### Filtering sites\n", + "Grid sites can be filtered in 3 different ways\n", + "- **allowlist**: if this list of specified, only the sites in the list are considered. No blocklist and regex are considered\n", + "- **blocklist**: if this list is specified, those sites are excluded from the replicas\n", + "- **regex_sites**: regex filter the sites to be considered, on top of the blocklist" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1f6b586c-a8b7-40d8-a25a-b02e94f4a892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                  Available replicas                  \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site             Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY       294 / 294     100.0%    │\n",
+       "│   1   │ T1_US_FNAL_Disk  193 / 294 │    65.6%     │\n",
+       "└───────┴─────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m193 / 294\u001b[0m\u001b[35m \u001b[0m│ 65.6% │\n", + "└───────┴─────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with allowlist\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[\"T2_DE_DESY\", \"T1_US_FNAL_Disk\"],\n", + " blocklist_sites=[],\n", + " regex_sites=None,\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "12f7e403-67fe-42c0-a3ee-a668006b1836", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T1_DE_KIT_Disk       294 / 294     100.0%    │\n",
+       "│   1   │ T1_UK_RAL_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_RU_JINR_Disk      294 / 294     100.0%    │\n",
+       "│   3   │ T3_KR_UOS            294 / 294 │    100.0%    │\n",
+       "│   4    T1_US_FNAL_Disk      193 / 294     65.6%     │\n",
+       "│   5   │ T2_US_Nebraska       99 / 294  │    33.7%     │\n",
+       "│   6    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   7   │ T2_US_Purdue         53 / 294  │    18.0%     │\n",
+       "│   8    T2_BE_IIHE           50 / 294      17.0%     │\n",
+       "│   9   │ T2_US_MIT            50 / 294  │    17.0%     │\n",
+       "│  10    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│  11   │ T2_US_Vanderbilt     40 / 294  │    13.6%     │\n",
+       "│  12    T2_BR_SPRACE         39 / 294      13.3%     │\n",
+       "│  13   │ T2_US_Florida        33 / 294  │    11.2%     │\n",
+       "│  14    T2_IT_Legnaro        28 / 294       9.5%     │\n",
+       "│  15   │ T2_US_UCSD           28 / 294  │     9.5%     │\n",
+       "│  16    T2_UA_KIPT           26 / 294       8.8%     │\n",
+       "│  17   │ T2_US_Caltech        24 / 294  │     8.2%     │\n",
+       "│  18    T2_US_Wisconsin      22 / 294       7.5%     │\n",
+       "│  19   │ T2_TR_METU           18 / 294  │     6.1%     │\n",
+       "│  20    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│  21   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  22    T2_BR_UERJ           7 / 294        2.4%     │\n",
+       "│  23   │ T2_UK_SGrid_Bristol  3 / 294   │     1.0%     │\n",
+       "│  24    T2_ES_IFCA           2 / 294        0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_UK_RAL_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_RU_JINR_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 65.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│ 33.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│ 18.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 17.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│ 13.6% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 13.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│ 11.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 8.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│ 8.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 7.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│ 6.1% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 2.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294 \u001b[0m\u001b[35m \u001b[0m│ 1.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 24 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 0.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with blocklist\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[\"T2_DE_DESY\", \"T3_CH_PSI\"],\n", + " regex_sites=None,\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f5dafcc2-c32e-4e33-9878-183a8e476b73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           294 / 294     100.0%    │\n",
+       "│   1   │ T1_DE_KIT_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_UK_RAL_Disk       294 / 294     100.0%    │\n",
+       "│   3   │ T3_CH_PSI            294 / 294 │    100.0%    │\n",
+       "│   4    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   5   │ T2_BE_IIHE           50 / 294  │    17.0%     │\n",
+       "│   6    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│   7   │ T2_IT_Legnaro        28 / 294  │     9.5%     │\n",
+       "│   8    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│   9   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  10    T2_UK_SGrid_Bristol  3 / 294        1.0%     │\n",
+       "│  11   │ T2_ES_IFCA           2 / 294   │     0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT3_CH_PSI \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_Bristol\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m3 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 1.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m2 / 294 \u001b[0m\u001b[35m \u001b[0m│ 0.7% │\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with regex\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[],\n", + " regex_sites= r\"T[123]_(FR|IT|BE|CH|DE|ES|UK)_\\w+\",\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "0b805dde-dd38-46a4-92ad-55ab2e4a4876", + "metadata": {}, + "source": [ + "# Using the DataDiscoveryCLI\n", + "Manipulating the dataset query and replicas is simplified by the `DataDiscoveryCLI` class in `dataset_query` module." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "39846193-d6f2-4de5-ba42-a089d1b0786d", + "metadata": {}, + "outputs": [], + "source": [ + "from coffea.dataset_tools import rucio_utils\n", + "from coffea.dataset_tools.dataset_query import print_dataset_query\n", + "from rich.console import Console\n", + "from rich.table import Table\n", + "from coffea.dataset_tools.dataset_query import DataDiscoveryCLI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eaba3e39-c95a-4282-83e2-3aadf748adca", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_definition = {\n", + " \"/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X*/NANOAODSIM\": {\"short_name\": \"ZJets\",\n", + " \"metadata\": {\"xsec\": 100.0,\"isMC\":True}},\n", + " \"/SingleMuon/Run2018C-UL20*_MiniAODv2_NanoAODv9_GT36*/NANOAOD\": {\"short_name\": \"SingleMuon\", \"metadata\": {\"isMC\":False}}\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3", + "metadata": {}, + "outputs": [], + "source": [ + "ddc = DataDiscoveryCLI()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "51a8ef69-5d4e-4089-b3a8-d0290cc973c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on method load_dataset_definition in module coffea.dataset_tools.dataset_query:\n", + "\n", + "load_dataset_definition(dataset_definition, query_results_strategy='all', replicas_strategy='round-robin') method of coffea.dataset_tools.dataset_query.DataDiscoveryCLI instance\n", + "\n" + ] + } + ], + "source": [ + "help(ddc.load_dataset_definition)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "db3ab214-93d3-49b1-b6e1-9374b9fcc1f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠧\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Sites availability for dataset: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                   Available replicas                   \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files    Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           67 / 67     100.0%    │\n",
+       "│   1   │ T3_KR_KISTI          67 / 67 │    100.0%    │\n",
+       "│   2    T2_TW_NCHC           67 / 67     100.0%    │\n",
+       "│   3   │ T2_BE_IIHE           67 / 67 │    100.0%    │\n",
+       "│   4    T2_US_Purdue         67 / 67     100.0%    │\n",
+       "│   5   │ T2_ES_CIEMAT         67 / 67 │    100.0%    │\n",
+       "│   6    T3_FR_IPNL           67 / 67     100.0%    │\n",
+       "│   7   │ T1_US_FNAL_Disk      61 / 67 │    91.0%     │\n",
+       "│   8    T2_UK_London_IC      39 / 67     58.2%     │\n",
+       "│   9   │ T1_FR_CCIN2P3_Disk   38 / 67 │    56.7%     │\n",
+       "│  10    T2_US_Caltech        26 / 67     38.8%     │\n",
+       "│  11   │ T2_CH_CERN           25 / 67 │    37.3%     │\n",
+       "│  12    T2_DE_RWTH           22 / 67     32.8%     │\n",
+       "│  13   │ T1_IT_CNAF_Disk      20 / 67 │    29.9%     │\n",
+       "│  14    T2_US_Wisconsin      16 / 67     23.9%     │\n",
+       "│  15   │ T2_US_Florida        16 / 67 │    23.9%     │\n",
+       "│  16    T2_US_Nebraska       13 / 67     19.4%     │\n",
+       "│  17   │ T2_TR_METU           11 / 67 │    16.4%     │\n",
+       "│  18    T1_DE_KIT_Disk       11 / 67     16.4%     │\n",
+       "│  19   │ T2_UK_SGrid_RALPP    6 / 67  │     9.0%     │\n",
+       "│  20    T2_IT_Legnaro        6 / 67       9.0%     │\n",
+       "│  21   │ T2_ES_IFCA           4 / 67  │     6.0%     │\n",
+       "│  22    T2_FR_IPHC           2 / 67       3.0%     │\n",
+       "│  23   │ T2_UK_London_Brunel  1 / 67  │     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT3_KR_KISTI \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_TW_NCHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Purdue \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_ES_CIEMAT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_FR_IPNL \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m61 / 67\u001b[0m\u001b[35m \u001b[0m│ 91.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 58.2% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│ 56.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Caltech \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 38.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_CH_CERN \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m25 / 67\u001b[0m\u001b[35m \u001b[0m│ 37.3% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_RWTH \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 32.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT1_IT_CNAF_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m20 / 67\u001b[0m\u001b[35m \u001b[0m│ 29.9% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m16 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 23.9% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m16 / 67\u001b[0m\u001b[35m \u001b[0m│ 23.9% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Nebraska \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m13 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│ 16.4% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m11 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 16.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_RALPP \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│ 9.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m4 / 67 \u001b[0m\u001b[35m \u001b[0m│ 6.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 3.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│ 1.5% │\n", + "└───────┴─────────────────────┴─────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Replicas for /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "├── T2_US_Wisconsin\n",
+       "│   ├── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
+       "│   └── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│       -v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
+       "├── T2_DE_DESY\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
+       "│   └── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│       36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
+       "├── T3_FR_IPNL\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
+       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│       Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
+       "├── T1_US_FNAL_Disk\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
+       "│   └── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│       Dv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
+       "├── T2_US_Caltech\n",
+       "│   ├── root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\n",
+       "│   │   _GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
+       "│   └── root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\n",
+       "│       _GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
+       "├── T2_TR_METU\n",
+       "│   ├── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
+       "│   │   _MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
+       "│   ├── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
+       "│   │   _MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
+       "│   └── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
+       "│       _MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
+       "├── T2_UK_London_IC\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
+       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
+       "├── T1_IT_CNAF_Disk\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
+       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "├── T3_KR_KISTI\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "│   └── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│       -v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
+       "├── T2_DE_RWTH\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
+       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│       _NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "├── T2_US_Purdue\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\n",
+       "│   └── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\n",
+       "├── T2_US_Florida\n",
+       "│   ├── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
+       "│   └── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│       520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "├── T2_TW_NCHC\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
+       "│   └── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│       1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
+       "├── T2_BE_IIHE\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
+       "│   └── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│       0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "├── T2_US_Nebraska\n",
+       "│   └── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│       1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
+       "├── T2_ES_CIEMAT\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
+       "│   └── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│       /2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
+       "├── T1_FR_CCIN2P3_Disk\n",
+       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "├── T2_IT_Legnaro\n",
+       "│   └── root://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\n",
+       "│       v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
+       "└── T2_CH_CERN\n",
+       "    └── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "        520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
+       "
\n" + ], + "text/plain": [ + "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n", + "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", + "├── \u001b[32mT2_DE_DESY\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", + "│ └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", + "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", + "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", + "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ \u001b[36mDv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", + "├── \u001b[32mT2_US_Caltech\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n", + "│ │ \u001b[36m_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n", + "│ \u001b[36m_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", + "├── \u001b[32mT2_TR_METU\u001b[0m\n", + "│ ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", + "│ │ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", + "│ │ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", + "│ └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", + "│ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", + "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", + "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", + "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "├── \u001b[32mT3_KR_KISTI\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "│ └── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ \u001b[36m-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", + "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", + "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "├── \u001b[32mT2_US_Purdue\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n", + "│ └── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n", + "├── \u001b[32mT2_US_Florida\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ \u001b[36m520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "├── \u001b[32mT2_TW_NCHC\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", + "│ └── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ \u001b[36m1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", + "├── \u001b[32mT2_BE_IIHE\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", + "│ └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ \u001b[36m0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "├── \u001b[32mT2_US_Nebraska\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ \u001b[36m1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", + "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", + "│ └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ \u001b[36m/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", + "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", + "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "├── \u001b[32mT2_IT_Legnaro\u001b[0m\n", + "│ └── \u001b[36mroot://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\u001b[0m\n", + "│ \u001b[36mv1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n", + "└── \u001b[32mT2_CH_CERN\u001b[0m\n", + " └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + " \u001b[36m520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.load_dataset_definition(dataset_definition)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_list_selected()" + ] + }, + { + "cell_type": "markdown", + "id": "a6ffbefb-8276-4733-aedb-cc12898f4ed8", + "metadata": {}, + "source": [ + "### Save the replicas metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b0e3e4b8-34d4-4558-988a-edacd1df9b37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
File replicas_info.json saved!\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32mFile replicas_info.json saved!\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_save(\"replicas_info.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919", + "metadata": {}, + "source": [ + "## Preprocess the fileset with dask" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "125cd0ea-ff05-414a-9177-2be98eb88362", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0mddc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0moutput_file\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mstep_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0malign_to_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdask_cluster\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Perform preprocessing for concrete fileset extraction.\n", + "Args: output_file [step_size] [align to file cluster boundaries] [dask cluster url]\n", + "\u001b[0;31mFile:\u001b[0m /work/dvalsecc/coffea/src/coffea/dataset_tools/dataset_query.py\n", + "\u001b[0;31mType:\u001b[0m method" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_preprocess?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04a2aeca-9c9f-4baf-b33b-b4f1b5ba4d4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
  Preprocessing files to extract available chunks with dask\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠧\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_preprocess(output_file=\"fileset\", \n", + " step_size=10000,\n", + " align_to_clusters=False,\n", + " dask_cluster=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1206bce-b726-43cc-b217-d74fd5516147", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 80db62e53..2dd7ca171 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -18,7 +18,7 @@ from .preprocess import preprocess -def print_dataset_query(query, dataset_list, selected, console): +def print_dataset_query(query, dataset_list, console, selected=[]): table = Table(title=f"Query: [bold red]{query}") table.add_column("Name", justify="left", style="cyan", no_wrap=True) table.add_column("Tag", style="magenta", no_wrap=True) @@ -208,19 +208,19 @@ def do_query(self, query=None): scope="cms", # TODO configure scope ) # Now let's print the results as a tree - print_dataset_query(query, outtree, self.selected_datasets, self.console) + print_dataset_query(query, outtree, self.console, self.selected_datasets) self.last_query = query self.last_query_list = outlist self.last_query_tree = outtree - print("Use the command [bold red]select (S)[/] to selected the datasets") + print("Use the command [bold red]select[/] to selected the datasets") def do_query_results(self): if self.last_query_list: print_dataset_query( self.last_query, self.last_query_tree, - self.selected_datasets, self.console, + self.selected_datasets, ) else: print("First [bold red]query (Q)[/] for a dataset") @@ -402,10 +402,11 @@ def do_replicas(self, mode=None, selection=None): T.add(f"[cyan]{f}") self.console.print(tree) - def do_allowlist_sites(self): - sites = Prompt.ask( - "[yellow]Restrict the available sites to (comma-separated list)" - ).split(",") + def do_allowlist_sites(self, sites=None): + if sites is None: + sites = Prompt.ask( + "[yellow]Restrict the available sites to (comma-separated list)" + ).split(",") if self.sites_allowlist is None: self.sites_allowlist = sites else: @@ -414,10 +415,11 @@ def do_allowlist_sites(self): for s in self.sites_allowlist: print(f"- {s}") - def do_blocklist_sites(self): - sites = Prompt.ask("[yellow]Exclude the sites (comma-separated list)").split( - "," - ) + def do_blocklist_sites(self, sites=None): + if sites is None: + sites = Prompt.ask( + "[yellow]Exclude the sites (comma-separated list)" + ).split(",") if self.sites_blocklist is None: self.sites_blocklist = sites else: @@ -426,8 +428,9 @@ def do_blocklist_sites(self): for s in self.sites_blocklist: print(f"- {s}") - def do_regex_sites(self): - regex = Prompt.ask("[yellow]Regex to restrict the available sites") + def do_regex_sites(self, regex=None): + if regex is None: + regex = Prompt.ask("[yellow]Regex to restrict the available sites") if len(regex): self.sites_regex = rf"{regex}" print(f"New sites regex: [cyan]{self.sites_regex}") @@ -513,10 +516,6 @@ def do_preprocess( align_to_clusters = Confirm.ask( "[yellow bold]Align to clusters", default=True ) - if dask_cluster is None: - dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None") - if dask_cluster == "None": - dask_cluster = None replicas = {} for fileset, files in self.replica_results.items(): @@ -541,6 +540,47 @@ def do_preprocess( with gzip.open(f"{output_file}_all.json.gz", "wt") as file: print(f"Saved all fileset chunks to {output_file}_all.json.gz") json.dump(out_updated, file, indent=2) + return out_updated + + def load_dataset_definition( + self, + dataset_definition, + query_results_strategy="all", + replicas_strategy="round-robin", + ): + for dataset_query, dataset_meta in dataset_definition.items(): + print(f"\nProcessing query: {dataset_query}") + # Adding queries + self.do_query(dataset_query) + # Now selecting the results depending on the interactive mode or not. + # Metadata are passed to the selection function to associated them with the selected dataset. + if query_results_strategy not in ["all", "manual"]: + print( + "Invalid query-results-strategy option: please choose between: manual|all" + ) + exit(1) + elif query_results_strategy == "manual": + self.do_select(selection=None, metadata=dataset_meta) + else: + self.do_select(selection="all", metadata=dataset_meta) + + # Now list all + self.do_list_selected() + + # selecting replicas + self.do_sites_filters(ask_clear=False) + print("Getting replicas") + if replicas_strategy == "manual": + self.do_replicas(mode=None, selection="all") + else: + if replicas_strategy not in ["round-robin", "choose"]: + print( + "Invalid replicas-strategy: please choose between manual|round-robin|choose" + ) + exit(1) + self.do_replicas(mode=replicas_strategy, selection="all") + # Now list all + self.do_list_selected() if __name__ == "__main__": @@ -622,49 +662,17 @@ def do_preprocess( cli.sites_regex = args.regex_sites if args.dataset_definition: - # Load the dataset definition if present: with open(args.dataset_definition) as file: - dataset_definition = json.load(file) - - for dataset_query, dataset_meta in dataset_definition.items(): - print(f"\nProcessing query: {dataset_query}") - # Adding queries - cli.do_query(dataset_query) - # Now selecting the results depending on the interactive mode or not. - # Metadata are passed to the selection function to associated them with the selected dataset. - if args.query_results_strategy not in ["all", "manual"]: - print( - "Invalid query-results-strategy option: please choose between: manual|all" - ) - exit(1) - elif args.query_results_strategy == "manual": - cli.do_select(selection=None, metadata=dataset_meta) - else: - cli.do_select(selection="all", metadata=dataset_meta) - - # Now list all - cli.do_list_selected() - - # selecting replicas - cli.do_sites_filters(ask_clear=False) - print("Getting replicas") - if args.replicas_strategy == "manual": - cli.do_replicas(mode=None, selection="all") - else: - if args.replicas_strategy not in ["round-robin", "choose"]: - print( - "Invalid replicas-strategy: please choose between manual|round-robin|choose" - ) - exit(1) - cli.do_replicas(mode=args.replicas_strategy, selection="all") - - # Now list all - cli.do_list_selected() - + dd = json.load(file) + cli.load_dataset_definition( + dd, + query_results_strategy=args.query_results_strategy, + replicas_strategy=args.replicas_strategy, + ) # Save if args.output: cli.do_save(filename=args.output) - if args.preprocess: + if preprocess: cli.do_preprocess( output_file=args.fileset_output, step_size=args.step_size, diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index a2cf11fe9..3940df2e1 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -271,11 +271,32 @@ def get_dataset_files_replicas( return outfiles, outsites, sites_counts -def query_dataset(query, client=None, tree=False, scope="cms"): +def query_dataset( + query: str, client=None, tree: bool = False, datatype="container", scope="cms" +): + """ + This function uses the rucio client to query for containers or datasets. + + Parameters + --------- + query: str = query to filter datasets / containers with the rucio list_dids functions + client: rucio client + tree: bool = if True return the results splitting the dataset name in parts parts + datatype: "container/dataset": rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. + scope: "cms". Rucio instance + + Returns + ------- + list of containers/datasets + + if tree==True, returns the list of dataset and also a dictionary decomposing the datasets + names in the 1st commond part and a list of available 2nd parts. + + """ client = client if client else get_rucio_client() out = list( client.list_dids( - scope=scope, filters={"name": query, "type": "container"}, long=False + scope=scope, filters={"name": query, "type": datatype}, long=False ) ) if tree: From 95b994941bf3504fb68783d1bd10f66280b5016d Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 11 Dec 2023 16:26:30 -0600 Subject: [PATCH 74/80] typo --- src/coffea/dataset_tools/rucio_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index 3940df2e1..b626f518b 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -290,7 +290,7 @@ def query_dataset( list of containers/datasets if tree==True, returns the list of dataset and also a dictionary decomposing the datasets - names in the 1st commond part and a list of available 2nd parts. + names in the 1st command part and a list of available 2nd parts. """ client = client if client else get_rucio_client() From a166a813989040efd54dbbdeeed9d4ca612ecf7e Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Mon, 11 Dec 2023 23:56:35 +0100 Subject: [PATCH 75/80] more docs in the notebook --- binder/dataset_discovery.ipynb | 1211 +++++++++++++++------ src/coffea/dataset_tools/dataset_query.py | 11 +- 2 files changed, 899 insertions(+), 323 deletions(-) diff --git a/binder/dataset_discovery.ipynb b/binder/dataset_discovery.ipynb index 9c29063fe..2b19cf1c0 100644 --- a/binder/dataset_discovery.ipynb +++ b/binder/dataset_discovery.ipynb @@ -5,7 +5,14 @@ "id": "c5754206-f41b-4e08-bc4d-496df85e8194", "metadata": {}, "source": [ - "# Dataset discovery tools" + "# Dataset discovery tools\n", + "\n", + "This notebook shows some features to make the dataset discovery for CMS analysis easier. \n", + "The rucio sytem is queried to look for dataset and access to the list of all available file replicas.\n", + "\n", + "Users can exploit these tools at 2 different levels:\n", + "- low level: use the `rucio_utils` module directly to just query rucio\n", + "- high level: use the `DataDiscoveryCLI` class to simplify dataset query, replicas filters and uproot preprocessing with dask" ] }, { @@ -753,50 +760,27 @@ ] }, { - "cell_type": "code", - "execution_count": 3, - "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3", - "metadata": {}, - "outputs": [], - "source": [ - "ddc = DataDiscoveryCLI()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "51a8ef69-5d4e-4089-b3a8-d0290cc973c1", + "cell_type": "markdown", + "id": "ecb84b02-b85f-4037-a08d-cce001bc35c7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method load_dataset_definition in module coffea.dataset_tools.dataset_query:\n", - "\n", - "load_dataset_definition(dataset_definition, query_results_strategy='all', replicas_strategy='round-robin') method of coffea.dataset_tools.dataset_query.DataDiscoveryCLI instance\n", - "\n" - ] - } - ], "source": [ - "help(ddc.load_dataset_definition)" + "The dataset definition is passed to a `DataDiscoveryCLI` to automatically query rucio and get replicas" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "db3ab214-93d3-49b1-b6e1-9374b9fcc1f0", + "execution_count": 11, + "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
        "
\n" ], "text/plain": [ - "\u001b[32m⠧\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" + "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" ] }, "metadata": {}, @@ -898,316 +882,312 @@ "data": { "text/html": [ "
Replicas for /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
-       "├── T2_US_Wisconsin\n",
-       "│   ├── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│   │   -v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
-       "│   └── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│       -v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
        "├── T2_DE_DESY\n",
        "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
-       "│   │   36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
+       "│   │   36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
        "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
-       "│   │   36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   │   36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
        "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
-       "│   │   36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
+       "│   │   36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
        "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
-       "│   │   36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
+       "│   │   36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
        "│   └── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
-       "│       36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
-       "├── T3_FR_IPNL\n",
-       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
-       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
-       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
-       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
-       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
-       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
-       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
-       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
-       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
-       "│       Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
-       "├── T1_US_FNAL_Disk\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
-       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│   │   Dv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
-       "│   └── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
-       "│       Dv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
-       "├── T2_US_Caltech\n",
-       "│   ├── root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\n",
-       "│   │   _GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
-       "│   └── root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\n",
-       "│       _GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
-       "├── T2_TR_METU\n",
-       "│   ├── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
-       "│   │   _MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
-       "│   ├── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
-       "│   │   _MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
-       "│   └── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
-       "│       _MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
-       "├── T2_UK_London_IC\n",
-       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
-       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
-       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
-       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
-       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
-       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
-       "├── T1_IT_CNAF_Disk\n",
-       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│   │   2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
-       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│       2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "│       36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
        "├── T3_KR_KISTI\n",
        "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│   │   -v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   │   -v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
        "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│   │   -v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
+       "│   │   -v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
        "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│   │   -v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "│   │   -v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
        "│   └── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
-       "│       -v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
-       "├── T2_DE_RWTH\n",
-       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
-       "│   │   _NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
-       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
-       "│   │   _NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
-       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
-       "│       _NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "│       -v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
+       "├── T2_ES_CIEMAT\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
+       "│   └── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│       /2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
+       "├── T1_FR_CCIN2P3_Disk\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
+       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "├── T2_BE_IIHE\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
+       "│   └── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│       0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
        "├── T2_US_Purdue\n",
        "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│   │   2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
-       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│   │   2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
-       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│   │   2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
+       "│   │   2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
        "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
-       "│   │   2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
+       "│   │   2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
        "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
        "│   │   2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\n",
        "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
        "│   │   2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\n",
        "│   └── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
        "│       2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\n",
-       "├── T2_US_Florida\n",
-       "│   ├── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
-       "│   │   520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
-       "│   └── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
-       "│       520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "├── T2_US_Wisconsin\n",
+       "│   ├── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
+       "│   └── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│       -v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
        "├── T2_TW_NCHC\n",
        "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
-       "│   │   1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   │   1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
        "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
-       "│   │   1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
+       "│   │   1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
        "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
        "│   │   1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\n",
        "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
-       "│   │   1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
+       "│   │   1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
        "│   └── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
-       "│       1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
-       "├── T2_BE_IIHE\n",
-       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
-       "│   │   0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
-       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
-       "│   │   0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
-       "│   └── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
-       "│       0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "│       1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
+       "├── T2_UK_London_IC\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
+       "├── T1_US_FNAL_Disk\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "│   └── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│       Dv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
+       "├── T1_IT_CNAF_Disk\n",
+       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
        "├── T2_US_Nebraska\n",
+       "│   ├── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
+       "│   ├── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
        "│   └── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
-       "│       1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
-       "├── T2_ES_CIEMAT\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
-       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│   │   /2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
-       "│   └── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
-       "│       /2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
-       "├── T1_FR_CCIN2P3_Disk\n",
-       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
-       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "│       1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
        "├── T2_IT_Legnaro\n",
        "│   └── root://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\n",
-       "│       v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
-       "└── T2_CH_CERN\n",
-       "    └── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
-       "        520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
+       "│       v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
+       "├── T3_FR_IPNL\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
+       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│       Dv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "├── T2_DE_RWTH\n",
+       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│       _NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
+       "├── T2_TR_METU\n",
+       "│   └── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
+       "│       _MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
+       "└── T2_US_Florida\n",
+       "    └── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "        520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
        "
\n" ], "text/plain": [ "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n", - "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ │ \u001b[36m-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", - "│ └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", "├── \u001b[32mT2_DE_DESY\u001b[0m\n", "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", - "│ │ \u001b[36m36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", - "│ │ \u001b[36m36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", - "│ │ \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", - "│ │ \u001b[36m36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", "│ └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", - "│ \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", - "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", - "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", - "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", - "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", - "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", - "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", - "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", - "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", - "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", - "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", - "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", - "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ │ \u001b[36mDv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", - "│ └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", - "│ \u001b[36mDv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", - "├── \u001b[32mT2_US_Caltech\u001b[0m\n", - "│ ├── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n", - "│ │ \u001b[36m_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", - "│ └── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n", - "│ \u001b[36m_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", - "├── \u001b[32mT2_TR_METU\u001b[0m\n", - "│ ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", - "│ │ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", - "│ ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", - "│ │ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", - "│ └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", - "│ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", - "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", - "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", - "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", - "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", - "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", - "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", - "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", - "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ │ \u001b[36m2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", - "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "│ \u001b[36m36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", "├── \u001b[32mT3_KR_KISTI\u001b[0m\n", "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ │ \u001b[36m-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ │ \u001b[36m-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ │ \u001b[36m-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", "│ └── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", - "│ \u001b[36m-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", - "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", - "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", - "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", - "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", - "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", - "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", - "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "│ \u001b[36m-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", + "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", + "│ └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", + "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", + "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "├── \u001b[32mT2_BE_IIHE\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", + "│ └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ \u001b[36m0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", "├── \u001b[32mT2_US_Purdue\u001b[0m\n", "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ │ \u001b[36m2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", - "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ │ \u001b[36m2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ │ \u001b[36m2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ │ \u001b[36m2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", - "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", - "│ │ \u001b[36m2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", + "│ │ \u001b[36m2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", "│ │ \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n", "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", "│ │ \u001b[36m2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n", "│ └── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", "│ \u001b[36m2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n", - "├── \u001b[32mT2_US_Florida\u001b[0m\n", - "│ ├── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", - "│ │ \u001b[36m520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", - "│ └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", - "│ \u001b[36m520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ \u001b[36m-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n", "├── \u001b[32mT2_TW_NCHC\u001b[0m\n", "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", - "│ │ \u001b[36m1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ │ \u001b[36m1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", - "│ │ \u001b[36m1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", + "│ │ \u001b[36m1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", "│ │ \u001b[36m1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n", "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", - "│ │ \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", + "│ │ \u001b[36m1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", "│ └── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", - "│ \u001b[36m1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", - "├── \u001b[32mT2_BE_IIHE\u001b[0m\n", - "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", - "│ │ \u001b[36m0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", - "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", - "│ │ \u001b[36m0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", - "│ └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", - "│ \u001b[36m0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "│ \u001b[36m1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", + "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", + "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ \u001b[36mDv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", + "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", "├── \u001b[32mT2_US_Nebraska\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", "│ └── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", - "│ \u001b[36m1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", - "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", - "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ │ \u001b[36m/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", - "│ └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", - "│ \u001b[36m/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", - "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", - "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", - "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "│ \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", "├── \u001b[32mT2_IT_Legnaro\u001b[0m\n", "│ └── \u001b[36mroot://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\u001b[0m\n", - "│ \u001b[36mv1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n", - "└── \u001b[32mT2_CH_CERN\u001b[0m\n", - " └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", - " \u001b[36m520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n" + "│ \u001b[36mv1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", + "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", + "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", + "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", + "├── \u001b[32mT2_TR_METU\u001b[0m\n", + "│ └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", + "│ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", + "└── \u001b[32mT2_US_Florida\u001b[0m\n", + " └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + " \u001b[36m520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n" ] }, "metadata": {}, @@ -1253,23 +1233,42 @@ } ], "source": [ - "ddc.load_dataset_definition(dataset_definition)" + "ddc = DataDiscoveryCLI()\n", + "ddc.load_dataset_definition(dataset_definition, \n", + " query_results_strategy=\"all\",\n", + " replicas_strategy=\"round-robin\")" + ] + }, + { + "cell_type": "markdown", + "id": "db7798eb-eb9f-47e5-9239-92cdea20600f", + "metadata": {}, + "source": [ + "### Filtering sites" + ] + }, + { + "cell_type": "markdown", + "id": "bd57fe7b-0642-48b8-9f9f-cd209e50d867", + "metadata": {}, + "source": [ + "Sites filtering works in a very similar way for `DataDiscoveryCLI`" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0", + "execution_count": 17, + "id": "d85ca119-0a56-4c67-bb21-ebbca8164728", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Selected datasets:\n",
+       "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
        "
\n" ], "text/plain": [ - "\u001b[36mSelected datasets:\u001b[0m\n" + "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" ] }, "metadata": {}, @@ -1278,20 +1277,455 @@ { "data": { "text/html": [ - "
                                                 Selected datasets                                                 \n",
-       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
-       "┃ Dataset                                                                                                   ┃\n",
-       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
-       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
-       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
-       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
-       "
\n" + "
\n"
       ],
-      "text/plain": [
-       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
-       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
-       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
-       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Sites availability for dataset: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                   Available replicas                   \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files    Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           67 / 67     100.0%    │\n",
+       "│   1   │ T3_FR_IPNL           67 / 67 │    100.0%    │\n",
+       "│   2    T2_UK_London_IC      39 / 67     58.2%     │\n",
+       "│   3   │ T1_FR_CCIN2P3_Disk   38 / 67 │    56.7%     │\n",
+       "│   4    T2_CH_CERN           25 / 67     37.3%     │\n",
+       "│   5   │ T2_DE_RWTH           22 / 67 │    32.8%     │\n",
+       "│   6    T1_IT_CNAF_Disk      20 / 67     29.9%     │\n",
+       "│   7   │ T1_DE_KIT_Disk       11 / 67 │    16.4%     │\n",
+       "│   8    T2_UK_SGrid_RALPP    6 / 67       9.0%     │\n",
+       "│   9   │ T2_IT_Legnaro        6 / 67  │     9.0%     │\n",
+       "│  10    T2_FR_IPHC           2 / 67       3.0%     │\n",
+       "│  11   │ T2_UK_London_Brunel  1 / 67  │     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT3_FR_IPNL \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 58.2% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│ 56.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_CH_CERN \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m25 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 37.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m22 / 67\u001b[0m\u001b[35m \u001b[0m│ 32.8% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m20 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 29.9% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│ 16.4% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_RALPP \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│ 9.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 3.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│ 1.5% │\n", + "└───────┴─────────────────────┴─────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Replicas for /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "├── T2_CH_CERN\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
+       "│   └── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│       520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
+       "├── T3_FR_IPNL\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
+       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│       Dv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
+       "├── T2_UK_London_IC\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\n",
+       "├── T1_FR_CCIN2P3_Disk\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
+       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\n",
+       "├── T2_FR_IPHC\n",
+       "│   └── root://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200\n",
+       "│       00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
+       "├── T2_DE_DESY\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
+       "│   └── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│       36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
+       "├── T1_DE_KIT_Disk\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "│   └── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│       9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
+       "├── T2_DE_RWTH\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
+       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│       _NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
+       "├── T1_IT_CNAF_Disk\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\n",
+       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "└── T2_UK_SGrid_RALPP\n",
+       "    ├── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "    │   AODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
+       "    ├── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "    │   AODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
+       "    └── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "        AODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
+       "
\n" + ], + "text/plain": [ + "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n", + "├── \u001b[32mT2_CH_CERN\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", + "│ └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ \u001b[36m520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", + "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n", + "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", + "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n", + "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", + "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n", + "├── \u001b[32mT2_FR_IPHC\u001b[0m\n", + "│ └── \u001b[36mroot://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200\u001b[0m\n", + "│ \u001b[36m00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", + "├── \u001b[32mT2_DE_DESY\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", + "│ └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", + "├── \u001b[32mT1_DE_KIT_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ \u001b[36m9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", + "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", + "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", + "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "└── \u001b[32mT2_UK_SGrid_RALPP\u001b[0m\n", + " ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " │ \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", + " ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " │ \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", + " └── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc = DataDiscoveryCLI()\n", + "ddc.do_regex_sites(r\"T[123]_(CH|IT|UK|FR|DE)_\\w+\")\n", + "ddc.load_dataset_definition(dataset_definition, \n", + " query_results_strategy=\"all\",\n", + " replicas_strategy=\"round-robin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" @@ -1315,7 +1749,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "id": "b0e3e4b8-34d4-4558-988a-edacd1df9b37", "metadata": {}, "outputs": [ @@ -1339,57 +1773,152 @@ }, { "cell_type": "markdown", - "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919", + "id": "f9f6a70b-0194-4b00-ab79-4fdb0b4fa0cf", "metadata": {}, "source": [ - "## Preprocess the fileset with dask" + "## DataDiscoveryCLI from shell" + ] + }, + { + "cell_type": "markdown", + "id": "7237fc9e-50b8-4cc4-9c51-9674fbf4358a", + "metadata": {}, + "source": [ + "The DataDiscoveryCLI can be used directly from CLI" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "125cd0ea-ff05-414a-9177-2be98eb88362", + "execution_count": 35, + "id": "2c075f2e-a06e-4c97-b5b6-6a6806571a9a", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0mddc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0moutput_file\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mstep_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0malign_to_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdask_cluster\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m\n", - "Perform preprocessing for concrete fileset extraction.\n", - "Args: output_file [step_size] [align to file cluster boundaries] [dask cluster url]\n", - "\u001b[0;31mFile:\u001b[0m /work/dvalsecc/coffea/src/coffea/dataset_tools/dataset_query.py\n", - "\u001b[0;31mType:\u001b[0m method" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: dataset_query.py [-h] [--cli] [-d DATASET_DEFINITION] [-o OUTPUT]\n", + " [-fo FILESET_OUTPUT] [-p] [--step-size STEP_SIZE]\n", + " [--dask-cluster DASK_CLUSTER]\n", + " [-as ALLOW_SITES [ALLOW_SITES ...]]\n", + " [-bs BLOCK_SITES [BLOCK_SITES ...]] [-rs REGEX_SITES]\n", + " [--query-results-strategy QUERY_RESULTS_STRATEGY]\n", + " [--replicas-strategy REPLICAS_STRATEGY]\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + " --cli Start the dataset discovery CLI\n", + " -d DATASET_DEFINITION, --dataset-definition DATASET_DEFINITION\n", + " Dataset definition file\n", + " -o OUTPUT, --output OUTPUT\n", + " Output name for dataset discovery output (no fileset\n", + " preprocessing)\n", + " -fo FILESET_OUTPUT, --fileset-output FILESET_OUTPUT\n", + " Output name for fileset\n", + " -p, --preprocess Preprocess with dask\n", + " --step-size STEP_SIZE\n", + " Step size for preprocessing\n", + " --dask-cluster DASK_CLUSTER\n", + " Dask cluster url\n", + " -as ALLOW_SITES [ALLOW_SITES ...], --allow-sites ALLOW_SITES [ALLOW_SITES ...]\n", + " List of sites to be allowlisted\n", + " -bs BLOCK_SITES [BLOCK_SITES ...], --block-sites BLOCK_SITES [BLOCK_SITES ...]\n", + " List of sites to be blocklisted\n", + " -rs REGEX_SITES, --regex-sites REGEX_SITES\n", + " Regex string to be used to filter the sites\n", + " --query-results-strategy QUERY_RESULTS_STRATEGY\n", + " Mode for query results selection: [all|manual]\n", + " --replicas-strategy REPLICAS_STRATEGY\n", + " Mode for selecting replicas for datasets:\n", + " [manual|round-robin|choose]\n" + ] } ], "source": [ - "ddc.do_preprocess?" + "!python -m coffea.dataset_tools.dataset_query --help" ] }, { "cell_type": "code", "execution_count": null, + "id": "e93cb24c-44ed-43f1-8aae-0f6b03c88de0", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m coffea.dataset_tools.dataset_query --cli -d dataset_definition.json" + ] + }, + { + "cell_type": "markdown", + "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919", + "metadata": {}, + "source": [ + "## Preprocess the fileset with dask" + ] + }, + { + "cell_type": "markdown", + "id": "046a0c99-6500-41b5-9954-fa7b78061800", + "metadata": {}, + "source": [ + "The replicas metadata contain the file location in the CMS grid. \n", + "This info can be **preprocessed** with uproot and dask-awkward to extract the **fileset**. Practically a fileset is a collection of metadata about the file location, file name, chunks splitting, that can be used directly to configure the uproot reading. \n", + "\n", + "This step replaces the preprocessing step in coffea 0.7.x. The output of the preprocessing can be used directly to start an analysis with dask-awkward.\n", + "\n", + "The preprocessing is performed locally with multiple processes if `dask_cluster==None`, but a pre-existing dask cluster url can be passed." + ] + }, + { + "cell_type": "code", + "execution_count": 22, "id": "04a2aeca-9c9f-4baf-b33b-b4f1b5ba4d4a", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
  Preprocessing files to extract available chunks with dask\n",
+       "
  Preprocessing files to extract available chunks with dask\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠙\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Saved available fileset chunks to fileset_available.json.gz\n",
        "
\n" ], "text/plain": [ - "\u001b[32m⠧\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n" + "Saved available fileset chunks to fileset_available.json.gz\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Saved all fileset chunks to fileset_all.json.gz\n",
+       "
\n" + ], + "text/plain": [ + "Saved all fileset chunks to fileset_all.json.gz\n" ] }, "metadata": {}, @@ -1397,18 +1926,56 @@ } ], "source": [ - "ddc.do_preprocess(output_file=\"fileset\", \n", - " step_size=10000,\n", + "fileset_total = ddc.do_preprocess(output_file=\"fileset\", \n", + " step_size=10000, #chunk size for files splitting\n", " align_to_clusters=False,\n", " dask_cluster=None)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "d1206bce-b726-43cc-b217-d74fd5516147", "metadata": {}, "outputs": [], + "source": [ + "import gzip\n", + "import json\n", + "with gzip.open(\"fileset_available.json.gz\", \"rt\") as file:\n", + " fileset_available = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "957ea9c6-783a-4932-960f-cbec5f2f0656", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/100000/13D0AD97-6B32-CB4C-BA87-5E37BA4CF20E.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 59081]], 'uuid': 'fbe50b00-1f7e-11ec-97b8-2bbee183beef'}\n", + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00C9792D-ACD2-2547-BB04-097F0C4E47E3.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 138192]], 'uuid': '938a4fe2-1d77-11ec-bddf-59319e86beef'}\n", + "root://dcache-cms-xrootd.desy.de:1094//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00EA9563-5449-D24E-9566-98AE8E2A61AE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1140000], [1140000, 1150000], [1150000, 1160000], [1160000, 1170000], [1170000, 1180000], [1180000, 1190000], [1190000, 1200000], [1200000, 1210000], [1210000, 1220000], [1220000, 1230000], [1230000, 1240000], [1240000, 1250000], [1250000, 1260000], [1260000, 1270000], [1270000, 1280000], [1280000, 1290000], [1290000, 1300000], [1300000, 1310000], [1310000, 1320000], [1320000, 1330000], [1330000, 1340000], [1340000, 1350000], [1350000, 1360000], [1360000, 1370000], [1370000, 1380000], [1380000, 1390000], [1390000, 1400000], [1400000, 1410000], [1410000, 1420000], [1420000, 1430000], [1430000, 1440000], [1440000, 1450000], [1450000, 1460000], [1460000, 1470000], [1470000, 1480000], [1480000, 1490000], [1490000, 1500000], [1500000, 1510000], [1510000, 1520000], [1520000, 1530000], [1530000, 1540000], [1540000, 1550000], [1550000, 1551326]], 'uuid': 'ced110a0-1b0f-11ec-b2e9-09c08e80beef'}\n", + "root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/068B0797-DEF5-9341-BBBE-EDBE50EBC6A1.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1138724]], 'uuid': 'd86ab2e2-1b28-11ec-8504-738a8e80beef'}\n", + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/0CFD79EF-41AB-4B4A-8F62-06393273EEDE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 911868]], 'uuid': '9d799986-1ad9-11ec-9257-fc1b1e0abeef'}\n" + ] + } + ], + "source": [ + "dataset = '/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'\n", + "for i, (file, meta) in enumerate(fileset_available[dataset][\"files\"].items()):\n", + " print(file, meta) \n", + " if i>3: break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f23bae95-8a2e-46a9-a884-714474a8ff12", + "metadata": {}, + "outputs": [], "source": [] } ], diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index 2dd7ca171..cf4f328f9 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -548,6 +548,13 @@ def load_dataset_definition( query_results_strategy="all", replicas_strategy="round-robin", ): + """ + Initialize the DataDiscoverCLI by querying a set of datasets defined in `dataset_definitions` + and selected results and replicas following the options. + + - query_results_strategy: "all" or "manual" to be prompt for selection + - replicas_strategy: "round-robin", "choose" (to manually choose the sites), "manual": to be prompt for manual decision case by case + """ for dataset_query, dataset_meta in dataset_definition.items(): print(f"\nProcessing query: {dataset_query}") # Adding queries @@ -617,7 +624,9 @@ def load_dataset_definition( parser.add_argument( "--step-size", help="Step size for preprocessing", type=int, default=500000 ) - parser.add_argument("--dask-cluster", help="Dask cluster url", type=str, default="") + parser.add_argument( + "--dask-cluster", help="Dask cluster url", type=str, default=None + ) parser.add_argument( "-as", "--allow-sites", From 02001b3b8641fc88cccc6077aad1018eb9f3499d Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 12 Dec 2023 05:48:22 -0600 Subject: [PATCH 76/80] actually pass uproot_options down --- src/coffea/dataset_tools/apply_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 69994b127..79386852e 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -73,7 +73,7 @@ def apply_to_fileset( metadata = copy.deepcopy(dataset.get("metadata", {})) metadata.setdefault("dataset", name) dataset_out = apply_to_dataset( - data_manipulation, dataset, schemaclass, metadata + data_manipulation, dataset, schemaclass, metadata, uproot_options ) if isinstance(out, tuple): out[name], report[name] = dataset_out From ba88fe4074f974de9c2c024d36d2539912c713e8 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 12 Dec 2023 08:27:57 -0400 Subject: [PATCH 77/80] fix get_failed_steps_for_fileset/dataset --- src/coffea/dataset_tools/__init__.py | 7 ++++++- src/coffea/dataset_tools/apply_processor.py | 2 +- src/coffea/dataset_tools/manipulations.py | 22 ++++++++++++--------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py index 1895bf722..8dd444189 100644 --- a/src/coffea/dataset_tools/__init__.py +++ b/src/coffea/dataset_tools/__init__.py @@ -1,5 +1,10 @@ from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset -from coffea.dataset_tools.manipulations import max_chunks, slice_chunks +from coffea.dataset_tools.manipulations import ( + get_failed_steps_for_dataset, + get_failed_steps_for_fileset, + max_chunks, + slice_chunks, +) from coffea.dataset_tools.preprocess import preprocess __all__ = [ diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 79386852e..698cd916a 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -75,7 +75,7 @@ def apply_to_fileset( dataset_out = apply_to_dataset( data_manipulation, dataset, schemaclass, metadata, uproot_options ) - if isinstance(out, tuple): + if isinstance(dataset_out, tuple): out[name], report[name] = dataset_out else: out[name] = dataset_out diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index a749561e0..3386adbe5 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -21,7 +21,8 @@ def slice_chunks(fileset, theslice=slice(None)): def get_failed_steps_for_dataset(dataset, report): - failed_dataset = {} + failed_dataset = copy.deepcopy(dataset) + failed_dataset["files"] = {} failures = report[~awkward.is_none(report.exception)] if not awkward.all(report.args[:, 4] == "True"): @@ -30,13 +31,16 @@ def get_failed_steps_for_dataset(dataset, report): ) for fdesc in dataset.values(): - if "steps" not in fdesc: + config = list(fdesc.values())[0] + if "steps" not in config: raise RuntimeError( "steps specification not found in dataset, please specify steps in input dataset." ) - fnames = set(dataset.keys()) - rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:])) + fnames = set(dataset["files"].keys()) + rnames = ( + set(numpy.unique(failures.args[:, 0][:, 1:-1:])) if len(failures) > 0 else set() + ) if not rnames.issubset(fnames): raise RuntimeError( f"Files: {rnames - fnames} are not in input dataset, please ensure report corresponds to input dataset!" @@ -47,11 +51,11 @@ def get_failed_steps_for_dataset(dataset, report): fname, object_path, start, stop, is_step = args_as_types - if fname in failed_dataset: - failed_dataset[fname]["steps"].append([start, stop]) + if fname in failed_dataset["files"]: + failed_dataset["files"][fname]["steps"].append([start, stop]) else: - failed_dataset[fname] = copy.deepcopy(dataset[fname]) - failed_dataset[fname]["steps"] = [[start, stop]] + failed_dataset["files"][fname] = copy.deepcopy(dataset["files"][fname]) + failed_dataset["files"][fname]["steps"] = [[start, stop]] return failed_dataset @@ -60,6 +64,6 @@ def get_failed_steps_for_fileset(fileset, report_dict): failed_fileset = {} for name, dataset in fileset.items(): failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name]) - if len(failed_dataset) > 0: + if len(failed_dataset["files"]) > 0: failed_fileset[name] = failed_dataset return failed_fileset From 6e22866db03753faa98e7dabc23ed71d6f7cef61 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 12 Dec 2023 08:40:10 -0400 Subject: [PATCH 78/80] properly check each input file for steps --- src/coffea/dataset_tools/manipulations.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index 3386adbe5..e18e4ec4a 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -30,11 +30,11 @@ def get_failed_steps_for_dataset(dataset, report): "step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file." ) - for fdesc in dataset.values(): - config = list(fdesc.values())[0] - if "steps" not in config: + for fname, fdesc in dataset["files"].items(): + if "steps" not in fdesc: raise RuntimeError( - "steps specification not found in dataset, please specify steps in input dataset." + f"steps specification not found in file description for {fname}, " + "please specify steps consistently in input dataset." ) fnames = set(dataset["files"].keys()) From 6d0722cbdb7e0ccea3ac8a4f28fb83df2a237996 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 12 Dec 2023 11:37:22 -0400 Subject: [PATCH 79/80] adjust uproot pins, add test --- pyproject.toml | 2 +- tests/test_dataset_tools.py | 90 ++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d3eaa6d95..f35d69e26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ ] dependencies = [ "awkward>=2.5.1rc1", - "uproot>=5.2.0rc3", + "uproot>=5.2.0rc4", "dask[array]>=2023.4.0", "dask-awkward>=2023.12.1", "dask-histogram>=2023.10.0", diff --git a/tests/test_dataset_tools.py b/tests/test_dataset_tools.py index ca46c0060..3500fdc3f 100644 --- a/tests/test_dataset_tools.py +++ b/tests/test_dataset_tools.py @@ -2,7 +2,13 @@ import pytest from distributed import Client -from coffea.dataset_tools import apply_to_fileset, max_chunks, preprocess, slice_chunks +from coffea.dataset_tools import ( + apply_to_fileset, + get_failed_steps_for_fileset, + max_chunks, + preprocess, + slice_chunks, +) from coffea.nanoevents import BaseSchema, NanoAODSchema from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor @@ -32,6 +38,56 @@ }, } +_starting_fileset_with_steps = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + }, + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + }, + } + }, +} + _runnable_result = { "ZJets": { "files": { @@ -218,3 +274,35 @@ def test_slicechunks(): } }, } + + +def test_recover_failed_chunks(): + with Client() as _: + to_compute = apply_to_fileset( + NanoEventsProcessor(), + _starting_fileset_with_steps, + schemaclass=NanoAODSchema, + uproot_options={"allow_read_errors_with_report": True}, + ) + out, reports = dask.compute(*to_compute) + + failed_fset = get_failed_steps_for_fileset(_starting_fileset_with_steps, reports) + assert failed_fset == { + "Data": { + "files": { + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + } + } From 994ed4a463d191c7b8e60fa4e801465a05e79509 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Tue, 12 Dec 2023 16:08:50 -0400 Subject: [PATCH 80/80] typing and docs --- src/coffea/dataset_tools/apply_processor.py | 46 ++++++++++++- src/coffea/dataset_tools/manipulations.py | 73 +++++++++++++++++++-- src/coffea/dataset_tools/preprocess.py | 56 +++++++++++++++- 3 files changed, 167 insertions(+), 8 deletions(-) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 698cd916a..324dfd908 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -35,7 +35,29 @@ def apply_to_dataset( schemaclass: BaseSchema = NanoAODSchema, metadata: dict[Hashable, Any] = {}, uproot_options: dict[str, Any] = {}, -) -> DaskOutputType: +) -> DaskOutputType | tuple[DaskOutputType, dask_awkward.Array]: + """ + Apply the supplied function or processor to the supplied dataset. + Parameters + ---------- + data_manipulation : ProcessorABC or GenericHEPAnalysis + The user analysis code to run on the input dataset + dataset: DatasetSpec | DatasetSpecOptional + The data to be acted upon by the data manipulation passed in. + schemaclass: BaseSchema, default NanoAODSchema + The nanoevents schema to interpret the input dataset with. + metadata: dict[Hashable, Any], default {} + Metadata for the dataset that is accessible by the input analysis. Should also be dask-serializable. + uproot_options: dict[str, Any], default {} + Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports. + + Returns + ------- + out : DaskOutputType + The output of the analysis workflow applied to the dataset + report : dask_awkward.Array, optional + The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. + """ files = dataset["files"] events = NanoEventsFactory.from_root( files, @@ -66,7 +88,27 @@ def apply_to_fileset( fileset: FilesetSpec | FilesetSpecOptional, schemaclass: BaseSchema = NanoAODSchema, uproot_options: dict[str, Any] = {}, -) -> dict[str, DaskOutputType]: +) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: + """ + Apply the supplied function or processor to the supplied fileset (set of datasets). + Parameters + ---------- + data_manipulation : ProcessorABC or GenericHEPAnalysis + The user analysis code to run on the input dataset + fileset: FilesetSpec | FilesetSpecOptional + The data to be acted upon by the data manipulation passed in. Metadata within the fileset should be dask-serializable. + schemaclass: BaseSchema, default NanoAODSchema + The nanoevents schema to interpret the input dataset with. + uproot_options: dict[str, Any], default {} + Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports. + + Returns + ------- + out : dict[str, DaskOutputType] + The output of the analysis workflow applied to the datasets, keyed by dataset name. + report : dask_awkward.Array, optional + The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. + """ out = {} report = {} for name, dataset in fileset.items(): diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index e18e4ec4a..081e1d97d 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -1,14 +1,47 @@ +from __future__ import annotations + import copy +from typing import Any import awkward import numpy +from coffea.dataset_tools.preprocess import DatasetSpec, FilesetSpec + + +def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpec: + """ + Modify the input dataset so that only the first "maxchunks" chunks of each file will be processed. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets reduce to max-chunks row-ranges. + maxchunks: int | None, default None + How many chunks to keep for each file. -def max_chunks(fileset, maxchunks=None): + Returns + ------- + out : FilesetSpec + The reduced fileset with only the first maxchunks event ranges left in. + """ return slice_chunks(fileset, slice(maxchunks)) -def slice_chunks(fileset, theslice=slice(None)): +def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: + """ + Modify the input dataset so that only the chunks of each file specified by the input slice are processed. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets to be sliced. + theslice: Any, default slice(None) + How to slice the array of row-ranges (steps) in the input fileset. + + Returns + ------- + out : FilesetSpec + The reduce fileset with only the row-ranges specific by theslice left. + """ if not isinstance(theslice, slice): theslice = slice(theslice) @@ -20,7 +53,23 @@ def slice_chunks(fileset, theslice=slice(None)): return out -def get_failed_steps_for_dataset(dataset, report): +def get_failed_steps_for_dataset( + dataset: DatasetSpec, report: awkward.Array +) -> DatasetSpec: + """ + Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters + ---------- + dataset: DatasetSpec + The dataset to be reduced to only contain files and row-ranges that have previously encountered failed file access. + report: awkward.Array + The computed file-access error report from dask-awkward. + + Returns + ------- + out : DatasetSpec + The reduced dataset with only the row-ranges and files that failed processing, according to the input report. + """ failed_dataset = copy.deepcopy(dataset) failed_dataset["files"] = {} failures = report[~awkward.is_none(report.exception)] @@ -60,7 +109,23 @@ def get_failed_steps_for_dataset(dataset, report): return failed_dataset -def get_failed_steps_for_fileset(fileset, report_dict): +def get_failed_steps_for_fileset( + fileset: FilesetSpec, report_dict: dict[str, awkward.Array] +): + """ + Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets to be reduced to only contain files and row-ranges that have previously encountered failed file access. + report_dict: dict[str, awkward.Array] + The computed file-access error reports from dask-awkward, indexed by dataset name. + + Returns + ------- + out : FilesetSpec + The reduced dataset with only the row-ranges and files that failed processing, according to the input report. + """ failed_fileset = {} for name, dataset in fileset.items(): failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name]) diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index da4ca7d6a..94ddf2507 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -14,12 +14,38 @@ def _get_steps( normed_files: awkward.Array | dask_awkward.Array, - maybe_step_size: None | int = None, + maybe_step_size: int | None = None, align_clusters: bool = False, recalculate_seen_steps: bool = False, skip_bad_files: bool = False, - file_exceptions: Exception | Warning = (FileNotFoundError, OSError), + file_exceptions: Exception + | Warning + | tuple[Exception | Warning] = (FileNotFoundError, OSError), ) -> awkward.Array | dask_awkward.Array: + """ + Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters + ---------- + normed_files: awkward.Array | dask_awkward.Array + The list of normalized file descriptions to process for steps. + maybe_step_sizes: int | None, default None + If specified, the size of the steps to make when analyzing the input files. + align_clusters: bool, default False + Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in + analysis. + recalculate_seen_steps: bool, default False + If steps are present in the input normed files, force the recalculation of those steps, instead + of only recalculating the steps if the uuid has changed. + skip_bad_files: bool, False + Instead of failing, catch exceptions specified by file_exceptions and return null data. + file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError) + What exceptions to catch when skipping bad files. + + Returns + ------- + array : awkward.Array | dask_awkward.Array + The normalized file descriptions, appended with the calculated steps for those files. + """ nf_backend = awkward.backend(normed_files) lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files) @@ -144,6 +170,32 @@ def preprocess( skip_bad_files: bool = False, file_exceptions: Exception | Warning = (FileNotFoundError, OSError), ) -> tuple[FilesetSpec, FilesetSpecOptional]: + """ + Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters + ---------- + fileset: FilesetSpecOptional + The set of datasets whose files will be preprocessed. + maybe_step_sizes: int | None, default None + If specified, the size of the steps to make when analyzing the input files. + align_clusters: bool, default False + Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in + analysis. + recalculate_seen_steps: bool, default False + If steps are present in the input normed files, force the recalculation of those steps, + instead of only recalculating the steps if the uuid has changed. + skip_bad_files: bool, False + Instead of failing, catch exceptions specified by file_exceptions and return null data. + file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError) + What exceptions to catch when skipping bad files. + + Returns + ------- + out_available : FilesetSpec + The subset of files in each dataset that were successfully preprocessed, organized by dataset. + out_updated : FilesetSpecOptional + The original set of datasets including files that were not accessible, updated to include the result of preprocessing where available. + """ out_updated = copy.deepcopy(fileset) out_available = copy.deepcopy(fileset) all_ak_norm_files = {}